| Index: third_party/re2/re2/prefilter_tree.h
|
| diff --git a/third_party/re2/re2/prefilter_tree.h b/third_party/re2/re2/prefilter_tree.h
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..596b734ff6a5f05f84780ce693089351bcc7e92b
|
| --- /dev/null
|
| +++ b/third_party/re2/re2/prefilter_tree.h
|
| @@ -0,0 +1,130 @@
|
| +// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
| +// Use of this source code is governed by a BSD-style
|
| +// license that can be found in the LICENSE file.
|
| +
|
| +// The PrefilterTree class is used to form an AND-OR tree of strings
|
| +// that would trigger each regexp. The 'prefilter' of each regexp is
|
| +// added tp PrefilterTree, and then PrefilterTree is used to find all
|
| +// the unique strings across the prefilters. During search, by using
|
| +// matches from a string matching engine, PrefilterTree deduces the
|
| +// set of regexps that are to be triggered. The 'string matching
|
| +// engine' itself is outside of this class, and the caller can use any
|
| +// favorite engine. PrefilterTree provides a set of strings (called
|
| +// atoms) that the user of this class should use to do the string
|
| +// matching.
|
| +//
|
| +#ifndef RE2_PREFILTER_TREE_H_
|
| +#define RE2_PREFILTER_TREE_H_
|
| +
|
| +#include "util/util.h"
|
| +#include "util/sparse_array.h"
|
| +
|
| +namespace re2 {
|
| +
|
| +typedef SparseArray<int> IntMap;
|
| +
|
| +class Prefilter;
|
| +
|
| +class PrefilterTree {
|
| + public:
|
| + PrefilterTree();
|
| + ~PrefilterTree();
|
| +
|
| + // Adds the prefilter for the next regexp. Note that we assume that
|
| + // Add called sequentially for all regexps. All Add calls
|
| + // must precede Compile.
|
| + void Add(Prefilter* prefilter);
|
| +
|
| + // The Compile returns a vector of string in atom_vec.
|
| + // Call this after all the prefilters are added through Add.
|
| + // No calls to Add after Compile are allowed.
|
| + // The caller should use the returned set of strings to do string matching.
|
| + // Each time a string matches, the corresponding index then has to be
|
| + // and passed to RegexpsGivenStrings below.
|
| + void Compile(vector<string>* atom_vec);
|
| +
|
| + // Given the indices of the atoms that matched, returns the indexes
|
| + // of regexps that should be searched. The matched_atoms should
|
| + // contain all the ids of string atoms that were found to match the
|
| + // content. The caller can use any string match engine to perform
|
| + // this function. This function is thread safe.
|
| + void RegexpsGivenStrings(const vector<int>& matched_atoms,
|
| + vector<int>* regexps) const;
|
| +
|
| + // Print debug prefilter. Also prints unique ids associated with
|
| + // nodes of the prefilter of the regexp.
|
| + void PrintPrefilter(int regexpid);
|
| +
|
| +
|
| + // Each unique node has a corresponding Entry that helps in
|
| + // passing the matching trigger information along the tree.
|
| + struct Entry {
|
| + public:
|
| + // How many children should match before this node triggers the
|
| + // parent. For an atom and an OR node, this is 1 and for an AND
|
| + // node, it is the number of unique children.
|
| + int propagate_up_at_count;
|
| +
|
| + // When this node is ready to trigger the parent, what are the indices
|
| + // of the parent nodes to trigger. The reason there may be more than
|
| + // one is because of sharing. For example (abc | def) and (xyz | def)
|
| + // are two different nodes, but they share the atom 'def'. So when
|
| + // 'def' matches, it triggers two parents, corresponding to the two
|
| + // different OR nodes.
|
| + IntMap* parents;
|
| +
|
| + // When this node is ready to trigger the parent, what are the
|
| + // regexps that are triggered.
|
| + vector<int> regexps;
|
| + };
|
| +
|
| + private:
|
| + // This function assigns unique ids to various parts of the
|
| + // prefilter, by looking at if these nodes are already in the
|
| + // PrefilterTree.
|
| + void AssignUniqueIds(vector<string>* atom_vec);
|
| +
|
| + // Given the matching atoms, find the regexps to be triggered.
|
| + void PropagateMatch(const vector<int>& atom_ids,
|
| + IntMap* regexps) const;
|
| +
|
| + // Returns the prefilter node that has the same NodeString as this
|
| + // node. For the canonical node, returns node.
|
| + Prefilter* CanonicalNode(Prefilter* node);
|
| +
|
| + // A string that uniquely identifies the node. Assumes that the
|
| + // children of node has already been assigned unique ids.
|
| + string NodeString(Prefilter* node) const;
|
| +
|
| + // Recursively constructs a readable prefilter string.
|
| + string DebugNodeString(Prefilter* node) const;
|
| +
|
| + // Used for debugging.
|
| + void PrintDebugInfo();
|
| +
|
| + // These are all the nodes formed by Compile. Essentially, there is
|
| + // one node for each unique atom and each unique AND/OR node.
|
| + vector<Entry> entries_;
|
| +
|
| + // Map node string to canonical Prefilter node.
|
| + map<string, Prefilter*> node_map_;
|
| +
|
| + // indices of regexps that always pass through the filter (since we
|
| + // found no required literals in these regexps).
|
| + vector<int> unfiltered_;
|
| +
|
| + // vector of Prefilter for all regexps.
|
| + vector<Prefilter*> prefilter_vec_;
|
| +
|
| + // Atom index in returned strings to entry id mapping.
|
| + vector<int> atom_index_to_id_;
|
| +
|
| + // Has the prefilter tree been compiled.
|
| + bool compiled_;
|
| +
|
| + DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree);
|
| +};
|
| +
|
| +} // namespace
|
| +
|
| +#endif // RE2_PREFILTER_TREE_H_
|
|
|