Index: third_party/re2/re2/prefilter.cc |
diff --git a/third_party/re2/re2/prefilter.cc b/third_party/re2/re2/prefilter.cc |
new file mode 100644 |
index 0000000000000000000000000000000000000000..30e4c01bd1002a5abe349ef421c2feaf0ca54988 |
--- /dev/null |
+++ b/third_party/re2/re2/prefilter.cc |
@@ -0,0 +1,671 @@ |
+// Copyright 2009 The RE2 Authors. All Rights Reserved. |
+// Use of this source code is governed by a BSD-style |
+// license that can be found in the LICENSE file. |
+ |
+#include "util/util.h" |
+#include "re2/prefilter.h" |
+#include "re2/re2.h" |
+#include "re2/unicode_casefold.h" |
+#include "re2/walker-inl.h" |
+ |
+namespace re2 { |
+ |
+static const int Trace = false; |
+ |
+typedef set<string>::iterator SSIter; |
+typedef set<string>::const_iterator ConstSSIter; |
+ |
+static int alloc_id = 100000; // Used for debugging. |
+// Initializes a Prefilter, allocating subs_ as necessary. |
+Prefilter::Prefilter(Op op) { |
+ op_ = op; |
+ subs_ = NULL; |
+ if (op_ == AND || op_ == OR) |
+ subs_ = new vector<Prefilter*>; |
+ |
+ alloc_id_ = alloc_id++; |
+ VLOG(10) << "alloc_id: " << alloc_id_; |
+} |
+ |
+// Destroys a Prefilter. |
+Prefilter::~Prefilter() { |
+ VLOG(10) << "Deleted: " << alloc_id_; |
+ if (subs_) { |
+ for (int i = 0; i < subs_->size(); i++) |
+ delete (*subs_)[i]; |
+ delete subs_; |
+ subs_ = NULL; |
+ } |
+} |
+ |
+// Simplify if the node is an empty Or or And. |
+Prefilter* Prefilter::Simplify() { |
+ if (op_ != AND && op_ != OR) { |
+ return this; |
+ } |
+ |
+ // Nothing left in the AND/OR. |
+ if (subs_->size() == 0) { |
+ if (op_ == AND) |
+ op_ = ALL; // AND of nothing is true |
+ else |
+ op_ = NONE; // OR of nothing is false |
+ |
+ return this; |
+ } |
+ |
+ // Just one subnode: throw away wrapper. |
+ if (subs_->size() == 1) { |
+ Prefilter* a = (*subs_)[0]; |
+ subs_->clear(); |
+ delete this; |
+ return a->Simplify(); |
+ } |
+ |
+ return this; |
+} |
+ |
+// Combines two Prefilters together to create an "op" (AND or OR). |
+// The passed Prefilters will be part of the returned Prefilter or deleted. |
+// Does lots of work to avoid creating unnecessarily complicated structures. |
+Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) { |
+ // If a, b can be rewritten as op, do so. |
+ a = a->Simplify(); |
+ b = b->Simplify(); |
+ |
+ // Canonicalize: a->op <= b->op. |
+ if (a->op() > b->op()) { |
+ Prefilter* t = a; |
+ a = b; |
+ b = t; |
+ } |
+ |
+ // Trivial cases. |
+ // ALL AND b = b |
+ // NONE OR b = b |
+ // ALL OR b = ALL |
+ // NONE AND b = NONE |
+ // Don't need to look at b, because of canonicalization above. |
+ // ALL and NONE are smallest opcodes. |
+ if (a->op() == ALL || a->op() == NONE) { |
+ if ((a->op() == ALL && op == AND) || |
+ (a->op() == NONE && op == OR)) { |
+ delete a; |
+ return b; |
+ } else { |
+ delete b; |
+ return a; |
+ } |
+ } |
+ |
+ // If a and b match op, merge their contents. |
+ if (a->op() == op && b->op() == op) { |
+ for (int i = 0; i < b->subs()->size(); i++) { |
+ Prefilter* bb = (*b->subs())[i]; |
+ a->subs()->push_back(bb); |
+ } |
+ b->subs()->clear(); |
+ delete b; |
+ return a; |
+ } |
+ |
+ // If a already has the same op as the op that is under construction |
+ // add in b (similarly if b already has the same op, add in a). |
+ if (b->op() == op) { |
+ Prefilter* t = a; |
+ a = b; |
+ b = t; |
+ } |
+ if (a->op() == op) { |
+ a->subs()->push_back(b); |
+ return a; |
+ } |
+ |
+ // Otherwise just return the op. |
+ Prefilter* c = new Prefilter(op); |
+ c->subs()->push_back(a); |
+ c->subs()->push_back(b); |
+ return c; |
+} |
+ |
+Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) { |
+ return AndOr(AND, a, b); |
+} |
+ |
+Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) { |
+ return AndOr(OR, a, b); |
+} |
+ |
+static void SimplifyStringSet(set<string> *ss) { |
+ // Now make sure that the strings aren't redundant. For example, if |
+ // we know "ab" is a required string, then it doesn't help at all to |
+ // know that "abc" is also a required string, so delete "abc". This |
+ // is because, when we are performing a string search to filter |
+ // regexps, matching ab will already allow this regexp to be a |
+ // candidate for match, so further matching abc is redundant. |
+ |
+ for (SSIter i = ss->begin(); i != ss->end(); ++i) { |
+ SSIter j = i; |
+ ++j; |
+ while (j != ss->end()) { |
+ // Increment j early so that we can erase the element it points to. |
+ SSIter old_j = j; |
+ ++j; |
+ if (old_j->find(*i) != string::npos) |
+ ss->erase(old_j); |
+ } |
+ } |
+} |
+ |
+Prefilter* Prefilter::OrStrings(set<string>* ss) { |
+ SimplifyStringSet(ss); |
+ Prefilter* or_prefilter = NULL; |
+ if (!ss->empty()) { |
+ or_prefilter = new Prefilter(NONE); |
+ for (SSIter i = ss->begin(); i != ss->end(); ++i) |
+ or_prefilter = Or(or_prefilter, FromString(*i)); |
+ } |
+ return or_prefilter; |
+} |
+ |
+static Rune ToLowerRune(Rune r) { |
+ if (r < Runeself) { |
+ if ('A' <= r && r <= 'Z') |
+ r += 'a' - 'A'; |
+ return r; |
+ } |
+ |
+ CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r); |
+ if (f == NULL || r < f->lo) |
+ return r; |
+ return ApplyFold(f, r); |
+} |
+ |
+Prefilter* Prefilter::FromString(const string& str) { |
+ Prefilter* m = new Prefilter(Prefilter::ATOM); |
+ m->atom_ = str; |
+ return m; |
+} |
+ |
+// Information about a regexp used during computation of Prefilter. |
+// Can be thought of as information about the set of strings matching |
+// the given regular expression. |
+class Prefilter::Info { |
+ public: |
+ Info(); |
+ ~Info(); |
+ |
+ // More constructors. They delete their Info* arguments. |
+ static Info* Alt(Info* a, Info* b); |
+ static Info* Concat(Info* a, Info* b); |
+ static Info* And(Info* a, Info* b); |
+ static Info* Star(Info* a); |
+ static Info* Plus(Info* a); |
+ static Info* Quest(Info* a); |
+ static Info* EmptyString(); |
+ static Info* NoMatch(); |
+ static Info* AnyChar(); |
+ static Info* CClass(CharClass* cc); |
+ static Info* Literal(Rune r); |
+ static Info* AnyMatch(); |
+ |
+ // Format Info as a string. |
+ string ToString(); |
+ |
+ // Caller takes ownership of the Prefilter. |
+ Prefilter* TakeMatch(); |
+ |
+ set<string>& exact() { return exact_; } |
+ |
+ bool is_exact() const { return is_exact_; } |
+ |
+ class Walker; |
+ |
+ private: |
+ set<string> exact_; |
+ |
+ // When is_exact_ is true, the strings that match |
+ // are placed in exact_. When it is no longer an exact |
+ // set of strings that match this RE, then is_exact_ |
+ // is false and the match_ contains the required match |
+ // criteria. |
+ bool is_exact_; |
+ |
+ // Accumulated Prefilter query that any |
+ // match for this regexp is guaranteed to match. |
+ Prefilter* match_; |
+}; |
+ |
+ |
+Prefilter::Info::Info() |
+ : is_exact_(false), |
+ match_(NULL) { |
+} |
+ |
+Prefilter::Info::~Info() { |
+ delete match_; |
+} |
+ |
+Prefilter* Prefilter::Info::TakeMatch() { |
+ if (is_exact_) { |
+ match_ = Prefilter::OrStrings(&exact_); |
+ is_exact_ = false; |
+ } |
+ Prefilter* m = match_; |
+ match_ = NULL; |
+ return m; |
+} |
+ |
+// Format a Info in string form. |
+string Prefilter::Info::ToString() { |
+ if (this == NULL) { |
+ // Sometimes when iterating on children of a node, |
+ // some children might have NULL Info. Adding |
+ // the check here for NULL to take care of cases where |
+ // the caller is not checking. |
+ return ""; |
+ } |
+ |
+ if (is_exact_) { |
+ int n = 0; |
+ string s; |
+ for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) { |
+ if (n++ > 0) |
+ s += ","; |
+ s += *i; |
+ } |
+ return s; |
+ } |
+ |
+ if (match_) |
+ return match_->DebugString(); |
+ |
+ return ""; |
+} |
+ |
+// Add the strings from src to dst. |
+static void CopyIn(const set<string>& src, set<string>* dst) { |
+ for (ConstSSIter i = src.begin(); i != src.end(); ++i) |
+ dst->insert(*i); |
+} |
+ |
+// Add the cross-product of a and b to dst. |
+// (For each string i in a and j in b, add i+j.) |
+static void CrossProduct(const set<string>& a, |
+ const set<string>& b, |
+ set<string>* dst) { |
+ for (ConstSSIter i = a.begin(); i != a.end(); ++i) |
+ for (ConstSSIter j = b.begin(); j != b.end(); ++j) |
+ dst->insert(*i + *j); |
+} |
+ |
+// Concats a and b. Requires that both are exact sets. |
+// Forms an exact set that is a crossproduct of a and b. |
+Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) { |
+ if (a == NULL) |
+ return b; |
+ DCHECK(a->is_exact_); |
+ DCHECK(b && b->is_exact_); |
+ Info *ab = new Info(); |
+ |
+ CrossProduct(a->exact_, b->exact_, &ab->exact_); |
+ ab->is_exact_ = true; |
+ |
+ delete a; |
+ delete b; |
+ return ab; |
+} |
+ |
+// Constructs an inexact Info for ab given a and b. |
+// Used only when a or b is not exact or when the |
+// exact cross product is likely to be too big. |
+Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) { |
+ if (a == NULL) |
+ return b; |
+ if (b == NULL) |
+ return a; |
+ |
+ Info *ab = new Info(); |
+ |
+ ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch()); |
+ ab->is_exact_ = false; |
+ delete a; |
+ delete b; |
+ return ab; |
+} |
+ |
+// Constructs Info for a|b given a and b. |
+Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) { |
+ Info *ab = new Info(); |
+ |
+ if (a->is_exact_ && b->is_exact_) { |
+ CopyIn(a->exact_, &ab->exact_); |
+ CopyIn(b->exact_, &ab->exact_); |
+ ab->is_exact_ = true; |
+ } else { |
+ // Either a or b has is_exact_ = false. If the other |
+ // one has is_exact_ = true, we move it to match_ and |
+ // then create a OR of a,b. The resulting Info has |
+ // is_exact_ = false. |
+ ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch()); |
+ ab->is_exact_ = false; |
+ } |
+ |
+ delete a; |
+ delete b; |
+ return ab; |
+} |
+ |
+// Constructs Info for a? given a. |
+Prefilter::Info* Prefilter::Info::Quest(Info *a) { |
+ Info *ab = new Info(); |
+ |
+ ab->is_exact_ = false; |
+ ab->match_ = new Prefilter(ALL); |
+ delete a; |
+ return ab; |
+} |
+ |
+// Constructs Info for a* given a. |
+// Same as a? -- not much to do. |
+Prefilter::Info* Prefilter::Info::Star(Info *a) { |
+ return Quest(a); |
+} |
+ |
+// Constructs Info for a+ given a. If a was exact set, it isn't |
+// anymore. |
+Prefilter::Info* Prefilter::Info::Plus(Info *a) { |
+ Info *ab = new Info(); |
+ |
+ ab->match_ = a->TakeMatch(); |
+ ab->is_exact_ = false; |
+ |
+ delete a; |
+ return ab; |
+} |
+ |
+static string RuneToString(Rune r) { |
+ char buf[UTFmax]; |
+ int n = runetochar(buf, &r); |
+ return string(buf, n); |
+} |
+ |
+// Constructs Info for literal rune. |
+Prefilter::Info* Prefilter::Info::Literal(Rune r) { |
+ Info* info = new Info(); |
+ info->exact_.insert(RuneToString(ToLowerRune(r))); |
+ info->is_exact_ = true; |
+ return info; |
+} |
+ |
+// Constructs Info for dot (any character). |
+Prefilter::Info* Prefilter::Info::AnyChar() { |
+ Prefilter::Info* info = new Prefilter::Info(); |
+ info->match_ = new Prefilter(ALL); |
+ return info; |
+} |
+ |
+// Constructs Prefilter::Info for no possible match. |
+Prefilter::Info* Prefilter::Info::NoMatch() { |
+ Prefilter::Info* info = new Prefilter::Info(); |
+ info->match_ = new Prefilter(NONE); |
+ return info; |
+} |
+ |
+// Constructs Prefilter::Info for any possible match. |
+// This Prefilter::Info is valid for any regular expression, |
+// since it makes no assertions whatsoever about the |
+// strings being matched. |
+Prefilter::Info* Prefilter::Info::AnyMatch() { |
+ Prefilter::Info *info = new Prefilter::Info(); |
+ info->match_ = new Prefilter(ALL); |
+ return info; |
+} |
+ |
+// Constructs Prefilter::Info for just the empty string. |
+Prefilter::Info* Prefilter::Info::EmptyString() { |
+ Prefilter::Info* info = new Prefilter::Info(); |
+ info->is_exact_ = true; |
+ info->exact_.insert(""); |
+ return info; |
+} |
+ |
+// Constructs Prefilter::Info for a character class. |
+typedef CharClass::iterator CCIter; |
+Prefilter::Info* Prefilter::Info::CClass(CharClass *cc) { |
+ if (Trace) { |
+ VLOG(0) << "CharClassInfo:"; |
+ for (CCIter i = cc->begin(); i != cc->end(); ++i) |
+ VLOG(0) << " " << i->lo << "-" << i->hi; |
+ } |
+ |
+ // If the class is too large, it's okay to overestimate. |
+ if (cc->size() > 10) |
+ return AnyChar(); |
+ |
+ Prefilter::Info *a = new Prefilter::Info(); |
+ for (CCIter i = cc->begin(); i != cc->end(); ++i) |
+ for (Rune r = i->lo; r <= i->hi; r++) |
+ a->exact_.insert(RuneToString(ToLowerRune(r))); |
+ |
+ a->is_exact_ = true; |
+ |
+ if (Trace) { |
+ VLOG(0) << " = " << a->ToString(); |
+ } |
+ |
+ return a; |
+} |
+ |
+class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> { |
+ public: |
+ Walker() {} |
+ |
+ virtual Info* PostVisit( |
+ Regexp* re, Info* parent_arg, |
+ Info* pre_arg, |
+ Info** child_args, int nchild_args); |
+ |
+ virtual Info* ShortVisit( |
+ Regexp* re, |
+ Info* parent_arg); |
+ |
+ private: |
+ DISALLOW_EVIL_CONSTRUCTORS(Walker); |
+}; |
+ |
+Prefilter::Info* Prefilter::BuildInfo(Regexp* re) { |
+ if (Trace) { |
+ LOG(INFO) << "BuildPrefilter::Info: " << re->ToString(); |
+ } |
+ Prefilter::Info::Walker w; |
+ Prefilter::Info* info = w.WalkExponential(re, NULL, 100000); |
+ |
+ if (w.stopped_early()) { |
+ delete info; |
+ return NULL; |
+ } |
+ |
+ return info; |
+} |
+ |
+Prefilter::Info* Prefilter::Info::Walker::ShortVisit( |
+ Regexp* re, Prefilter::Info* parent_arg) { |
+ return AnyMatch(); |
+} |
+ |
+// Constructs the Prefilter::Info for the given regular expression. |
+// Assumes re is simplified. |
+Prefilter::Info* Prefilter::Info::Walker::PostVisit( |
+ Regexp* re, Prefilter::Info* parent_arg, |
+ Prefilter::Info* pre_arg, Prefilter::Info** child_args, |
+ int nchild_args) { |
+ Prefilter::Info *info; |
+ switch (re->op()) { |
+ default: |
+ case kRegexpRepeat: |
+ LOG(DFATAL) << "Bad regexp op " << re->op(); |
+ info = EmptyString(); |
+ break; |
+ |
+ case kRegexpNoMatch: |
+ info = NoMatch(); |
+ break; |
+ |
+ // These ops match the empty string: |
+ case kRegexpEmptyMatch: // anywhere |
+ case kRegexpBeginLine: // at beginning of line |
+ case kRegexpEndLine: // at end of line |
+ case kRegexpBeginText: // at beginning of text |
+ case kRegexpEndText: // at end of text |
+ case kRegexpWordBoundary: // at word boundary |
+ case kRegexpNoWordBoundary: // not at word boundary |
+ info = EmptyString(); |
+ break; |
+ |
+ case kRegexpLiteral: |
+ info = Literal(re->rune()); |
+ break; |
+ |
+ case kRegexpLiteralString: |
+ if (re->nrunes() == 0) { |
+ info = NoMatch(); |
+ break; |
+ } |
+ info = Literal(re->runes()[0]); |
+ for (int i = 1; i < re->nrunes(); i++) |
+ info = Concat(info, Literal(re->runes()[i])); |
+ break; |
+ |
+ case kRegexpConcat: { |
+ // Accumulate in info. |
+ // Exact is concat of recent contiguous exact nodes. |
+ info = NULL; |
+ Info* exact = NULL; |
+ for (int i = 0; i < nchild_args; i++) { |
+ Info* ci = child_args[i]; // child info |
+ if (!ci->is_exact() || |
+ (exact && ci->exact().size() * exact->exact().size() > 16)) { |
+ // Exact run is over. |
+ info = And(info, exact); |
+ exact = NULL; |
+ // Add this child's info. |
+ info = And(info, ci); |
+ } else { |
+ // Append to exact run. |
+ exact = Concat(exact, ci); |
+ } |
+ } |
+ info = And(info, exact); |
+ } |
+ break; |
+ |
+ case kRegexpAlternate: |
+ info = child_args[0]; |
+ for (int i = 1; i < nchild_args; i++) |
+ info = Alt(info, child_args[i]); |
+ VLOG(10) << "Alt: " << info->ToString(); |
+ break; |
+ |
+ case kRegexpStar: |
+ info = Star(child_args[0]); |
+ break; |
+ |
+ case kRegexpQuest: |
+ info = Quest(child_args[0]); |
+ break; |
+ |
+ case kRegexpPlus: |
+ info = Plus(child_args[0]); |
+ break; |
+ |
+ case kRegexpAnyChar: |
+ // Claim nothing, except that it's not empty. |
+ info = AnyChar(); |
+ break; |
+ |
+ case kRegexpCharClass: |
+ info = CClass(re->cc()); |
+ break; |
+ |
+ case kRegexpCapture: |
+ // These don't affect the set of matching strings. |
+ info = child_args[0]; |
+ break; |
+ } |
+ |
+ if (Trace) { |
+ VLOG(0) << "BuildInfo " << re->ToString() |
+ << ": " << info->ToString(); |
+ } |
+ |
+ return info; |
+} |
+ |
+ |
+Prefilter* Prefilter::FromRegexp(Regexp* re) { |
+ if (re == NULL) |
+ return NULL; |
+ |
+ Regexp* simple = re->Simplify(); |
+ Prefilter::Info *info = BuildInfo(simple); |
+ |
+ simple->Decref(); |
+ if (info == NULL) |
+ return NULL; |
+ |
+ Prefilter* m = info->TakeMatch(); |
+ |
+ delete info; |
+ return m; |
+} |
+ |
+string Prefilter::DebugString() const { |
+ if (this == NULL) |
+ return "<nil>"; |
+ |
+ switch (op_) { |
+ default: |
+ LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_; |
+ return StringPrintf("op%d", op_); |
+ case NONE: |
+ return "*no-matches*"; |
+ case ATOM: |
+ return atom_; |
+ case ALL: |
+ return ""; |
+ case AND: { |
+ string s = ""; |
+ for (int i = 0; i < subs_->size(); i++) { |
+ if (i > 0) |
+ s += " "; |
+ s += (*subs_)[i]->DebugString(); |
+ } |
+ return s; |
+ } |
+ case OR: { |
+ string s = "("; |
+ for (int i = 0; i < subs_->size(); i++) { |
+ if (i > 0) |
+ s += "|"; |
+ s += (*subs_)[i]->DebugString(); |
+ } |
+ s += ")"; |
+ return s; |
+ } |
+ } |
+} |
+ |
+Prefilter* Prefilter::FromRE2(const RE2* re2) { |
+ if (re2 == NULL) |
+ return NULL; |
+ |
+ Regexp* regexp = re2->Regexp(); |
+ if (regexp == NULL) |
+ return NULL; |
+ |
+ return FromRegexp(regexp); |
+} |
+ |
+ |
+} // namespace re2 |