Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1575)

Unified Diff: third_party/re2/re2/prefilter.cc

Issue 10575037: Include RE2 library (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Less intrusive fix for Android Created 8 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/re2/re2/prefilter.h ('k') | third_party/re2/re2/prefilter_tree.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/re2/re2/prefilter.cc
diff --git a/third_party/re2/re2/prefilter.cc b/third_party/re2/re2/prefilter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..30e4c01bd1002a5abe349ef421c2feaf0ca54988
--- /dev/null
+++ b/third_party/re2/re2/prefilter.cc
@@ -0,0 +1,671 @@
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "util/util.h"
+#include "re2/prefilter.h"
+#include "re2/re2.h"
+#include "re2/unicode_casefold.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+static const int Trace = false;
+
+typedef set<string>::iterator SSIter;
+typedef set<string>::const_iterator ConstSSIter;
+
+static int alloc_id = 100000; // Used for debugging.
+// Initializes a Prefilter, allocating subs_ as necessary.
+Prefilter::Prefilter(Op op) {
+ op_ = op;
+ subs_ = NULL;
+ if (op_ == AND || op_ == OR)
+ subs_ = new vector<Prefilter*>;
+
+ alloc_id_ = alloc_id++;
+ VLOG(10) << "alloc_id: " << alloc_id_;
+}
+
+// Destroys a Prefilter.
+Prefilter::~Prefilter() {
+ VLOG(10) << "Deleted: " << alloc_id_;
+ if (subs_) {
+ for (int i = 0; i < subs_->size(); i++)
+ delete (*subs_)[i];
+ delete subs_;
+ subs_ = NULL;
+ }
+}
+
+// Simplify if the node is an empty Or or And.
+Prefilter* Prefilter::Simplify() {
+ if (op_ != AND && op_ != OR) {
+ return this;
+ }
+
+ // Nothing left in the AND/OR.
+ if (subs_->size() == 0) {
+ if (op_ == AND)
+ op_ = ALL; // AND of nothing is true
+ else
+ op_ = NONE; // OR of nothing is false
+
+ return this;
+ }
+
+ // Just one subnode: throw away wrapper.
+ if (subs_->size() == 1) {
+ Prefilter* a = (*subs_)[0];
+ subs_->clear();
+ delete this;
+ return a->Simplify();
+ }
+
+ return this;
+}
+
+// Combines two Prefilters together to create an "op" (AND or OR).
+// The passed Prefilters will be part of the returned Prefilter or deleted.
+// Does lots of work to avoid creating unnecessarily complicated structures.
+Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
+ // If a, b can be rewritten as op, do so.
+ a = a->Simplify();
+ b = b->Simplify();
+
+ // Canonicalize: a->op <= b->op.
+ if (a->op() > b->op()) {
+ Prefilter* t = a;
+ a = b;
+ b = t;
+ }
+
+ // Trivial cases.
+ // ALL AND b = b
+ // NONE OR b = b
+ // ALL OR b = ALL
+ // NONE AND b = NONE
+ // Don't need to look at b, because of canonicalization above.
+ // ALL and NONE are smallest opcodes.
+ if (a->op() == ALL || a->op() == NONE) {
+ if ((a->op() == ALL && op == AND) ||
+ (a->op() == NONE && op == OR)) {
+ delete a;
+ return b;
+ } else {
+ delete b;
+ return a;
+ }
+ }
+
+ // If a and b match op, merge their contents.
+ if (a->op() == op && b->op() == op) {
+ for (int i = 0; i < b->subs()->size(); i++) {
+ Prefilter* bb = (*b->subs())[i];
+ a->subs()->push_back(bb);
+ }
+ b->subs()->clear();
+ delete b;
+ return a;
+ }
+
+ // If a already has the same op as the op that is under construction
+ // add in b (similarly if b already has the same op, add in a).
+ if (b->op() == op) {
+ Prefilter* t = a;
+ a = b;
+ b = t;
+ }
+ if (a->op() == op) {
+ a->subs()->push_back(b);
+ return a;
+ }
+
+ // Otherwise just return the op.
+ Prefilter* c = new Prefilter(op);
+ c->subs()->push_back(a);
+ c->subs()->push_back(b);
+ return c;
+}
+
+Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
+ return AndOr(AND, a, b);
+}
+
+Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
+ return AndOr(OR, a, b);
+}
+
+static void SimplifyStringSet(set<string> *ss) {
+ // Now make sure that the strings aren't redundant. For example, if
+ // we know "ab" is a required string, then it doesn't help at all to
+ // know that "abc" is also a required string, so delete "abc". This
+ // is because, when we are performing a string search to filter
+ // regexps, matching ab will already allow this regexp to be a
+ // candidate for match, so further matching abc is redundant.
+
+ for (SSIter i = ss->begin(); i != ss->end(); ++i) {
+ SSIter j = i;
+ ++j;
+ while (j != ss->end()) {
+ // Increment j early so that we can erase the element it points to.
+ SSIter old_j = j;
+ ++j;
+ if (old_j->find(*i) != string::npos)
+ ss->erase(old_j);
+ }
+ }
+}
+
+Prefilter* Prefilter::OrStrings(set<string>* ss) {
+ SimplifyStringSet(ss);
+ Prefilter* or_prefilter = NULL;
+ if (!ss->empty()) {
+ or_prefilter = new Prefilter(NONE);
+ for (SSIter i = ss->begin(); i != ss->end(); ++i)
+ or_prefilter = Or(or_prefilter, FromString(*i));
+ }
+ return or_prefilter;
+}
+
+static Rune ToLowerRune(Rune r) {
+ if (r < Runeself) {
+ if ('A' <= r && r <= 'Z')
+ r += 'a' - 'A';
+ return r;
+ }
+
+ CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
+ if (f == NULL || r < f->lo)
+ return r;
+ return ApplyFold(f, r);
+}
+
+Prefilter* Prefilter::FromString(const string& str) {
+ Prefilter* m = new Prefilter(Prefilter::ATOM);
+ m->atom_ = str;
+ return m;
+}
+
+// Information about a regexp used during computation of Prefilter.
+// Can be thought of as information about the set of strings matching
+// the given regular expression.
+class Prefilter::Info {
+ public:
+ Info();
+ ~Info();
+
+ // More constructors. They delete their Info* arguments.
+ static Info* Alt(Info* a, Info* b);
+ static Info* Concat(Info* a, Info* b);
+ static Info* And(Info* a, Info* b);
+ static Info* Star(Info* a);
+ static Info* Plus(Info* a);
+ static Info* Quest(Info* a);
+ static Info* EmptyString();
+ static Info* NoMatch();
+ static Info* AnyChar();
+ static Info* CClass(CharClass* cc);
+ static Info* Literal(Rune r);
+ static Info* AnyMatch();
+
+ // Format Info as a string.
+ string ToString();
+
+ // Caller takes ownership of the Prefilter.
+ Prefilter* TakeMatch();
+
+ set<string>& exact() { return exact_; }
+
+ bool is_exact() const { return is_exact_; }
+
+ class Walker;
+
+ private:
+ set<string> exact_;
+
+ // When is_exact_ is true, the strings that match
+ // are placed in exact_. When it is no longer an exact
+ // set of strings that match this RE, then is_exact_
+ // is false and the match_ contains the required match
+ // criteria.
+ bool is_exact_;
+
+ // Accumulated Prefilter query that any
+ // match for this regexp is guaranteed to match.
+ Prefilter* match_;
+};
+
+
+Prefilter::Info::Info()
+ : is_exact_(false),
+ match_(NULL) {
+}
+
+Prefilter::Info::~Info() {
+ delete match_;
+}
+
+Prefilter* Prefilter::Info::TakeMatch() {
+ if (is_exact_) {
+ match_ = Prefilter::OrStrings(&exact_);
+ is_exact_ = false;
+ }
+ Prefilter* m = match_;
+ match_ = NULL;
+ return m;
+}
+
+// Format a Info in string form.
+string Prefilter::Info::ToString() {
+ if (this == NULL) {
+ // Sometimes when iterating on children of a node,
+ // some children might have NULL Info. Adding
+ // the check here for NULL to take care of cases where
+ // the caller is not checking.
+ return "";
+ }
+
+ if (is_exact_) {
+ int n = 0;
+ string s;
+ for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
+ if (n++ > 0)
+ s += ",";
+ s += *i;
+ }
+ return s;
+ }
+
+ if (match_)
+ return match_->DebugString();
+
+ return "";
+}
+
+// Add the strings from src to dst.
+static void CopyIn(const set<string>& src, set<string>* dst) {
+ for (ConstSSIter i = src.begin(); i != src.end(); ++i)
+ dst->insert(*i);
+}
+
+// Add the cross-product of a and b to dst.
+// (For each string i in a and j in b, add i+j.)
+static void CrossProduct(const set<string>& a,
+ const set<string>& b,
+ set<string>* dst) {
+ for (ConstSSIter i = a.begin(); i != a.end(); ++i)
+ for (ConstSSIter j = b.begin(); j != b.end(); ++j)
+ dst->insert(*i + *j);
+}
+
+// Concats a and b. Requires that both are exact sets.
+// Forms an exact set that is a crossproduct of a and b.
+Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
+ if (a == NULL)
+ return b;
+ DCHECK(a->is_exact_);
+ DCHECK(b && b->is_exact_);
+ Info *ab = new Info();
+
+ CrossProduct(a->exact_, b->exact_, &ab->exact_);
+ ab->is_exact_ = true;
+
+ delete a;
+ delete b;
+ return ab;
+}
+
+// Constructs an inexact Info for ab given a and b.
+// Used only when a or b is not exact or when the
+// exact cross product is likely to be too big.
+Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
+ if (a == NULL)
+ return b;
+ if (b == NULL)
+ return a;
+
+ Info *ab = new Info();
+
+ ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
+ ab->is_exact_ = false;
+ delete a;
+ delete b;
+ return ab;
+}
+
+// Constructs Info for a|b given a and b.
+Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
+ Info *ab = new Info();
+
+ if (a->is_exact_ && b->is_exact_) {
+ CopyIn(a->exact_, &ab->exact_);
+ CopyIn(b->exact_, &ab->exact_);
+ ab->is_exact_ = true;
+ } else {
+ // Either a or b has is_exact_ = false. If the other
+ // one has is_exact_ = true, we move it to match_ and
+ // then create a OR of a,b. The resulting Info has
+ // is_exact_ = false.
+ ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
+ ab->is_exact_ = false;
+ }
+
+ delete a;
+ delete b;
+ return ab;
+}
+
+// Constructs Info for a? given a.
+Prefilter::Info* Prefilter::Info::Quest(Info *a) {
+ Info *ab = new Info();
+
+ ab->is_exact_ = false;
+ ab->match_ = new Prefilter(ALL);
+ delete a;
+ return ab;
+}
+
+// Constructs Info for a* given a.
+// Same as a? -- not much to do.
+Prefilter::Info* Prefilter::Info::Star(Info *a) {
+ return Quest(a);
+}
+
+// Constructs Info for a+ given a. If a was exact set, it isn't
+// anymore.
+Prefilter::Info* Prefilter::Info::Plus(Info *a) {
+ Info *ab = new Info();
+
+ ab->match_ = a->TakeMatch();
+ ab->is_exact_ = false;
+
+ delete a;
+ return ab;
+}
+
+static string RuneToString(Rune r) {
+ char buf[UTFmax];
+ int n = runetochar(buf, &r);
+ return string(buf, n);
+}
+
+// Constructs Info for literal rune.
+Prefilter::Info* Prefilter::Info::Literal(Rune r) {
+ Info* info = new Info();
+ info->exact_.insert(RuneToString(ToLowerRune(r)));
+ info->is_exact_ = true;
+ return info;
+}
+
+// Constructs Info for dot (any character).
+Prefilter::Info* Prefilter::Info::AnyChar() {
+ Prefilter::Info* info = new Prefilter::Info();
+ info->match_ = new Prefilter(ALL);
+ return info;
+}
+
+// Constructs Prefilter::Info for no possible match.
+Prefilter::Info* Prefilter::Info::NoMatch() {
+ Prefilter::Info* info = new Prefilter::Info();
+ info->match_ = new Prefilter(NONE);
+ return info;
+}
+
+// Constructs Prefilter::Info for any possible match.
+// This Prefilter::Info is valid for any regular expression,
+// since it makes no assertions whatsoever about the
+// strings being matched.
+Prefilter::Info* Prefilter::Info::AnyMatch() {
+ Prefilter::Info *info = new Prefilter::Info();
+ info->match_ = new Prefilter(ALL);
+ return info;
+}
+
+// Constructs Prefilter::Info for just the empty string.
+Prefilter::Info* Prefilter::Info::EmptyString() {
+ Prefilter::Info* info = new Prefilter::Info();
+ info->is_exact_ = true;
+ info->exact_.insert("");
+ return info;
+}
+
+// Constructs Prefilter::Info for a character class.
+typedef CharClass::iterator CCIter;
+Prefilter::Info* Prefilter::Info::CClass(CharClass *cc) {
+ if (Trace) {
+ VLOG(0) << "CharClassInfo:";
+ for (CCIter i = cc->begin(); i != cc->end(); ++i)
+ VLOG(0) << " " << i->lo << "-" << i->hi;
+ }
+
+ // If the class is too large, it's okay to overestimate.
+ if (cc->size() > 10)
+ return AnyChar();
+
+ Prefilter::Info *a = new Prefilter::Info();
+ for (CCIter i = cc->begin(); i != cc->end(); ++i)
+ for (Rune r = i->lo; r <= i->hi; r++)
+ a->exact_.insert(RuneToString(ToLowerRune(r)));
+
+ a->is_exact_ = true;
+
+ if (Trace) {
+ VLOG(0) << " = " << a->ToString();
+ }
+
+ return a;
+}
+
+class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
+ public:
+ Walker() {}
+
+ virtual Info* PostVisit(
+ Regexp* re, Info* parent_arg,
+ Info* pre_arg,
+ Info** child_args, int nchild_args);
+
+ virtual Info* ShortVisit(
+ Regexp* re,
+ Info* parent_arg);
+
+ private:
+ DISALLOW_EVIL_CONSTRUCTORS(Walker);
+};
+
+Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
+ if (Trace) {
+ LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
+ }
+ Prefilter::Info::Walker w;
+ Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
+
+ if (w.stopped_early()) {
+ delete info;
+ return NULL;
+ }
+
+ return info;
+}
+
+Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
+ Regexp* re, Prefilter::Info* parent_arg) {
+ return AnyMatch();
+}
+
+// Constructs the Prefilter::Info for the given regular expression.
+// Assumes re is simplified.
+Prefilter::Info* Prefilter::Info::Walker::PostVisit(
+ Regexp* re, Prefilter::Info* parent_arg,
+ Prefilter::Info* pre_arg, Prefilter::Info** child_args,
+ int nchild_args) {
+ Prefilter::Info *info;
+ switch (re->op()) {
+ default:
+ case kRegexpRepeat:
+ LOG(DFATAL) << "Bad regexp op " << re->op();
+ info = EmptyString();
+ break;
+
+ case kRegexpNoMatch:
+ info = NoMatch();
+ break;
+
+ // These ops match the empty string:
+ case kRegexpEmptyMatch: // anywhere
+ case kRegexpBeginLine: // at beginning of line
+ case kRegexpEndLine: // at end of line
+ case kRegexpBeginText: // at beginning of text
+ case kRegexpEndText: // at end of text
+ case kRegexpWordBoundary: // at word boundary
+ case kRegexpNoWordBoundary: // not at word boundary
+ info = EmptyString();
+ break;
+
+ case kRegexpLiteral:
+ info = Literal(re->rune());
+ break;
+
+ case kRegexpLiteralString:
+ if (re->nrunes() == 0) {
+ info = NoMatch();
+ break;
+ }
+ info = Literal(re->runes()[0]);
+ for (int i = 1; i < re->nrunes(); i++)
+ info = Concat(info, Literal(re->runes()[i]));
+ break;
+
+ case kRegexpConcat: {
+ // Accumulate in info.
+ // Exact is concat of recent contiguous exact nodes.
+ info = NULL;
+ Info* exact = NULL;
+ for (int i = 0; i < nchild_args; i++) {
+ Info* ci = child_args[i]; // child info
+ if (!ci->is_exact() ||
+ (exact && ci->exact().size() * exact->exact().size() > 16)) {
+ // Exact run is over.
+ info = And(info, exact);
+ exact = NULL;
+ // Add this child's info.
+ info = And(info, ci);
+ } else {
+ // Append to exact run.
+ exact = Concat(exact, ci);
+ }
+ }
+ info = And(info, exact);
+ }
+ break;
+
+ case kRegexpAlternate:
+ info = child_args[0];
+ for (int i = 1; i < nchild_args; i++)
+ info = Alt(info, child_args[i]);
+ VLOG(10) << "Alt: " << info->ToString();
+ break;
+
+ case kRegexpStar:
+ info = Star(child_args[0]);
+ break;
+
+ case kRegexpQuest:
+ info = Quest(child_args[0]);
+ break;
+
+ case kRegexpPlus:
+ info = Plus(child_args[0]);
+ break;
+
+ case kRegexpAnyChar:
+ // Claim nothing, except that it's not empty.
+ info = AnyChar();
+ break;
+
+ case kRegexpCharClass:
+ info = CClass(re->cc());
+ break;
+
+ case kRegexpCapture:
+ // These don't affect the set of matching strings.
+ info = child_args[0];
+ break;
+ }
+
+ if (Trace) {
+ VLOG(0) << "BuildInfo " << re->ToString()
+ << ": " << info->ToString();
+ }
+
+ return info;
+}
+
+
+Prefilter* Prefilter::FromRegexp(Regexp* re) {
+ if (re == NULL)
+ return NULL;
+
+ Regexp* simple = re->Simplify();
+ Prefilter::Info *info = BuildInfo(simple);
+
+ simple->Decref();
+ if (info == NULL)
+ return NULL;
+
+ Prefilter* m = info->TakeMatch();
+
+ delete info;
+ return m;
+}
+
+string Prefilter::DebugString() const {
+ if (this == NULL)
+ return "<nil>";
+
+ switch (op_) {
+ default:
+ LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
+ return StringPrintf("op%d", op_);
+ case NONE:
+ return "*no-matches*";
+ case ATOM:
+ return atom_;
+ case ALL:
+ return "";
+ case AND: {
+ string s = "";
+ for (int i = 0; i < subs_->size(); i++) {
+ if (i > 0)
+ s += " ";
+ s += (*subs_)[i]->DebugString();
+ }
+ return s;
+ }
+ case OR: {
+ string s = "(";
+ for (int i = 0; i < subs_->size(); i++) {
+ if (i > 0)
+ s += "|";
+ s += (*subs_)[i]->DebugString();
+ }
+ s += ")";
+ return s;
+ }
+ }
+}
+
+Prefilter* Prefilter::FromRE2(const RE2* re2) {
+ if (re2 == NULL)
+ return NULL;
+
+ Regexp* regexp = re2->Regexp();
+ if (regexp == NULL)
+ return NULL;
+
+ return FromRegexp(regexp);
+}
+
+
+} // namespace re2
« no previous file with comments | « third_party/re2/re2/prefilter.h ('k') | third_party/re2/re2/prefilter_tree.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698