third_party/re2/re2/simplify.cc - Issue 10575037: Include RE2 library

Unified Diff: third_party/re2/re2/simplify.cc

Issue 10575037: Include RE2 library (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Less intrusive fix for Android Created 8 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: third_party/re2/re2/simplify.cc

diff --git a/third_party/re2/re2/simplify.cc b/third_party/re2/re2/simplify.cc

new file mode 100644

index 0000000000000000000000000000000000000000..faf32084e05f9f171b33fdc076d465e0ec693600

--- /dev/null

+++ b/third_party/re2/re2/simplify.cc

@@ -0,0 +1,393 @@

+// Use of this source code is governed by a BSD-style

+// license that can be found in the LICENSE file.

+// Rewrite POSIX and other features in re

+// to use simple extended regular expression features.

+// Also sort and simplify character classes.

+#include "util/util.h"

+#include "re2/regexp.h"

+#include "re2/walker-inl.h"

+namespace re2 {

+// Parses the regexp src and then simplifies it and sets *dst to the

+// string representation of the simplified form. Returns true on success.

+// Returns false and sets *error (if error != NULL) on error.

+bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,

+ string* dst,

+ RegexpStatus* status) {

+ Regexp* re = Parse(src, flags, status);

+ if (re == NULL)

+ return false;

+ Regexp* sre = re->Simplify();

+ re->Decref();

+ if (sre == NULL) {

+ // Should not happen, since Simplify never fails.

+ LOG(ERROR) << "Simplify failed on " << src;

+ if (status) {

+ status->set_code(kRegexpInternalError);

+ status->set_error_arg(src);

+ }

+ return false;

+ }

+ *dst = sre->ToString();

+ sre->Decref();

+ return true;

+// Assuming the simple_ flags on the children are accurate,

+// is this Regexp* simple?

+bool Regexp::ComputeSimple() {

+ Regexp** subs;

+ switch (op_) {

+ case kRegexpNoMatch:

+ case kRegexpEmptyMatch:

+ case kRegexpLiteral:

+ case kRegexpLiteralString:

+ case kRegexpBeginLine:

+ case kRegexpEndLine:

+ case kRegexpBeginText:

+ case kRegexpWordBoundary:

+ case kRegexpNoWordBoundary:

+ case kRegexpEndText:

+ case kRegexpAnyChar:

+ case kRegexpAnyByte:

+ case kRegexpHaveMatch:

+ return true;

+ case kRegexpConcat:

+ case kRegexpAlternate:

+ // These are simple as long as the subpieces are simple.

+ subs = sub();

+ for (int i = 0; i < nsub_; i++)

+ if (!subs[i]->simple_)

+ return false;

+ return true;

+ case kRegexpCharClass:

+ // Simple as long as the char class is not empty, not full.

+ if (ccb_ != NULL)

+ return !ccb_->empty() && !ccb_->full();

+ return !cc_->empty() && !cc_->full();

+ case kRegexpCapture:

+ subs = sub();

+ return subs[0]->simple_;

+ case kRegexpStar:

+ case kRegexpPlus:

+ case kRegexpQuest:

+ subs = sub();

+ if (!subs[0]->simple_)

+ return false;

+ switch (subs[0]->op_) {

+ case kRegexpStar:

+ case kRegexpPlus:

+ case kRegexpQuest:

+ case kRegexpEmptyMatch:

+ case kRegexpNoMatch:

+ return false;

+ default:

+ break;

+ }

+ return true;

+ case kRegexpRepeat:

+ return false;

+ }

+ LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;

+ return false;

+// Walker subclass used by Simplify.

+// The simplify walk is purely post-recursive: given the simplified children,

+// PostVisit creates the simplified result.

+// The child_args are simplified Regexp*s.

+class SimplifyWalker : public Regexp::Walker<Regexp*> {

+ public:

+ SimplifyWalker() {}

+ virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);

+ virtual Regexp* PostVisit(Regexp* re,

+ Regexp* parent_arg,

+ Regexp* pre_arg,

+ Regexp** child_args, int nchild_args);

+ virtual Regexp* Copy(Regexp* re);

+ virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);

+ private:

+ // These functions are declared inside SimplifyWalker so that

+ // they can edit the private fields of the Regexps they construct.

+ // Creates a concatenation of two Regexp, consuming refs to re1 and re2.

+ // Caller must Decref return value when done with it.

+ static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);

+ // Simplifies the expression re{min,max} in terms of *, +, and ?.

+ // Returns a new regexp. Does not edit re. Does not consume reference to re.

+ // Caller must Decref return value when done with it.

+ static Regexp* SimplifyRepeat(Regexp* re, int min, int max,

+ Regexp::ParseFlags parse_flags);

+ // Simplifies a character class by expanding any named classes

+ // into rune ranges. Does not edit re. Does not consume ref to re.

+ // Caller must Decref return value when done with it.

+ static Regexp* SimplifyCharClass(Regexp* re);

+ DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker);

+};

+// Simplifies a regular expression, returning a new regexp.

+// The new regexp uses traditional Unix egrep features only,

+// plus the Perl (?:) non-capturing parentheses.

+// Otherwise, no POSIX or Perl additions. The new regexp

+// captures exactly the same subexpressions (with the same indices)

+// as the original.

+// Does not edit current object.

+// Caller must Decref() return value when done with it.

+Regexp* Regexp::Simplify() {

+ if (simple_)

+ return Incref();

+ SimplifyWalker w;

+ return w.Walk(this, NULL);

+#define Simplify DontCallSimplify // Avoid accidental recursion

+Regexp* SimplifyWalker::Copy(Regexp* re) {

+ return re->Incref();

+Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {

+ // This should never be called, since we use Walk and not

+ // WalkExponential.

+ LOG(DFATAL) << "SimplifyWalker::ShortVisit called";

+ return re->Incref();

+Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {

+ if (re->simple_) {

+ *stop = true;

+ return re->Incref();

+ }

+ return NULL;

+Regexp* SimplifyWalker::PostVisit(Regexp* re,

+ Regexp* parent_arg,

+ Regexp* pre_arg,

+ Regexp** child_args,

+ int nchild_args) {

+ switch (re->op()) {

+ case kRegexpNoMatch:

+ case kRegexpEmptyMatch:

+ case kRegexpLiteral:

+ case kRegexpLiteralString:

+ case kRegexpBeginLine:

+ case kRegexpEndLine:

+ case kRegexpBeginText:

+ case kRegexpWordBoundary:

+ case kRegexpNoWordBoundary:

+ case kRegexpEndText:

+ case kRegexpAnyChar:

+ case kRegexpAnyByte:

+ case kRegexpHaveMatch:

+ // All these are always simple.

+ re->simple_ = true;

+ return re->Incref();

+ case kRegexpConcat:

+ case kRegexpAlternate: {

+ // These are simple as long as the subpieces are simple.

+ // Two passes to avoid allocation in the common case.

+ bool changed = false;

+ Regexp** subs = re->sub();

+ for (int i = 0; i < re->nsub_; i++) {

+ Regexp* sub = subs[i];

+ Regexp* newsub = child_args[i];

+ if (newsub != sub) {

+ changed = true;

+ break;

+ }

+ if (!changed) {

+ for (int i = 0; i < re->nsub_; i++) {

+ Regexp* newsub = child_args[i];

+ newsub->Decref();

+ }

+ re->simple_ = true;

+ return re->Incref();

+ }

+ Regexp* nre = new Regexp(re->op(), re->parse_flags());

+ nre->AllocSub(re->nsub_);

+ Regexp** nre_subs = nre->sub();

+ for (int i = 0; i <re->nsub_; i++)

+ nre_subs[i] = child_args[i];

+ nre->simple_ = true;

+ return nre;

+ }

+ case kRegexpCapture: {

+ Regexp* newsub = child_args[0];

+ if (newsub == re->sub()[0]) {

+ newsub->Decref();

+ re->simple_ = true;

+ return re->Incref();

+ }

+ Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());

+ nre->AllocSub(1);

+ nre->sub()[0] = newsub;

+ nre->cap_ = re->cap_;

+ nre->simple_ = true;

+ return nre;

+ }

+ case kRegexpStar:

+ case kRegexpPlus:

+ case kRegexpQuest: {

+ Regexp* newsub = child_args[0];

+ // Special case: repeat the empty string as much as

+ // you want, but it's still the empty string.

+ if (newsub->op() == kRegexpEmptyMatch)

+ return newsub;

+ // These are simple as long as the subpiece is simple.

+ if (newsub == re->sub()[0]) {

+ newsub->Decref();

+ re->simple_ = true;

+ return re->Incref();

+ }

+ // These are also idempotent if flags are constant.

+ if (re->op() == newsub->op() &&

+ re->parse_flags() == newsub->parse_flags())

+ return newsub;

+ Regexp* nre = new Regexp(re->op(), re->parse_flags());

+ nre->AllocSub(1);

+ nre->sub()[0] = newsub;

+ nre->simple_ = true;

+ return nre;

+ }

+ case kRegexpRepeat: {

+ Regexp* newsub = child_args[0];

+ // Special case: repeat the empty string as much as

+ // you want, but it's still the empty string.

+ if (newsub->op() == kRegexpEmptyMatch)

+ return newsub;

+ Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,

+ re->parse_flags());

+ newsub->Decref();

+ nre->simple_ = true;

+ return nre;

+ }

+ case kRegexpCharClass: {

+ Regexp* nre = SimplifyCharClass(re);

+ nre->simple_ = true;

+ return nre;

+ }

+ LOG(ERROR) << "Simplify case not handled: " << re->op();

+ return re->Incref();

+// Creates a concatenation of two Regexp, consuming refs to re1 and re2.

+// Returns a new Regexp, handing the ref to the caller.

+Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,

+ Regexp::ParseFlags parse_flags) {

+ Regexp* re = new Regexp(kRegexpConcat, parse_flags);

+ re->AllocSub(2);

+ Regexp** subs = re->sub();

+ subs[0] = re1;

+ subs[1] = re2;

+ return re;

+// Simplifies the expression re{min,max} in terms of *, +, and ?.

+// Returns a new regexp. Does not edit re. Does not consume reference to re.

+// Caller must Decref return value when done with it.

+// The result will *not* necessarily have the right capturing parens

+// if you call ToString() and re-parse it: (x){2} becomes (x)(x),

+// but in the Regexp* representation, both (x) are marked as $1.

+Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,

+ Regexp::ParseFlags f) {

+ // x{n,} means at least n matches of x.

+ if (max == -1) {

+ // Special case: x{0,} is x*

+ if (min == 0)

+ return Regexp::Star(re->Incref(), f);

+ // Special case: x{1,} is x+

+ if (min == 1)

+ return Regexp::Plus(re->Incref(), f);

+ // General case: x{4,} is xxxx+

+ Regexp* nre = new Regexp(kRegexpConcat, f);

+ nre->AllocSub(min);

+ VLOG(1) << "Simplify " << min;

+ Regexp** nre_subs = nre->sub();

+ for (int i = 0; i < min-1; i++)

+ nre_subs[i] = re->Incref();

+ nre_subs[min-1] = Regexp::Plus(re->Incref(), f);

+ return nre;

+ }

+ // Special case: (x){0} matches only empty string.

+ if (min == 0 && max == 0)

+ return new Regexp(kRegexpEmptyMatch, f);

+ // Special case: x{1} is just x.

+ if (min == 1 && max == 1)

+ return re->Incref();

+ // General case: x{n,m} means n copies of x and m copies of x?.

+ // The machine will do less work if we nest the final m copies,

+ // so that x{2,5} = xx(x(x(x)?)?)?

+ // Build leading prefix: xx. Capturing only on the last one.

+ Regexp* nre = NULL;

+ if (min > 0) {

+ nre = new Regexp(kRegexpConcat, f);

+ nre->AllocSub(min);

+ Regexp** nre_subs = nre->sub();

+ for (int i = 0; i < min; i++)

+ nre_subs[i] = re->Incref();

+ }

+ // Build and attach suffix: (x(x(x)?)?)?

+ if (max > min) {

+ Regexp* suf = Regexp::Quest(re->Incref(), f);

+ for (int i = min+1; i < max; i++)

+ suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);

+ if (nre == NULL)

+ nre = suf;

+ else

+ nre = Concat2(nre, suf, f);

+ }

+ if (nre == NULL) {

+ // Some degenerate case, like min > max, or min < max < 0.

+ // This shouldn't happen, because the parser rejects such regexps.

+ LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;

+ return new Regexp(kRegexpNoMatch, f);

+ }

+ return nre;

+// Simplifies a character class.

+// Caller must Decref return value when done with it.

+Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {

+ CharClass* cc = re->cc();

+ // Special cases

+ if (cc->empty())

+ return new Regexp(kRegexpNoMatch, re->parse_flags());

+ if (cc->full())

+ return new Regexp(kRegexpAnyChar, re->parse_flags());

+ return re->Incref();

+} // namespace re2

« no previous file with comments | « third_party/re2/re2/set.cc ('k') | third_party/re2/re2/stringpiece.h » ('j') | no next file with comments »