Index: third_party/re2/re2/simplify.cc |
diff --git a/third_party/re2/re2/simplify.cc b/third_party/re2/re2/simplify.cc |
new file mode 100644 |
index 0000000000000000000000000000000000000000..faf32084e05f9f171b33fdc076d465e0ec693600 |
--- /dev/null |
+++ b/third_party/re2/re2/simplify.cc |
@@ -0,0 +1,393 @@ |
+// Copyright 2006 The RE2 Authors. All Rights Reserved. |
+// Use of this source code is governed by a BSD-style |
+// license that can be found in the LICENSE file. |
+ |
+// Rewrite POSIX and other features in re |
+// to use simple extended regular expression features. |
+// Also sort and simplify character classes. |
+ |
+#include "util/util.h" |
+#include "re2/regexp.h" |
+#include "re2/walker-inl.h" |
+ |
+namespace re2 { |
+ |
+// Parses the regexp src and then simplifies it and sets *dst to the |
+// string representation of the simplified form. Returns true on success. |
+// Returns false and sets *error (if error != NULL) on error. |
+bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags, |
+ string* dst, |
+ RegexpStatus* status) { |
+ Regexp* re = Parse(src, flags, status); |
+ if (re == NULL) |
+ return false; |
+ Regexp* sre = re->Simplify(); |
+ re->Decref(); |
+ if (sre == NULL) { |
+ // Should not happen, since Simplify never fails. |
+ LOG(ERROR) << "Simplify failed on " << src; |
+ if (status) { |
+ status->set_code(kRegexpInternalError); |
+ status->set_error_arg(src); |
+ } |
+ return false; |
+ } |
+ *dst = sre->ToString(); |
+ sre->Decref(); |
+ return true; |
+} |
+ |
+// Assuming the simple_ flags on the children are accurate, |
+// is this Regexp* simple? |
+bool Regexp::ComputeSimple() { |
+ Regexp** subs; |
+ switch (op_) { |
+ case kRegexpNoMatch: |
+ case kRegexpEmptyMatch: |
+ case kRegexpLiteral: |
+ case kRegexpLiteralString: |
+ case kRegexpBeginLine: |
+ case kRegexpEndLine: |
+ case kRegexpBeginText: |
+ case kRegexpWordBoundary: |
+ case kRegexpNoWordBoundary: |
+ case kRegexpEndText: |
+ case kRegexpAnyChar: |
+ case kRegexpAnyByte: |
+ case kRegexpHaveMatch: |
+ return true; |
+ case kRegexpConcat: |
+ case kRegexpAlternate: |
+ // These are simple as long as the subpieces are simple. |
+ subs = sub(); |
+ for (int i = 0; i < nsub_; i++) |
+ if (!subs[i]->simple_) |
+ return false; |
+ return true; |
+ case kRegexpCharClass: |
+ // Simple as long as the char class is not empty, not full. |
+ if (ccb_ != NULL) |
+ return !ccb_->empty() && !ccb_->full(); |
+ return !cc_->empty() && !cc_->full(); |
+ case kRegexpCapture: |
+ subs = sub(); |
+ return subs[0]->simple_; |
+ case kRegexpStar: |
+ case kRegexpPlus: |
+ case kRegexpQuest: |
+ subs = sub(); |
+ if (!subs[0]->simple_) |
+ return false; |
+ switch (subs[0]->op_) { |
+ case kRegexpStar: |
+ case kRegexpPlus: |
+ case kRegexpQuest: |
+ case kRegexpEmptyMatch: |
+ case kRegexpNoMatch: |
+ return false; |
+ default: |
+ break; |
+ } |
+ return true; |
+ case kRegexpRepeat: |
+ return false; |
+ } |
+ LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_; |
+ return false; |
+} |
+ |
+// Walker subclass used by Simplify. |
+// The simplify walk is purely post-recursive: given the simplified children, |
+// PostVisit creates the simplified result. |
+// The child_args are simplified Regexp*s. |
+class SimplifyWalker : public Regexp::Walker<Regexp*> { |
+ public: |
+ SimplifyWalker() {} |
+ virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop); |
+ virtual Regexp* PostVisit(Regexp* re, |
+ Regexp* parent_arg, |
+ Regexp* pre_arg, |
+ Regexp** child_args, int nchild_args); |
+ virtual Regexp* Copy(Regexp* re); |
+ virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); |
+ |
+ private: |
+ // These functions are declared inside SimplifyWalker so that |
+ // they can edit the private fields of the Regexps they construct. |
+ |
+ // Creates a concatenation of two Regexp, consuming refs to re1 and re2. |
+ // Caller must Decref return value when done with it. |
+ static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags); |
+ |
+ // Simplifies the expression re{min,max} in terms of *, +, and ?. |
+ // Returns a new regexp. Does not edit re. Does not consume reference to re. |
+ // Caller must Decref return value when done with it. |
+ static Regexp* SimplifyRepeat(Regexp* re, int min, int max, |
+ Regexp::ParseFlags parse_flags); |
+ |
+ // Simplifies a character class by expanding any named classes |
+ // into rune ranges. Does not edit re. Does not consume ref to re. |
+ // Caller must Decref return value when done with it. |
+ static Regexp* SimplifyCharClass(Regexp* re); |
+ |
+ DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker); |
+}; |
+ |
+// Simplifies a regular expression, returning a new regexp. |
+// The new regexp uses traditional Unix egrep features only, |
+// plus the Perl (?:) non-capturing parentheses. |
+// Otherwise, no POSIX or Perl additions. The new regexp |
+// captures exactly the same subexpressions (with the same indices) |
+// as the original. |
+// Does not edit current object. |
+// Caller must Decref() return value when done with it. |
+ |
+Regexp* Regexp::Simplify() { |
+ if (simple_) |
+ return Incref(); |
+ SimplifyWalker w; |
+ return w.Walk(this, NULL); |
+} |
+ |
+#define Simplify DontCallSimplify // Avoid accidental recursion |
+ |
+Regexp* SimplifyWalker::Copy(Regexp* re) { |
+ return re->Incref(); |
+} |
+ |
+Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { |
+ // This should never be called, since we use Walk and not |
+ // WalkExponential. |
+ LOG(DFATAL) << "SimplifyWalker::ShortVisit called"; |
+ return re->Incref(); |
+} |
+ |
+Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) { |
+ if (re->simple_) { |
+ *stop = true; |
+ return re->Incref(); |
+ } |
+ return NULL; |
+} |
+ |
+Regexp* SimplifyWalker::PostVisit(Regexp* re, |
+ Regexp* parent_arg, |
+ Regexp* pre_arg, |
+ Regexp** child_args, |
+ int nchild_args) { |
+ switch (re->op()) { |
+ case kRegexpNoMatch: |
+ case kRegexpEmptyMatch: |
+ case kRegexpLiteral: |
+ case kRegexpLiteralString: |
+ case kRegexpBeginLine: |
+ case kRegexpEndLine: |
+ case kRegexpBeginText: |
+ case kRegexpWordBoundary: |
+ case kRegexpNoWordBoundary: |
+ case kRegexpEndText: |
+ case kRegexpAnyChar: |
+ case kRegexpAnyByte: |
+ case kRegexpHaveMatch: |
+ // All these are always simple. |
+ re->simple_ = true; |
+ return re->Incref(); |
+ |
+ case kRegexpConcat: |
+ case kRegexpAlternate: { |
+ // These are simple as long as the subpieces are simple. |
+ // Two passes to avoid allocation in the common case. |
+ bool changed = false; |
+ Regexp** subs = re->sub(); |
+ for (int i = 0; i < re->nsub_; i++) { |
+ Regexp* sub = subs[i]; |
+ Regexp* newsub = child_args[i]; |
+ if (newsub != sub) { |
+ changed = true; |
+ break; |
+ } |
+ } |
+ if (!changed) { |
+ for (int i = 0; i < re->nsub_; i++) { |
+ Regexp* newsub = child_args[i]; |
+ newsub->Decref(); |
+ } |
+ re->simple_ = true; |
+ return re->Incref(); |
+ } |
+ Regexp* nre = new Regexp(re->op(), re->parse_flags()); |
+ nre->AllocSub(re->nsub_); |
+ Regexp** nre_subs = nre->sub(); |
+ for (int i = 0; i <re->nsub_; i++) |
+ nre_subs[i] = child_args[i]; |
+ nre->simple_ = true; |
+ return nre; |
+ } |
+ |
+ case kRegexpCapture: { |
+ Regexp* newsub = child_args[0]; |
+ if (newsub == re->sub()[0]) { |
+ newsub->Decref(); |
+ re->simple_ = true; |
+ return re->Incref(); |
+ } |
+ Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags()); |
+ nre->AllocSub(1); |
+ nre->sub()[0] = newsub; |
+ nre->cap_ = re->cap_; |
+ nre->simple_ = true; |
+ return nre; |
+ } |
+ |
+ case kRegexpStar: |
+ case kRegexpPlus: |
+ case kRegexpQuest: { |
+ Regexp* newsub = child_args[0]; |
+ // Special case: repeat the empty string as much as |
+ // you want, but it's still the empty string. |
+ if (newsub->op() == kRegexpEmptyMatch) |
+ return newsub; |
+ |
+ // These are simple as long as the subpiece is simple. |
+ if (newsub == re->sub()[0]) { |
+ newsub->Decref(); |
+ re->simple_ = true; |
+ return re->Incref(); |
+ } |
+ |
+ // These are also idempotent if flags are constant. |
+ if (re->op() == newsub->op() && |
+ re->parse_flags() == newsub->parse_flags()) |
+ return newsub; |
+ |
+ Regexp* nre = new Regexp(re->op(), re->parse_flags()); |
+ nre->AllocSub(1); |
+ nre->sub()[0] = newsub; |
+ nre->simple_ = true; |
+ return nre; |
+ } |
+ |
+ case kRegexpRepeat: { |
+ Regexp* newsub = child_args[0]; |
+ // Special case: repeat the empty string as much as |
+ // you want, but it's still the empty string. |
+ if (newsub->op() == kRegexpEmptyMatch) |
+ return newsub; |
+ |
+ Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_, |
+ re->parse_flags()); |
+ newsub->Decref(); |
+ nre->simple_ = true; |
+ return nre; |
+ } |
+ |
+ case kRegexpCharClass: { |
+ Regexp* nre = SimplifyCharClass(re); |
+ nre->simple_ = true; |
+ return nre; |
+ } |
+ } |
+ |
+ LOG(ERROR) << "Simplify case not handled: " << re->op(); |
+ return re->Incref(); |
+} |
+ |
+// Creates a concatenation of two Regexp, consuming refs to re1 and re2. |
+// Returns a new Regexp, handing the ref to the caller. |
+Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2, |
+ Regexp::ParseFlags parse_flags) { |
+ Regexp* re = new Regexp(kRegexpConcat, parse_flags); |
+ re->AllocSub(2); |
+ Regexp** subs = re->sub(); |
+ subs[0] = re1; |
+ subs[1] = re2; |
+ return re; |
+} |
+ |
+// Simplifies the expression re{min,max} in terms of *, +, and ?. |
+// Returns a new regexp. Does not edit re. Does not consume reference to re. |
+// Caller must Decref return value when done with it. |
+// The result will *not* necessarily have the right capturing parens |
+// if you call ToString() and re-parse it: (x){2} becomes (x)(x), |
+// but in the Regexp* representation, both (x) are marked as $1. |
+Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, |
+ Regexp::ParseFlags f) { |
+ // x{n,} means at least n matches of x. |
+ if (max == -1) { |
+ // Special case: x{0,} is x* |
+ if (min == 0) |
+ return Regexp::Star(re->Incref(), f); |
+ |
+ // Special case: x{1,} is x+ |
+ if (min == 1) |
+ return Regexp::Plus(re->Incref(), f); |
+ |
+ // General case: x{4,} is xxxx+ |
+ Regexp* nre = new Regexp(kRegexpConcat, f); |
+ nre->AllocSub(min); |
+ VLOG(1) << "Simplify " << min; |
+ Regexp** nre_subs = nre->sub(); |
+ for (int i = 0; i < min-1; i++) |
+ nre_subs[i] = re->Incref(); |
+ nre_subs[min-1] = Regexp::Plus(re->Incref(), f); |
+ return nre; |
+ } |
+ |
+ // Special case: (x){0} matches only empty string. |
+ if (min == 0 && max == 0) |
+ return new Regexp(kRegexpEmptyMatch, f); |
+ |
+ // Special case: x{1} is just x. |
+ if (min == 1 && max == 1) |
+ return re->Incref(); |
+ |
+ // General case: x{n,m} means n copies of x and m copies of x?. |
+ // The machine will do less work if we nest the final m copies, |
+ // so that x{2,5} = xx(x(x(x)?)?)? |
+ |
+ // Build leading prefix: xx. Capturing only on the last one. |
+ Regexp* nre = NULL; |
+ if (min > 0) { |
+ nre = new Regexp(kRegexpConcat, f); |
+ nre->AllocSub(min); |
+ Regexp** nre_subs = nre->sub(); |
+ for (int i = 0; i < min; i++) |
+ nre_subs[i] = re->Incref(); |
+ } |
+ |
+ // Build and attach suffix: (x(x(x)?)?)? |
+ if (max > min) { |
+ Regexp* suf = Regexp::Quest(re->Incref(), f); |
+ for (int i = min+1; i < max; i++) |
+ suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f); |
+ if (nre == NULL) |
+ nre = suf; |
+ else |
+ nre = Concat2(nre, suf, f); |
+ } |
+ |
+ if (nre == NULL) { |
+ // Some degenerate case, like min > max, or min < max < 0. |
+ // This shouldn't happen, because the parser rejects such regexps. |
+ LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max; |
+ return new Regexp(kRegexpNoMatch, f); |
+ } |
+ |
+ return nre; |
+} |
+ |
+// Simplifies a character class. |
+// Caller must Decref return value when done with it. |
+Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) { |
+ CharClass* cc = re->cc(); |
+ |
+ // Special cases |
+ if (cc->empty()) |
+ return new Regexp(kRegexpNoMatch, re->parse_flags()); |
+ if (cc->full()) |
+ return new Regexp(kRegexpAnyChar, re->parse_flags()); |
+ |
+ return re->Incref(); |
+} |
+ |
+} // namespace re2 |