Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(78)

Unified Diff: third_party/re2/re2/mimics_pcre.cc

Issue 10575037: Include RE2 library (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Less intrusive fix for Android Created 8 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/re2/re2/make_unicode_groups.py ('k') | third_party/re2/re2/nfa.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/re2/re2/mimics_pcre.cc
diff --git a/third_party/re2/re2/mimics_pcre.cc b/third_party/re2/re2/mimics_pcre.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc6dd4ad5941f9b1810464a1f3155ccadc48aa60
--- /dev/null
+++ b/third_party/re2/re2/mimics_pcre.cc
@@ -0,0 +1,185 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Determine whether this library should match PCRE exactly
+// for a particular Regexp. (If so, the testing framework can
+// check that it does.)
+//
+// This library matches PCRE except in these cases:
+// * the regexp contains a repetition of an empty string,
+// like (a*)* or (a*)+. In this case, PCRE will treat
+// the repetition sequence as ending with an empty string,
+// while this library does not.
+// * Perl and PCRE differ on whether \v matches \n.
+// For historical reasons, this library implements the Perl behavior.
+// * Perl and PCRE allow $ in one-line mode to match either the very
+// end of the text or just before a \n at the end of the text.
+// This library requires it to match only the end of the text.
+// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
+// match the end of the text if the last character is a \n.
+// This library does allow it.
+//
+// Regexp::MimicsPCRE checks for any of these conditions.
+
+#include "util/util.h"
+#include "re2/regexp.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+// Returns whether re might match an empty string.
+static bool CanBeEmptyString(Regexp *re);
+
+// Walker class to compute whether library handles a regexp
+// exactly as PCRE would. See comment at top for conditions.
+
+class PCREWalker : public Regexp::Walker<bool> {
+ public:
+ PCREWalker() {}
+ bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
+ int nchild_args);
+
+ bool ShortVisit(Regexp* re, bool a) {
+ // Should never be called: we use Walk not WalkExponential.
+ LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
+ return a;
+ }
+};
+
+// Called after visiting each of re's children and accumulating
+// the return values in child_args. So child_args contains whether
+// this library mimics PCRE for those subexpressions.
+bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+ bool* child_args, int nchild_args) {
+ // If children failed, so do we.
+ for (int i = 0; i < nchild_args; i++)
+ if (!child_args[i])
+ return false;
+
+ // Otherwise look for other reasons to fail.
+ switch (re->op()) {
+ // Look for repeated empty string.
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ if (CanBeEmptyString(re->sub()[0]))
+ return false;
+ break;
+ case kRegexpRepeat:
+ if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
+ return false;
+ break;
+
+ // Look for \v
+ case kRegexpLiteral:
+ if (re->rune() == '\v')
+ return false;
+ break;
+
+ // Look for $ in single-line mode.
+ case kRegexpEndText:
+ case kRegexpEmptyMatch:
+ if (re->parse_flags() & Regexp::WasDollar)
+ return false;
+ break;
+
+ // Look for ^ in multi-line mode.
+ case kRegexpBeginLine:
+ // No condition: in single-line mode ^ becomes kRegexpBeginText.
+ return false;
+
+ default:
+ break;
+ }
+
+ // Not proven guilty.
+ return true;
+}
+
+// Returns whether this regexp's behavior will mimic PCRE's exactly.
+bool Regexp::MimicsPCRE() {
+ PCREWalker w;
+ return w.Walk(this, true);
+}
+
+
+// Walker class to compute whether a Regexp can match an empty string.
+// It is okay to overestimate. For example, \b\B cannot match an empty
+// string, because \b and \B are mutually exclusive, but this isn't
+// that smart and will say it can. Spurious empty strings
+// will reduce the number of regexps we sanity check against PCRE,
+// but they won't break anything.
+
+class EmptyStringWalker : public Regexp::Walker<bool> {
+ public:
+ EmptyStringWalker() { }
+ bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+ bool* child_args, int nchild_args);
+
+ bool ShortVisit(Regexp* re, bool a) {
+ // Should never be called: we use Walk not WalkExponential.
+ LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
+ return a;
+ }
+
+ private:
+ DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker);
+};
+
+// Called after visiting re's children. child_args contains the return
+// value from each of the children's PostVisits (i.e., whether each child
+// can match an empty string). Returns whether this clause can match an
+// empty string.
+bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+ bool* child_args, int nchild_args) {
+ switch (re->op()) {
+ case kRegexpNoMatch: // never empty
+ case kRegexpLiteral:
+ case kRegexpAnyChar:
+ case kRegexpAnyByte:
+ case kRegexpCharClass:
+ case kRegexpLiteralString:
+ return false;
+
+ case kRegexpEmptyMatch: // always empty
+ case kRegexpBeginLine: // always empty, when they match
+ case kRegexpEndLine:
+ case kRegexpNoWordBoundary:
+ case kRegexpWordBoundary:
+ case kRegexpBeginText:
+ case kRegexpEndText:
+ case kRegexpStar: // can always be empty
+ case kRegexpQuest:
+ case kRegexpHaveMatch:
+ return true;
+
+ case kRegexpConcat: // can be empty if all children can
+ for (int i = 0; i < nchild_args; i++)
+ if (!child_args[i])
+ return false;
+ return true;
+
+ case kRegexpAlternate: // can be empty if any child can
+ for (int i = 0; i < nchild_args; i++)
+ if (child_args[i])
+ return true;
+ return false;
+
+ case kRegexpPlus: // can be empty if the child can
+ case kRegexpCapture:
+ return child_args[0];
+
+ case kRegexpRepeat: // can be empty if child can or is x{0}
+ return child_args[0] || re->min() == 0;
+ }
+ return false;
+}
+
+// Returns whether re can match an empty string.
+static bool CanBeEmptyString(Regexp* re) {
+ EmptyStringWalker w;
+ return w.Walk(re, true);
+}
+
+} // namespace re2
« no previous file with comments | « third_party/re2/re2/make_unicode_groups.py ('k') | third_party/re2/re2/nfa.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698