Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(490)

Unified Diff: third_party/re2/re2/testing/tester.cc

Issue 10575037: Include RE2 library (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Less intrusive fix for Android Created 8 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/re2/re2/testing/tester.h ('k') | third_party/re2/re2/testing/unicode_test.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/re2/re2/testing/tester.cc
diff --git a/third_party/re2/re2/testing/tester.cc b/third_party/re2/re2/testing/tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..003dc5add05696f9dfcdb88936aa1230831a265e
--- /dev/null
+++ b/third_party/re2/re2/testing/tester.cc
@@ -0,0 +1,640 @@
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Regular expression engine tester -- test all the implementations against each other.
+
+#include "util/util.h"
+#include "util/flags.h"
+#include "re2/testing/tester.h"
+#include "re2/prog.h"
+#include "re2/re2.h"
+#include "re2/regexp.h"
+
+DEFINE_bool(dump_prog, false, "dump regexp program");
+DEFINE_bool(log_okay, false, "log successful runs");
+DEFINE_bool(dump_rprog, false, "dump reversed regexp program");
+
+DEFINE_int32(max_regexp_failures, 100,
+ "maximum number of regexp test failures (-1 = unlimited)");
+
+DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test");
+
+namespace re2 {
+
+enum {
+ kMaxSubmatch = 1+16, // $0...$16
+};
+
+const char* engine_types[kEngineMax] = {
+ "Backtrack",
+ "NFA",
+ "DFA",
+ "DFA1",
+ "OnePass",
+ "BitState",
+ "RE2",
+ "RE2a",
+ "RE2b",
+ "PCRE",
+};
+
+// Returns the name string for the type t.
+static string EngineString(Engine t) {
+ if (t < 0 || t >= arraysize(engine_types) || engine_types[t] == NULL) {
+ return StringPrintf("type%d", static_cast<int>(t));
+ }
+ return engine_types[t];
+}
+
+// Returns bit mask of engines to use.
+static uint32 Engines() {
+ static uint32 cached_engines;
+ static bool did_parse;
+
+ if (did_parse)
+ return cached_engines;
+
+ if (FLAGS_regexp_engines.empty()) {
+ cached_engines = ~0;
+ } else {
+ for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
+ if (strstr(EngineString(i).c_str(), FLAGS_regexp_engines.c_str()))
+ cached_engines |= 1<<i;
+ }
+
+ if (cached_engines == 0)
+ LOG(INFO) << "Warning: no engines enabled.";
+ if (!UsingPCRE)
+ cached_engines &= ~(1<<kEnginePCRE);
+ for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
+ if (cached_engines & (1<<i))
+ LOG(INFO) << EngineString(i) << " enabled";
+ }
+ did_parse = true;
+ return cached_engines;
+}
+
+// The result of running a match.
+struct TestInstance::Result {
+ bool skipped; // test skipped: wasn't applicable
+ bool matched; // found a match
+ bool untrusted; // don't really trust the answer
+ bool have_submatch; // computed all submatch info
+ bool have_submatch0; // computed just submatch[0]
+ StringPiece submatch[kMaxSubmatch];
+};
+
+typedef TestInstance::Result Result;
+
+// Formats a single capture range s in text in the form (a,b)
+// where a and b are the starting and ending offsets of s in text.
+static string FormatCapture(const StringPiece& text, const StringPiece& s) {
+ if (s.begin() == NULL)
+ return "(?,?)";
+ return StringPrintf("(%d,%d)",
+ static_cast<int>(s.begin() - text.begin()),
+ static_cast<int>(s.end() - text.begin()));
+}
+
+// Returns whether text contains non-ASCII (>= 0x80) bytes.
+static bool NonASCII(const StringPiece& text) {
+ for (int i = 0; i < text.size(); i++)
+ if ((uint8)text[i] >= 0x80)
+ return true;
+ return false;
+}
+
+// Returns string representation of match kind.
+static string FormatKind(Prog::MatchKind kind) {
+ switch (kind) {
+ case Prog::kFullMatch:
+ return "full match";
+ case Prog::kLongestMatch:
+ return "longest match";
+ case Prog::kFirstMatch:
+ return "first match";
+ case Prog::kManyMatch:
+ return "many match";
+ }
+ return "???";
+}
+
+// Returns string representation of anchor kind.
+static string FormatAnchor(Prog::Anchor anchor) {
+ switch (anchor) {
+ case Prog::kAnchored:
+ return "anchored";
+ case Prog::kUnanchored:
+ return "unanchored";
+ }
+ return "???";
+}
+
+struct ParseMode {
+ Regexp::ParseFlags parse_flags;
+ string desc;
+};
+
+static const Regexp::ParseFlags single_line =
+ Regexp::LikePerl;
+static const Regexp::ParseFlags multi_line =
+ static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
+
+static ParseMode parse_modes[] = {
+ { single_line, "single-line" },
+ { single_line|Regexp::Latin1, "single-line, latin1" },
+ { multi_line, "multiline" },
+ { multi_line|Regexp::NonGreedy, "multiline, nongreedy" },
+ { multi_line|Regexp::Latin1, "multiline, latin1" },
+};
+
+static string FormatMode(Regexp::ParseFlags flags) {
+ for (int i = 0; i < arraysize(parse_modes); i++)
+ if (parse_modes[i].parse_flags == flags)
+ return parse_modes[i].desc;
+ return StringPrintf("%#x", static_cast<uint>(flags));
+}
+
+// Constructs and saves all the matching engines that
+// will be required for the given tests.
+TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
+ Regexp::ParseFlags flags)
+ : regexp_str_(regexp_str),
+ kind_(kind),
+ flags_(flags),
+ error_(false),
+ regexp_(NULL),
+ num_captures_(0),
+ prog_(NULL),
+ rprog_(NULL),
+ re_(NULL),
+ re2_(NULL) {
+
+ VLOG(1) << CEscape(regexp_str);
+
+ // Compile regexp to prog.
+ // Always required - needed for backtracking (reference implementation).
+ RegexpStatus status;
+ regexp_ = Regexp::Parse(regexp_str, flags, &status);
+ if (regexp_ == NULL) {
+ LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
+ << " mode: " << FormatMode(flags);
+ error_ = true;
+ return;
+ }
+ num_captures_ = regexp_->NumCaptures();
+ prog_ = regexp_->CompileToProg(0);
+ if (prog_ == NULL) {
+ LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
+ error_ = true;
+ return;
+ }
+ if (FLAGS_dump_prog) {
+ LOG(INFO) << "Prog for "
+ << " regexp "
+ << CEscape(regexp_str_)
+ << " (" << FormatKind(kind_)
+ << ", " << FormatMode(flags_)
+ << ")\n"
+ << prog_->Dump();
+ }
+
+ // Compile regexp to reversed prog. Only needed for DFA engines.
+ if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
+ rprog_ = regexp_->CompileToReverseProg(0);
+ if (rprog_ == NULL) {
+ LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
+ error_ = true;
+ return;
+ }
+ if (FLAGS_dump_rprog)
+ LOG(INFO) << rprog_->Dump();
+ }
+
+ // Create re string that will be used for RE and RE2.
+ string re = regexp_str.as_string();
+ // Accomodate flags.
+ // Regexp::Latin1 will be accomodated below.
+ if (!(flags & Regexp::OneLine))
+ re = "(?m)" + re;
+ if (flags & Regexp::NonGreedy)
+ re = "(?U)" + re;
+ if (flags & Regexp::DotNL)
+ re = "(?s)" + re;
+
+ // Compile regexp to RE2.
+ if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
+ RE2::Options options;
+ if (flags & Regexp::Latin1)
+ options.set_encoding(RE2::Options::EncodingLatin1);
+ if (kind_ == Prog::kLongestMatch)
+ options.set_longest_match(true);
+ re2_ = new RE2(re, options);
+ if (!re2_->error().empty()) {
+ LOG(INFO) << "Cannot RE2: " << CEscape(re);
+ error_ = true;
+ return;
+ }
+ }
+
+ // Compile regexp to RE.
+ // PCRE as exposed by the RE interface isn't always usable.
+ // 1. It disagrees about handling of empty-string reptitions
+ // like matching (a*)* against "b". PCRE treats the (a*) as
+ // occurring once, while we treat it as occurring not at all.
+ // 2. It treats $ as this weird thing meaning end of string
+ // or before the \n at the end of the string.
+ // 3. It doesn't implement POSIX leftmost-longest matching.
+ // MimicsPCRE() detects 1 and 2.
+ if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
+ kind_ != Prog::kLongestMatch) {
+ PCRE_Options o;
+ o.set_option(PCRE::UTF8);
+ if (flags & Regexp::Latin1)
+ o.set_option(PCRE::None);
+ // PCRE has interface bug keeping us from finding $0, so
+ // add one more layer of parens.
+ re_ = new PCRE("("+re+")", o);
+ if (!re_->error().empty()) {
+ LOG(INFO) << "Cannot PCRE: " << CEscape(re);
+ error_ = true;
+ return;
+ }
+ }
+}
+
+TestInstance::~TestInstance() {
+ if (regexp_)
+ regexp_->Decref();
+ delete prog_;
+ delete rprog_;
+ delete re_;
+ delete re2_;
+}
+
+// Runs a single search using the named engine type.
+// This interface hides all the irregularities of the various
+// engine interfaces from the rest of this file.
+void TestInstance::RunSearch(Engine type,
+ const StringPiece& orig_text,
+ const StringPiece& orig_context,
+ Prog::Anchor anchor,
+ Result *result) {
+ memset(result, 0, sizeof *result);
+ if (regexp_ == NULL) {
+ result->skipped = true;
+ return;
+ }
+ int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0
+ if (nsubmatch > kMaxSubmatch)
+ nsubmatch = kMaxSubmatch;
+
+ StringPiece text = orig_text;
+ StringPiece context = orig_context;
+
+ switch (type) {
+ default:
+ LOG(FATAL) << "Bad RunSearch type: " << (int)type;
+
+ case kEngineBacktrack:
+ if (prog_ == NULL) {
+ result->skipped = true;
+ break;
+ }
+ result->matched =
+ prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
+ result->submatch, nsubmatch);
+ result->have_submatch = true;
+ break;
+
+ case kEngineNFA:
+ if (prog_ == NULL) {
+ result->skipped = true;
+ break;
+ }
+ result->matched =
+ prog_->SearchNFA(text, context, anchor, kind_,
+ result->submatch, nsubmatch);
+ result->have_submatch = true;
+ break;
+
+ case kEngineDFA:
+ if (prog_ == NULL) {
+ result->skipped = true;
+ break;
+ }
+ result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
+ &result->skipped, NULL);
+ break;
+
+ case kEngineDFA1:
+ if (prog_ == NULL || rprog_ == NULL) {
+ result->skipped = true;
+ break;
+ }
+ result->matched =
+ prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
+ &result->skipped, NULL);
+ // If anchored, no need for second run,
+ // but do it anyway to find more bugs.
+ if (result->matched) {
+ if (!rprog_->SearchDFA(result->submatch[0], context,
+ Prog::kAnchored, Prog::kLongestMatch,
+ result->submatch,
+ &result->skipped, NULL)) {
+ LOG(ERROR) << "Reverse DFA inconsistency: " << CEscape(regexp_str_)
+ << " on " << CEscape(text);
+ result->matched = false;
+ }
+ }
+ result->have_submatch0 = true;
+ break;
+
+ case kEngineOnePass:
+ if (prog_ == NULL ||
+ anchor == Prog::kUnanchored ||
+ !prog_->IsOnePass() ||
+ nsubmatch > Prog::kMaxOnePassCapture) {
+ result->skipped = true;
+ break;
+ }
+ result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
+ result->submatch, nsubmatch);
+ result->have_submatch = true;
+ break;
+
+ case kEngineBitState:
+ if (prog_ == NULL) {
+ result->skipped = true;
+ break;
+ }
+ result->matched = prog_->SearchBitState(text, context, anchor, kind_,
+ result->submatch, nsubmatch);
+ result->have_submatch = true;
+ break;
+
+ case kEngineRE2:
+ case kEngineRE2a:
+ case kEngineRE2b: {
+ if (!re2_ || text.end() != context.end()) {
+ result->skipped = true;
+ break;
+ }
+
+ RE2::Anchor re_anchor;
+ if (anchor == Prog::kAnchored)
+ re_anchor = RE2::ANCHOR_START;
+ else
+ re_anchor = RE2::UNANCHORED;
+ if (kind_ == Prog::kFullMatch)
+ re_anchor = RE2::ANCHOR_BOTH;
+
+ result->matched = re2_->Match(context,
+ text.begin() - context.begin(),
+ text.end() - context.begin(),
+ re_anchor, result->submatch, nsubmatch);
+ result->have_submatch = nsubmatch > 0;
+ break;
+ }
+
+ case kEnginePCRE: {
+ if (!re_ || text.begin() != context.begin() ||
+ text.end() != context.end()) {
+ result->skipped = true;
+ break;
+ }
+
+ const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
+ PCRE::Arg *a = new PCRE::Arg[nsubmatch];
+ for (int i = 0; i < nsubmatch; i++) {
+ a[i] = PCRE::Arg(&result->submatch[i]);
+ argptr[i] = &a[i];
+ }
+ int consumed;
+ PCRE::Anchor pcre_anchor;
+ if (anchor == Prog::kAnchored)
+ pcre_anchor = PCRE::ANCHOR_START;
+ else
+ pcre_anchor = PCRE::UNANCHORED;
+ if (kind_ == Prog::kFullMatch)
+ pcre_anchor = PCRE::ANCHOR_BOTH;
+ re_->ClearHitLimit();
+ result->matched =
+ re_->DoMatch(text,
+ pcre_anchor,
+ &consumed,
+ argptr, nsubmatch);
+ if (re_->HitLimit()) {
+ result->untrusted = true;
+ delete[] argptr;
+ delete[] a;
+ break;
+ }
+ result->have_submatch = true;
+
+ // Work around RE interface bug: PCRE returns -1 as the
+ // offsets for an unmatched subexpression, and RE should
+ // turn that into StringPiece(NULL) but in fact it uses
+ // StringPiece(text.begin() - 1, 0). Oops.
+ for (int i = 0; i < nsubmatch; i++)
+ if (result->submatch[i].begin() == text.begin() - 1)
+ result->submatch[i] = NULL;
+ delete[] argptr;
+ delete[] a;
+ break;
+ }
+ }
+
+ if (!result->matched)
+ memset(result->submatch, 0, sizeof result->submatch);
+}
+
+// Checks whether r is okay given that correct is the right answer.
+// Specifically, r's answers have to match (but it doesn't have to
+// claim to have all the answers).
+static bool ResultOkay(const Result& r, const Result& correct) {
+ if (r.skipped)
+ return true;
+ if (r.matched != correct.matched)
+ return false;
+ if (r.have_submatch || r.have_submatch0) {
+ for (int i = 0; i < kMaxSubmatch; i++) {
+ if (correct.submatch[i].begin() != r.submatch[i].begin() ||
+ correct.submatch[i].size() != r.submatch[i].size())
+ return false;
+ if (!r.have_submatch)
+ break;
+ }
+ }
+ return true;
+}
+
+// Runs a single test.
+bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
+ Prog::Anchor anchor) {
+ // Backtracking is the gold standard.
+ Result correct;
+ RunSearch(kEngineBacktrack, text, context, anchor, &correct);
+ if (correct.skipped) {
+ if (regexp_ == NULL)
+ return true;
+ LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
+ << " " << FormatMode(flags_);
+ return false;
+ }
+ VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
+ << " text " << CEscape(text)
+ << " (" << FormatKind(kind_)
+ << ", " << FormatAnchor(anchor)
+ << ", " << FormatMode(flags_)
+ << ")";
+
+ // Compare the others.
+ bool all_okay = true;
+ for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
+ if (!(Engines() & (1<<i)))
+ continue;
+
+ Result r;
+ RunSearch(i, text, context, anchor, &r);
+ if (ResultOkay(r, correct)) {
+ if (FLAGS_log_okay)
+ LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
+ continue;
+ }
+
+ // We disagree with PCRE on the meaning of some Unicode matches.
+ // In particular, we treat all non-ASCII UTF-8 as word characters.
+ // We also treat "empty" character sets like [^\w\W] as being
+ // impossible to match, while PCRE apparently excludes some code
+ // points (e.g., 0x0080) from both \w and \W.
+ if (i == kEnginePCRE && NonASCII(text))
+ continue;
+
+ if (!r.untrusted)
+ all_okay = false;
+
+ LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
+ context, anchor);
+ if (r.matched != correct.matched) {
+ if (r.matched) {
+ LOG(INFO) << " Should not match (but does).";
+ } else {
+ LOG(INFO) << " Should match (but does not).";
+ continue;
+ }
+ }
+ for (int i = 0; i < 1+num_captures_; i++) {
+ if (r.submatch[i].begin() != correct.submatch[i].begin() ||
+ r.submatch[i].end() != correct.submatch[i].end()) {
+ LOG(INFO) <<
+ StringPrintf(" $%d: should be %s is %s",
+ i,
+ FormatCapture(text, correct.submatch[i]).c_str(),
+ FormatCapture(text, r.submatch[i]).c_str());
+ } else {
+ LOG(INFO) <<
+ StringPrintf(" $%d: %s ok", i,
+ FormatCapture(text, r.submatch[i]).c_str());
+ }
+ }
+ }
+
+ if (!all_okay) {
+ if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0)
+ LOG(QFATAL) << "Too many regexp failures.";
+ }
+
+ return all_okay;
+}
+
+void TestInstance::LogMatch(const char* prefix, Engine e,
+ const StringPiece& text, const StringPiece& context,
+ Prog::Anchor anchor) {
+ LOG(INFO) << prefix
+ << EngineString(e)
+ << " regexp "
+ << CEscape(regexp_str_)
+ << " "
+ << CEscape(regexp_->ToString())
+ << " text "
+ << CEscape(text)
+ << " ("
+ << text.begin() - context.begin()
+ << ","
+ << text.end() - context.begin()
+ << ") of context "
+ << CEscape(context)
+ << " (" << FormatKind(kind_)
+ << ", " << FormatAnchor(anchor)
+ << ", " << FormatMode(flags_)
+ << ")";
+}
+
+static Prog::MatchKind kinds[] = {
+ Prog::kFirstMatch,
+ Prog::kLongestMatch,
+ Prog::kFullMatch,
+};
+
+// Test all possible match kinds and parse modes.
+Tester::Tester(const StringPiece& regexp) {
+ error_ = false;
+ for (int i = 0; i < arraysize(kinds); i++) {
+ for (int j = 0; j < arraysize(parse_modes); j++) {
+ TestInstance* t = new TestInstance(regexp, kinds[i],
+ parse_modes[j].parse_flags);
+ error_ |= t->error();
+ v_.push_back(t);
+ }
+ }
+}
+
+Tester::~Tester() {
+ for (int i = 0; i < v_.size(); i++)
+ delete v_[i];
+}
+
+bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
+ Prog::Anchor anchor) {
+ bool okay = true;
+ for (int i = 0; i < v_.size(); i++)
+ okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
+ return okay;
+}
+
+static Prog::Anchor anchors[] = {
+ Prog::kAnchored,
+ Prog::kUnanchored
+};
+
+bool Tester::TestInput(const StringPiece& text) {
+ bool okay = TestInputInContext(text, text);
+ if (text.size() > 0) {
+ StringPiece sp;
+ sp = text;
+ sp.remove_prefix(1);
+ okay &= TestInputInContext(sp, text);
+ sp = text;
+ sp.remove_suffix(1);
+ okay &= TestInputInContext(sp, text);
+ }
+ return okay;
+}
+
+bool Tester::TestInputInContext(const StringPiece& text,
+ const StringPiece& context) {
+ bool okay = true;
+ for (int i = 0; i < arraysize(anchors); i++)
+ okay &= TestCase(text, context, anchors[i]);
+ return okay;
+}
+
+bool TestRegexpOnText(const StringPiece& regexp,
+ const StringPiece& text) {
+ Tester t(regexp);
+ return t.TestInput(text);
+}
+
+} // namespace re2
« no previous file with comments | « third_party/re2/re2/testing/tester.h ('k') | third_party/re2/re2/testing/unicode_test.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698