third_party/re2/re2/testing/tester.cc - Issue 10575037: Include RE2 library

Unified Diff: third_party/re2/re2/testing/tester.cc

Issue 10575037: Include RE2 library (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Less intrusive fix for Android Created 8 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: third_party/re2/re2/testing/tester.cc

diff --git a/third_party/re2/re2/testing/tester.cc b/third_party/re2/re2/testing/tester.cc

new file mode 100644

index 0000000000000000000000000000000000000000..003dc5add05696f9dfcdb88936aa1230831a265e

--- /dev/null

+++ b/third_party/re2/re2/testing/tester.cc

@@ -0,0 +1,640 @@

+// Use of this source code is governed by a BSD-style

+// license that can be found in the LICENSE file.

+// Regular expression engine tester -- test all the implementations against each other.

+#include "util/util.h"

+#include "util/flags.h"

+#include "re2/testing/tester.h"

+#include "re2/prog.h"

+#include "re2/re2.h"

+#include "re2/regexp.h"

+DEFINE_bool(dump_prog, false, "dump regexp program");

+DEFINE_bool(log_okay, false, "log successful runs");

+DEFINE_bool(dump_rprog, false, "dump reversed regexp program");

+DEFINE_int32(max_regexp_failures, 100,

+ "maximum number of regexp test failures (-1 = unlimited)");

+DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test");

+namespace re2 {

+enum {

+ kMaxSubmatch = 1+16, // $0...$16

+};

+const char* engine_types[kEngineMax] = {

+ "Backtrack",

+ "NFA",

+ "DFA",

+ "DFA1",

+ "OnePass",

+ "BitState",

+ "RE2",

+ "RE2a",

+ "RE2b",

+ "PCRE",

+};

+// Returns the name string for the type t.

+static string EngineString(Engine t) {

+ if (t < 0 || t >= arraysize(engine_types) || engine_types[t] == NULL) {

+ return StringPrintf("type%d", static_cast<int>(t));

+ }

+ return engine_types[t];

+// Returns bit mask of engines to use.

+static uint32 Engines() {

+ static uint32 cached_engines;

+ static bool did_parse;

+ if (did_parse)

+ return cached_engines;

+ if (FLAGS_regexp_engines.empty()) {

+ cached_engines = ~0;

+ } else {

+ for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)

+ if (strstr(EngineString(i).c_str(), FLAGS_regexp_engines.c_str()))

+ cached_engines |= 1<<i;

+ }

+ if (cached_engines == 0)

+ LOG(INFO) << "Warning: no engines enabled.";

+ if (!UsingPCRE)

+ cached_engines &= ~(1<<kEnginePCRE);

+ for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {

+ if (cached_engines & (1<<i))

+ LOG(INFO) << EngineString(i) << " enabled";

+ }

+ did_parse = true;

+ return cached_engines;

+// The result of running a match.

+struct TestInstance::Result {

+ bool skipped; // test skipped: wasn't applicable

+ bool matched; // found a match

+ bool untrusted; // don't really trust the answer

+ bool have_submatch; // computed all submatch info

+ bool have_submatch0; // computed just submatch[0]

+ StringPiece submatch[kMaxSubmatch];

+};

+typedef TestInstance::Result Result;

+// Formats a single capture range s in text in the form (a,b)

+// where a and b are the starting and ending offsets of s in text.

+static string FormatCapture(const StringPiece& text, const StringPiece& s) {

+ if (s.begin() == NULL)

+ return "(?,?)";

+ return StringPrintf("(%d,%d)",

+ static_cast<int>(s.begin() - text.begin()),

+ static_cast<int>(s.end() - text.begin()));

+// Returns whether text contains non-ASCII (>= 0x80) bytes.

+static bool NonASCII(const StringPiece& text) {

+ for (int i = 0; i < text.size(); i++)

+ if ((uint8)text[i] >= 0x80)

+ return true;

+ return false;

+// Returns string representation of match kind.

+static string FormatKind(Prog::MatchKind kind) {

+ switch (kind) {

+ case Prog::kFullMatch:

+ return "full match";

+ case Prog::kLongestMatch:

+ return "longest match";

+ case Prog::kFirstMatch:

+ return "first match";

+ case Prog::kManyMatch:

+ return "many match";

+ }

+ return "???";

+// Returns string representation of anchor kind.

+static string FormatAnchor(Prog::Anchor anchor) {

+ switch (anchor) {

+ case Prog::kAnchored:

+ return "anchored";

+ case Prog::kUnanchored:

+ return "unanchored";

+ }

+ return "???";

+struct ParseMode {

+ Regexp::ParseFlags parse_flags;

+ string desc;

+};

+static const Regexp::ParseFlags single_line =

+ Regexp::LikePerl;

+static const Regexp::ParseFlags multi_line =

+ static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);

+static ParseMode parse_modes[] = {

+ { single_line, "single-line" },

+ { single_line|Regexp::Latin1, "single-line, latin1" },

+ { multi_line, "multiline" },

+ { multi_line|Regexp::NonGreedy, "multiline, nongreedy" },

+ { multi_line|Regexp::Latin1, "multiline, latin1" },

+};

+static string FormatMode(Regexp::ParseFlags flags) {

+ for (int i = 0; i < arraysize(parse_modes); i++)

+ if (parse_modes[i].parse_flags == flags)

+ return parse_modes[i].desc;

+ return StringPrintf("%#x", static_cast<uint>(flags));

+// Constructs and saves all the matching engines that

+// will be required for the given tests.

+TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,

+ Regexp::ParseFlags flags)

+ : regexp_str_(regexp_str),

+ kind_(kind),

+ flags_(flags),

+ error_(false),

+ regexp_(NULL),

+ num_captures_(0),

+ prog_(NULL),

+ rprog_(NULL),

+ re_(NULL),

+ re2_(NULL) {

+ VLOG(1) << CEscape(regexp_str);

+ // Compile regexp to prog.

+ // Always required - needed for backtracking (reference implementation).

+ RegexpStatus status;

+ regexp_ = Regexp::Parse(regexp_str, flags, &status);

+ if (regexp_ == NULL) {

+ LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)

+ << " mode: " << FormatMode(flags);

+ error_ = true;

+ return;

+ }

+ num_captures_ = regexp_->NumCaptures();

+ prog_ = regexp_->CompileToProg(0);

+ if (prog_ == NULL) {

+ LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);

+ error_ = true;

+ return;

+ }

+ if (FLAGS_dump_prog) {

+ LOG(INFO) << "Prog for "

+ << " regexp "

+ << CEscape(regexp_str_)

+ << " (" << FormatKind(kind_)

+ << ", " << FormatMode(flags_)

+ << ")\n"

+ << prog_->Dump();

+ }

+ // Compile regexp to reversed prog. Only needed for DFA engines.

+ if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {

+ rprog_ = regexp_->CompileToReverseProg(0);

+ if (rprog_ == NULL) {

+ LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);

+ error_ = true;

+ return;

+ }

+ if (FLAGS_dump_rprog)

+ LOG(INFO) << rprog_->Dump();

+ }

+ // Create re string that will be used for RE and RE2.

+ string re = regexp_str.as_string();

+ // Accomodate flags.

+ // Regexp::Latin1 will be accomodated below.

+ if (!(flags & Regexp::OneLine))

+ re = "(?m)" + re;

+ if (flags & Regexp::NonGreedy)

+ re = "(?U)" + re;

+ if (flags & Regexp::DotNL)

+ re = "(?s)" + re;

+ // Compile regexp to RE2.

+ if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {

+ RE2::Options options;

+ if (flags & Regexp::Latin1)

+ options.set_encoding(RE2::Options::EncodingLatin1);

+ if (kind_ == Prog::kLongestMatch)

+ options.set_longest_match(true);

+ re2_ = new RE2(re, options);

+ if (!re2_->error().empty()) {

+ LOG(INFO) << "Cannot RE2: " << CEscape(re);

+ error_ = true;

+ return;

+ }

+ // Compile regexp to RE.

+ // PCRE as exposed by the RE interface isn't always usable.

+ // 1. It disagrees about handling of empty-string reptitions

+ // like matching (a*)* against "b". PCRE treats the (a*) as

+ // occurring once, while we treat it as occurring not at all.

+ // 2. It treats $ as this weird thing meaning end of string

+ // or before the \n at the end of the string.

+ // 3. It doesn't implement POSIX leftmost-longest matching.

+ // MimicsPCRE() detects 1 and 2.

+ if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&

+ kind_ != Prog::kLongestMatch) {

+ PCRE_Options o;

+ o.set_option(PCRE::UTF8);

+ if (flags & Regexp::Latin1)

+ o.set_option(PCRE::None);

+ // PCRE has interface bug keeping us from finding $0, so

+ // add one more layer of parens.

+ re_ = new PCRE("("+re+")", o);

+ if (!re_->error().empty()) {

+ LOG(INFO) << "Cannot PCRE: " << CEscape(re);

+ error_ = true;

+ return;

+ }

+TestInstance::~TestInstance() {

+ if (regexp_)

+ regexp_->Decref();

+ delete prog_;

+ delete rprog_;

+ delete re_;

+ delete re2_;

+// Runs a single search using the named engine type.

+// This interface hides all the irregularities of the various

+// engine interfaces from the rest of this file.

+void TestInstance::RunSearch(Engine type,

+ const StringPiece& orig_text,

+ const StringPiece& orig_context,

+ Prog::Anchor anchor,

+ Result *result) {

+ memset(result, 0, sizeof *result);

+ if (regexp_ == NULL) {

+ result->skipped = true;

+ return;

+ }

+ int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0

+ if (nsubmatch > kMaxSubmatch)

+ nsubmatch = kMaxSubmatch;

+ StringPiece text = orig_text;

+ StringPiece context = orig_context;

+ switch (type) {

+ default:

+ LOG(FATAL) << "Bad RunSearch type: " << (int)type;

+ case kEngineBacktrack:

+ if (prog_ == NULL) {

+ result->skipped = true;

+ break;

+ }

+ result->matched =

+ prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,

+ result->submatch, nsubmatch);

+ result->have_submatch = true;

+ break;

+ case kEngineNFA:

+ if (prog_ == NULL) {

+ result->skipped = true;

+ break;

+ }

+ result->matched =

+ prog_->SearchNFA(text, context, anchor, kind_,

+ result->submatch, nsubmatch);

+ result->have_submatch = true;

+ break;

+ case kEngineDFA:

+ if (prog_ == NULL) {

+ result->skipped = true;

+ break;

+ }

+ result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,

+ &result->skipped, NULL);

+ break;

+ case kEngineDFA1:

+ if (prog_ == NULL || rprog_ == NULL) {

+ result->skipped = true;

+ break;

+ }

+ result->matched =

+ prog_->SearchDFA(text, context, anchor, kind_, result->submatch,

+ &result->skipped, NULL);

+ // If anchored, no need for second run,

+ // but do it anyway to find more bugs.

+ if (result->matched) {

+ if (!rprog_->SearchDFA(result->submatch[0], context,

+ Prog::kAnchored, Prog::kLongestMatch,

+ result->submatch,

+ &result->skipped, NULL)) {

+ LOG(ERROR) << "Reverse DFA inconsistency: " << CEscape(regexp_str_)

+ << " on " << CEscape(text);

+ result->matched = false;

+ }

+ result->have_submatch0 = true;

+ break;

+ case kEngineOnePass:

+ if (prog_ == NULL ||

+ anchor == Prog::kUnanchored ||

+ !prog_->IsOnePass() ||

+ nsubmatch > Prog::kMaxOnePassCapture) {

+ result->skipped = true;

+ break;

+ }

+ result->matched = prog_->SearchOnePass(text, context, anchor, kind_,

+ result->submatch, nsubmatch);

+ result->have_submatch = true;

+ break;

+ case kEngineBitState:

+ if (prog_ == NULL) {

+ result->skipped = true;

+ break;

+ }

+ result->matched = prog_->SearchBitState(text, context, anchor, kind_,

+ result->submatch, nsubmatch);

+ result->have_submatch = true;

+ break;

+ case kEngineRE2:

+ case kEngineRE2a:

+ case kEngineRE2b: {

+ if (!re2_ || text.end() != context.end()) {

+ result->skipped = true;

+ break;

+ }

+ RE2::Anchor re_anchor;

+ if (anchor == Prog::kAnchored)

+ re_anchor = RE2::ANCHOR_START;

+ else

+ re_anchor = RE2::UNANCHORED;

+ if (kind_ == Prog::kFullMatch)

+ re_anchor = RE2::ANCHOR_BOTH;

+ result->matched = re2_->Match(context,

+ text.begin() - context.begin(),

+ text.end() - context.begin(),

+ re_anchor, result->submatch, nsubmatch);

+ result->have_submatch = nsubmatch > 0;

+ break;

+ }

+ case kEnginePCRE: {

+ if (!re_ || text.begin() != context.begin() ||

+ text.end() != context.end()) {

+ result->skipped = true;

+ break;

+ }

+ const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];

+ PCRE::Arg *a = new PCRE::Arg[nsubmatch];

+ for (int i = 0; i < nsubmatch; i++) {

+ a[i] = PCRE::Arg(&result->submatch[i]);

+ argptr[i] = &a[i];

+ }

+ int consumed;

+ PCRE::Anchor pcre_anchor;

+ if (anchor == Prog::kAnchored)

+ pcre_anchor = PCRE::ANCHOR_START;

+ else

+ pcre_anchor = PCRE::UNANCHORED;

+ if (kind_ == Prog::kFullMatch)

+ pcre_anchor = PCRE::ANCHOR_BOTH;

+ re_->ClearHitLimit();

+ result->matched =

+ re_->DoMatch(text,

+ pcre_anchor,

+ &consumed,

+ argptr, nsubmatch);

+ if (re_->HitLimit()) {

+ result->untrusted = true;

+ delete[] argptr;

+ delete[] a;

+ break;

+ }

+ result->have_submatch = true;

+ // Work around RE interface bug: PCRE returns -1 as the

+ // offsets for an unmatched subexpression, and RE should

+ // turn that into StringPiece(NULL) but in fact it uses

+ // StringPiece(text.begin() - 1, 0). Oops.

+ for (int i = 0; i < nsubmatch; i++)

+ if (result->submatch[i].begin() == text.begin() - 1)

+ result->submatch[i] = NULL;

+ delete[] argptr;

+ delete[] a;

+ break;

+ }

+ if (!result->matched)

+ memset(result->submatch, 0, sizeof result->submatch);

+// Checks whether r is okay given that correct is the right answer.

+// Specifically, r's answers have to match (but it doesn't have to

+// claim to have all the answers).

+static bool ResultOkay(const Result& r, const Result& correct) {

+ if (r.skipped)

+ return true;

+ if (r.matched != correct.matched)

+ return false;

+ if (r.have_submatch || r.have_submatch0) {

+ for (int i = 0; i < kMaxSubmatch; i++) {

+ if (correct.submatch[i].begin() != r.submatch[i].begin() ||

+ correct.submatch[i].size() != r.submatch[i].size())

+ return false;

+ if (!r.have_submatch)

+ break;

+ }

+ return true;

+// Runs a single test.

+bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,

+ Prog::Anchor anchor) {

+ // Backtracking is the gold standard.

+ Result correct;

+ RunSearch(kEngineBacktrack, text, context, anchor, &correct);

+ if (correct.skipped) {

+ if (regexp_ == NULL)

+ return true;

+ LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)

+ << " " << FormatMode(flags_);

+ return false;

+ }

+ VLOG(1) << "Try: regexp " << CEscape(regexp_str_)

+ << " text " << CEscape(text)

+ << " (" << FormatKind(kind_)

+ << ", " << FormatAnchor(anchor)

+ << ", " << FormatMode(flags_)

+ << ")";

+ // Compare the others.

+ bool all_okay = true;

+ for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {

+ if (!(Engines() & (1<<i)))

+ continue;

+ Result r;

+ RunSearch(i, text, context, anchor, &r);

+ if (ResultOkay(r, correct)) {

+ if (FLAGS_log_okay)

+ LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);

+ continue;

+ }

+ // We disagree with PCRE on the meaning of some Unicode matches.

+ // In particular, we treat all non-ASCII UTF-8 as word characters.

+ // We also treat "empty" character sets like [^\w\W] as being

+ // impossible to match, while PCRE apparently excludes some code

+ // points (e.g., 0x0080) from both \w and \W.

+ if (i == kEnginePCRE && NonASCII(text))

+ continue;

+ if (!r.untrusted)

+ all_okay = false;

+ LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,

+ context, anchor);

+ if (r.matched != correct.matched) {

+ if (r.matched) {

+ LOG(INFO) << " Should not match (but does).";

+ } else {

+ LOG(INFO) << " Should match (but does not).";

+ continue;

+ }

+ for (int i = 0; i < 1+num_captures_; i++) {

+ if (r.submatch[i].begin() != correct.submatch[i].begin() ||

+ r.submatch[i].end() != correct.submatch[i].end()) {

+ LOG(INFO) <<

+ StringPrintf(" $%d: should be %s is %s",

+ i,

+ FormatCapture(text, correct.submatch[i]).c_str(),

+ FormatCapture(text, r.submatch[i]).c_str());

+ } else {

+ LOG(INFO) <<

+ StringPrintf(" $%d: %s ok", i,

+ FormatCapture(text, r.submatch[i]).c_str());

+ }

+ if (!all_okay) {

+ if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0)

+ LOG(QFATAL) << "Too many regexp failures.";

+ }

+ return all_okay;

+void TestInstance::LogMatch(const char* prefix, Engine e,

+ const StringPiece& text, const StringPiece& context,

+ Prog::Anchor anchor) {

+ LOG(INFO) << prefix

+ << EngineString(e)

+ << " regexp "

+ << CEscape(regexp_str_)

+ << " "

+ << CEscape(regexp_->ToString())

+ << " text "

+ << CEscape(text)

+ << " ("

+ << text.begin() - context.begin()

+ << ","

+ << text.end() - context.begin()

+ << ") of context "

+ << CEscape(context)

+ << " (" << FormatKind(kind_)

+ << ", " << FormatAnchor(anchor)

+ << ", " << FormatMode(flags_)

+ << ")";

+static Prog::MatchKind kinds[] = {

+ Prog::kFirstMatch,

+ Prog::kLongestMatch,

+ Prog::kFullMatch,

+};

+// Test all possible match kinds and parse modes.

+Tester::Tester(const StringPiece& regexp) {

+ error_ = false;

+ for (int i = 0; i < arraysize(kinds); i++) {

+ for (int j = 0; j < arraysize(parse_modes); j++) {

+ TestInstance* t = new TestInstance(regexp, kinds[i],

+ parse_modes[j].parse_flags);

+ error_ |= t->error();

+ v_.push_back(t);

+ }

+Tester::~Tester() {

+ for (int i = 0; i < v_.size(); i++)

+ delete v_[i];

+bool Tester::TestCase(const StringPiece& text, const StringPiece& context,

+ Prog::Anchor anchor) {

+ bool okay = true;

+ for (int i = 0; i < v_.size(); i++)

+ okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));

+ return okay;

+static Prog::Anchor anchors[] = {

+ Prog::kAnchored,

+ Prog::kUnanchored

+};

+bool Tester::TestInput(const StringPiece& text) {

+ bool okay = TestInputInContext(text, text);

+ if (text.size() > 0) {

+ StringPiece sp;

+ sp = text;

+ sp.remove_prefix(1);

+ okay &= TestInputInContext(sp, text);

+ sp = text;

+ sp.remove_suffix(1);

+ okay &= TestInputInContext(sp, text);

+ }

+ return okay;

+bool Tester::TestInputInContext(const StringPiece& text,

+ const StringPiece& context) {

+ bool okay = true;

+ for (int i = 0; i < arraysize(anchors); i++)

+ okay &= TestCase(text, context, anchors[i]);

+ return okay;

+bool TestRegexpOnText(const StringPiece& regexp,

+ const StringPiece& text) {

+ Tester t(regexp);

+ return t.TestInput(text);

+} // namespace re2

« no previous file with comments | « third_party/re2/re2/testing/tester.h ('k') | third_party/re2/re2/testing/unicode_test.py » ('j') | no next file with comments »