third_party/re2/re2/re2.cc - Issue 10575037: Include RE2 library

Unified Diff: third_party/re2/re2/re2.cc

Issue 10575037: Include RE2 library (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Less intrusive fix for Android Created 8 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: third_party/re2/re2/re2.cc

diff --git a/third_party/re2/re2/re2.cc b/third_party/re2/re2/re2.cc

new file mode 100644

index 0000000000000000000000000000000000000000..858b6a48dafa7f0be2ffa6ff2d2363d8b0782a7c

--- /dev/null

+++ b/third_party/re2/re2/re2.cc

@@ -0,0 +1,1186 @@

+// Use of this source code is governed by a BSD-style

+// license that can be found in the LICENSE file.

+// Regular expression interface RE2.

+//

+// Originally the PCRE C++ wrapper, but adapted to use

+// the new automata-based regular expression engines.

+#include "re2/re2.h"

+#include <ctype.h>

+#include <stdio.h>

+#include <string>

+#ifdef WIN32

+#define strtoll _strtoi64

+#define strtoull _strtoui64

+#define strtof strtod

+#else

+#include <pthread.h>

+#endif

+#include <errno.h>

+#include "util/util.h"

+#include "util/flags.h"

+#include "re2/prog.h"

+#include "re2/regexp.h"

+DEFINE_bool(trace_re2, false, "trace RE2 execution");

+namespace re2 {

+// Maximum number of args we can set

+static const int kMaxArgs = 16;

+static const int kVecSize = 1+kMaxArgs;

+const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::FullMatchN> RE2::FullMatch;

+const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::PartialMatchN> RE2::PartialMatch;

+const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume;

+const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume;

+// Commonly-used option sets; arguments to constructor are:

+// utf8 input

+// posix syntax

+// longest match

+// log errors

+const RE2::Options RE2::DefaultOptions; // EncodingUTF8, false, false, true

+const RE2::Options RE2::Latin1(RE2::Options::EncodingLatin1, false, false, true);

+//const RE2::Options RE2::POSIX(RE2::Options::EncodingUTF8, true, true, true);

+const RE2::Options RE2::Quiet(RE2::Options::EncodingUTF8, false, false, false);

+// If a regular expression has no error, its error_ field points here

+static const string empty_string;

+// Converts from Regexp error code to RE2 error code.

+// Maybe some day they will diverge. In any event, this

+// hides the existence of Regexp from RE2 users.

+static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {

+ switch (code) {

+ case re2::kRegexpSuccess:

+ return RE2::NoError;

+ case re2::kRegexpInternalError:

+ return RE2::ErrorInternal;

+ case re2::kRegexpBadEscape:

+ return RE2::ErrorBadEscape;

+ case re2::kRegexpBadCharClass:

+ return RE2::ErrorBadCharClass;

+ case re2::kRegexpBadCharRange:

+ return RE2::ErrorBadCharRange;

+ case re2::kRegexpMissingBracket:

+ return RE2::ErrorMissingBracket;

+ case re2::kRegexpMissingParen:

+ return RE2::ErrorMissingParen;

+ case re2::kRegexpTrailingBackslash:

+ return RE2::ErrorTrailingBackslash;

+ case re2::kRegexpRepeatArgument:

+ return RE2::ErrorRepeatArgument;

+ case re2::kRegexpRepeatSize:

+ return RE2::ErrorRepeatSize;

+ case re2::kRegexpRepeatOp:

+ return RE2::ErrorRepeatOp;

+ case re2::kRegexpBadPerlOp:

+ return RE2::ErrorBadPerlOp;

+ case re2::kRegexpBadUTF8:

+ return RE2::ErrorBadUTF8;

+ case re2::kRegexpBadNamedCapture:

+ return RE2::ErrorBadNamedCapture;

+ }

+ return RE2::ErrorInternal;

+static string trunc(const StringPiece& pattern) {

+ if (pattern.size() < 100)

+ return pattern.as_string();

+ return pattern.substr(0, 100).as_string() + "...";

+RE2::RE2(const char* pattern) {

+ Init(pattern, DefaultOptions);

+RE2::RE2(const string& pattern) {

+ Init(pattern, DefaultOptions);

+RE2::RE2(const StringPiece& pattern) {

+ Init(pattern, DefaultOptions);

+RE2::RE2(const StringPiece& pattern, const Options& options) {

+ Init(pattern, options);

+int RE2::Options::ParseFlags() const {

+ int flags = Regexp::ClassNL;

+ switch (encoding()) {

+ default:

+ LOG(ERROR) << "Unknown encoding " << encoding();

+ break;

+ case RE2::Options::EncodingUTF8:

+ break;

+ case RE2::Options::EncodingLatin1:

+ flags |= Regexp::Latin1;

+ break;

+ }

+ if (!posix_syntax())

+ flags |= Regexp::LikePerl;

+ if (literal())

+ flags |= Regexp::Literal;

+ if (never_nl())

+ flags |= Regexp::NeverNL;

+ if (!case_sensitive())

+ flags |= Regexp::FoldCase;

+ if (perl_classes())

+ flags |= Regexp::PerlClasses;

+ if (word_boundary())

+ flags |= Regexp::PerlB;

+ if (one_line())

+ flags |= Regexp::OneLine;

+ return flags;

+void RE2::Init(const StringPiece& pattern, const Options& options) {

+ mutex_ = new Mutex;

+ pattern_ = pattern.as_string();

+ options_.Copy(options);

+ error_ = &empty_string;

+ error_code_ = NoError;

+ suffix_regexp_ = NULL;

+ entire_regexp_ = NULL;

+ prog_ = NULL;

+ rprog_ = NULL;

+ named_groups_ = NULL;

+ group_names_ = NULL;

+ num_captures_ = -1;

+ RegexpStatus status;

+ entire_regexp_ = Regexp::Parse(

+ pattern_,

+ static_cast<Regexp::ParseFlags>(options_.ParseFlags()),

+ &status);

+ if (entire_regexp_ == NULL) {

+ if (error_ == &empty_string)

+ error_ = new string(status.Text());

+ if (options_.log_errors()) {

+ LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "

+ << status.Text();

+ }

+ error_arg_ = status.error_arg().as_string();

+ error_code_ = RegexpErrorToRE2(status.code());

+ return;

+ }

+ prefix_.clear();

+ prefix_foldcase_ = false;

+ re2::Regexp* suffix;

+ if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))

+ suffix_regexp_ = suffix;

+ else

+ suffix_regexp_ = entire_regexp_->Incref();

+ // Two thirds of the memory goes to the forward Prog,

+ // one third to the reverse prog, because the forward

+ // Prog has two DFAs but the reverse prog has one.

+ prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);

+ if (prog_ == NULL) {

+ if (options_.log_errors())

+ LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";

+ error_ = new string("pattern too large - compile failed");

+ error_code_ = RE2::ErrorPatternTooLarge;

+ return;

+ }

+ // Could delay this until the first match call that

+ // cares about submatch information, but the one-pass

+ // machine's memory gets cut from the DFA memory budget,

+ // and that is harder to do if the DFA has already

+ // been built.

+ is_one_pass_ = prog_->IsOnePass();

+// Returns rprog_, computing it if needed.

+re2::Prog* RE2::ReverseProg() const {

+ MutexLock l(mutex_);

+ if (rprog_ == NULL && error_ == &empty_string) {

+ rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3);

+ if (rprog_ == NULL) {

+ if (options_.log_errors())

+ LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'";

+ error_ = new string("pattern too large - reverse compile failed");

+ error_code_ = RE2::ErrorPatternTooLarge;

+ return NULL;

+ }

+ return rprog_;

+static const map<string, int> empty_named_groups;

+static const map<int, string> empty_group_names;

+RE2::~RE2() {

+ if (suffix_regexp_)

+ suffix_regexp_->Decref();

+ if (entire_regexp_)

+ entire_regexp_->Decref();

+ delete mutex_;

+ delete prog_;

+ delete rprog_;

+ if (error_ != &empty_string)

+ delete error_;

+ if (named_groups_ != NULL && named_groups_ != &empty_named_groups)

+ delete named_groups_;

+ if (group_names_ != NULL && group_names_ != &empty_group_names)

+ delete group_names_;

+int RE2::ProgramSize() const {

+ if (prog_ == NULL)

+ return -1;

+ return prog_->size();

+// Returns named_groups_, computing it if needed.

+const map<string, int>& RE2::NamedCapturingGroups() const {

+ MutexLock l(mutex_);

+ if (!ok())

+ return empty_named_groups;

+ if (named_groups_ == NULL) {

+ named_groups_ = suffix_regexp_->NamedCaptures();

+ if (named_groups_ == NULL)

+ named_groups_ = &empty_named_groups;

+ }

+ return *named_groups_;

+// Returns group_names_, computing it if needed.

+const map<int, string>& RE2::CapturingGroupNames() const {

+ MutexLock l(mutex_);

+ if (!ok())

+ return empty_group_names;

+ if (group_names_ == NULL) {

+ group_names_ = suffix_regexp_->CaptureNames();

+ if (group_names_ == NULL)

+ group_names_ = &empty_group_names;

+ }

+ return *group_names_;

+/***** Convenience interfaces *****/

+bool RE2::FullMatchN(const StringPiece& text, const RE2& re,

+ const Arg* const args[], int n) {

+ return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);

+bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,

+ const Arg* const args[], int n) {

+ return re.DoMatch(text, UNANCHORED, NULL, args, n);

+bool RE2::ConsumeN(StringPiece* input, const RE2& re,

+ const Arg* const args[], int n) {

+ int consumed;

+ if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {

+ input->remove_prefix(consumed);

+ return true;

+ } else {

+ return false;

+ }

+bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,

+ const Arg* const args[], int n) {

+ int consumed;

+ if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {

+ input->remove_prefix(consumed);

+ return true;

+ } else {

+ return false;

+ }

+// Returns the maximum submatch needed for the rewrite to be done by Replace().

+// E.g. if rewrite == "foo \\2,\\1", returns 2.

+static int MaxSubmatch(const StringPiece& rewrite) {

+ int max = 0;

+ for (const char *s = rewrite.data(), *end = s + rewrite.size();

+ s < end; s++) {

+ if (*s == '\\') {

+ s++;

+ int c = (s < end) ? *s : -1;

+ if (isdigit(c)) {

+ int n = (c - '0');

+ if (n > max)

+ max = n;

+ }

+ return max;

+bool RE2::Replace(string *str,

+ const RE2& re,

+ const StringPiece& rewrite) {

+ StringPiece vec[kVecSize];

+ int nvec = 1 + MaxSubmatch(rewrite);

+ if (nvec > arraysize(vec))

+ return false;

+ if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))

+ return false;

+ string s;

+ if (!re.Rewrite(&s, rewrite, vec, nvec))

+ return false;

+ assert(vec[0].begin() >= str->data());

+ assert(vec[0].end() <= str->data()+str->size());

+ str->replace(vec[0].data() - str->data(), vec[0].size(), s);

+ return true;

+int RE2::GlobalReplace(string *str,

+ const RE2& re,

+ const StringPiece& rewrite) {

+ StringPiece vec[kVecSize];

+ int nvec = 1 + MaxSubmatch(rewrite);

+ if (nvec > arraysize(vec))

+ return false;

+ const char* p = str->data();

+ const char* ep = p + str->size();

+ const char* lastend = NULL;

+ string out;

+ int count = 0;

+ while (p <= ep) {

+ if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec))

+ break;

+ if (p < vec[0].begin())

+ out.append(p, vec[0].begin() - p);

+ if (vec[0].begin() == lastend && vec[0].size() == 0) {

+ // Disallow empty match at end of last match: skip ahead.

+ if (p < ep)

+ out.append(p, 1);

+ p++;

+ continue;

+ }

+ re.Rewrite(&out, rewrite, vec, nvec);

+ p = vec[0].end();

+ lastend = p;

+ count++;

+ }

+ if (count == 0)

+ return 0;

+ if (p < ep)

+ out.append(p, ep - p);

+ swap(out, *str);

+ return count;

+bool RE2::Extract(const StringPiece &text,

+ const RE2& re,

+ const StringPiece &rewrite,

+ string *out) {

+ StringPiece vec[kVecSize];

+ int nvec = 1 + MaxSubmatch(rewrite);

+ if (nvec > arraysize(vec))

+ return false;

+ if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))

+ return false;

+ out->clear();

+ return re.Rewrite(out, rewrite, vec, nvec);

+string RE2::QuoteMeta(const StringPiece& unquoted) {

+ string result;

+ result.reserve(unquoted.size() << 1);

+ // Escape any ascii character not in [A-Za-z_0-9].

+ //

+ // Note that it's legal to escape a character even if it has no

+ // special meaning in a regular expression -- so this function does

+ // that. (This also makes it identical to the perl function of the

+ // same name except for the null-character special case;

+ // see `perldoc -f quotemeta`.)

+ for (int ii = 0; ii < unquoted.length(); ++ii) {

+ // Note that using 'isalnum' here raises the benchmark time from

+ // 32ns to 58ns:

+ if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&

+ (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&

+ (unquoted[ii] < '0' || unquoted[ii] > '9') &&

+ unquoted[ii] != '_' &&

+ // If this is the part of a UTF8 or Latin1 character, we need

+ // to copy this byte without escaping. Experimentally this is

+ // what works correctly with the regexp library.

+ !(unquoted[ii] & 128)) {

+ if (unquoted[ii] == '\0') { // Special handling for null chars.

+ // Note that this special handling is not strictly required for RE2,

+ // but this quoting is required for other regexp libraries such as

+ // PCRE.

+ // Can't use "\\0" since the next character might be a digit.

+ result += "\\x00";

+ continue;

+ }

+ result += '\\';

+ }

+ result += unquoted[ii];

+ }

+ return result;

+bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {

+ if (prog_ == NULL)

+ return false;

+ int n = prefix_.size();

+ if (n > maxlen)

+ n = maxlen;

+ // Determine initial min max from prefix_ literal.

+ string pmin, pmax;

+ pmin = prefix_.substr(0, n);

+ pmax = prefix_.substr(0, n);

+ if (prefix_foldcase_) {

+ // prefix is ASCII lowercase; change pmin to uppercase.

+ for (int i = 0; i < n; i++) {

+ if ('a' <= pmin[i] && pmin[i] <= 'z')

+ pmin[i] += 'A' - 'a';

+ }

+ // Add to prefix min max using PossibleMatchRange on regexp.

+ string dmin, dmax;

+ maxlen -= n;

+ if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {

+ pmin += dmin;

+ pmax += dmax;

+ } else if (pmax.size() > 0) {

+ // prog_->PossibleMatchRange has failed us,

+ // but we still have useful information from prefix_.

+ // Round up pmax to allow any possible suffix.

+ pmax = PrefixSuccessor(pmax);

+ } else {

+ // Nothing useful.

+ *min = "";

+ *max = "";

+ return false;

+ }

+ *min = pmin;

+ *max = pmax;

+ return true;

+// Avoid possible locale nonsense in standard strcasecmp.

+// The string a is known to be all lowercase.

+static int ascii_strcasecmp(const char* a, const char* b, int len) {

+ const char *ae = a + len;

+ for (; a < ae; a++, b++) {

+ uint8 x = *a;

+ uint8 y = *b;

+ if ('A' <= y && y <= 'Z')

+ y += 'a' - 'A';

+ if (x != y)

+ return x - y;

+ }

+ return 0;

+/***** Actual matching and rewriting code *****/

+bool RE2::Match(const StringPiece& text,

+ int startpos,

+ int endpos,

+ Anchor re_anchor,

+ StringPiece* submatch,

+ int nsubmatch) const {

+ if (!ok() || suffix_regexp_ == NULL) {

+ if (options_.log_errors())

+ LOG(ERROR) << "Invalid RE2: " << *error_;

+ return false;

+ }

+ if (startpos < 0 || startpos > endpos || endpos > text.size()) {

+ LOG(ERROR) << "RE2: invalid startpos, endpos pair.";

+ return false;

+ }

+ StringPiece subtext = text;

+ subtext.remove_prefix(startpos);

+ subtext.remove_suffix(text.size() - endpos);

+ // Use DFAs to find exact location of match, filter out non-matches.

+ // Don't ask for the location if we won't use it.

+ // SearchDFA can do extra optimizations in that case.

+ StringPiece match;

+ StringPiece* matchp = &match;

+ if (nsubmatch == 0)

+ matchp = NULL;

+ int ncap = 1 + NumberOfCapturingGroups();

+ if (ncap > nsubmatch)

+ ncap = nsubmatch;

+ // If the regexp is anchored explicitly, must not be in middle of text.

+ if (prog_->anchor_start() && startpos != 0)

+ return false;

+ // If the regexp is anchored explicitly, update re_anchor

+ // so that we can potentially fall into a faster case below.

+ if (prog_->anchor_start() && prog_->anchor_end())

+ re_anchor = ANCHOR_BOTH;

+ else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)

+ re_anchor = ANCHOR_START;

+ // Check for the required prefix, if any.

+ int prefixlen = 0;

+ if (!prefix_.empty()) {

+ if (startpos != 0)

+ return false;

+ prefixlen = prefix_.size();

+ if (prefixlen > subtext.size())

+ return false;

+ if (prefix_foldcase_) {

+ if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)

+ return false;

+ } else {

+ if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)

+ return false;

+ }

+ subtext.remove_prefix(prefixlen);

+ // If there is a required prefix, the anchor must be at least ANCHOR_START.

+ if (re_anchor != ANCHOR_BOTH)

+ re_anchor = ANCHOR_START;

+ }

+ Prog::Anchor anchor = Prog::kUnanchored;

+ Prog::MatchKind kind = Prog::kFirstMatch;

+ if (options_.longest_match())

+ kind = Prog::kLongestMatch;

+ bool skipped_test = false;

+ bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);

+ // SearchBitState allocates a bit vector of size prog_->size() * text.size().

+ // It also allocates a stack of 3-word structures which could potentially

+ // grow as large as prog_->size() * text.size() but in practice is much

+ // smaller.

+ // Conditions for using SearchBitState:

+ const int MaxBitStateProg = 500; // prog_->size() <= Max.

+ const int MaxBitStateVector = 256*1024; // bit vector size <= Max (bits)

+ bool can_bit_state = prog_->size() <= MaxBitStateProg;

+ int bit_state_text_max = MaxBitStateVector / prog_->size();

+ bool dfa_failed = false;

+ switch (re_anchor) {

+ default:

+ case UNANCHORED: {

+ if (!prog_->SearchDFA(subtext, text, anchor, kind,

+ matchp, &dfa_failed, NULL)) {

+ if (dfa_failed) {

+ // Fall back to NFA below.

+ skipped_test = true;

+ if (FLAGS_trace_re2)

+ LOG(INFO) << "Match " << trunc(pattern_)

+ << " [" << CEscape(subtext) << "]"

+ << " DFA failed.";

+ break;

+ }

+ if (FLAGS_trace_re2)

+ LOG(INFO) << "Match " << trunc(pattern_)

+ << " [" << CEscape(subtext) << "]"

+ << " used DFA - no match.";

+ return false;

+ }

+ if (FLAGS_trace_re2)

+ LOG(INFO) << "Match " << trunc(pattern_)

+ << " [" << CEscape(subtext) << "]"

+ << " used DFA - match";

+ if (matchp == NULL) // Matched. Don't care where

+ return true;

+ // SearchDFA set match[0].end() but didn't know where the

+ // match started. Run the regexp backward from match[0].end()

+ // to find the longest possible match -- that's where it started.

+ Prog* prog = ReverseProg();

+ if (prog == NULL)

+ return false;

+ if (!prog->SearchDFA(match, text, Prog::kAnchored,

+ Prog::kLongestMatch, &match, &dfa_failed, NULL)) {

+ if (dfa_failed) {

+ // Fall back to NFA below.

+ skipped_test = true;

+ if (FLAGS_trace_re2)

+ LOG(INFO) << "Match " << trunc(pattern_)

+ << " [" << CEscape(subtext) << "]"

+ << " reverse DFA failed.";

+ break;

+ }

+ if (FLAGS_trace_re2)

+ LOG(INFO) << "Match " << trunc(pattern_)

+ << " [" << CEscape(subtext) << "]"

+ << " DFA inconsistency.";

+ LOG(ERROR) << "DFA inconsistency";

+ return false;

+ }

+ if (FLAGS_trace_re2)

+ LOG(INFO) << "Match " << trunc(pattern_)

+ << " [" << CEscape(subtext) << "]"

+ << " used reverse DFA.";

+ break;

+ }

+ case ANCHOR_BOTH:

+ case ANCHOR_START:

+ if (re_anchor == ANCHOR_BOTH)

+ kind = Prog::kFullMatch;

+ anchor = Prog::kAnchored;

+ // If only a small amount of text and need submatch

+ // information anyway and we're going to use OnePass or BitState

+ // to get it, we might as well not even bother with the DFA:

+ // OnePass or BitState will be fast enough.

+ // On tiny texts, OnePass outruns even the DFA, and

+ // it doesn't have the shared state and occasional mutex that

+ // the DFA does.

+ if (can_one_pass && text.size() <= 4096 &&

+ (ncap > 1 || text.size() <= 8)) {

+ if (FLAGS_trace_re2)

+ LOG(INFO) << "Match " << trunc(pattern_)

+ << " [" << CEscape(subtext) << "]"

+ << " skipping DFA for OnePass.";

+ skipped_test = true;

+ break;

+ }

+ if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {

+ if (FLAGS_trace_re2)

+ LOG(INFO) << "Match " << trunc(pattern_)

+ << " [" << CEscape(subtext) << "]"

+ << " skipping DFA for BitState.";

+ skipped_test = true;

+ break;

+ }

+ if (!prog_->SearchDFA(subtext, text, anchor, kind,

+ &match, &dfa_failed, NULL)) {

+ if (dfa_failed) {

+ if (FLAGS_trace_re2)

+ LOG(INFO) << "Match " << trunc(pattern_)

+ << " [" << CEscape(subtext) << "]"

+ << " DFA failed.";

+ skipped_test = true;

+ break;

+ }

+ if (FLAGS_trace_re2)

+ LOG(INFO) << "Match " << trunc(pattern_)

+ << " [" << CEscape(subtext) << "]"

+ << " used DFA - no match.";

+ return false;

+ }

+ break;

+ }

+ if (!skipped_test && ncap <= 1) {

+ // We know exactly where it matches. That's enough.

+ if (ncap == 1)

+ submatch[0] = match;

+ } else {

+ StringPiece subtext1;

+ if (skipped_test) {

+ // DFA ran out of memory or was skipped:

+ // need to search in entire original text.

+ subtext1 = subtext;

+ } else {

+ // DFA found the exact match location:

+ // let NFA run an anchored, full match search

+ // to find submatch locations.

+ subtext1 = match;

+ anchor = Prog::kAnchored;

+ kind = Prog::kFullMatch;

+ }

+ if (can_one_pass && anchor != Prog::kUnanchored) {

+ if (FLAGS_trace_re2)

+ LOG(INFO) << "Match " << trunc(pattern_)

+ << " [" << CEscape(subtext) << "]"

+ << " using OnePass.";

+ if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {

+ if (!skipped_test)

+ LOG(ERROR) << "SearchOnePass inconsistency";

+ return false;

+ }

+ } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {

+ if (FLAGS_trace_re2)

+ LOG(INFO) << "Match " << trunc(pattern_)

+ << " [" << CEscape(subtext) << "]"

+ << " using BitState.";

+ if (!prog_->SearchBitState(subtext1, text, anchor,

+ kind, submatch, ncap)) {

+ if (!skipped_test)

+ LOG(ERROR) << "SearchBitState inconsistency";

+ return false;

+ }

+ } else {

+ if (FLAGS_trace_re2)

+ LOG(INFO) << "Match " << trunc(pattern_)

+ << " [" << CEscape(subtext) << "]"

+ << " using NFA.";

+ if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {

+ if (!skipped_test)

+ LOG(ERROR) << "SearchNFA inconsistency";

+ return false;

+ }

+ // Adjust overall match for required prefix that we stripped off.

+ if (prefixlen > 0 && nsubmatch > 0)

+ submatch[0] = StringPiece(submatch[0].begin() - prefixlen,

+ submatch[0].size() + prefixlen);

+ // Zero submatches that don't exist in the regexp.

+ for (int i = ncap; i < nsubmatch; i++)

+ submatch[i] = NULL;

+ return true;

+// Internal matcher - like Match() but takes Args not StringPieces.

+bool RE2::DoMatch(const StringPiece& text,

+ Anchor anchor,

+ int* consumed,

+ const Arg* const* args,

+ int n) const {

+ if (!ok()) {

+ if (options_.log_errors())

+ LOG(ERROR) << "Invalid RE2: " << *error_;

+ return false;

+ }

+ // Count number of capture groups needed.

+ int nvec;

+ if (n == 0 && consumed == NULL)

+ nvec = 0;

+ else

+ nvec = n+1;

+ StringPiece* vec;

+ StringPiece stkvec[kVecSize];

+ StringPiece* heapvec = NULL;

+ if (nvec <= arraysize(stkvec)) {

+ vec = stkvec;

+ } else {

+ vec = new StringPiece[nvec];

+ heapvec = vec;

+ }

+ if (!Match(text, 0, text.size(), anchor, vec, nvec)) {

+ delete[] heapvec;

+ return false;

+ }

+ if(consumed != NULL)

+ *consumed = vec[0].end() - text.begin();

+ if (n == 0 || args == NULL) {

+ // We are not interested in results

+ delete[] heapvec;

+ return true;

+ }

+ int ncap = NumberOfCapturingGroups();

+ if (ncap < n) {

+ // RE has fewer capturing groups than number of arg pointers passed in

+ VLOG(1) << "Asked for " << n << " but only have " << ncap;

+ delete[] heapvec;

+ return false;

+ }

+ // If we got here, we must have matched the whole pattern.

+ for (int i = 0; i < n; i++) {

+ const StringPiece& s = vec[i+1];

+ if (!args[i]->Parse(s.data(), s.size())) {

+ // TODO: Should we indicate what the error was?

+ VLOG(1) << "Parse error on #" << i << " " << s << " "

+ << (void*)s.data() << "/" << s.size();

+ delete[] heapvec;

+ return false;

+ }

+ delete[] heapvec;

+ return true;

+// Append the "rewrite" string, with backslash subsitutions from "vec",

+// to string "out".

+bool RE2::Rewrite(string *out, const StringPiece &rewrite,

+ const StringPiece *vec, int veclen) const {

+ for (const char *s = rewrite.data(), *end = s + rewrite.size();

+ s < end; s++) {

+ int c = *s;

+ if (c == '\\') {

+ s++;

+ c = (s < end) ? *s : -1;

+ if (isdigit(c)) {

+ int n = (c - '0');

+ if (n >= veclen) {

+ LOG(ERROR) << "requested group " << n

+ << " in regexp " << rewrite.data();

+ return false;

+ }

+ StringPiece snip = vec[n];

+ if (snip.size() > 0)

+ out->append(snip.data(), snip.size());

+ } else if (c == '\\') {

+ out->push_back('\\');

+ } else {

+ LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();

+ return false;

+ }

+ } else {

+ out->push_back(c);

+ }

+ return true;

+// Return the number of capturing subpatterns, or -1 if the

+// regexp wasn't valid on construction.

+int RE2::NumberOfCapturingGroups() const {

+ if (suffix_regexp_ == NULL)

+ return -1;

+ ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case"

+ " multiple threads end up doing the same work in parallel.");

+ if (num_captures_ == -1)

+ num_captures_ = suffix_regexp_->NumCaptures();

+ return num_captures_;

+// Checks that the rewrite string is well-formed with respect to this

+// regular expression.

+bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {

+ int max_token = -1;

+ for (const char *s = rewrite.data(), *end = s + rewrite.size();

+ s < end; s++) {

+ int c = *s;

+ if (c != '\\') {

+ continue;

+ }

+ if (++s == end) {

+ *error = "Rewrite schema error: '\\' not allowed at end.";

+ return false;

+ }

+ c = *s;

+ if (c == '\\') {

+ continue;

+ }

+ if (!isdigit(c)) {

+ *error = "Rewrite schema error: "

+ "'\\' must be followed by a digit or '\\'.";

+ return false;

+ }

+ int n = (c - '0');

+ if (max_token < n) {

+ max_token = n;

+ }

+ if (max_token > NumberOfCapturingGroups()) {

+ SStringPrintf(error, "Rewrite schema requests %d matches, "

+ "but the regexp only has %d parenthesized subexpressions.",

+ max_token, NumberOfCapturingGroups());

+ return false;

+ }

+ return true;

+/***** Parsers for various types *****/

+bool RE2::Arg::parse_null(const char* str, int n, void* dest) {

+ // We fail if somebody asked us to store into a non-NULL void* pointer

+ return (dest == NULL);

+bool RE2::Arg::parse_string(const char* str, int n, void* dest) {

+ if (dest == NULL) return true;

+ reinterpret_cast<string*>(dest)->assign(str, n);

+ return true;

+bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) {

+ if (dest == NULL) return true;

+ reinterpret_cast<StringPiece*>(dest)->set(str, n);

+ return true;

+bool RE2::Arg::parse_char(const char* str, int n, void* dest) {

+ if (n != 1) return false;

+ if (dest == NULL) return true;

+ *(reinterpret_cast<char*>(dest)) = str[0];

+ return true;

+bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {

+ if (n != 1) return false;

+ if (dest == NULL) return true;

+ *(reinterpret_cast<unsigned char*>(dest)) = str[0];

+ return true;

+// Largest number spec that we are willing to parse

+static const int kMaxNumberLength = 32;

+// REQUIRES "buf" must have length at least kMaxNumberLength+1

+// Copies "str" into "buf" and null-terminates.

+// Overwrites *np with the new length.

+static const char* TerminateNumber(char* buf, const char* str, int* np) {

+ int n = *np;

+ if (n <= 0) return "";

+ if (n > 0 && isspace(*str)) {

+ // We are less forgiving than the strtoxxx() routines and do not

+ // allow leading spaces.

+ return "";

+ }

+ // Although buf has a fixed maximum size, we can still handle

+ // arbitrarily large integers correctly by omitting leading zeros.

+ // (Numbers that are still too long will be out of range.)

+ // Before deciding whether str is too long,

+ // remove leading zeros with s/000+/00/.

+ // Leaving the leading two zeros in place means that

+ // we don't change 0000x123 (invalid) into 0x123 (valid).

+ // Skip over leading - before replacing.

+ bool neg = false;

+ if (n >= 1 && str[0] == '-') {

+ neg = true;

+ n--;

+ str++;

+ }

+ if (n >= 3 && str[0] == '0' && str[1] == '0') {

+ while (n >= 3 && str[2] == '0') {

+ n--;

+ str++;

+ }

+ if (neg) { // make room in buf for -

+ n++;

+ str--;

+ }

+ if (n > kMaxNumberLength) return "";

+ memmove(buf, str, n);

+ if (neg) {

+ buf[0] = '-';

+ }

+ buf[n] = '\0';

+ *np = n;

+ return buf;

+bool RE2::Arg::parse_long_radix(const char* str,

+ int n,

+ void* dest,

+ int radix) {

+ if (n == 0) return false;

+ char buf[kMaxNumberLength+1];

+ str = TerminateNumber(buf, str, &n);

+ char* end;

+ errno = 0;

+ long r = strtol(str, &end, radix);

+ if (end != str + n) return false; // Leftover junk

+ if (errno) return false;

+ if (dest == NULL) return true;

+ *(reinterpret_cast<long*>(dest)) = r;

+ return true;

+bool RE2::Arg::parse_ulong_radix(const char* str,

+ int n,

+ void* dest,

+ int radix) {

+ if (n == 0) return false;

+ char buf[kMaxNumberLength+1];

+ str = TerminateNumber(buf, str, &n);

+ if (str[0] == '-') {

+ // strtoul() will silently accept negative numbers and parse

+ // them. This module is more strict and treats them as errors.

+ return false;

+ }

+ char* end;

+ errno = 0;

+ unsigned long r = strtoul(str, &end, radix);

+ if (end != str + n) return false; // Leftover junk

+ if (errno) return false;

+ if (dest == NULL) return true;

+ *(reinterpret_cast<unsigned long*>(dest)) = r;

+ return true;

+bool RE2::Arg::parse_short_radix(const char* str,

+ int n,

+ void* dest,

+ int radix) {

+ long r;

+ if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse

+ if ((short)r != r) return false; // Out of range

+ if (dest == NULL) return true;

+ *(reinterpret_cast<short*>(dest)) = r;

+ return true;

+bool RE2::Arg::parse_ushort_radix(const char* str,

+ int n,

+ void* dest,

+ int radix) {

+ unsigned long r;

+ if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse

+ if ((ushort)r != r) return false; // Out of range

+ if (dest == NULL) return true;

+ *(reinterpret_cast<unsigned short*>(dest)) = r;

+ return true;

+bool RE2::Arg::parse_int_radix(const char* str,

+ int n,

+ void* dest,

+ int radix) {

+ long r;

+ if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse

+ if ((int)r != r) return false; // Out of range

+ if (dest == NULL) return true;

+ *(reinterpret_cast<int*>(dest)) = r;

+ return true;

+bool RE2::Arg::parse_uint_radix(const char* str,

+ int n,

+ void* dest,

+ int radix) {

+ unsigned long r;

+ if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse

+ if ((uint)r != r) return false; // Out of range

+ if (dest == NULL) return true;

+ *(reinterpret_cast<unsigned int*>(dest)) = r;

+ return true;

+bool RE2::Arg::parse_longlong_radix(const char* str,

+ int n,

+ void* dest,

+ int radix) {

+ if (n == 0) return false;

+ char buf[kMaxNumberLength+1];

+ str = TerminateNumber(buf, str, &n);

+ char* end;

+ errno = 0;

+ int64 r = strtoll(str, &end, radix);

+ if (end != str + n) return false; // Leftover junk

+ if (errno) return false;

+ if (dest == NULL) return true;

+ *(reinterpret_cast<int64*>(dest)) = r;

+ return true;

+bool RE2::Arg::parse_ulonglong_radix(const char* str,

+ int n,

+ void* dest,

+ int radix) {

+ if (n == 0) return false;

+ char buf[kMaxNumberLength+1];

+ str = TerminateNumber(buf, str, &n);

+ if (str[0] == '-') {

+ // strtoull() will silently accept negative numbers and parse

+ // them. This module is more strict and treats them as errors.

+ return false;

+ }

+ char* end;

+ errno = 0;

+ uint64 r = strtoull(str, &end, radix);

+ if (end != str + n) return false; // Leftover junk

+ if (errno) return false;

+ if (dest == NULL) return true;

+ *(reinterpret_cast<uint64*>(dest)) = r;

+ return true;

+static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {

+ if (n == 0) return false;

+ static const int kMaxLength = 200;

+ char buf[kMaxLength];

+ if (n >= kMaxLength) return false;

+ memcpy(buf, str, n);

+ buf[n] = '\0';

+ errno = 0;

+ char* end;

+ double r;

+ if (isfloat) {

+ r = strtof(buf, &end);

+ } else {

+ r = strtod(buf, &end);

+ }

+ if (end != buf + n) return false; // Leftover junk

+ if (errno) return false;

+ if (dest == NULL) return true;

+ if (isfloat) {

+ *(reinterpret_cast<float*>(dest)) = r;

+ } else {

+ *(reinterpret_cast<double*>(dest)) = r;

+ }

+ return true;

+bool RE2::Arg::parse_double(const char* str, int n, void* dest) {

+ return parse_double_float(str, n, false, dest);

+bool RE2::Arg::parse_float(const char* str, int n, void* dest) {

+ return parse_double_float(str, n, true, dest);

+#define DEFINE_INTEGER_PARSERS(name) \

+ bool RE2::Arg::parse_##name(const char* str, int n, void* dest) { \

+ return parse_##name##_radix(str, n, dest, 10); \

+ } \

+ bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \

+ return parse_##name##_radix(str, n, dest, 16); \

+ } \

+ bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \

+ return parse_##name##_radix(str, n, dest, 8); \

+ } \

+ bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \

+ return parse_##name##_radix(str, n, dest, 0); \

+ }

+DEFINE_INTEGER_PARSERS(short);

+DEFINE_INTEGER_PARSERS(ushort);

+DEFINE_INTEGER_PARSERS(int);

+DEFINE_INTEGER_PARSERS(uint);

+DEFINE_INTEGER_PARSERS(long);

+DEFINE_INTEGER_PARSERS(ulong);

+DEFINE_INTEGER_PARSERS(longlong);

+DEFINE_INTEGER_PARSERS(ulonglong);

+#undef DEFINE_INTEGER_PARSERS

+} // namespace re2

« no previous file with comments | « third_party/re2/re2/re2.h ('k') | third_party/re2/re2/regexp.h » ('j') | no next file with comments »