third_party/re2/re2/parse.cc - Issue 10575037: Include RE2 library

Side by Side Diff: third_party/re2/re2/parse.cc

Issue 10575037: Include RE2 library (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Less intrusive fix for Android Created 8 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 // Copyright 2006 The RE2 Authors. All Rights Reserved.

	2 // Use of this source code is governed by a BSD-style

	3 // license that can be found in the LICENSE file.

	4

	5 // Regular expression parser.

	6

	7 // The parser is a simple precedence-based parser with a

	8 // manual stack. The parsing work is done by the methods

	9 // of the ParseState class. The Regexp::Parse function is

	10 // essentially just a lexer that calls the ParseState method

	11 // for each token.

	12

	13 // The parser recognizes POSIX extended regular expressions

	14 // excluding backreferences, collating elements, and collating

	15 // classes. It also allows the empty string as a regular expression

	16 // and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W.

	17 // See regexp.h for rationale.

	18

	19 #include <ctype.h>

	20

	21 #include "util/util.h"

	22 #include "re2/regexp.h"

	23 #include "re2/stringpiece.h"

	24 #include "re2/unicode_casefold.h"

	25 #include "re2/unicode_groups.h"

	26

	27 namespace re2 {

	28

	29 // Regular expression parse state.

	30 // The list of parsed regexps so far is maintained as a vector of

	31 // Regexp pointers called the stack. Left parenthesis and vertical

	32 // bar markers are also placed on the stack, as Regexps with

	33 // non-standard opcodes.

	34 // Scanning a left parenthesis causes the parser to push a left parenthesis

	35 // marker on the stack.

	36 // Scanning a vertical bar causes the parser to pop the stack until it finds a

	37 // vertical bar or left parenthesis marker (not popping the marker),

	38 // concatenate all the popped results, and push them back on

	39 // the stack (DoConcatenation).

	40 // Scanning a right parenthesis causes the parser to act as though it

	41 // has seen a vertical bar, which then leaves the top of the stack in the

	42 // form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar.

	43 // The parser pops all this off the stack and creates an alternation of the

	44 // regexps (DoAlternation).

	45

	46 class Regexp::ParseState {

	47 public:

	48 ParseState(ParseFlags flags, const StringPiece& whole_regexp,

	49 RegexpStatus* status);

	50 ~ParseState();

	51

	52 ParseFlags flags() { return flags_; }

	53 int rune_max() { return rune_max_; }

	54

	55 // Parse methods. All public methods return a bool saying

	56 // whether parsing should continue. If a method returns

	57 // false, it has set fields in *status_, and the parser

	58 // should return NULL.

	59

	60 // Pushes the given regular expression onto the stack.

	61 // Could check for too much memory used here.

	62 bool PushRegexp(Regexp* re);

	63

	64 // Pushes the literal rune r onto the stack.

	65 bool PushLiteral(Rune r);

	66

	67 // Pushes a regexp with the given op (and no args) onto the stack.

	68 bool PushSimpleOp(RegexpOp op);

	69

	70 // Pushes a ^ onto the stack.

	71 bool PushCarat();

	72

	73 // Pushes a \b (word == true) or \B (word == false) onto the stack.

	74 bool PushWordBoundary(bool word);

	75

	76 // Pushes a $ onto the stack.

	77 bool PushDollar();

	78

	79 // Pushes a . onto the stack

	80 bool PushDot();

	81

	82 // Pushes a repeat operator regexp onto the stack.

	83 // A valid argument for the operator must already be on the stack.

	84 // s is the name of the operator, for use in error messages.

	85 bool PushRepeatOp(RegexpOp op, const StringPiece& s, bool nongreedy);

	86

	87 // Pushes a repetition regexp onto the stack.

	88 // A valid argument for the operator must already be on the stack.

	89 bool PushRepetition(int min, int max, const StringPiece& s, bool nongreedy);

	90

	91 // Checks whether a particular regexp op is a marker.

	92 bool IsMarker(RegexpOp op);

	93

	94 // Processes a left parenthesis in the input.

	95 // Pushes a marker onto the stack.

	96 bool DoLeftParen(const StringPiece& name);

	97 bool DoLeftParenNoCapture();

	98

	99 // Processes a vertical bar in the input.

	100 bool DoVerticalBar();

	101

	102 // Processes a right parenthesis in the input.

	103 bool DoRightParen();

	104

	105 // Processes the end of input, returning the final regexp.

	106 Regexp* DoFinish();

	107

	108 // Finishes the regexp if necessary, preparing it for use

	109 // in a more complicated expression.

	110 // If it is a CharClassBuilder, converts into a CharClass.

	111 Regexp* FinishRegexp(Regexp*);

	112

	113 // These routines don't manipulate the parse stack

	114 // directly, but they do need to look at flags_.

	115 // ParseCharClass also manipulates the internals of Regexp

	116 // while creating *out_re.

	117

	118 // Parse a character class into *out_re.

	119 // Removes parsed text from s.

	120 bool ParseCharClass(StringPiece* s, Regexp** out_re,

	121 RegexpStatus* status);

	122

	123 // Parse a character class character into *rp.

	124 // Removes parsed text from s.

	125 bool ParseCCCharacter(StringPiece* s, Rune *rp,

	126 const StringPiece& whole_class,

	127 RegexpStatus* status);

	128

	129 // Parse a character class range into rr.

	130 // Removes parsed text from s.

	131 bool ParseCCRange(StringPiece* s, RuneRange* rr,

	132 const StringPiece& whole_class,

	133 RegexpStatus* status);

	134

	135 // Parse a Perl flag set or non-capturing group from s.

	136 bool ParsePerlFlags(StringPiece* s);

	137

	138

	139 // Finishes the current concatenation,

	140 // collapsing it into a single regexp on the stack.

	141 void DoConcatenation();

	142

	143 // Finishes the current alternation,

	144 // collapsing it to a single regexp on the stack.

	145 void DoAlternation();

	146

	147 // Generalized DoAlternation/DoConcatenation.

	148 void DoCollapse(RegexpOp op);

	149

	150 // Maybe concatenate Literals into LiteralString.

	151 bool MaybeConcatString(int r, ParseFlags flags);

	152

	153 private:

	154 ParseFlags flags_;

	155 StringPiece whole_regexp_;

	156 RegexpStatus* status_;

	157 Regexp* stacktop_;

	158 int ncap_; // number of capturing parens seen

	159 int rune_max_; // maximum char value for this encoding

	160

	161 DISALLOW_EVIL_CONSTRUCTORS(ParseState);

	162 };

	163

	164 // Pseudo-operators - only on parse stack.

	165 const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1);

	166 const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2);

	167

	168 Regexp::ParseState::ParseState(ParseFlags flags,

	169 const StringPiece& whole_regexp,

	170 RegexpStatus* status)

	171 : flags_(flags), whole_regexp_(whole_regexp),

	172 status_(status), stacktop_(NULL), ncap_(0) {

	173 if (flags_ & Latin1)

	174 rune_max_ = 0xFF;

	175 else

	176 rune_max_ = Runemax;

	177 }

	178

	179 // Cleans up by freeing all the regexps on the stack.

	180 Regexp::ParseState::~ParseState() {

	181 Regexp* next;

	182 for (Regexp* re = stacktop_; re != NULL; re = next) {

	183 next = re->down_;

	184 re->down_ = NULL;

	185 if (re->op() == kLeftParen)

	186 delete re->name_;

	187 re->Decref();

	188 }

	189 }

	190

	191 // Finishes the regexp if necessary, preparing it for use in

	192 // a more complex expression.

	193 // If it is a CharClassBuilder, converts into a CharClass.

	194 Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) {

	195 if (re == NULL)

	196 return NULL;

	197 re->down_ = NULL;

	198

	199 if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {

	200 CharClassBuilder* ccb = re->ccb_;

	201 re->ccb_ = NULL;

	202 re->cc_ = ccb->GetCharClass();

	203 delete ccb;

	204 }

	205

	206 return re;

	207 }

	208

	209 // Pushes the given regular expression onto the stack.

	210 // Could check for too much memory used here.

	211 bool Regexp::ParseState::PushRegexp(Regexp* re) {

	212 MaybeConcatString(-1, NoParseFlags);

	213

	214 // Special case: a character class of one character is just

	215 // a literal. This is a common idiom for escaping

	216 // single characters (e.g., [.] instead of \.), and some

	217 // analysis does better with fewer character classes.

	218 // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.

	219 if (re->op_ == kRegexpCharClass) {

	220 if (re->ccb_->size() == 1) {

	221 Rune r = re->ccb_->begin()->lo;

	222 re->Decref();

	223 re = new Regexp(kRegexpLiteral, flags_);

	224 re->rune_ = r;

	225 } else if (re->ccb_->size() == 2) {

	226 Rune r = re->ccb_->begin()->lo;

	227 if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) {

	228 re->Decref();

	229 re = new Regexp(kRegexpLiteral, flags_ \| FoldCase);

	230 re->rune_ = r + 'a' - 'A';

	231 }

	232 }

	233 }

	234

	235 if (!IsMarker(re->op()))

	236 re->simple_ = re->ComputeSimple();

	237 re->down_ = stacktop_;

	238 stacktop_ = re;

	239 return true;

	240 }

	241

	242 // Searches the case folding tables and returns the CaseFold* that contains r.

	243 // If there isn't one, returns the CaseFold* with smallest f->lo bigger than r.

	244 // If there isn't one, returns NULL.

	245 CaseFold* LookupCaseFold(CaseFold *f, int n, Rune r) {

	246 CaseFold* ef = f + n;

	247

	248 // Binary search for entry containing r.

	249 while (n > 0) {

	250 int m = n/2;

	251 if (f[m].lo <= r && r <= f[m].hi)

	252 return &f[m];

	253 if (r < f[m].lo) {

	254 n = m;

	255 } else {

	256 f += m+1;

	257 n -= m+1;

	258 }

	259 }

	260

	261 // There is no entry that contains r, but f points

	262 // where it would have been. Unless f points at

	263 // the end of the array, it points at the next entry

	264 // after r.

	265 if (f < ef)

	266 return f;

	267

	268 // No entry contains r; no entry contains runes > r.

	269 return NULL;

	270 }

	271

	272 // Returns the result of applying the fold f to the rune r.

	273 Rune ApplyFold(CaseFold *f, Rune r) {

	274 switch (f->delta) {

	275 default:

	276 return r + f->delta;

	277

	278 case EvenOddSkip: // even <-> odd but only applies to every other

	279 if ((r - f->lo) % 2)

	280 return r;

	281 // fall through

	282 case EvenOdd: // even <-> odd

	283 if (r%2 == 0)

	284 return r + 1;

	285 return r - 1;

	286

	287 case OddEvenSkip: // odd <-> even but only applies to every other

	288 if ((r - f->lo) % 2)

	289 return r;

	290 // fall through

	291 case OddEven: // odd <-> even

	292 if (r%2 == 1)

	293 return r + 1;

	294 return r - 1;

	295 }

	296 }

	297

	298 // Returns the next Rune in r's folding cycle (see unicode_casefold.h).

	299 // Examples:

	300 // CycleFoldRune('A') = 'a'

	301 // CycleFoldRune('a') = 'A'

	302 //

	303 // CycleFoldRune('K') = 'k'

	304 // CycleFoldRune('k') = 0x212A (Kelvin)

	305 // CycleFoldRune(0x212A) = 'K'

	306 //

	307 // CycleFoldRune('?') = '?'

	308 Rune CycleFoldRune(Rune r) {

	309 CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r);

	310 if (f == NULL \|\| r < f->lo)

	311 return r;

	312 return ApplyFold(f, r);

	313 }

	314

	315 // Add lo-hi to the class, along with their fold-equivalent characters.

	316 // If lo-hi is already in the class, assume that the fold-equivalent

	317 // chars are there too, so there's no work to do.

	318 static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {

	319 // AddFoldedRange calls itself recursively for each rune in the fold cycle.

	320 // Most folding cycles are small: there aren't any bigger than four in the

	321 // current Unicode tables. make_unicode_casefold.py checks that

	322 // the cycles are not too long, and we double-check here using depth.

	323 if (depth > 10) {

	324 LOG(DFATAL) << "AddFoldedRange recurses too much.";

	325 return;

	326 }

	327

	328 if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done

	329 return;

	330

	331 while (lo <= hi) {

	332 CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo);

	333 if (f == NULL) // lo has no fold, nor does anything above lo

	334 break;

	335 if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo

	336 lo = f->lo;

	337 continue;

	338 }

	339

	340 // Add in the result of folding the range lo - f->hi

	341 // and that range's fold, recursively.

	342 Rune lo1 = lo;

	343 Rune hi1 = min<Rune>(hi, f->hi);

	344 switch (f->delta) {

	345 default:

	346 lo1 += f->delta;

	347 hi1 += f->delta;

	348 break;

	349 case EvenOdd:

	350 if (lo1%2 == 1)

	351 lo1--;

	352 if (hi1%2 == 0)

	353 hi1++;

	354 break;

	355 case OddEven:

	356 if (lo1%2 == 0)

	357 lo1--;

	358 if (hi1%2 == 1)

	359 hi1++;

	360 break;

	361 }

	362 AddFoldedRange(cc, lo1, hi1, depth+1);

	363

	364 // Pick up where this fold left off.

	365 lo = f->hi + 1;

	366 }

	367 }

	368

	369 // Pushes the literal rune r onto the stack.

	370 bool Regexp::ParseState::PushLiteral(Rune r) {

	371 // Do case folding if needed.

	372 if ((flags_ & FoldCase) && CycleFoldRune(r) != r) {

	373 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);

	374 re->ccb_ = new CharClassBuilder;

	375 Rune r1 = r;

	376 do {

	377 if (!(flags_ & NeverNL) \|\| r != '\n') {

	378 re->ccb_->AddRange(r, r);

	379 }

	380 r = CycleFoldRune(r);

	381 } while (r != r1);

	382 re->ccb_->RemoveAbove(rune_max_);

	383 return PushRegexp(re);

	384 }

	385

	386 // Exclude newline if applicable.

	387 if ((flags_ & NeverNL) && r == '\n')

	388 return PushRegexp(new Regexp(kRegexpNoMatch, flags_));

	389

	390 // No fancy stuff worked. Ordinary literal.

	391 if (MaybeConcatString(r, flags_))

	392 return true;

	393

	394 Regexp* re = new Regexp(kRegexpLiteral, flags_);

	395 re->rune_ = r;

	396 return PushRegexp(re);

	397 }

	398

	399 // Pushes a ^ onto the stack.

	400 bool Regexp::ParseState::PushCarat() {

	401 if (flags_ & OneLine) {

	402 return PushSimpleOp(kRegexpBeginText);

	403 }

	404 return PushSimpleOp(kRegexpBeginLine);

	405 }

	406

	407 // Pushes a \b or \B onto the stack.

	408 bool Regexp::ParseState::PushWordBoundary(bool word) {

	409 if (word)

	410 return PushSimpleOp(kRegexpWordBoundary);

	411 return PushSimpleOp(kRegexpNoWordBoundary);

	412 }

	413

	414 // Pushes a $ onto the stack.

	415 bool Regexp::ParseState::PushDollar() {

	416 if (flags_ & OneLine) {

	417 // Clumsy marker so that MimicsPCRE() can tell whether

	418 // this kRegexpEndText was a $ and not a \z.

	419 Regexp::ParseFlags oflags = flags_;

	420 flags_ = flags_ \| WasDollar;

	421 bool ret = PushSimpleOp(kRegexpEndText);

	422 flags_ = oflags;

	423 return ret;

	424 }

	425 return PushSimpleOp(kRegexpEndLine);

	426 }

	427

	428 // Pushes a . onto the stack.

	429 bool Regexp::ParseState::PushDot() {

	430 if ((flags_ & DotNL) && !(flags_ & NeverNL))

	431 return PushSimpleOp(kRegexpAnyChar);

	432 // Rewrite . into [^\n]

	433 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);

	434 re->ccb_ = new CharClassBuilder;

	435 re->ccb_->AddRange(0, '\n' - 1);

	436 re->ccb_->AddRange('\n' + 1, rune_max_);

	437 return PushRegexp(re);

	438 }

	439

	440 // Pushes a regexp with the given op (and no args) onto the stack.

	441 bool Regexp::ParseState::PushSimpleOp(RegexpOp op) {

	442 Regexp* re = new Regexp(op, flags_);

	443 return PushRegexp(re);

	444 }

	445

	446 // Pushes a repeat operator regexp onto the stack.

	447 // A valid argument for the operator must already be on the stack.

	448 // The char c is the name of the operator, for use in error messages.

	449 bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s,

	450 bool nongreedy) {

	451 if (stacktop_ == NULL \|\| IsMarker(stacktop_->op())) {

	452 status_->set_code(kRegexpRepeatArgument);

	453 status_->set_error_arg(s);

	454 return false;

	455 }

	456 Regexp::ParseFlags fl = flags_;

	457 if (nongreedy)

	458 fl = fl ^ NonGreedy;

	459 Regexp* re = new Regexp(op, fl);

	460 re->AllocSub(1);

	461 re->down_ = stacktop_->down_;

	462 re->sub()[0] = FinishRegexp(stacktop_);

	463 re->simple_ = re->ComputeSimple();

	464 stacktop_ = re;

	465 return true;

	466 }

	467

	468 // Pushes a repetition regexp onto the stack.

	469 // A valid argument for the operator must already be on the stack.

	470 bool Regexp::ParseState::PushRepetition(int min, int max,

	471 const StringPiece& s,

	472 bool nongreedy) {

	473 if ((max != -1 && max < min) \|\| min > 1000 \|\| max > 1000) {

	474 status_->set_code(kRegexpRepeatSize);

	475 status_->set_error_arg(s);

	476 return false;

	477 }

	478 if (stacktop_ == NULL \|\| IsMarker(stacktop_->op())) {

	479 status_->set_code(kRegexpRepeatArgument);

	480 status_->set_error_arg(s);

	481 return false;

	482 }

	483 Regexp::ParseFlags fl = flags_;

	484 if (nongreedy)

	485 fl = fl ^ NonGreedy;

	486 Regexp* re = new Regexp(kRegexpRepeat, fl);

	487 re->min_ = min;

	488 re->max_ = max;

	489 re->AllocSub(1);

	490 re->down_ = stacktop_->down_;

	491 re->sub()[0] = FinishRegexp(stacktop_);

	492 re->simple_ = re->ComputeSimple();

	493

	494 stacktop_ = re;

	495 return true;

	496 }

	497

	498 // Checks whether a particular regexp op is a marker.

	499 bool Regexp::ParseState::IsMarker(RegexpOp op) {

	500 return op >= kLeftParen;

	501 }

	502

	503 // Processes a left parenthesis in the input.

	504 // Pushes a marker onto the stack.

	505 bool Regexp::ParseState::DoLeftParen(const StringPiece& name) {

	506 Regexp* re = new Regexp(kLeftParen, flags_);

	507 re->cap_ = ++ncap_;

	508 if (name.data() != NULL)

	509 re->name_ = new string(name.as_string());

	510 return PushRegexp(re);

	511 }

	512

	513 // Pushes a non-capturing marker onto the stack.

	514 bool Regexp::ParseState::DoLeftParenNoCapture() {

	515 Regexp* re = new Regexp(kLeftParen, flags_);

	516 re->cap_ = -1;

	517 return PushRegexp(re);

	518 }

	519

	520 // Adds r to cc, along with r's upper case if foldascii is set.

	521 static void AddLiteral(CharClassBuilder* cc, Rune r, bool foldascii) {

	522 cc->AddRange(r, r);

	523 if (foldascii && 'a' <= r && r <= 'z')

	524 cc->AddRange(r + 'A' - 'a', r + 'A' - 'a');

	525 }

	526

	527 // Processes a vertical bar in the input.

	528 bool Regexp::ParseState::DoVerticalBar() {

	529 MaybeConcatString(-1, NoParseFlags);

	530 DoConcatenation();

	531

	532 // Below the vertical bar is a list to alternate.

	533 // Above the vertical bar is a list to concatenate.

	534 // We just did the concatenation, so either swap

	535 // the result below the vertical bar or push a new

	536 // vertical bar on the stack.

	537 Regexp* r1;

	538 Regexp* r2;

	539 if ((r1 = stacktop_) != NULL &&

	540 (r2 = stacktop_->down_) != NULL &&

	541 r2->op() == kVerticalBar) {

	542 // If above and below vertical bar are literal or char class,

	543 // can merge into a single char class.

	544 Regexp* r3;

	545 if ((r1->op() == kRegexpLiteral \|\|

	546 r1->op() == kRegexpCharClass \|\|

	547 r1->op() == kRegexpAnyChar) &&

	548 (r3 = r2->down_) != NULL) {

	549 Rune rune;

	550 switch (r3->op()) {

	551 case kRegexpLiteral: // convert to char class

	552 rune = r3->rune_;

	553 r3->op_ = kRegexpCharClass;

	554 r3->cc_ = NULL;

	555 r3->ccb_ = new CharClassBuilder;

	556 AddLiteral(r3->ccb_, rune, r3->parse_flags_ & Regexp::FoldCase);

	557 // fall through

	558 case kRegexpCharClass:

	559 if (r1->op() == kRegexpLiteral)

	560 AddLiteral(r3->ccb_, r1->rune_,

	561 r1->parse_flags_ & Regexp::FoldCase);

	562 else if (r1->op() == kRegexpCharClass)

	563 r3->ccb_->AddCharClass(r1->ccb_);

	564 if (r1->op() == kRegexpAnyChar \|\| r3->ccb_->full()) {

	565 delete r3->ccb_;

	566 r3->ccb_ = NULL;

	567 r3->op_ = kRegexpAnyChar;

	568 }

	569 // fall through

	570 case kRegexpAnyChar:

	571 // pop r1

	572 stacktop_ = r2;

	573 r1->Decref();

	574 return true;

	575 default:

	576 break;

	577 }

	578 }

	579

	580 // Swap r1 below vertical bar (r2).

	581 r1->down_ = r2->down_;

	582 r2->down_ = r1;

	583 stacktop_ = r2;

	584 return true;

	585 }

	586 return PushSimpleOp(kVerticalBar);

	587 }

	588

	589 // Processes a right parenthesis in the input.

	590 bool Regexp::ParseState::DoRightParen() {

	591 // Finish the current concatenation and alternation.

	592 DoAlternation();

	593

	594 // The stack should be: LeftParen regexp

	595 // Remove the LeftParen, leaving the regexp,

	596 // parenthesized.

	597 Regexp* r1;

	598 Regexp* r2;

	599 if ((r1 = stacktop_) == NULL \|\|

	600 (r2 = r1->down_) == NULL \|\|

	601 r2->op() != kLeftParen) {

	602 status_->set_code(kRegexpMissingParen);

	603 status_->set_error_arg(whole_regexp_);

	604 return false;

	605 }

	606

	607 // Pop off r1, r2. Will Decref or reuse below.

	608 stacktop_ = r2->down_;

	609

	610 // Restore flags from when paren opened.

	611 Regexp* re = r2;

	612 flags_ = re->parse_flags();

	613

	614 // Rewrite LeftParen as capture if needed.

	615 if (re->cap_ > 0) {

	616 re->op_ = kRegexpCapture;

	617 // re->cap_ is already set

	618 re->AllocSub(1);

	619 re->sub()[0] = FinishRegexp(r1);

	620 re->simple_ = re->ComputeSimple();

	621 } else {

	622 re->Decref();

	623 re = r1;

	624 }

	625 return PushRegexp(re);

	626 }

	627

	628 // Processes the end of input, returning the final regexp.

	629 Regexp* Regexp::ParseState::DoFinish() {

	630 DoAlternation();

	631 Regexp* re = stacktop_;

	632 if (re != NULL && re->down_ != NULL) {

	633 status_->set_code(kRegexpMissingParen);

	634 status_->set_error_arg(whole_regexp_);

	635 return NULL;

	636 }

	637 stacktop_ = NULL;

	638 return FinishRegexp(re);

	639 }

	640

	641 // Returns the leading regexp that re starts with.

	642 // The returned Regexp* points into a piece of re,

	643 // so it must not be used after the caller calls re->Decref().

	644 Regexp* Regexp::LeadingRegexp(Regexp* re) {

	645 if (re->op() == kRegexpEmptyMatch)

	646 return NULL;

	647 if (re->op() == kRegexpConcat && re->nsub() >= 2) {

	648 Regexp** sub = re->sub();

	649 if (sub[0]->op() == kRegexpEmptyMatch)

	650 return NULL;

	651 return sub[0];

	652 }

	653 return re;

	654 }

	655

	656 // Removes LeadingRegexp(re) from re and returns what's left.

	657 // Consumes the reference to re and may edit it in place.

	658 // If caller wants to hold on to LeadingRegexp(re),

	659 // must have already Incref'ed it.

	660 Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) {

	661 if (re->op() == kRegexpEmptyMatch)

	662 return re;

	663 if (re->op() == kRegexpConcat && re->nsub() >= 2) {

	664 Regexp** sub = re->sub();

	665 if (sub[0]->op() == kRegexpEmptyMatch)

	666 return re;

	667 sub[0]->Decref();

	668 sub[0] = NULL;

	669 if (re->nsub() == 2) {

	670 // Collapse concatenation to single regexp.

	671 Regexp* nre = sub[1];

	672 sub[1] = NULL;

	673 re->Decref();

	674 return nre;

	675 }

	676 // 3 or more -> 2 or more.

	677 re->nsub_--;

	678 memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);

	679 return re;

	680 }

	681 Regexp::ParseFlags pf = re->parse_flags();

	682 re->Decref();

	683 return new Regexp(kRegexpEmptyMatch, pf);

	684 }

	685

	686 // Returns the leading string that re starts with.

	687 // The returned Rune* points into a piece of re,

	688 // so it must not be used after the caller calls re->Decref().

	689 Rune* Regexp::LeadingString(Regexp* re, int *nrune,

	690 Regexp::ParseFlags *flags) {

	691 while (re->op() == kRegexpConcat && re->nsub() > 0)

	692 re = re->sub()[0];

	693

	694 *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase);

	695

	696 if (re->op() == kRegexpLiteral) {

	697 *nrune = 1;

	698 return &re->rune_;

	699 }

	700

	701 if (re->op() == kRegexpLiteralString) {

	702 *nrune = re->nrunes_;

	703 return re->runes_;

	704 }

	705

	706 *nrune = 0;

	707 return NULL;

	708 }

	709

	710 // Removes the first n leading runes from the beginning of re.

	711 // Edits re in place.

	712 void Regexp::RemoveLeadingString(Regexp* re, int n) {

	713 // Chase down concats to find first string.

	714 // For regexps generated by parser, nested concats are

	715 // flattened except when doing so would overflow the 16-bit

	716 // limit on the size of a concatenation, so we should never

	717 // see more than two here.

	718 Regexp* stk[4];

	719 int d = 0;

	720 while (re->op() == kRegexpConcat) {

	721 if (d < arraysize(stk))

	722 stk[d++] = re;

	723 re = re->sub()[0];

	724 }

	725

	726 // Remove leading string from re.

	727 if (re->op() == kRegexpLiteral) {

	728 re->rune_ = 0;

	729 re->op_ = kRegexpEmptyMatch;

	730 } else if (re->op() == kRegexpLiteralString) {

	731 if (n >= re->nrunes_) {

	732 delete[] re->runes_;

	733 re->runes_ = NULL;

	734 re->nrunes_ = 0;

	735 re->op_ = kRegexpEmptyMatch;

	736 } else if (n == re->nrunes_ - 1) {

	737 Rune rune = re->runes_[re->nrunes_ - 1];

	738 delete[] re->runes_;

	739 re->runes_ = NULL;

	740 re->nrunes_ = 0;

	741 re->rune_ = rune;

	742 re->op_ = kRegexpLiteral;

	743 } else {

	744 re->nrunes_ -= n;

	745 memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]);

	746 }

	747 }

	748

	749 // If re is now empty, concatenations might simplify too.

	750 while (d-- > 0) {

	751 re = stk[d];

	752 Regexp** sub = re->sub();

	753 if (sub[0]->op() == kRegexpEmptyMatch) {

	754 sub[0]->Decref();

	755 sub[0] = NULL;

	756 // Delete first element of concat.

	757 switch (re->nsub()) {

	758 case 0:

	759 case 1:

	760 // Impossible.

	761 LOG(DFATAL) << "Concat of " << re->nsub();

	762 re->submany_ = NULL;

	763 re->op_ = kRegexpEmptyMatch;

	764 break;

	765

	766 case 2: {

	767 // Replace re with sub[1].

	768 Regexp* old = sub[1];

	769 sub[1] = NULL;

	770 re->Swap(old);

	771 old->Decref();

	772 break;

	773 }

	774

	775 default:

	776 // Slide down.

	777 re->nsub_--;

	778 memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);

	779 break;

	780 }

	781 }

	782 }

	783 }

	784

	785 // Factors common prefixes from alternation.

	786 // For example,

	787 // ABC\|ABD\|AEF\|BCX\|BCY

	788 // simplifies to

	789 // A(B(C\|D)\|EF)\|BC(X\|Y)

	790 // which the normal parse state routines will further simplify to

	791 // A(B[CD]\|EF)\|BC[XY]

	792 //

	793 // Rewrites sub to contain simplified list to alternate and returns

	794 // the new length of sub. Adjusts reference counts accordingly

	795 // (incoming sub[i] decremented, outgoing sub[i] incremented).

	796

	797 // It's too much of a pain to write this code with an explicit stack,

	798 // so instead we let the caller specify a maximum depth and

	799 // don't simplify beyond that. There are around 15 words of local

	800 // variables and parameters in the frame, so allowing 8 levels

	801 // on a 64-bit machine is still less than a kilobyte of stack and

	802 // probably enough benefit for practical uses.

	803 const int kFactorAlternationMaxDepth = 8;

	804

	805 int Regexp::FactorAlternation(

	806 Regexp** sub, int n,

	807 Regexp::ParseFlags altflags) {

	808 return FactorAlternationRecursive(sub, n, altflags,

	809 kFactorAlternationMaxDepth);

	810 }

	811

	812 int Regexp::FactorAlternationRecursive(

	813 Regexp** sub, int n,

	814 Regexp::ParseFlags altflags,

	815 int maxdepth) {

	816

	817 if (maxdepth <= 0)

	818 return n;

	819

	820 // Round 1: Factor out common literal prefixes.

	821 Rune *rune = NULL;

	822 int nrune = 0;

	823 Regexp::ParseFlags runeflags = Regexp::NoParseFlags;

	824 int start = 0;

	825 int out = 0;

	826 for (int i = 0; i <= n; i++) {

	827 // Invariant: what was in sub[0:start] has been Decref'ed

	828 // and that space has been reused for sub[0:out] (out <= start).

	829 //

	830 // Invariant: sub[start:i] consists of regexps that all begin

	831 // with the string rune[0:nrune].

	832

	833 Rune* rune_i = NULL;

	834 int nrune_i = 0;

	835 Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags;

	836 if (i < n) {

	837 rune_i = LeadingString(sub[i], &nrune_i, &runeflags_i);

	838 if (runeflags_i == runeflags) {

	839 int same = 0;

	840 while (same < nrune && same < nrune_i && rune[same] == rune_i[same])

	841 same++;

	842 if (same > 0) {

	843 // Matches at least one rune in current range. Keep going around.

	844 nrune = same;

	845 continue;

	846 }

	847 }

	848 }

	849

	850 // Found end of a run with common leading literal string:

	851 // sub[start:i] all begin with rune[0:nrune] but sub[i]

	852 // does not even begin with rune[0].

	853 //

	854 // Factor out common string and append factored expression to sub[0:out].

	855 if (i == start) {

	856 // Nothing to do - first iteration.

	857 } else if (i == start+1) {

	858 // Just one: don't bother factoring.

	859 sub[out++] = sub[start];

	860 } else {

	861 // Construct factored form: prefix(suffix1\|suffix2\|...)

	862 Regexp* x[2]; // x[0] = prefix, x[1] = suffix1\|suffix2\|...

	863 x[0] = LiteralString(rune, nrune, runeflags);

	864 for (int j = start; j < i; j++)

	865 RemoveLeadingString(sub[j], nrune);

	866 int nn = FactorAlternationRecursive(sub + start, i - start, altflags,

	867 maxdepth - 1);

	868 x[1] = AlternateNoFactor(sub + start, nn, altflags);

	869 sub[out++] = Concat(x, 2, altflags);

	870 }

	871

	872 // Prepare for next round (if there is one).

	873 if (i < n) {

	874 start = i;

	875 rune = rune_i;

	876 nrune = nrune_i;

	877 runeflags = runeflags_i;

	878 }

	879 }

	880 n = out;

	881

	882 // Round 2: Factor out common complex prefixes,

	883 // just the first piece of each concatenation,

	884 // whatever it is. This is good enough a lot of the time.

	885 start = 0;

	886 out = 0;

	887 Regexp* first = NULL;

	888 for (int i = 0; i <= n; i++) {

	889 // Invariant: what was in sub[0:start] has been Decref'ed

	890 // and that space has been reused for sub[0:out] (out <= start).

	891 //

	892 // Invariant: sub[start:i] consists of regexps that all begin with first.

	893

	894 Regexp* first_i = NULL;

	895 if (i < n) {

	896 first_i = LeadingRegexp(sub[i]);

	897 if (first != NULL && Regexp::Equal(first, first_i)) {

	898 continue;

	899 }

	900 }

	901

	902 // Found end of a run with common leading regexp:

	903 // sub[start:i] all begin with first but sub[i] does not.

	904 //

	905 // Factor out common regexp and append factored expression to sub[0:out].

	906 if (i == start) {

	907 // Nothing to do - first iteration.

	908 } else if (i == start+1) {

	909 // Just one: don't bother factoring.

	910 sub[out++] = sub[start];

	911 } else {

	912 // Construct factored form: prefix(suffix1\|suffix2\|...)

	913 Regexp* x[2]; // x[0] = prefix, x[1] = suffix1\|suffix2\|...

	914 x[0] = first->Incref();

	915 for (int j = start; j < i; j++)

	916 sub[j] = RemoveLeadingRegexp(sub[j]);

	917 int nn = FactorAlternationRecursive(sub + start, i - start, altflags,

	918 maxdepth - 1);

	919 x[1] = AlternateNoFactor(sub + start, nn, altflags);

	920 sub[out++] = Concat(x, 2, altflags);

	921 }

	922

	923 // Prepare for next round (if there is one).

	924 if (i < n) {

	925 start = i;

	926 first = first_i;

	927 }

	928 }

	929 n = out;

	930

	931 // Round 3: Collapse runs of single literals into character classes.

	932 start = 0;

	933 out = 0;

	934 for (int i = 0; i <= n; i++) {

	935 // Invariant: what was in sub[0:start] has been Decref'ed

	936 // and that space has been reused for sub[0:out] (out <= start).

	937 //

	938 // Invariant: sub[start:i] consists of regexps that are either

	939 // literal runes or character classes.

	940

	941 if (i < n &&

	942 (sub[i]->op() == kRegexpLiteral \|\|

	943 sub[i]->op() == kRegexpCharClass))

	944 continue;

	945

	946 // sub[i] is not a char or char class;

	947 // emit char class for sub[start:i]...

	948 if (i == start) {

	949 // Nothing to do.

	950 } else if (i == start+1) {

	951 sub[out++] = sub[start];

	952 } else {

	953 // Make new char class.

	954 CharClassBuilder ccb;

	955 for (int j = start; j < i; j++) {

	956 Regexp* re = sub[j];

	957 if (re->op() == kRegexpCharClass) {

	958 CharClass* cc = re->cc();

	959 for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)

	960 ccb.AddRange(it->lo, it->hi);

	961 } else if (re->op() == kRegexpLiteral) {

	962 ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags());

	963 } else {

	964 LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " "

	965 << re->ToString();

	966 }

	967 re->Decref();

	968 }

	969 sub[out++] = NewCharClass(ccb.GetCharClass(), altflags);

	970 }

	971

	972 // ... and then emit sub[i].

	973 if (i < n)

	974 sub[out++] = sub[i];

	975 start = i+1;

	976 }

	977 n = out;

	978

	979 // Round 4: Collapse runs of empty matches into single empty match.

	980 start = 0;

	981 out = 0;

	982 for (int i = 0; i < n; i++) {

	983 if (i + 1 < n &&

	984 sub[i]->op() == kRegexpEmptyMatch &&

	985 sub[i+1]->op() == kRegexpEmptyMatch) {

	986 sub[i]->Decref();

	987 continue;

	988 }

	989 sub[out++] = sub[i];

	990 }

	991 n = out;

	992

	993 return n;

	994 }

	995

	996 // Collapse the regexps on top of the stack, down to the

	997 // first marker, into a new op node (op == kRegexpAlternate

	998 // or op == kRegexpConcat).

	999 void Regexp::ParseState::DoCollapse(RegexpOp op) {

	1000 // Scan backward to marker, counting children of composite.

	1001 int n = 0;

	1002 Regexp* next = NULL;

	1003 Regexp* sub;

	1004 for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {

	1005 next = sub->down_;

	1006 if (sub->op_ == op)

	1007 n += sub->nsub_;

	1008 else

	1009 n++;

	1010 }

	1011

	1012 // If there's just one child, leave it alone.

	1013 // (Concat of one thing is that one thing; alternate of one thing is same.)

	1014 if (stacktop_ != NULL && stacktop_->down_ == next)

	1015 return;

	1016

	1017 // Construct op (alternation or concatenation), flattening op of op.

	1018 Regexp** subs = new Regexp*[n];

	1019 next = NULL;

	1020 int i = n;

	1021 for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {

	1022 next = sub->down_;

	1023 if (sub->op_ == op) {

	1024 Regexp** sub_subs = sub->sub();

	1025 for (int k = sub->nsub_ - 1; k >= 0; k--)

	1026 subs[--i] = sub_subs[k]->Incref();

	1027 sub->Decref();

	1028 } else {

	1029 subs[--i] = FinishRegexp(sub);

	1030 }

	1031 }

	1032

	1033 Regexp* re = ConcatOrAlternate(op, subs, n, flags_, true);

	1034 delete[] subs;

	1035 re->simple_ = re->ComputeSimple();

	1036 re->down_ = next;

	1037 stacktop_ = re;

	1038 }

	1039

	1040 // Finishes the current concatenation,

	1041 // collapsing it into a single regexp on the stack.

	1042 void Regexp::ParseState::DoConcatenation() {

	1043 Regexp* r1 = stacktop_;

	1044 if (r1 == NULL \|\| IsMarker(r1->op())) {

	1045 // empty concatenation is special case

	1046 Regexp* re = new Regexp(kRegexpEmptyMatch, flags_);

	1047 PushRegexp(re);

	1048 }

	1049 DoCollapse(kRegexpConcat);

	1050 }

	1051

	1052 // Finishes the current alternation,

	1053 // collapsing it to a single regexp on the stack.

	1054 void Regexp::ParseState::DoAlternation() {

	1055 DoVerticalBar();

	1056 // Now stack top is kVerticalBar.

	1057 Regexp* r1 = stacktop_;

	1058 stacktop_ = r1->down_;

	1059 r1->Decref();

	1060 DoCollapse(kRegexpAlternate);

	1061 }

	1062

	1063 // Incremental conversion of concatenated literals into strings.

	1064 // If top two elements on stack are both literal or string,

	1065 // collapse into single string.

	1066 // Don't walk down the stack -- the parser calls this frequently

	1067 // enough that below the bottom two is known to be collapsed.

	1068 // Only called when another regexp is about to be pushed

	1069 // on the stack, so that the topmost literal is not being considered.

	1070 // (Otherwise ab* would turn into (ab)*.)

	1071 // If r >= 0, consider pushing a literal r on the stack.

	1072 // Return whether that happened.

	1073 bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) {

	1074 Regexp* re1;

	1075 Regexp* re2;

	1076 if ((re1 = stacktop_) == NULL \|\| (re2 = re1->down_) == NULL)

	1077 return false;

	1078

	1079 if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString)

	1080 return false;

	1081 if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString)

	1082 return false;

	1083 if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase))

	1084 return false;

	1085

	1086 if (re2->op_ == kRegexpLiteral) {

	1087 // convert into string

	1088 Rune rune = re2->rune_;

	1089 re2->op_ = kRegexpLiteralString;

	1090 re2->nrunes_ = 0;

	1091 re2->runes_ = NULL;

	1092 re2->AddRuneToString(rune);

	1093 }

	1094

	1095 // push re1 into re2.

	1096 if (re1->op_ == kRegexpLiteral) {

	1097 re2->AddRuneToString(re1->rune_);

	1098 } else {

	1099 for (int i = 0; i < re1->nrunes_; i++)

	1100 re2->AddRuneToString(re1->runes_[i]);

	1101 re1->nrunes_ = 0;

	1102 delete[] re1->runes_;

	1103 re1->runes_ = NULL;

	1104 }

	1105

	1106 // reuse re1 if possible

	1107 if (r >= 0) {

	1108 re1->op_ = kRegexpLiteral;

	1109 re1->rune_ = r;

	1110 re1->parse_flags_ = flags;

	1111 return true;

	1112 }

	1113

	1114 stacktop_ = re2;

	1115 re1->Decref();

	1116 return false;

	1117 }

	1118

	1119 // Lexing routines.

	1120

	1121 // Parses a decimal integer, storing it in *n.

	1122 // Sets *s to span the remainder of the string.

	1123 // Sets *out_re to the regexp for the class.

	1124 static bool ParseInteger(StringPiece* s, int* np) {

	1125 if (s->size() == 0 \|\| !isdigit((*s)[0] & 0xFF))

	1126 return false;

	1127 // Disallow leading zeros.

	1128 if (s->size() >= 2 && (s)[0] == '0' && isdigit((s)[1] & 0xFF))

	1129 return false;

	1130 int n = 0;

	1131 int c;

	1132 while (s->size() > 0 && isdigit(c = (*s)[0] & 0xFF)) {

	1133 // Avoid overflow.

	1134 if (n >= 100000000)

	1135 return false;

	1136 n = n*10 + c - '0';

	1137 s->remove_prefix(1); // digit

	1138 }

	1139 *np = n;

	1140 return true;

	1141 }

	1142

	1143 // Parses a repetition suffix like {1,2} or {2} or {2,}.

	1144 // Sets *s to span the remainder of the string on success.

	1145 // Sets lo and hi to the given range.

	1146 // In the case of {2,}, the high number is unbounded;

	1147 // sets *hi to -1 to signify this.

	1148 // {,2} is NOT a valid suffix.

	1149 // The Maybe in the name signifies that the regexp parse

	1150 // doesn't fail even if ParseRepetition does, so the StringPiece

	1151 // s must NOT be edited unless MaybeParseRepetition returns true.

	1152 static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) {

	1153 StringPiece s = *sp;

	1154 if (s.size() == 0 \|\| s[0] != '{')

	1155 return false;

	1156 s.remove_prefix(1); // '{'

	1157 if (!ParseInteger(&s, lo))

	1158 return false;

	1159 if (s.size() == 0)

	1160 return false;

	1161 if (s[0] == ',') {

	1162 s.remove_prefix(1); // ','

	1163 if (s.size() == 0)

	1164 return false;

	1165 if (s[0] == '}') {

	1166 // {2,} means at least 2

	1167 *hi = -1;

	1168 } else {

	1169 // {2,4} means 2, 3, or 4.

	1170 if (!ParseInteger(&s, hi))

	1171 return false;

	1172 }

	1173 } else {

	1174 // {2} means exactly two

	1175 hi = lo;

	1176 }

	1177 if (s.size() == 0 \|\| s[0] != '}')

	1178 return false;

	1179 s.remove_prefix(1); // '}'

	1180 *sp = s;

	1181 return true;

	1182 }

	1183

	1184 // Removes the next Rune from the StringPiece and stores it in *r.

	1185 // Returns number of bytes removed from sp.

	1186 // Behaves as though there is a terminating NUL at the end of sp.

	1187 // Argument order is backwards from usual Google style

	1188 // but consistent with chartorune.

	1189 static int StringPieceToRune(Rune r, StringPiece sp, RegexpStatus* status) {

	1190 int n;

	1191 if (fullrune(sp->data(), sp->size())) {

	1192 n = chartorune(r, sp->data());

	1193 if (!(n == 1 && *r == Runeerror)) { // no decoding error

	1194 sp->remove_prefix(n);

	1195 return n;

	1196 }

	1197 }

	1198

	1199 status->set_code(kRegexpBadUTF8);

	1200 status->set_error_arg(NULL);

	1201 return -1;

	1202 }

	1203

	1204 // Return whether name is valid UTF-8.

	1205 // If not, set status to kRegexpBadUTF8.

	1206 static bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) {

	1207 StringPiece t = s;

	1208 Rune r;

	1209 while (t.size() > 0) {

	1210 if (StringPieceToRune(&r, &t, status) < 0)

	1211 return false;

	1212 }

	1213 return true;

	1214 }

	1215

	1216 // Is c a hex digit?

	1217 static int IsHex(int c) {

	1218 return ('0' <= c && c <= '9') \|\|

	1219 ('A' <= c && c <= 'F') \|\|

	1220 ('a' <= c && c <= 'f');

	1221 }

	1222

	1223 // Convert hex digit to value.

	1224 static int UnHex(int c) {

	1225 if ('0' <= c && c <= '9')

	1226 return c - '0';

	1227 if ('A' <= c && c <= 'F')

	1228 return c - 'A' + 10;

	1229 if ('a' <= c && c <= 'f')

	1230 return c - 'a' + 10;

	1231 LOG(DFATAL) << "Bad hex digit " << c;

	1232 return 0;

	1233 }

	1234

	1235 // Parse an escape sequence (e.g., \n, \{).

	1236 // Sets *s to span the remainder of the string.

	1237 // Sets *rp to the named character.

	1238 static bool ParseEscape(StringPiece* s, Rune* rp,

	1239 RegexpStatus* status, int rune_max) {

	1240 const char* begin = s->begin();

	1241 if (s->size() < 1 \|\| (*s)[0] != '\\') {

	1242 // Should not happen - caller always checks.

	1243 status->set_code(kRegexpInternalError);

	1244 status->set_error_arg(NULL);

	1245 return false;

	1246 }

	1247 if (s->size() < 2) {

	1248 status->set_code(kRegexpTrailingBackslash);

	1249 status->set_error_arg(NULL);

	1250 return false;

	1251 }

	1252 Rune c, c1;

	1253 s->remove_prefix(1); // backslash

	1254 if (StringPieceToRune(&c, s, status) < 0)

	1255 return false;

	1256 int code;

	1257 switch (c) {

	1258 default:

	1259 if (c < Runeself && !isalpha(c) && !isdigit(c)) {

	1260 // Escaped non-word characters are always themselves.

	1261 // PCRE is not quite so rigorous: it accepts things like

	1262 // \q, but we don't. We once rejected \_, but too many

	1263 // programs and people insist on using it, so allow \_.

	1264 *rp = c;

	1265 return true;

	1266 }

	1267 goto BadEscape;

	1268

	1269 // Octal escapes.

	1270 case '1':

	1271 case '2':

	1272 case '3':

	1273 case '4':

	1274 case '5':

	1275 case '6':

	1276 case '7':

	1277 // Single non-zero octal digit is a backreference; not supported.

	1278 if (s->size() == 0 \|\| (s)[0] < '0' \|\| (s)[0] > '7')

	1279 goto BadEscape;

	1280 // fall through

	1281 case '0':

	1282 // consume up to three octal digits; already have one.

	1283 code = c - '0';

	1284 if (s->size() > 0 && '0' <= (c = (*s)[0]) && c <= '7') {

	1285 code = code * 8 + c - '0';

	1286 s->remove_prefix(1); // digit

	1287 if (s->size() > 0) {

	1288 c = (*s)[0];

	1289 if ('0' <= c && c <= '7') {

	1290 code = code * 8 + c - '0';

	1291 s->remove_prefix(1); // digit

	1292 }

	1293 }

	1294 }

	1295 *rp = code;

	1296 return true;

	1297

	1298 // Hexadecimal escapes

	1299 case 'x':

	1300 if (s->size() == 0)

	1301 goto BadEscape;

	1302 if (StringPieceToRune(&c, s, status) < 0)

	1303 return false;

	1304 if (c == '{') {

	1305 // Any number of digits in braces.

	1306 // Update n as we consume the string, so that

	1307 // the whole thing gets shown in the error message.

	1308 // Perl accepts any text at all; it ignores all text

	1309 // after the first non-hex digit. We require only hex digits,

	1310 // and at least one.

	1311 if (StringPieceToRune(&c, s, status) < 0)

	1312 return false;

	1313 int nhex = 0;

	1314 code = 0;

	1315 while (IsHex(c)) {

	1316 nhex++;

	1317 code = code * 16 + UnHex(c);

	1318 if (code > rune_max)

	1319 goto BadEscape;

	1320 if (s->size() == 0)

	1321 goto BadEscape;

	1322 if (StringPieceToRune(&c, s, status) < 0)

	1323 return false;

	1324 }

	1325 if (c != '}' \|\| nhex == 0)

	1326 goto BadEscape;

	1327 *rp = code;

	1328 return true;

	1329 }

	1330 // Easy case: two hex digits.

	1331 if (s->size() == 0)

	1332 goto BadEscape;

	1333 if (StringPieceToRune(&c1, s, status) < 0)

	1334 return false;

	1335 if (!IsHex(c) \|\| !IsHex(c1))

	1336 goto BadEscape;

	1337 rp = UnHex(c) 16 + UnHex(c1);

	1338 return true;

	1339

	1340 // C escapes.

	1341 case 'n':

	1342 *rp = '\n';

	1343 return true;

	1344 case 'r':

	1345 *rp = '\r';

	1346 return true;

	1347 case 't':

	1348 *rp = '\t';

	1349 return true;

	1350

	1351 // Less common C escapes.

	1352 case 'a':

	1353 *rp = '\a';

	1354 return true;

	1355 case 'f':

	1356 *rp = '\f';

	1357 return true;

	1358 case 'v':

	1359 *rp = '\v';

	1360 return true;

	1361

	1362 // This code is disabled to avoid misparsing

	1363 // the Perl word-boundary \b as a backspace

	1364 // when in POSIX regexp mode. Surprisingly,

	1365 // in Perl, \b means word-boundary but [\b]

	1366 // means backspace. We don't support that:

	1367 // if you want a backspace embed a literal

	1368 // backspace character or use \x08.

	1369 //

	1370 // case 'b':

	1371 // *rp = '\b';

	1372 // return true;

	1373 }

	1374

	1375 LOG(DFATAL) << "Not reached in ParseEscape.";

	1376

	1377 BadEscape:

	1378 // Unrecognized escape sequence.

	1379 status->set_code(kRegexpBadEscape);

	1380 status->set_error_arg(StringPiece(begin, s->data() - begin));

	1381 return false;

	1382 }

	1383

	1384 // Add a range to the character class, but exclude newline if asked.

	1385 // Also handle case folding.

	1386 void CharClassBuilder::AddRangeFlags(

	1387 Rune lo, Rune hi, Regexp::ParseFlags parse_flags) {

	1388

	1389 // Take out \n if the flags say so.

	1390 bool cutnl = !(parse_flags & Regexp::ClassNL) \|\|

	1391 (parse_flags & Regexp::NeverNL);

	1392 if (cutnl && lo <= '\n' && '\n' <= hi) {

	1393 if (lo < '\n')

	1394 AddRangeFlags(lo, '\n' - 1, parse_flags);

	1395 if (hi > '\n')

	1396 AddRangeFlags('\n' + 1, hi, parse_flags);

	1397 return;

	1398 }

	1399

	1400 // If folding case, add fold-equivalent characters too.

	1401 if (parse_flags & Regexp::FoldCase)

	1402 AddFoldedRange(this, lo, hi, 0);

	1403 else

	1404 AddRange(lo, hi);

	1405 }

	1406

	1407 // Look for a group with the given name.

	1408 static UGroup* LookupGroup(const StringPiece& name,

	1409 UGroup *groups, int ngroups) {

	1410 // Simple name lookup.

	1411 for (int i = 0; i < ngroups; i++)

	1412 if (StringPiece(groups[i].name) == name)

	1413 return &groups[i];

	1414 return NULL;

	1415 }

	1416

	1417 // Fake UGroup containing all Runes

	1418 static URange16 any16[] = { { 0, 65535 } };

	1419 static URange32 any32[] = { { 65536, Runemax } };

	1420 static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 };

	1421

	1422 // Look for a POSIX group with the given name (e.g., "[:^alpha:]")

	1423 static UGroup* LookupPosixGroup(const StringPiece& name) {

	1424 return LookupGroup(name, posix_groups, num_posix_groups);

	1425 }

	1426

	1427 static UGroup* LookupPerlGroup(const StringPiece& name) {

	1428 return LookupGroup(name, perl_groups, num_perl_groups);

	1429 }

	1430

	1431 // Look for a Unicode group with the given name (e.g., "Han")

	1432 static UGroup* LookupUnicodeGroup(const StringPiece& name) {

	1433 // Special case: "Any" means any.

	1434 if (name == StringPiece("Any"))

	1435 return &anygroup;

	1436 return LookupGroup(name, unicode_groups, num_unicode_groups);

	1437 }

	1438

	1439 // Add a UGroup or its negation to the character class.

	1440 static void AddUGroup(CharClassBuilder cc, UGroup g, int sign,

	1441 Regexp::ParseFlags parse_flags) {

	1442 if (sign == +1) {

	1443 for (int i = 0; i < g->nr16; i++) {

	1444 cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags);

	1445 }

	1446 for (int i = 0; i < g->nr32; i++) {

	1447 cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags);

	1448 }

	1449 } else {

	1450 if (parse_flags & Regexp::FoldCase) {

	1451 // Normally adding a case-folded group means

	1452 // adding all the extra fold-equivalent runes too.

	1453 // But if we're adding the negation of the group,

	1454 // we have to exclude all the runes that are fold-equivalent

	1455 // to what's already missing. Too hard, so do in two steps.

	1456 CharClassBuilder ccb1;

	1457 AddUGroup(&ccb1, g, +1, parse_flags);

	1458 ccb1.Negate();

	1459 cc->AddCharClass(&ccb1);

	1460 return;

	1461 }

	1462 int next = 0;

	1463 for (int i = 0; i < g->nr16; i++) {

	1464 if (next < g->r16[i].lo)

	1465 cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags);

	1466 next = g->r16[i].hi + 1;

	1467 }

	1468 for (int i = 0; i < g->nr32; i++) {

	1469 if (next < g->r32[i].lo)

	1470 cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags);

	1471 next = g->r32[i].hi + 1;

	1472 }

	1473 if (next <= Runemax)

	1474 cc->AddRangeFlags(next, Runemax, parse_flags);

	1475 }

	1476 }

	1477

	1478 // Maybe parse a Perl character class escape sequence.

	1479 // Only recognizes the Perl character classes (\d \s \w \D \S \W),

	1480 // not the Perl empty-string classes (\b \B \A \Z \z).

	1481 // On success, sets *s to span the remainder of the string

	1482 // and returns the corresponding UGroup.

	1483 // The StringPiece must NOT be edited unless the call succeeds.

	1484 UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) {

	1485 if (!(parse_flags & Regexp::PerlClasses))

	1486 return NULL;

	1487 if (s->size() < 2 \|\| (*s)[0] != '\\')

	1488 return NULL;

	1489 // Could use StringPieceToRune, but there aren't

	1490 // any non-ASCII Perl group names.

	1491 StringPiece name(s->begin(), 2);

	1492 UGroup *g = LookupPerlGroup(name);

	1493 if (g == NULL)

	1494 return NULL;

	1495 s->remove_prefix(name.size());

	1496 return g;

	1497 }

	1498

	1499 enum ParseStatus {

	1500 kParseOk, // Did some parsing.

	1501 kParseError, // Found an error.

	1502 kParseNothing, // Decided not to parse.

	1503 };

	1504

	1505 // Maybe parses a Unicode character group like \p{Han} or \P{Han}

	1506 // (the latter is a negated group).

	1507 ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags,

	1508 CharClassBuilder *cc,

	1509 RegexpStatus* status) {

	1510 // Decide whether to parse.

	1511 if (!(parse_flags & Regexp::UnicodeGroups))

	1512 return kParseNothing;

	1513 if (s->size() < 2 \|\| (*s)[0] != '\\')

	1514 return kParseNothing;

	1515 Rune c = (*s)[1];

	1516 if (c != 'p' && c != 'P')

	1517 return kParseNothing;

	1518

	1519 // Committed to parse. Results:

	1520 int sign = +1; // -1 = negated char class

	1521 if (c == 'P')

	1522 sign = -1;

	1523 StringPiece seq = *s; // \p{Han} or \pL

	1524 StringPiece name; // Han or L

	1525 s->remove_prefix(2); // '\\', 'p'

	1526

	1527 if (!StringPieceToRune(&c, s, status))

	1528 return kParseError;

	1529 if (c != '{') {

	1530 // Name is the bit of string we just skipped over for c.

	1531 const char* p = seq.begin() + 2;

	1532 name = StringPiece(p, s->begin() - p);

	1533 } else {

	1534 // Name is in braces. Look for closing }

	1535 int end = s->find('}', 0);

	1536 if (end == s->npos) {

	1537 if (!IsValidUTF8(seq, status))

	1538 return kParseError;

	1539 status->set_code(kRegexpBadCharRange);

	1540 status->set_error_arg(seq);

	1541 return kParseError;

	1542 }

	1543 name = StringPiece(s->begin(), end); // without '}'

	1544 s->remove_prefix(end + 1); // with '}'

	1545 if (!IsValidUTF8(name, status))

	1546 return kParseError;

	1547 }

	1548

	1549 // Chop seq where s now begins.

	1550 seq = StringPiece(seq.begin(), s->begin() - seq.begin());

	1551

	1552 // Look up group

	1553 if (name.size() > 0 && name[0] == '^') {

	1554 sign = -sign;

	1555 name.remove_prefix(1); // '^'

	1556 }

	1557 UGroup *g = LookupUnicodeGroup(name);

	1558 if (g == NULL) {

	1559 status->set_code(kRegexpBadCharRange);

	1560 status->set_error_arg(seq);

	1561 return kParseError;

	1562 }

	1563

	1564 AddUGroup(cc, g, sign, parse_flags);

	1565 return kParseOk;

	1566 }

	1567

	1568 // Parses a character class name like [:alnum:].

	1569 // Sets *s to span the remainder of the string.

	1570 // Adds the ranges corresponding to the class to ranges.

	1571 static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags,

	1572 CharClassBuilder *cc,

	1573 RegexpStatus* status) {

	1574 // Check begins with [:

	1575 const char* p = s->data();

	1576 const char* ep = s->data() + s->size();

	1577 if (ep - p < 2 \|\| p[0] != '[' \|\| p[1] != ':')

	1578 return kParseNothing;

	1579

	1580 // Look for closing :].

	1581 const char* q;

	1582 for (q = p+2; q <= ep-2 && (q != ':' \|\| (q+1) != ']'); q++)

	1583 ;

	1584

	1585 // If no closing :], then ignore.

	1586 if (q > ep-2)

	1587 return kParseNothing;

	1588

	1589 // Got it. Check that it's valid.

	1590 q += 2;

	1591 StringPiece name(p, q-p);

	1592

	1593 UGroup *g = LookupPosixGroup(name);

	1594 if (g == NULL) {

	1595 status->set_code(kRegexpBadCharRange);

	1596 status->set_error_arg(name);

	1597 return kParseError;

	1598 }

	1599

	1600 s->remove_prefix(name.size());

	1601 AddUGroup(cc, g, g->sign, parse_flags);

	1602 return kParseOk;

	1603 }

	1604

	1605 // Parses a character inside a character class.

	1606 // There are fewer special characters here than in the rest of the regexp.

	1607 // Sets *s to span the remainder of the string.

	1608 // Sets *rp to the character.

	1609 bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp,

	1610 const StringPiece& whole_class,

	1611 RegexpStatus* status) {

	1612 if (s->size() == 0) {

	1613 status->set_code(kRegexpMissingBracket);

	1614 status->set_error_arg(whole_class);

	1615 return false;

	1616 }

	1617

	1618 // Allow regular escape sequences even though

	1619 // many need not be escaped in this context.

	1620 if (s->size() >= 1 && (*s)[0] == '\\')

	1621 return ParseEscape(s, rp, status, rune_max_);

	1622

	1623 // Otherwise take the next rune.

	1624 return StringPieceToRune(rp, s, status) >= 0;

	1625 }

	1626

	1627 // Parses a character class character, or, if the character

	1628 // is followed by a hyphen, parses a character class range.

	1629 // For single characters, rr->lo == rr->hi.

	1630 // Sets *s to span the remainder of the string.

	1631 // Sets *rp to the character.

	1632 bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr,

	1633 const StringPiece& whole_class,

	1634 RegexpStatus* status) {

	1635 StringPiece os = *s;

	1636 if (!ParseCCCharacter(s, &rr->lo, whole_class, status))

	1637 return false;

	1638 // [a-] means (a\|-), so check for final ].

	1639 if (s->size() >= 2 && (s)[0] == '-' && (s)[1] != ']') {

	1640 s->remove_prefix(1); // '-'

	1641 if (!ParseCCCharacter(s, &rr->hi, whole_class, status))

	1642 return false;

	1643 if (rr->hi < rr->lo) {

	1644 status->set_code(kRegexpBadCharRange);

	1645 status->set_error_arg(StringPiece(os.data(), s->data() - os.data()));

	1646 return false;

	1647 }

	1648 } else {

	1649 rr->hi = rr->lo;

	1650 }

	1651 return true;

	1652 }

	1653

	1654 // Parses a possibly-negated character class expression like [^abx-z[:digit:]].

	1655 // Sets *s to span the remainder of the string.

	1656 // Sets *out_re to the regexp for the class.

	1657 bool Regexp::ParseState::ParseCharClass(StringPiece* s,

	1658 Regexp** out_re,

	1659 RegexpStatus* status) {

	1660 StringPiece whole_class = *s;

	1661 if (s->size() == 0 \|\| (*s)[0] != '[') {

	1662 // Caller checked this.

	1663 status->set_code(kRegexpInternalError);

	1664 status->set_error_arg(NULL);

	1665 return false;

	1666 }

	1667 bool negated = false;

	1668 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);

	1669 re->ccb_ = new CharClassBuilder;

	1670 s->remove_prefix(1); // '['

	1671 if (s->size() > 0 && (*s)[0] == '^') {

	1672 s->remove_prefix(1); // '^'

	1673 negated = true;

	1674 if (!(flags_ & ClassNL) \|\| (flags_ & NeverNL)) {

	1675 // If NL can't match implicitly, then pretend

	1676 // negated classes include a leading \n.

	1677 re->ccb_->AddRange('\n', '\n');

	1678 }

	1679 }

	1680 bool first = true; // ] is okay as first char in class

	1681 while (s->size() > 0 && ((*s)[0] != ']' \|\| first)) {

	1682 // - is only okay unescaped as first or last in class.

	1683 // Except that Perl allows - anywhere.

	1684 if ((*s)[0] == '-' && !first && !(flags_&PerlX) &&

	1685 (s->size() == 1 \|\| (*s)[1] != ']')) {

	1686 StringPiece t = *s;

	1687 t.remove_prefix(1); // '-'

	1688 Rune r;

	1689 int n = StringPieceToRune(&r, &t, status);

	1690 if (n < 0) {

	1691 re->Decref();

	1692 return false;

	1693 }

	1694 status->set_code(kRegexpBadCharRange);

	1695 status->set_error_arg(StringPiece(s->data(), 1+n));

	1696 re->Decref();

	1697 return false;

	1698 }

	1699 first = false;

	1700

	1701 // Look for [:alnum:] etc.

	1702 if (s->size() > 2 && (s)[0] == '[' && (s)[1] == ':') {

	1703 switch (ParseCCName(s, flags_, re->ccb_, status)) {

	1704 case kParseOk:

	1705 continue;

	1706 case kParseError:

	1707 re->Decref();

	1708 return false;

	1709 case kParseNothing:

	1710 break;

	1711 }

	1712 }

	1713

	1714 // Look for Unicode character group like \p{Han}

	1715 if (s->size() > 2 &&

	1716 (*s)[0] == '\\' &&

	1717 ((s)[1] == 'p' \|\| (s)[1] == 'P')) {

	1718 switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) {

	1719 case kParseOk:

	1720 continue;

	1721 case kParseError:

	1722 re->Decref();

	1723 return false;

	1724 case kParseNothing:

	1725 break;

	1726 }

	1727 }

	1728

	1729 // Look for Perl character class symbols (extension).

	1730 UGroup *g = MaybeParsePerlCCEscape(s, flags_);

	1731 if (g != NULL) {

	1732 AddUGroup(re->ccb_, g, g->sign, flags_);

	1733 continue;

	1734 }

	1735

	1736 // Otherwise assume single character or simple range.

	1737 RuneRange rr;

	1738 if (!ParseCCRange(s, &rr, whole_class, status)) {

	1739 re->Decref();

	1740 return false;

	1741 }

	1742 // AddRangeFlags is usually called in response to a class like

	1743 // \p{Foo} or [[:foo:]]; for those, it filters \n out unless

	1744 // Regexp::ClassNL is set. In an explicit range or singleton

	1745 // like we just parsed, we do not filter \n out, so set ClassNL

	1746 // in the flags.

	1747 re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ \| Regexp::ClassNL);

	1748 }

	1749 if (s->size() == 0) {

	1750 status->set_code(kRegexpMissingBracket);

	1751 status->set_error_arg(whole_class);

	1752 re->Decref();

	1753 return false;

	1754 }

	1755 s->remove_prefix(1); // ']'

	1756

	1757 if (negated)

	1758 re->ccb_->Negate();

	1759 re->ccb_->RemoveAbove(rune_max_);

	1760

	1761 *out_re = re;

	1762 return true;

	1763 }

	1764

	1765 // Is this a valid capture name? [A-Za-z0-9_]+

	1766 // PCRE limits names to 32 bytes.

	1767 // Python rejects names starting with digits.

	1768 // We don't enforce either of those.

	1769 static bool IsValidCaptureName(const StringPiece& name) {

	1770 if (name.size() == 0)

	1771 return false;

	1772 for (int i = 0; i < name.size(); i++) {

	1773 int c = name[i];

	1774 if (('0' <= c && c <= '9') \|\|

	1775 ('a' <= c && c <= 'z') \|\|

	1776 ('A' <= c && c <= 'Z') \|\|

	1777 c == '_')

	1778 continue;

	1779 return false;

	1780 }

	1781 return true;

	1782 }

	1783

	1784 // Parses a Perl flag setting or non-capturing group or both,

	1785 // like (?i) or (?: or (?i:. Removes from s, updates parse state.

	1786 // The caller must check that s begins with "(?".

	1787 // Returns true on success. If the Perl flag is not

	1788 // well-formed or not supported, sets status_ and returns false.

	1789 bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) {

	1790 StringPiece t = *s;

	1791

	1792 // Caller is supposed to check this.

	1793 if (!(flags_ & PerlX) \|\| t.size() < 2 \|\| t[0] != '(' \|\| t[1] != '?') {

	1794 LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags";

	1795 status_->set_code(kRegexpInternalError);

	1796 return false;

	1797 }

	1798

	1799 t.remove_prefix(2); // "(?"

	1800

	1801 // Check for named captures, first introduced in Python's regexp library.

	1802 // As usual, there are three slightly different syntaxes:

	1803 //

	1804 // (?P<name>expr) the original, introduced by Python

	1805 // (?<name>expr) the .NET alteration, adopted by Perl 5.10

	1806 // (?'name'expr) another .NET alteration, adopted by Perl 5.10

	1807 //

	1808 // Perl 5.10 gave in and implemented the Python version too,

	1809 // but they claim that the last two are the preferred forms.

	1810 // PCRE and languages based on it (specifically, PHP and Ruby)

	1811 // support all three as well. EcmaScript 4 uses only the Python form.

	1812 //

	1813 // In both the open source world (via Code Search) and the

	1814 // Google source tree, (?P<expr>name) is the dominant form,

	1815 // so that's the one we implement. One is enough.

	1816 if (t.size() > 2 && t[0] == 'P' && t[1] == '<') {

	1817 // Pull out name.

	1818 int end = t.find('>', 2);

	1819 if (end == t.npos) {

	1820 if (!IsValidUTF8(*s, status_))

	1821 return false;

	1822 status_->set_code(kRegexpBadNamedCapture);

	1823 status_->set_error_arg(*s);

	1824 return false;

	1825 }

	1826

	1827 // t is "P<name>...", t[end] == '>'

	1828 StringPiece capture(t.begin()-2, end+3); // "(?P<name>"

	1829 StringPiece name(t.begin()+2, end-2); // "name"

	1830 if (!IsValidUTF8(name, status_))

	1831 return false;

	1832 if (!IsValidCaptureName(name)) {

	1833 status_->set_code(kRegexpBadNamedCapture);

	1834 status_->set_error_arg(capture);

	1835 return false;

	1836 }

	1837

	1838 if (!DoLeftParen(name)) {

	1839 // DoLeftParen's failure set status_.

	1840 return false;

	1841 }

	1842

	1843 s->remove_prefix(capture.end() - s->begin());

	1844 return true;

	1845 }

	1846

	1847 bool negated = false;

	1848 bool sawflags = false;

	1849 int nflags = flags_;

	1850 Rune c;

	1851 for (bool done = false; !done; ) {

	1852 if (t.size() == 0)

	1853 goto BadPerlOp;

	1854 if (StringPieceToRune(&c, &t, status_) < 0)

	1855 return false;

	1856 switch (c) {

	1857 default:

	1858 goto BadPerlOp;

	1859

	1860 // Parse flags.

	1861 case 'i':

	1862 sawflags = true;

	1863 if (negated)

	1864 nflags &= ~FoldCase;

	1865 else

	1866 nflags \|= FoldCase;

	1867 break;

	1868

	1869 case 'm': // opposite of our OneLine

	1870 sawflags = true;

	1871 if (negated)

	1872 nflags \|= OneLine;

	1873 else

	1874 nflags &= ~OneLine;

	1875 break;

	1876

	1877 case 's':

	1878 sawflags = true;

	1879 if (negated)

	1880 nflags &= ~DotNL;

	1881 else

	1882 nflags \|= DotNL;

	1883 break;

	1884

	1885 case 'U':

	1886 sawflags = true;

	1887 if (negated)

	1888 nflags &= ~NonGreedy;

	1889 else

	1890 nflags \|= NonGreedy;

	1891 break;

	1892

	1893 // Negation

	1894 case '-':

	1895 if (negated)

	1896 goto BadPerlOp;

	1897 negated = true;

	1898 sawflags = false;

	1899 break;

	1900

	1901 // Open new group.

	1902 case ':':

	1903 if (!DoLeftParenNoCapture()) {

	1904 // DoLeftParenNoCapture's failure set status_.

	1905 return false;

	1906 }

	1907 done = true;

	1908 break;

	1909

	1910 // Finish flags.

	1911 case ')':

	1912 done = true;

	1913 break;

	1914 }

	1915 }

	1916

	1917 if (negated && !sawflags)

	1918 goto BadPerlOp;

	1919

	1920 flags_ = static_cast<Regexp::ParseFlags>(nflags);

	1921 *s = t;

	1922 return true;

	1923

	1924 BadPerlOp:

	1925 status_->set_code(kRegexpBadPerlOp);

	1926 status_->set_error_arg(StringPiece(s->begin(), t.begin() - s->begin()));

	1927 return false;

	1928 }

	1929

	1930 // Converts latin1 (assumed to be encoded as Latin1 bytes)

	1931 // into UTF8 encoding in string.

	1932 // Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is

	1933 // deprecated and because it rejects code points 0x80-0x9F.

	1934 void ConvertLatin1ToUTF8(const StringPiece& latin1, string* utf) {

	1935 char buf[UTFmax];

	1936

	1937 utf->clear();

	1938 for (int i = 0; i < latin1.size(); i++) {

	1939 Rune r = latin1[i] & 0xFF;

	1940 int n = runetochar(buf, &r);

	1941 utf->append(buf, n);

	1942 }

	1943 }

	1944

	1945 // Parses the regular expression given by s,

	1946 // returning the corresponding Regexp tree.

	1947 // The caller must Decref the return value when done with it.

	1948 // Returns NULL on error.

	1949 Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,

	1950 RegexpStatus* status) {

	1951 // Make status non-NULL (easier on everyone else).

	1952 RegexpStatus xstatus;

	1953 if (status == NULL)

	1954 status = &xstatus;

	1955

	1956 ParseState ps(global_flags, s, status);

	1957 StringPiece t = s;

	1958

	1959 // Convert regexp to UTF-8 (easier on the rest of the parser).

	1960 if (global_flags & Latin1) {

	1961 string* tmp = new string;

	1962 ConvertLatin1ToUTF8(t, tmp);

	1963 status->set_tmp(tmp);

	1964 t = *tmp;

	1965 }

	1966

	1967 if (global_flags & Literal) {

	1968 // Special parse loop for literal string.

	1969 while (t.size() > 0) {

	1970 Rune r;

	1971 if (StringPieceToRune(&r, &t, status) < 0)

	1972 return NULL;

	1973 if (!ps.PushLiteral(r))

	1974 return NULL;

	1975 }

	1976 return ps.DoFinish();

	1977 }

	1978

	1979 StringPiece lastunary = NULL;

	1980 while (t.size() > 0) {

	1981 StringPiece isunary = NULL;

	1982 switch (t[0]) {

	1983 default: {

	1984 Rune r;

	1985 if (StringPieceToRune(&r, &t, status) < 0)

	1986 return NULL;

	1987 if (!ps.PushLiteral(r))

	1988 return NULL;

	1989 break;

	1990 }

	1991

	1992 case '(':

	1993 // "(?" introduces Perl escape.

	1994 if ((ps.flags() & PerlX) && (t.size() >= 2 && t[1] == '?')) {

	1995 // Flag changes and non-capturing groups.

	1996 if (!ps.ParsePerlFlags(&t))

	1997 return NULL;

	1998 break;

	1999 }

	2000 if (!ps.DoLeftParen(NULL))

	2001 return NULL;

	2002 t.remove_prefix(1); // '('

	2003 break;

	2004

	2005 case '\|':

	2006 if (!ps.DoVerticalBar())

	2007 return NULL;

	2008 t.remove_prefix(1); // '\|'

	2009 break;

	2010

	2011 case ')':

	2012 if (!ps.DoRightParen())

	2013 return NULL;

	2014 t.remove_prefix(1); // ')'

	2015 break;

	2016

	2017 case '^': // Beginning of line.

	2018 if (!ps.PushCarat())

	2019 return NULL;

	2020 t.remove_prefix(1); // '^'

	2021 break;

	2022

	2023 case '$': // End of line.

	2024 if (!ps.PushDollar())

	2025 return NULL;

	2026 t.remove_prefix(1); // '$'

	2027 break;

	2028

	2029 case '.': // Any character (possibly except newline).

	2030 if (!ps.PushDot())

	2031 return NULL;

	2032 t.remove_prefix(1); // '.'

	2033 break;

	2034

	2035 case '[': { // Character class.

	2036 Regexp* re;

	2037 if (!ps.ParseCharClass(&t, &re, status))

	2038 return NULL;

	2039 if (!ps.PushRegexp(re))

	2040 return NULL;

	2041 break;

	2042 }

	2043

	2044 case '*': { // Zero or more.

	2045 RegexpOp op;

	2046 op = kRegexpStar;

	2047 goto Rep;

	2048 case '+': // One or more.

	2049 op = kRegexpPlus;

	2050 goto Rep;

	2051 case '?': // Zero or one.

	2052 op = kRegexpQuest;

	2053 goto Rep;

	2054 Rep:

	2055 StringPiece opstr = t;

	2056 bool nongreedy = false;

	2057 t.remove_prefix(1); // '*' or '+' or '?'

	2058 if (ps.flags() & PerlX) {

	2059 if (t.size() > 0 && t[0] == '?') {

	2060 nongreedy = true;

	2061 t.remove_prefix(1); // '?'

	2062 }

	2063 if (lastunary.size() > 0) {

	2064 // In Perl it is not allowed to stack repetition operators:

	2065 // a** is a syntax error, not a double-star.

	2066 // (and a++ means something else entirely, which we don't support!)

	2067 status->set_code(kRegexpRepeatOp);

	2068 status->set_error_arg(StringPiece(lastunary.begin(),

	2069 t.begin() - lastunary.begin()));

	2070 return NULL;

	2071 }

	2072 }

	2073 opstr.set(opstr.data(), t.data() - opstr.data());

	2074 if (!ps.PushRepeatOp(op, opstr, nongreedy))

	2075 return NULL;

	2076 isunary = opstr;

	2077 break;

	2078 }

	2079

	2080 case '{': { // Counted repetition.

	2081 int lo, hi;

	2082 StringPiece opstr = t;

	2083 if (!MaybeParseRepetition(&t, &lo, &hi)) {

	2084 // Treat like a literal.

	2085 if (!ps.PushLiteral('{'))

	2086 return NULL;

	2087 t.remove_prefix(1); // '{'

	2088 break;

	2089 }

	2090 bool nongreedy = false;

	2091 if (ps.flags() & PerlX) {

	2092 if (t.size() > 0 && t[0] == '?') {

	2093 nongreedy = true;

	2094 t.remove_prefix(1); // '?'

	2095 }

	2096 if (lastunary.size() > 0) {

	2097 // Not allowed to stack repetition operators.

	2098 status->set_code(kRegexpRepeatOp);

	2099 status->set_error_arg(StringPiece(lastunary.begin(),

	2100 t.begin() - lastunary.begin()));

	2101 return NULL;

	2102 }

	2103 }

	2104 opstr.set(opstr.data(), t.data() - opstr.data());

	2105 if (!ps.PushRepetition(lo, hi, opstr, nongreedy))

	2106 return NULL;

	2107 isunary = opstr;

	2108 break;

	2109 }

	2110

	2111 case '\\': { // Escaped character or Perl sequence.

	2112 // \b and \B: word boundary or not

	2113 if ((ps.flags() & Regexp::PerlB) &&

	2114 t.size() >= 2 && (t[1] == 'b' \|\| t[1] == 'B')) {

	2115 if (!ps.PushWordBoundary(t[1] == 'b'))

	2116 return NULL;

	2117 t.remove_prefix(2); // '\\', 'b'

	2118 break;

	2119 }

	2120

	2121 if ((ps.flags() & Regexp::PerlX) && t.size() >= 2) {

	2122 if (t[1] == 'A') {

	2123 if (!ps.PushSimpleOp(kRegexpBeginText))

	2124 return NULL;

	2125 t.remove_prefix(2); // '\\', 'A'

	2126 break;

	2127 }

	2128 if (t[1] == 'z') {

	2129 if (!ps.PushSimpleOp(kRegexpEndText))

	2130 return NULL;

	2131 t.remove_prefix(2); // '\\', 'z'

	2132 break;

	2133 }

	2134 // Do not recognize \Z, because this library can't

	2135 // implement the exact Perl/PCRE semantics.

	2136 // (This library treats "(?-m)$" as \z, even though

	2137 // in Perl and PCRE it is equivalent to \Z.)

	2138

	2139 if (t[1] == 'C') { // \C: any byte [sic]

	2140 if (!ps.PushSimpleOp(kRegexpAnyByte))

	2141 return NULL;

	2142 t.remove_prefix(2); // '\\', 'C'

	2143 break;

	2144 }

	2145

	2146 if (t[1] == 'Q') { // \Q ... \E: the ... is always literals

	2147 t.remove_prefix(2); // '\\', 'Q'

	2148 while (t.size() > 0) {

	2149 if (t.size() >= 2 && t[0] == '\\' && t[1] == 'E') {

	2150 t.remove_prefix(2); // '\\', 'E'

	2151 break;

	2152 }

	2153 Rune r;

	2154 if (StringPieceToRune(&r, &t, status) < 0)

	2155 return NULL;

	2156 if (!ps.PushLiteral(r))

	2157 return NULL;

	2158 }

	2159 break;

	2160 }

	2161 }

	2162

	2163 if (t.size() >= 2 && (t[1] == 'p' \|\| t[1] == 'P')) {

	2164 Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);

	2165 re->ccb_ = new CharClassBuilder;

	2166 switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) {

	2167 case kParseOk:

	2168 if (!ps.PushRegexp(re))

	2169 return NULL;

	2170 goto Break2;

	2171 case kParseError:

	2172 re->Decref();

	2173 return NULL;

	2174 case kParseNothing:

	2175 re->Decref();

	2176 break;

	2177 }

	2178 }

	2179

	2180 UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags());

	2181 if (g != NULL) {

	2182 Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);

	2183 re->ccb_ = new CharClassBuilder;

	2184 AddUGroup(re->ccb_, g, g->sign, ps.flags());

	2185 if (!ps.PushRegexp(re))

	2186 return NULL;

	2187 break;

	2188 }

	2189

	2190 Rune r;

	2191 if (!ParseEscape(&t, &r, status, ps.rune_max()))

	2192 return NULL;

	2193 if (!ps.PushLiteral(r))

	2194 return NULL;

	2195 break;

	2196 }

	2197 }

	2198 Break2:

	2199 lastunary = isunary;

	2200 }

	2201 return ps.DoFinish();

	2202 }

	2203

	2204 } // namespace re2

OLD	NEW

« no previous file with comments | « third_party/re2/re2/onepass.cc ('k') | third_party/re2/re2/perl_groups.cc » ('j') | no next file with comments »