third_party/re2/ucs2.diff - Issue 10575037: Include RE2 library

Unified Diff: third_party/re2/ucs2.diff

Issue 10575037: Include RE2 library (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Less intrusive fix for Android Created 8 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: third_party/re2/ucs2.diff

diff --git a/third_party/re2/ucs2.diff b/third_party/re2/ucs2.diff

new file mode 100644

index 0000000000000000000000000000000000000000..57aec04a15cfd2fc93dd75468841076a7042fc97

--- /dev/null

+++ b/third_party/re2/ucs2.diff

@@ -0,0 +1,567 @@

+This is a dump from Google's source control system of the change

+that removed UCS-2 support from RE2. As the explanation below

+says, UCS-2 mode is fundamentally at odds with things like ^ and $,

+so it never really worked very well. But if you are interested in using

+it without those operators, it did work for that. It assumed that the

+UCS-2 data was in the native host byte order.

+If you are interested in adding UCS-2 mode back, this patch might

+be a good starting point.

+Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15

+ Retire UCS-2 mode.

+ I added it as an experiment for V8, but it

+ requires 2-byte lookahead to do completely,

+ and RE2 has 1-byte lookahead (enough for UTF-8)

+ as a fairly deep fundamental assumption,

+ so it did not support ^ or $.

+==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ====

+re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319

+ cap_[0] = p;

+ if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.

+ return true;

+- if (prog_->flags() & Regexp::UCS2)

+- p++;

+ }

+ return false;

+ }

+==== re2/compile.cc#17 - re2/compile.cc#18 ====

+re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100

+ // Input encodings.

+ enum Encoding {

+ kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)

+- kEncodingUCS2, // UCS-2 (0-FFFF), native byte order

+ kEncodingLatin1, // Latin1 (0-FF)

+ };

+re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172

+ void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);

+ void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);

+ void Add_80_10ffff();

+- void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase);

+- void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,

+- uint8 lo2, uint8 hi2, bool fold2);

+ // New suffix that matches the byte range lo-hi, then goes to next.

+ Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next);

+re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477

+ // Converts rune range lo-hi into a fragment that recognizes

+ // the bytes that would make up those runes in the current

+- // encoding (Latin 1, UTF-8, or UCS-2).

++ // encoding (Latin 1 or UTF-8).

+ // This lets the machine work byte-by-byte even when

+ // using multibyte encodings.

+re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489

+ case kEncodingLatin1:

+ AddRuneRangeLatin1(lo, hi, foldcase);

+ break;

+- case kEncodingUCS2:

+- AddRuneRangeUCS2(lo, hi, foldcase);

+- break;

+ }

+re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501

+ AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL));

+ }

+- // Test whether 16-bit values are big or little endian.

+- static bool BigEndian() {

+- union {

+- char byte[2];

+- int16 endian;

+- } u;

+- u.byte[0] = 1;

+- u.byte[1] = 2;

+- return u.endian == 0x0102;

+- }

+- void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,

+- uint8 lo2, uint8 hi2, bool fold2) {

+- Inst* ip;

+- if (reversed_) {

+- ip = RuneByteSuffix(lo1, hi1, fold1, NULL);

+- ip = RuneByteSuffix(lo2, hi2, fold2, ip);

+- } else {

+- ip = RuneByteSuffix(lo2, hi2, fold2, NULL);

+- ip = RuneByteSuffix(lo1, hi1, fold1, ip);

+- }

+- AddSuffix(ip);

+- }

+- void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) {

+- if (lo > hi || lo > 0xFFFF)

+- return;

+- if (hi > 0xFFFF)

+- hi = 0xFFFF;

+- // We'll assemble a pattern assuming big endian.

+- // If the machine isn't, tell Cat to reverse its arguments.

+- bool oldreversed = reversed_;

+- if (!BigEndian()) {

+- reversed_ = !oldreversed;

+- }

+- // Split into bytes.

+- int lo1 = lo >> 8;

+- int lo2 = lo & 0xFF;

+- int hi1 = hi >> 8;

+- int hi2 = hi & 0xFF;

+- if (lo1 == hi1) {

+- // Easy case: high bits are same in both.

+- // Only do ASCII case folding on the second byte if the top byte is 00.

+- AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase);

+- } else {

+- // Harder case: different second byte ranges depending on first byte.

+- // Initial fragment.

+- if (lo2 > 0) {

+- AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase);

+- lo1++;

+- }

+- // Trailing fragment.

+- if (hi2 < 0xFF) {

+- AddUCS2Pair(hi1, hi1, false, 0, hi2, false);

+- hi1--;

+- }

+- // Inner ranges.

+- if (lo1 <= hi1) {

+- AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false);

+- }

+- // Restore reverse setting.

+- reversed_ = oldreversed;

+- }

+ // Table describing how to make a UTF-8 matching machine

+ // for the rune range 80-10FFFF (Runeself-Runemax).

+ // This range happens frequently enough (for example /./ and /[^a-z]/)

+re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634

+ Frag Compiler::Literal(Rune r, bool foldcase) {

+ switch (encoding_) {

+- default: // UCS-2 or something new

+- BeginRange();

+- AddRuneRange(r, r, foldcase);

+- return EndRange();

++ default:

++ return kNullFrag;

+ case kEncodingLatin1:

+ return ByteRange(r, r, foldcase);

+re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850

+ if (re->parse_flags() & Regexp::Latin1)

+ c.encoding_ = kEncodingLatin1;

+- else if (re->parse_flags() & Regexp::UCS2)

+- c.encoding_ = kEncodingUCS2;

+ c.reversed_ = reversed;

+ if (max_mem <= 0) {

+ c.max_inst_ = 100000; // more than enough

+re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905

+ c.prog_->set_start_unanchored(c.prog_->start());

+ } else {

+ Frag dot;

+- if (c.encoding_ == kEncodingUCS2) {

+- dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, false));

+- } else {

+- dot = c.ByteRange(0x00, 0xFF, false);

+- }

++ dot = c.ByteRange(0x00, 0xFF, false);

+ Frag dotloop = c.Star(dot, true);

+ Frag unanchored = c.Cat(dotloop, all);

+ c.prog_->set_start_unanchored(unanchored.begin);

+==== re2/nfa.cc#8 - re2/nfa.cc#9 ====

+re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431

+ const char* bp = context.begin();

+ int c = -1;

+ int wasword = 0;

+- bool ucs2 = prog_->flags() & Regexp::UCS2;

+ if (text.begin() > context.begin()) {

+ c = text.begin()[-1] & 0xFF;

+re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497

+ // If there's a required first byte for an unanchored search

+ // and we're not in the middle of any possible matches,

+ // use memchr to search for the byte quickly.

+- if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 &&

++ if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&

+ p < text.end() && (p[0] & 0xFF) != first_byte_) {

+ p = reinterpret_cast<const char*>(memchr(p, first_byte_,

+ text.end() - p));

+re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514

+ flag = Prog::EmptyFlags(context, p);

+ }

+- // In UCS-2 mode, if we need to start a new thread,

+- // make sure to do it on an even boundary.

+- if(ucs2 && runq->size() == 0 &&

+- (p - context.begin()) % 2 && p < text.end()) {

+- p++;

+- flag = Prog::EmptyFlags(context, p);

+- }

+ // Steal match storage (cleared but unused as of yet)

+ // temporarily to hold match boundaries for new thread.

+- // In UCS-2 mode, only start the thread on a 2-byte boundary.

+- if(!ucs2 || (p - context.begin()) % 2 == 0) {

+- match_[0] = p;

+- AddToThreadq(runq, start_, flag, p, match_);

+- match_[0] = NULL;

+- }

++ match_[0] = p;

++ AddToThreadq(runq, start_, flag, p, match_);

++ match_[0] = NULL;

+ }

+ // If all the threads have died, stop early.

+==== re2/parse.cc#22 - re2/parse.cc#23 ====

+re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165

+ status_(status), stacktop_(NULL), ncap_(0) {

+ if (flags_ & Latin1)

+ rune_max_ = 0xFF;

+- else if (flags & UCS2)

+- rune_max_ = 0xFFFF;

+ else

+ rune_max_ = Runemax;

+ }

+re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374

+ bool Regexp::ParseState::PushCarat() {

+ if (flags_ & OneLine) {

+ return PushSimpleOp(kRegexpBeginText);

+- } else {

+- if (flags_ & UCS2) {

+- status_->set_code(kRegexpUnsupported);

+- status_->set_error_arg("multiline ^ in UCS-2 mode");

+- return false;

+- }

+- return PushSimpleOp(kRegexpBeginLine);

+ }

++ return PushSimpleOp(kRegexpBeginLine);

+ }

+ // Pushes a \b or \B onto the stack.

+ bool Regexp::ParseState::PushWordBoundary(bool word) {

+- if (flags_ & UCS2) {

+- status_->set_code(kRegexpUnsupported);

+- status_->set_error_arg("\\b or \\B in UCS-2 mode");

+- return false;

+- }

+ if (word)

+ return PushSimpleOp(kRegexpWordBoundary);

+ return PushSimpleOp(kRegexpNoWordBoundary);

+re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389

+ bool ret = PushSimpleOp(kRegexpEndText);

+ flags_ = oflags;

+ return ret;

+- }

+- if (flags_ & UCS2) {

+- status_->set_code(kRegexpUnsupported);

+- status_->set_error_arg("multiline $ in UCS-2 mode");

+- return false;

+ }

+ return PushSimpleOp(kRegexpEndLine);

+ }

+==== re2/re2.cc#34 - re2/re2.cc#35 ====

+re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84

+ return RE2::ErrorBadUTF8;

+ case re2::kRegexpBadNamedCapture:

+ return RE2::ErrorBadNamedCapture;

+- case re2::kRegexpUnsupported:

+- return RE2::ErrorUnsupported;

+ }

+ return RE2::ErrorInternal;

+ }

+re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125

+ break;

+ case RE2::Options::EncodingLatin1:

+ flags |= Regexp::Latin1;

+- break;

+- case RE2::Options::EncodingUCS2:

+- flags |= Regexp::UCS2;

+ break;

+ }

+==== re2/re2.h#36 - re2/re2.h#37 ====

+re2/re2.h#36:246,252 - re2/re2.h#37:246,251

+ ErrorBadUTF8, // invalid UTF-8 in regexp

+ ErrorBadNamedCapture, // bad named capture group

+ ErrorPatternTooLarge, // pattern too large (compile failed)

+- ErrorUnsupported, // unsupported feature (in UCS-2 mode)

+ };

+ // Predefined common options.

+re2/re2.h#36:570,576 - re2/re2.h#37:569,574

+ enum Encoding {

+ EncodingUTF8 = 1,

+- EncodingUCS2, // 16-bit Unicode 0-FFFF only

+ EncodingLatin1

+ };

+==== re2/regexp.cc#15 - re2/regexp.cc#16 ====

+re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329

+ // the regexp that remains after the prefix. The prefix might

+ // be ASCII case-insensitive.

+ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {

+- // Don't even bother for UCS-2; it's time to throw that code away.

+- if (parse_flags_ & UCS2)

+- return false;

+ // No need for a walker: the regexp must be of the form

+ // 1. some number of ^ anchors

+ // 2. a literal char or string

+==== re2/regexp.h#20 - re2/regexp.h#21 ====

+re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192

+ kRegexpBadPerlOp, // bad perl operator

+ kRegexpBadUTF8, // invalid UTF-8 in regexp

+ kRegexpBadNamedCapture, // bad named capture

+- kRegexpUnsupported, // unsupported operator

+ };

+ // Error status for certain operations.

+re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314

+ // \Q and \E to disable/enable metacharacters

+ // (?P<name>expr) for named captures

+ // \C to match any single byte

+- UCS2 = 1<<10, // Text is in UCS-2, regexp is in UTF-8.

+- UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group

++ UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group

+ // and \P{Han} for its negation.

+- NeverNL = 1<<12, // Never match NL, even if the regexp mentions

++ NeverNL = 1<<11, // Never match NL, even if the regexp mentions

+ // it explicitly.

+ // As close to Perl as we can get.

+==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ====

+re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139

+ cap_[0] = p;

+ if (Visit(prog_->start(), p)) // Match must be leftmost; done.

+ return true;

+- if (prog_->flags() & Regexp::UCS2)

+- p++;

+ }

+ return false;

+ }

+==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ====

+re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152

+ static ParseMode parse_modes[] = {

+ { single_line, "single-line" },

+ { single_line|Regexp::Latin1, "single-line, latin1" },

+- { single_line|Regexp::UCS2, "single-line, ucs2" },

+ { multi_line, "multiline" },

+ { multi_line|Regexp::NonGreedy, "multiline, nongreedy" },

+ { multi_line|Regexp::Latin1, "multiline, latin1" },

+- { multi_line|Regexp::UCS2, "multiline, ucs2" },

+ };

+ static string FormatMode(Regexp::ParseFlags flags) {

+re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185

+ RegexpStatus status;

+ regexp_ = Regexp::Parse(regexp_str, flags, &status);

+ if (regexp_ == NULL) {

+- if (status.code() != kRegexpUnsupported) {

+- LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)

+- << " mode: " << FormatMode(flags);

+- error_ = true;

+- }

++ LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)

++ << " mode: " << FormatMode(flags);

++ error_ = true;

+ return;

+ }

+ prog_ = regexp_->CompileToProg(0);

+re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231

+ RE2::Options options;

+ if (flags & Regexp::Latin1)

+ options.set_encoding(RE2::Options::EncodingLatin1);

+- else if (flags & Regexp::UCS2)

+- options.set_encoding(RE2::Options::EncodingUCS2);

+ if (kind_ == Prog::kLongestMatch)

+ options.set_longest_match(true);

+ re2_ = new RE2(re, options);

+re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280

+ delete re2_;

+ }

+- // Converts UTF-8 string in text into UCS-2 string in new_text.

+- static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text) {

+- const char* p = text.begin();

+- const char* ep = text.end();

+- uint16* q = new uint16[ep - p];

+- uint16* q0 = q;

+- int n;

+- Rune r;

+- for (; p < ep; p += n) {

+- if (!fullrune(p, ep - p)) {

+- delete[] q0;

+- return false;

+- }

+- n = chartorune(&r, p);

+- if (r > 0xFFFF) {

+- delete[] q0;

+- return false;

+- }

+- *q++ = r;

+- }

+- *new_text = StringPiece(reinterpret_cast<char*>(q0), 2*(q - q0));

+- return true;

+- }

+- // Rewrites *sp from being a pointer into text8 (UTF-8)

+- // to being a pointer into text16 (equivalent text but in UCS-2).

+- static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text16,

+- StringPiece *sp) {

+- if (sp->begin() == NULL && text8.begin() != NULL)

+- return;

+- int nrune = 0;

+- int n;

+- Rune r;

+- const char* p = text8.begin();

+- const char* ep = text8.end();

+- const char* spbegin = NULL;

+- const char* spend = NULL;

+- for (;;) {

+- if (p == sp->begin())

+- spbegin = text16.begin() + sizeof(uint16)*nrune;

+- if (p == sp->end())

+- spend = text16.begin() + sizeof(uint16)*nrune;

+- if (p >= ep)

+- break;

+- n = chartorune(&r, p);

+- p += n;

+- nrune++;

+- }

+- if (spbegin == NULL || spend == NULL) {

+- LOG(FATAL) << "Error in AdjustUTF8ToUCS2 "

+- << CEscape(text8) << " "

+- << (int)(sp->begin() - text8.begin()) << " "

+- << (int)(sp->end() - text8.begin());

+- }

+- *sp = StringPiece(spbegin, spend - spbegin);

+- }

+- // Rewrites *sp from begin a pointer into text16 (UCS-2)

+- // to being a pointer into text8 (equivalent text but in UTF-8).

+- static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& text8,

+- StringPiece* sp) {

+- if (sp->begin() == NULL)

+- return;

+- int nrune = 0;

+- int n;

+- Rune r;

+- const char* p = text8.begin();

+- const char* ep = text8.end();

+- const char* spbegin = NULL;

+- const char* spend = NULL;

+- for (;;) {

+- if (nrune == (sp->begin() - text16.begin())/2)

+- spbegin = p;

+- if (nrune == (sp->end() - text16.begin())/2)

+- spend = p;

+- if (p >= ep)

+- break;

+- n = chartorune(&r, p);

+- p += n;

+- nrune++;

+- }

+- if (text8.begin() != NULL && (spbegin == NULL || spend == NULL)) {

+- LOG(FATAL) << "Error in AdjustUCS2ToUTF8 "

+- << CEscape(text16) << " "

+- << (int)(sp->begin() - text16.begin()) << " "

+- << (int)(sp->end() - text16.begin());

+- }

+- *sp = StringPiece(spbegin, spend - spbegin);

+- }

+ // Runs a single search using the named engine type.

+ // This interface hides all the irregularities of the various

+ // engine interfaces from the rest of this file.

+re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300

+ StringPiece text = orig_text;

+ StringPiece context = orig_context;

+- bool ucs2 = false;

+- if ((flags() & Regexp::UCS2) && type != kEnginePCRE) {

+- if (!ConvertUTF8ToUCS2(orig_context, &context)) {

+- result->skipped = true;

+- return;

+- }

+- // Rewrite context to refer to new text.

+- AdjustUTF8ToUCS2(orig_context, context, &text);

+- ucs2 = true;

+- }

+ switch (type) {

+ default:

+ LOG(FATAL) << "Bad RunSearch type: " << (int)type;

+re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451

+ }

+- // If we did UCS-2 matching, rewrite the matches to refer

+- // to the original UTF-8 text.

+- if (ucs2) {

+- if (result->matched) {

+- if (result->have_submatch0) {

+- AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]);

+- } else if (result->have_submatch) {

+- for (int i = 0; i < nsubmatch; i++) {

+- AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]);

+- }

+- delete[] context.begin();

+- }

+ if (!result->matched)

+ memset(result->submatch, 0, sizeof result->submatch);

+ }

+re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475

+ return true;

+ }

+- // Check whether text uses only Unicode points <= 0xFFFF

+- // (in the BMP).

+- static bool IsBMP(const StringPiece& text) {

+- const char* p = text.begin();

+- const char* ep = text.end();

+- while (p < ep) {

+- if (!fullrune(p, ep - p))

+- return false;

+- Rune r;

+- p += chartorune(&r, p);

+- if (r > 0xFFFF)

+- return false;

+- }

+- return true;

+- }

+ // Runs a single test.

+ bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,

+ Prog::Anchor anchor) {

+re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483

+ Result correct;

+ RunSearch(kEngineBacktrack, text, context, anchor, &correct);

+ if (correct.skipped) {

+- if (regexp_ == NULL || !IsBMP(context)) // okay to skip in UCS-2 mode

++ if (regexp_ == NULL)

+ return true;

+ LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)

+ << " " << FormatMode(flags_);

« no previous file with comments | « third_party/re2/testinstall.cc ('k') | third_party/re2/util/arena.h » ('j') | no next file with comments »