| Index: third_party/re2/ucs2.diff
 | 
| diff --git a/third_party/re2/ucs2.diff b/third_party/re2/ucs2.diff
 | 
| new file mode 100644
 | 
| index 0000000000000000000000000000000000000000..57aec04a15cfd2fc93dd75468841076a7042fc97
 | 
| --- /dev/null
 | 
| +++ b/third_party/re2/ucs2.diff
 | 
| @@ -0,0 +1,567 @@
 | 
| +This is a dump from Google's source control system of the change
 | 
| +that removed UCS-2 support from RE2.  As the explanation below
 | 
| +says, UCS-2 mode is fundamentally at odds with things like ^ and $,
 | 
| +so it never really worked very well.  But if you are interested in using
 | 
| +it without those operators, it did work for that.  It assumed that the
 | 
| +UCS-2 data was in the native host byte order.
 | 
| +
 | 
| +If you are interested in adding UCS-2 mode back, this patch might
 | 
| +be a good starting point.
 | 
| +
 | 
| +
 | 
| +Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15
 | 
| +
 | 
| +	Retire UCS-2 mode.
 | 
| +	
 | 
| +	I added it as an experiment for V8, but it
 | 
| +	requires 2-byte lookahead to do completely,
 | 
| +	and RE2 has 1-byte lookahead (enough for UTF-8)
 | 
| +	as a fairly deep fundamental assumption,
 | 
| +	so it did not support ^ or $.
 | 
| +
 | 
| +==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ====
 | 
| +re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319
 | 
| +      cap_[0] = p;
 | 
| +      if (TrySearch(prog_->start(), p))  // Match must be leftmost; done.
 | 
| +        return true;
 | 
| +-     if (prog_->flags() & Regexp::UCS2)
 | 
| +-       p++;
 | 
| +    }
 | 
| +    return false;
 | 
| +  }
 | 
| +==== re2/compile.cc#17 - re2/compile.cc#18 ====
 | 
| +re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100
 | 
| +  // Input encodings.
 | 
| +  enum Encoding {
 | 
| +    kEncodingUTF8 = 1,  // UTF-8 (0-10FFFF)
 | 
| +-   kEncodingUCS2,     // UCS-2 (0-FFFF), native byte order
 | 
| +    kEncodingLatin1,    // Latin1 (0-FF)
 | 
| +  };
 | 
| +  
 | 
| +re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172
 | 
| +    void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
 | 
| +    void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
 | 
| +    void Add_80_10ffff();
 | 
| +-   void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase);
 | 
| +-   void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
 | 
| +-                    uint8 lo2, uint8 hi2, bool fold2);
 | 
| +  
 | 
| +    // New suffix that matches the byte range lo-hi, then goes to next.
 | 
| +    Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next);
 | 
| +re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477
 | 
| +  
 | 
| +  // Converts rune range lo-hi into a fragment that recognizes
 | 
| +  // the bytes that would make up those runes in the current
 | 
| +- // encoding (Latin 1, UTF-8, or UCS-2).
 | 
| ++ // encoding (Latin 1 or UTF-8).
 | 
| +  // This lets the machine work byte-by-byte even when
 | 
| +  // using multibyte encodings.
 | 
| +  
 | 
| +re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489
 | 
| +      case kEncodingLatin1:
 | 
| +        AddRuneRangeLatin1(lo, hi, foldcase);
 | 
| +        break;
 | 
| +-     case kEncodingUCS2:
 | 
| +-       AddRuneRangeUCS2(lo, hi, foldcase);
 | 
| +-       break;
 | 
| +    }
 | 
| +  }
 | 
| +  
 | 
| +re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501
 | 
| +    AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL));
 | 
| +  }
 | 
| +  
 | 
| +- // Test whether 16-bit values are big or little endian.
 | 
| +- static bool BigEndian() {
 | 
| +-   union {
 | 
| +-     char byte[2];
 | 
| +-     int16 endian;
 | 
| +-   } u;
 | 
| +- 
 | 
| +-   u.byte[0] = 1;
 | 
| +-   u.byte[1] = 2;
 | 
| +-   return u.endian == 0x0102;
 | 
| +- }
 | 
| +- 
 | 
| +- void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
 | 
| +-                            uint8 lo2, uint8 hi2, bool fold2) {
 | 
| +-   Inst* ip;
 | 
| +-   if (reversed_) {
 | 
| +-     ip = RuneByteSuffix(lo1, hi1, fold1, NULL);
 | 
| +-     ip = RuneByteSuffix(lo2, hi2, fold2, ip);
 | 
| +-   } else {
 | 
| +-     ip = RuneByteSuffix(lo2, hi2, fold2, NULL);
 | 
| +-     ip = RuneByteSuffix(lo1, hi1, fold1, ip);
 | 
| +-   }
 | 
| +-   AddSuffix(ip);
 | 
| +- }
 | 
| +- 
 | 
| +- void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) {
 | 
| +-   if (lo > hi || lo > 0xFFFF)
 | 
| +-     return;
 | 
| +-   if (hi > 0xFFFF)
 | 
| +-     hi = 0xFFFF;
 | 
| +- 
 | 
| +-   // We'll assemble a pattern assuming big endian.
 | 
| +-   // If the machine isn't, tell Cat to reverse its arguments.
 | 
| +-   bool oldreversed = reversed_;
 | 
| +-   if (!BigEndian()) {
 | 
| +-     reversed_ = !oldreversed;
 | 
| +-   }
 | 
| +- 
 | 
| +-   // Split into bytes.
 | 
| +-   int lo1 = lo >> 8;
 | 
| +-   int lo2 = lo & 0xFF;
 | 
| +-   int hi1 = hi >> 8;
 | 
| +-   int hi2 = hi & 0xFF;
 | 
| +- 
 | 
| +-   if (lo1 == hi1) {
 | 
| +-     // Easy case: high bits are same in both.
 | 
| +-     // Only do ASCII case folding on the second byte if the top byte is 00.
 | 
| +-     AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase);
 | 
| +-   } else {
 | 
| +-     // Harder case: different second byte ranges depending on first byte.
 | 
| +- 
 | 
| +-     // Initial fragment.
 | 
| +-     if (lo2 > 0) {
 | 
| +-       AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase);
 | 
| +-       lo1++;
 | 
| +-     }
 | 
| +- 
 | 
| +-     // Trailing fragment.
 | 
| +-     if (hi2 < 0xFF) {
 | 
| +-       AddUCS2Pair(hi1, hi1, false, 0, hi2, false);
 | 
| +-       hi1--;
 | 
| +-     }
 | 
| +- 
 | 
| +-     // Inner ranges.
 | 
| +-     if (lo1 <= hi1) {
 | 
| +-       AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false);
 | 
| +-     }
 | 
| +-   }
 | 
| +- 
 | 
| +-   // Restore reverse setting.
 | 
| +-   reversed_ = oldreversed;
 | 
| +- }
 | 
| +- 
 | 
| +  // Table describing how to make a UTF-8 matching machine
 | 
| +  // for the rune range 80-10FFFF (Runeself-Runemax).
 | 
| +  // This range happens frequently enough (for example /./ and /[^a-z]/)
 | 
| +re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634
 | 
| +  
 | 
| +  Frag Compiler::Literal(Rune r, bool foldcase) {
 | 
| +    switch (encoding_) {
 | 
| +-     default:  // UCS-2 or something new
 | 
| +-       BeginRange();
 | 
| +-       AddRuneRange(r, r, foldcase);
 | 
| +-       return EndRange();
 | 
| ++     default:
 | 
| ++       return kNullFrag;
 | 
| +  
 | 
| +      case kEncodingLatin1:
 | 
| +        return ByteRange(r, r, foldcase);
 | 
| +re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850
 | 
| +  
 | 
| +    if (re->parse_flags() & Regexp::Latin1)
 | 
| +      c.encoding_ = kEncodingLatin1;
 | 
| +-   else if (re->parse_flags() & Regexp::UCS2)
 | 
| +-     c.encoding_ = kEncodingUCS2;
 | 
| +    c.reversed_ = reversed;
 | 
| +    if (max_mem <= 0) {
 | 
| +      c.max_inst_ = 100000;  // more than enough
 | 
| +re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905
 | 
| +      c.prog_->set_start_unanchored(c.prog_->start());
 | 
| +    } else {
 | 
| +      Frag dot;
 | 
| +-     if (c.encoding_ == kEncodingUCS2) {
 | 
| +-       dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, false));
 | 
| +-     } else {
 | 
| +-       dot = c.ByteRange(0x00, 0xFF, false);
 | 
| +-     }
 | 
| ++     dot = c.ByteRange(0x00, 0xFF, false);
 | 
| +      Frag dotloop = c.Star(dot, true);
 | 
| +      Frag unanchored = c.Cat(dotloop, all);
 | 
| +      c.prog_->set_start_unanchored(unanchored.begin);
 | 
| +==== re2/nfa.cc#8 - re2/nfa.cc#9 ====
 | 
| +re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431
 | 
| +    const char* bp = context.begin();
 | 
| +    int c = -1;
 | 
| +    int wasword = 0;
 | 
| +-   bool ucs2 = prog_->flags() & Regexp::UCS2;
 | 
| +  
 | 
| +    if (text.begin() > context.begin()) {
 | 
| +      c = text.begin()[-1] & 0xFF;
 | 
| +re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497
 | 
| +        // If there's a required first byte for an unanchored search
 | 
| +        // and we're not in the middle of any possible matches,
 | 
| +        // use memchr to search for the byte quickly.
 | 
| +-       if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 &&
 | 
| ++       if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
 | 
| +            p < text.end() && (p[0] & 0xFF) != first_byte_) {
 | 
| +          p = reinterpret_cast<const char*>(memchr(p, first_byte_,
 | 
| +                                                   text.end() - p));
 | 
| +re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514
 | 
| +          flag = Prog::EmptyFlags(context, p);
 | 
| +        }
 | 
| +  
 | 
| +-       // In UCS-2 mode, if we need to start a new thread,
 | 
| +-       // make sure to do it on an even boundary.
 | 
| +-       if(ucs2 && runq->size() == 0 &&
 | 
| +-           (p - context.begin()) % 2 && p < text.end()) {
 | 
| +-         p++;
 | 
| +-         flag = Prog::EmptyFlags(context, p);
 | 
| +-       }
 | 
| +- 
 | 
| +        // Steal match storage (cleared but unused as of yet)
 | 
| +        // temporarily to hold match boundaries for new thread.
 | 
| +-       // In UCS-2 mode, only start the thread on a 2-byte boundary.
 | 
| +-       if(!ucs2 || (p - context.begin()) % 2 == 0) {
 | 
| +-         match_[0] = p;
 | 
| +-         AddToThreadq(runq, start_, flag, p, match_);
 | 
| +-         match_[0] = NULL;
 | 
| +-       }
 | 
| ++       match_[0] = p;
 | 
| ++       AddToThreadq(runq, start_, flag, p, match_);
 | 
| ++       match_[0] = NULL;
 | 
| +      }
 | 
| +  
 | 
| +      // If all the threads have died, stop early.
 | 
| +==== re2/parse.cc#22 - re2/parse.cc#23 ====
 | 
| +re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165
 | 
| +      status_(status), stacktop_(NULL), ncap_(0) {
 | 
| +    if (flags_ & Latin1)
 | 
| +      rune_max_ = 0xFF;
 | 
| +-   else if (flags & UCS2)
 | 
| +-     rune_max_ = 0xFFFF;
 | 
| +    else
 | 
| +      rune_max_ = Runemax;
 | 
| +  }
 | 
| +re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374
 | 
| +  bool Regexp::ParseState::PushCarat() {
 | 
| +    if (flags_ & OneLine) {
 | 
| +      return PushSimpleOp(kRegexpBeginText);
 | 
| +-   } else {
 | 
| +-     if (flags_ & UCS2) {
 | 
| +-       status_->set_code(kRegexpUnsupported);
 | 
| +-       status_->set_error_arg("multiline ^ in UCS-2 mode");
 | 
| +-       return false;
 | 
| +-     }
 | 
| +-     return PushSimpleOp(kRegexpBeginLine);
 | 
| +    }
 | 
| ++   return PushSimpleOp(kRegexpBeginLine);
 | 
| +  }
 | 
| +  
 | 
| +  // Pushes a \b or \B onto the stack.
 | 
| +  bool Regexp::ParseState::PushWordBoundary(bool word) {
 | 
| +-   if (flags_ & UCS2) {
 | 
| +-     status_->set_code(kRegexpUnsupported);
 | 
| +-     status_->set_error_arg("\\b or \\B in UCS-2 mode");
 | 
| +-     return false;
 | 
| +-   }
 | 
| +    if (word)
 | 
| +      return PushSimpleOp(kRegexpWordBoundary);
 | 
| +    return PushSimpleOp(kRegexpNoWordBoundary);
 | 
| +re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389
 | 
| +      bool ret = PushSimpleOp(kRegexpEndText);
 | 
| +      flags_ = oflags;
 | 
| +      return ret;
 | 
| +-   }
 | 
| +-   if (flags_ & UCS2) {
 | 
| +-     status_->set_code(kRegexpUnsupported);
 | 
| +-     status_->set_error_arg("multiline $ in UCS-2 mode");
 | 
| +-     return false;
 | 
| +    }
 | 
| +    return PushSimpleOp(kRegexpEndLine);
 | 
| +  }
 | 
| +==== re2/re2.cc#34 - re2/re2.cc#35 ====
 | 
| +re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84
 | 
| +        return RE2::ErrorBadUTF8;
 | 
| +      case re2::kRegexpBadNamedCapture:
 | 
| +        return RE2::ErrorBadNamedCapture;
 | 
| +-     case re2::kRegexpUnsupported:
 | 
| +-       return RE2::ErrorUnsupported;
 | 
| +    }
 | 
| +    return RE2::ErrorInternal;
 | 
| +  }
 | 
| +re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125
 | 
| +        break;
 | 
| +      case RE2::Options::EncodingLatin1:
 | 
| +        flags |= Regexp::Latin1;
 | 
| +-       break;
 | 
| +-     case RE2::Options::EncodingUCS2:
 | 
| +-       flags |= Regexp::UCS2;
 | 
| +        break;
 | 
| +    }
 | 
| +  
 | 
| +==== re2/re2.h#36 - re2/re2.h#37 ====
 | 
| +re2/re2.h#36:246,252 - re2/re2.h#37:246,251
 | 
| +      ErrorBadUTF8,            // invalid UTF-8 in regexp
 | 
| +      ErrorBadNamedCapture,    // bad named capture group
 | 
| +      ErrorPatternTooLarge,    // pattern too large (compile failed)
 | 
| +-     ErrorUnsupported,        // unsupported feature (in UCS-2 mode)
 | 
| +    };
 | 
| +  
 | 
| +    // Predefined common options.
 | 
| +re2/re2.h#36:570,576 - re2/re2.h#37:569,574
 | 
| +  
 | 
| +      enum Encoding {
 | 
| +        EncodingUTF8 = 1,
 | 
| +-       EncodingUCS2,      // 16-bit Unicode 0-FFFF only
 | 
| +        EncodingLatin1
 | 
| +      };
 | 
| +  
 | 
| +==== re2/regexp.cc#15 - re2/regexp.cc#16 ====
 | 
| +re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329
 | 
| +  // the regexp that remains after the prefix.  The prefix might
 | 
| +  // be ASCII case-insensitive.
 | 
| +  bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
 | 
| +-   // Don't even bother for UCS-2; it's time to throw that code away.
 | 
| +-   if (parse_flags_ & UCS2)
 | 
| +-     return false;
 | 
| +- 
 | 
| +    // No need for a walker: the regexp must be of the form
 | 
| +    // 1. some number of ^ anchors
 | 
| +    // 2. a literal char or string
 | 
| +==== re2/regexp.h#20 - re2/regexp.h#21 ====
 | 
| +re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192
 | 
| +    kRegexpBadPerlOp,          // bad perl operator
 | 
| +    kRegexpBadUTF8,            // invalid UTF-8 in regexp
 | 
| +    kRegexpBadNamedCapture,    // bad named capture
 | 
| +-   kRegexpUnsupported,        // unsupported operator
 | 
| +  };
 | 
| +  
 | 
| +  // Error status for certain operations.
 | 
| +re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314
 | 
| +                             //   \Q and \E to disable/enable metacharacters
 | 
| +                             //   (?P<name>expr) for named captures
 | 
| +                             //   \C to match any single byte
 | 
| +-     UCS2         = 1<<10,  // Text is in UCS-2, regexp is in UTF-8.
 | 
| +-     UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group
 | 
| ++     UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
 | 
| +                             //   and \P{Han} for its negation.
 | 
| +-     NeverNL      = 1<<12,  // Never match NL, even if the regexp mentions
 | 
| ++     NeverNL      = 1<<11,  // Never match NL, even if the regexp mentions
 | 
| +                             //   it explicitly.
 | 
| +  
 | 
| +      // As close to Perl as we can get.
 | 
| +==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ====
 | 
| +re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139
 | 
| +      cap_[0] = p;
 | 
| +      if (Visit(prog_->start(), p))  // Match must be leftmost; done.
 | 
| +        return true;
 | 
| +-     if (prog_->flags() & Regexp::UCS2)
 | 
| +-       p++;
 | 
| +    }
 | 
| +    return false;
 | 
| +  }
 | 
| +==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ====
 | 
| +re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152
 | 
| +  static ParseMode parse_modes[] = {
 | 
| +    { single_line,                   "single-line"          },
 | 
| +    { single_line|Regexp::Latin1,    "single-line, latin1"  },
 | 
| +-   { single_line|Regexp::UCS2,     "single-line, ucs2"   },
 | 
| +    { multi_line,                    "multiline"            },
 | 
| +    { multi_line|Regexp::NonGreedy,  "multiline, nongreedy" },
 | 
| +    { multi_line|Regexp::Latin1,     "multiline, latin1"    },
 | 
| +-   { multi_line|Regexp::UCS2,      "multiline, ucs2"     },
 | 
| +  };
 | 
| +  
 | 
| +  static string FormatMode(Regexp::ParseFlags flags) {
 | 
| +re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185
 | 
| +    RegexpStatus status;
 | 
| +    regexp_ = Regexp::Parse(regexp_str, flags, &status);
 | 
| +    if (regexp_ == NULL) {
 | 
| +-     if (status.code() != kRegexpUnsupported) {
 | 
| +-       LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
 | 
| +-                 << " mode: " << FormatMode(flags);
 | 
| +-       error_ = true;
 | 
| +-     }
 | 
| ++     LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
 | 
| ++               << " mode: " << FormatMode(flags);
 | 
| ++     error_ = true;
 | 
| +      return;
 | 
| +    }
 | 
| +    prog_ = regexp_->CompileToProg(0);
 | 
| +re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231
 | 
| +      RE2::Options options;
 | 
| +      if (flags & Regexp::Latin1)
 | 
| +        options.set_encoding(RE2::Options::EncodingLatin1);
 | 
| +-     else if (flags & Regexp::UCS2)
 | 
| +-       options.set_encoding(RE2::Options::EncodingUCS2);
 | 
| +      if (kind_ == Prog::kLongestMatch)
 | 
| +        options.set_longest_match(true);
 | 
| +      re2_ = new RE2(re, options);
 | 
| +re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280
 | 
| +      delete re2_;
 | 
| +  }
 | 
| +  
 | 
| +- // Converts UTF-8 string in text into UCS-2 string in new_text.
 | 
| +- static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text) {
 | 
| +-   const char* p = text.begin();
 | 
| +-   const char* ep = text.end();
 | 
| +-   uint16* q = new uint16[ep - p];
 | 
| +-   uint16* q0 = q;
 | 
| +- 
 | 
| +-   int n;
 | 
| +-   Rune r;
 | 
| +-   for (; p < ep; p += n) {
 | 
| +-     if (!fullrune(p, ep - p)) {
 | 
| +-       delete[] q0;
 | 
| +-       return false;
 | 
| +-     }
 | 
| +-     n = chartorune(&r, p);
 | 
| +-     if (r > 0xFFFF) {
 | 
| +-       delete[] q0;
 | 
| +-       return false;
 | 
| +-     }
 | 
| +-     *q++ = r;
 | 
| +-   }
 | 
| +-   *new_text = StringPiece(reinterpret_cast<char*>(q0), 2*(q - q0));
 | 
| +-   return true;
 | 
| +- }
 | 
| +- 
 | 
| +- // Rewrites *sp from being a pointer into text8 (UTF-8)
 | 
| +- // to being a pointer into text16 (equivalent text but in UCS-2).
 | 
| +- static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text16,
 | 
| +-                               StringPiece *sp) {
 | 
| +-   if (sp->begin() == NULL && text8.begin() != NULL)
 | 
| +-     return;
 | 
| +- 
 | 
| +-   int nrune = 0;
 | 
| +-   int n;
 | 
| +-   Rune r;
 | 
| +-   const char* p = text8.begin();
 | 
| +-   const char* ep = text8.end();
 | 
| +-   const char* spbegin = NULL;
 | 
| +-   const char* spend = NULL;
 | 
| +-   for (;;) {
 | 
| +-     if (p == sp->begin())
 | 
| +-       spbegin = text16.begin() + sizeof(uint16)*nrune;
 | 
| +-     if (p == sp->end())
 | 
| +-       spend = text16.begin() + sizeof(uint16)*nrune;
 | 
| +-     if (p >= ep)
 | 
| +-       break;
 | 
| +-     n = chartorune(&r, p);
 | 
| +-     p += n;
 | 
| +-     nrune++;
 | 
| +-   }
 | 
| +-   if (spbegin == NULL || spend == NULL) {
 | 
| +-     LOG(FATAL) << "Error in AdjustUTF8ToUCS2 "
 | 
| +-                << CEscape(text8) << " "
 | 
| +-                << (int)(sp->begin() - text8.begin()) << " "
 | 
| +-                << (int)(sp->end() - text8.begin());
 | 
| +-   }
 | 
| +-   *sp = StringPiece(spbegin, spend - spbegin);
 | 
| +- }
 | 
| +- 
 | 
| +- // Rewrites *sp from begin a pointer into text16 (UCS-2)
 | 
| +- // to being a pointer into text8 (equivalent text but in UTF-8).
 | 
| +- static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& text8,
 | 
| +-                               StringPiece* sp) {
 | 
| +-   if (sp->begin() == NULL)
 | 
| +-     return;
 | 
| +- 
 | 
| +-   int nrune = 0;
 | 
| +-   int n;
 | 
| +-   Rune r;
 | 
| +-   const char* p = text8.begin();
 | 
| +-   const char* ep = text8.end();
 | 
| +-   const char* spbegin = NULL;
 | 
| +-   const char* spend = NULL;
 | 
| +-   for (;;) {
 | 
| +-     if (nrune == (sp->begin() - text16.begin())/2)
 | 
| +-       spbegin = p;
 | 
| +-     if (nrune == (sp->end() - text16.begin())/2)
 | 
| +-       spend = p;
 | 
| +-     if (p >= ep)
 | 
| +-       break;
 | 
| +-     n = chartorune(&r, p);
 | 
| +-     p += n;
 | 
| +-     nrune++;
 | 
| +-   }
 | 
| +-   if (text8.begin() != NULL && (spbegin == NULL || spend == NULL)) {
 | 
| +-     LOG(FATAL) << "Error in AdjustUCS2ToUTF8 "
 | 
| +-                << CEscape(text16) << " "
 | 
| +-                << (int)(sp->begin() - text16.begin()) << " "
 | 
| +-                << (int)(sp->end() - text16.begin());
 | 
| +-   }
 | 
| +-   *sp = StringPiece(spbegin, spend - spbegin);
 | 
| +- }
 | 
| +- 
 | 
| +  // Runs a single search using the named engine type.
 | 
| +  // This interface hides all the irregularities of the various
 | 
| +  // engine interfaces from the rest of this file.
 | 
| +re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300
 | 
| +  
 | 
| +    StringPiece text = orig_text;
 | 
| +    StringPiece context = orig_context;
 | 
| +-   bool ucs2 = false;
 | 
| +  
 | 
| +-   if ((flags() & Regexp::UCS2) && type != kEnginePCRE) {
 | 
| +-     if (!ConvertUTF8ToUCS2(orig_context, &context)) {
 | 
| +-       result->skipped = true;
 | 
| +-       return;
 | 
| +-     }
 | 
| +- 
 | 
| +-     // Rewrite context to refer to new text.
 | 
| +-     AdjustUTF8ToUCS2(orig_context, context, &text);
 | 
| +-     ucs2 = true;
 | 
| +-   }
 | 
| +- 
 | 
| +    switch (type) {
 | 
| +      default:
 | 
| +        LOG(FATAL) << "Bad RunSearch type: " << (int)type;
 | 
| +re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451
 | 
| +      }
 | 
| +    }
 | 
| +  
 | 
| +-   // If we did UCS-2 matching, rewrite the matches to refer
 | 
| +-   // to the original UTF-8 text.
 | 
| +-   if (ucs2) {
 | 
| +-     if (result->matched) {
 | 
| +-       if (result->have_submatch0) {
 | 
| +-         AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]);
 | 
| +-       } else if (result->have_submatch) {
 | 
| +-         for (int i = 0; i < nsubmatch; i++) {
 | 
| +-           AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]);
 | 
| +-         }
 | 
| +-       }
 | 
| +-     }
 | 
| +-     delete[] context.begin();
 | 
| +-   }
 | 
| +- 
 | 
| +    if (!result->matched)
 | 
| +      memset(result->submatch, 0, sizeof result->submatch);
 | 
| +  }
 | 
| +re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475
 | 
| +    return true;
 | 
| +  }
 | 
| +  
 | 
| +- // Check whether text uses only Unicode points <= 0xFFFF
 | 
| +- // (in the BMP).
 | 
| +- static bool IsBMP(const StringPiece& text) {
 | 
| +-   const char* p = text.begin();
 | 
| +-   const char* ep = text.end();
 | 
| +-   while (p < ep) {
 | 
| +-     if (!fullrune(p, ep - p))
 | 
| +-       return false;
 | 
| +-     Rune r;
 | 
| +-     p += chartorune(&r, p);
 | 
| +-     if (r > 0xFFFF)
 | 
| +-       return false;
 | 
| +-   }
 | 
| +-   return true;
 | 
| +- }
 | 
| +- 
 | 
| +  // Runs a single test.
 | 
| +  bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
 | 
| +                             Prog::Anchor anchor) {
 | 
| +re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483
 | 
| +    Result correct;
 | 
| +    RunSearch(kEngineBacktrack, text, context, anchor, &correct);
 | 
| +    if (correct.skipped) {
 | 
| +-     if (regexp_ == NULL || !IsBMP(context))  // okay to skip in UCS-2 mode
 | 
| ++     if (regexp_ == NULL)
 | 
| +        return true;
 | 
| +      LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
 | 
| +                 << " " << FormatMode(flags_);
 | 
| 
 |