OLD | NEW |
(Empty) | |
| 1 // Copyright 2003-2009 Google Inc. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style |
| 3 // license that can be found in the LICENSE file. |
| 4 |
| 5 // This is a variant of PCRE's pcrecpp.cc, originally written at Google. |
| 6 // The main changes are the addition of the HitLimit method and |
| 7 // compilation as PCRE in namespace re2. |
| 8 |
| 9 #include <errno.h> |
| 10 #include "util/util.h" |
| 11 #include "util/flags.h" |
| 12 #include "util/pcre.h" |
| 13 |
| 14 #ifdef WIN32 |
| 15 #define strtoll _strtoi64 |
| 16 #define strtoull _strtoui64 |
| 17 #endif |
| 18 |
| 19 #define PCREPORT(level) LOG(level) |
| 20 |
| 21 // Default PCRE limits. |
| 22 // Defaults chosen to allow a plausible amount of CPU and |
| 23 // not exceed main thread stacks. Note that other threads |
| 24 // often have smaller stacks, and therefore tightening |
| 25 // regexp_stack_limit may frequently be necessary. |
| 26 DEFINE_int32(regexp_stack_limit, 256<<10, "default PCRE stack limit (bytes)"); |
| 27 DEFINE_int32(regexp_match_limit, 1000000, |
| 28 "default PCRE match limit (function calls)"); |
| 29 |
| 30 namespace re2 { |
| 31 |
| 32 // Maximum number of args we can set |
| 33 static const int kMaxArgs = 16; |
| 34 static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace |
| 35 |
| 36 // Approximate size of a recursive invocation of PCRE's |
| 37 // internal "match()" frame. This varies depending on the |
| 38 // compiler and architecture, of course, so the constant is |
| 39 // just a conservative estimate. To find the exact number, |
| 40 // run regexp_unittest with --regexp_stack_limit=0 under |
| 41 // a debugger and look at the frames when it crashes. |
| 42 // The exact frame size was 656 in production on 2008/02/03. |
| 43 static const int kPCREFrameSize = 700; |
| 44 |
| 45 // Special name for missing C++ arguments. |
| 46 PCRE::Arg PCRE::no_more_args((void*)NULL); |
| 47 |
| 48 const PCRE::PartialMatchFunctor PCRE::PartialMatch = { }; |
| 49 const PCRE::FullMatchFunctor PCRE::FullMatch = { } ; |
| 50 const PCRE::ConsumeFunctor PCRE::Consume = { }; |
| 51 const PCRE::FindAndConsumeFunctor PCRE::FindAndConsume = { }; |
| 52 |
| 53 // If a regular expression has no error, its error_ field points here |
| 54 static const string empty_string; |
| 55 |
| 56 void PCRE::Init(const char* pattern, Option options, int match_limit, |
| 57 int stack_limit, bool report_errors) { |
| 58 pattern_ = pattern; |
| 59 options_ = options; |
| 60 match_limit_ = match_limit; |
| 61 stack_limit_ = stack_limit; |
| 62 hit_limit_ = false; |
| 63 error_ = &empty_string; |
| 64 report_errors_ = report_errors; |
| 65 re_full_ = NULL; |
| 66 re_partial_ = NULL; |
| 67 |
| 68 if (options & ~(EnabledCompileOptions | EnabledExecOptions)) { |
| 69 error_ = new string("illegal regexp option"); |
| 70 PCREPORT(ERROR) |
| 71 << "Error compiling '" << pattern << "': illegal regexp option"; |
| 72 } else { |
| 73 re_partial_ = Compile(UNANCHORED); |
| 74 if (re_partial_ != NULL) { |
| 75 re_full_ = Compile(ANCHOR_BOTH); |
| 76 } |
| 77 } |
| 78 } |
| 79 |
| 80 PCRE::PCRE(const char* pattern) { |
| 81 Init(pattern, None, 0, 0, true); |
| 82 } |
| 83 PCRE::PCRE(const char* pattern, Option option) { |
| 84 Init(pattern, option, 0, 0, true); |
| 85 } |
| 86 PCRE::PCRE(const string& pattern) { |
| 87 Init(pattern.c_str(), None, 0, 0, true); |
| 88 } |
| 89 PCRE::PCRE(const string& pattern, Option option) { |
| 90 Init(pattern.c_str(), option, 0, 0, true); |
| 91 } |
| 92 PCRE::PCRE(const string& pattern, const PCRE_Options& re_option) { |
| 93 Init(pattern.c_str(), re_option.option(), re_option.match_limit(), |
| 94 re_option.stack_limit(), re_option.report_errors()); |
| 95 } |
| 96 |
| 97 PCRE::PCRE(const char *pattern, const PCRE_Options& re_option) { |
| 98 Init(pattern, re_option.option(), re_option.match_limit(), |
| 99 re_option.stack_limit(), re_option.report_errors()); |
| 100 } |
| 101 |
| 102 PCRE::~PCRE() { |
| 103 if (re_full_ != NULL) pcre_free(re_full_); |
| 104 if (re_partial_ != NULL) pcre_free(re_partial_); |
| 105 if (error_ != &empty_string) delete error_; |
| 106 } |
| 107 |
| 108 pcre* PCRE::Compile(Anchor anchor) { |
| 109 // Special treatment for anchoring. This is needed because at |
| 110 // runtime pcre only provides an option for anchoring at the |
| 111 // beginning of a string. |
| 112 // |
| 113 // There are three types of anchoring we want: |
| 114 // UNANCHORED Compile the original pattern, and use |
| 115 // a pcre unanchored match. |
| 116 // ANCHOR_START Compile the original pattern, and use |
| 117 // a pcre anchored match. |
| 118 // ANCHOR_BOTH Tack a "\z" to the end of the original pattern |
| 119 // and use a pcre anchored match. |
| 120 |
| 121 const char* error; |
| 122 int eoffset; |
| 123 pcre* re; |
| 124 if (anchor != ANCHOR_BOTH) { |
| 125 re = pcre_compile(pattern_.c_str(), |
| 126 (options_ & EnabledCompileOptions), |
| 127 &error, &eoffset, NULL); |
| 128 } else { |
| 129 // Tack a '\z' at the end of PCRE. Parenthesize it first so that |
| 130 // the '\z' applies to all top-level alternatives in the regexp. |
| 131 string wrapped = "(?:"; // A non-counting grouping operator |
| 132 wrapped += pattern_; |
| 133 wrapped += ")\\z"; |
| 134 re = pcre_compile(wrapped.c_str(), |
| 135 (options_ & EnabledCompileOptions), |
| 136 &error, &eoffset, NULL); |
| 137 } |
| 138 if (re == NULL) { |
| 139 if (error_ == &empty_string) error_ = new string(error); |
| 140 PCREPORT(ERROR) << "Error compiling '" << pattern_ << "': " << error; |
| 141 } |
| 142 return re; |
| 143 } |
| 144 |
| 145 /***** Convenience interfaces *****/ |
| 146 |
| 147 bool PCRE::FullMatchFunctor::operator ()(const StringPiece& text, |
| 148 const PCRE& re, |
| 149 const Arg& a0, |
| 150 const Arg& a1, |
| 151 const Arg& a2, |
| 152 const Arg& a3, |
| 153 const Arg& a4, |
| 154 const Arg& a5, |
| 155 const Arg& a6, |
| 156 const Arg& a7, |
| 157 const Arg& a8, |
| 158 const Arg& a9, |
| 159 const Arg& a10, |
| 160 const Arg& a11, |
| 161 const Arg& a12, |
| 162 const Arg& a13, |
| 163 const Arg& a14, |
| 164 const Arg& a15) const { |
| 165 const Arg* args[kMaxArgs]; |
| 166 int n = 0; |
| 167 if (&a0 == &no_more_args) goto done; args[n++] = &a0; |
| 168 if (&a1 == &no_more_args) goto done; args[n++] = &a1; |
| 169 if (&a2 == &no_more_args) goto done; args[n++] = &a2; |
| 170 if (&a3 == &no_more_args) goto done; args[n++] = &a3; |
| 171 if (&a4 == &no_more_args) goto done; args[n++] = &a4; |
| 172 if (&a5 == &no_more_args) goto done; args[n++] = &a5; |
| 173 if (&a6 == &no_more_args) goto done; args[n++] = &a6; |
| 174 if (&a7 == &no_more_args) goto done; args[n++] = &a7; |
| 175 if (&a8 == &no_more_args) goto done; args[n++] = &a8; |
| 176 if (&a9 == &no_more_args) goto done; args[n++] = &a9; |
| 177 if (&a10 == &no_more_args) goto done; args[n++] = &a10; |
| 178 if (&a11 == &no_more_args) goto done; args[n++] = &a11; |
| 179 if (&a12 == &no_more_args) goto done; args[n++] = &a12; |
| 180 if (&a13 == &no_more_args) goto done; args[n++] = &a13; |
| 181 if (&a14 == &no_more_args) goto done; args[n++] = &a14; |
| 182 if (&a15 == &no_more_args) goto done; args[n++] = &a15; |
| 183 done: |
| 184 |
| 185 int consumed; |
| 186 int vec[kVecSize]; |
| 187 return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize); |
| 188 } |
| 189 |
| 190 bool PCRE::PartialMatchFunctor::operator ()(const StringPiece& text, |
| 191 const PCRE& re, |
| 192 const Arg& a0, |
| 193 const Arg& a1, |
| 194 const Arg& a2, |
| 195 const Arg& a3, |
| 196 const Arg& a4, |
| 197 const Arg& a5, |
| 198 const Arg& a6, |
| 199 const Arg& a7, |
| 200 const Arg& a8, |
| 201 const Arg& a9, |
| 202 const Arg& a10, |
| 203 const Arg& a11, |
| 204 const Arg& a12, |
| 205 const Arg& a13, |
| 206 const Arg& a14, |
| 207 const Arg& a15) const { |
| 208 const Arg* args[kMaxArgs]; |
| 209 int n = 0; |
| 210 if (&a0 == &no_more_args) goto done; args[n++] = &a0; |
| 211 if (&a1 == &no_more_args) goto done; args[n++] = &a1; |
| 212 if (&a2 == &no_more_args) goto done; args[n++] = &a2; |
| 213 if (&a3 == &no_more_args) goto done; args[n++] = &a3; |
| 214 if (&a4 == &no_more_args) goto done; args[n++] = &a4; |
| 215 if (&a5 == &no_more_args) goto done; args[n++] = &a5; |
| 216 if (&a6 == &no_more_args) goto done; args[n++] = &a6; |
| 217 if (&a7 == &no_more_args) goto done; args[n++] = &a7; |
| 218 if (&a8 == &no_more_args) goto done; args[n++] = &a8; |
| 219 if (&a9 == &no_more_args) goto done; args[n++] = &a9; |
| 220 if (&a10 == &no_more_args) goto done; args[n++] = &a10; |
| 221 if (&a11 == &no_more_args) goto done; args[n++] = &a11; |
| 222 if (&a12 == &no_more_args) goto done; args[n++] = &a12; |
| 223 if (&a13 == &no_more_args) goto done; args[n++] = &a13; |
| 224 if (&a14 == &no_more_args) goto done; args[n++] = &a14; |
| 225 if (&a15 == &no_more_args) goto done; args[n++] = &a15; |
| 226 done: |
| 227 |
| 228 int consumed; |
| 229 int vec[kVecSize]; |
| 230 return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize); |
| 231 } |
| 232 |
| 233 bool PCRE::ConsumeFunctor::operator ()(StringPiece* input, |
| 234 const PCRE& pattern, |
| 235 const Arg& a0, |
| 236 const Arg& a1, |
| 237 const Arg& a2, |
| 238 const Arg& a3, |
| 239 const Arg& a4, |
| 240 const Arg& a5, |
| 241 const Arg& a6, |
| 242 const Arg& a7, |
| 243 const Arg& a8, |
| 244 const Arg& a9, |
| 245 const Arg& a10, |
| 246 const Arg& a11, |
| 247 const Arg& a12, |
| 248 const Arg& a13, |
| 249 const Arg& a14, |
| 250 const Arg& a15) const { |
| 251 const Arg* args[kMaxArgs]; |
| 252 int n = 0; |
| 253 if (&a0 == &no_more_args) goto done; args[n++] = &a0; |
| 254 if (&a1 == &no_more_args) goto done; args[n++] = &a1; |
| 255 if (&a2 == &no_more_args) goto done; args[n++] = &a2; |
| 256 if (&a3 == &no_more_args) goto done; args[n++] = &a3; |
| 257 if (&a4 == &no_more_args) goto done; args[n++] = &a4; |
| 258 if (&a5 == &no_more_args) goto done; args[n++] = &a5; |
| 259 if (&a6 == &no_more_args) goto done; args[n++] = &a6; |
| 260 if (&a7 == &no_more_args) goto done; args[n++] = &a7; |
| 261 if (&a8 == &no_more_args) goto done; args[n++] = &a8; |
| 262 if (&a9 == &no_more_args) goto done; args[n++] = &a9; |
| 263 if (&a10 == &no_more_args) goto done; args[n++] = &a10; |
| 264 if (&a11 == &no_more_args) goto done; args[n++] = &a11; |
| 265 if (&a12 == &no_more_args) goto done; args[n++] = &a12; |
| 266 if (&a13 == &no_more_args) goto done; args[n++] = &a13; |
| 267 if (&a14 == &no_more_args) goto done; args[n++] = &a14; |
| 268 if (&a15 == &no_more_args) goto done; args[n++] = &a15; |
| 269 done: |
| 270 |
| 271 int consumed; |
| 272 int vec[kVecSize]; |
| 273 if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed, |
| 274 args, n, vec, kVecSize)) { |
| 275 input->remove_prefix(consumed); |
| 276 return true; |
| 277 } else { |
| 278 return false; |
| 279 } |
| 280 } |
| 281 |
| 282 bool PCRE::FindAndConsumeFunctor::operator ()(StringPiece* input, |
| 283 const PCRE& pattern, |
| 284 const Arg& a0, |
| 285 const Arg& a1, |
| 286 const Arg& a2, |
| 287 const Arg& a3, |
| 288 const Arg& a4, |
| 289 const Arg& a5, |
| 290 const Arg& a6, |
| 291 const Arg& a7, |
| 292 const Arg& a8, |
| 293 const Arg& a9, |
| 294 const Arg& a10, |
| 295 const Arg& a11, |
| 296 const Arg& a12, |
| 297 const Arg& a13, |
| 298 const Arg& a14, |
| 299 const Arg& a15) const { |
| 300 const Arg* args[kMaxArgs]; |
| 301 int n = 0; |
| 302 if (&a0 == &no_more_args) goto done; args[n++] = &a0; |
| 303 if (&a1 == &no_more_args) goto done; args[n++] = &a1; |
| 304 if (&a2 == &no_more_args) goto done; args[n++] = &a2; |
| 305 if (&a3 == &no_more_args) goto done; args[n++] = &a3; |
| 306 if (&a4 == &no_more_args) goto done; args[n++] = &a4; |
| 307 if (&a5 == &no_more_args) goto done; args[n++] = &a5; |
| 308 if (&a6 == &no_more_args) goto done; args[n++] = &a6; |
| 309 if (&a7 == &no_more_args) goto done; args[n++] = &a7; |
| 310 if (&a8 == &no_more_args) goto done; args[n++] = &a8; |
| 311 if (&a9 == &no_more_args) goto done; args[n++] = &a9; |
| 312 if (&a10 == &no_more_args) goto done; args[n++] = &a10; |
| 313 if (&a11 == &no_more_args) goto done; args[n++] = &a11; |
| 314 if (&a12 == &no_more_args) goto done; args[n++] = &a12; |
| 315 if (&a13 == &no_more_args) goto done; args[n++] = &a13; |
| 316 if (&a14 == &no_more_args) goto done; args[n++] = &a14; |
| 317 if (&a15 == &no_more_args) goto done; args[n++] = &a15; |
| 318 done: |
| 319 |
| 320 int consumed; |
| 321 int vec[kVecSize]; |
| 322 if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed, |
| 323 args, n, vec, kVecSize)) { |
| 324 input->remove_prefix(consumed); |
| 325 return true; |
| 326 } else { |
| 327 return false; |
| 328 } |
| 329 } |
| 330 |
| 331 bool PCRE::Replace(string *str, |
| 332 const PCRE& pattern, |
| 333 const StringPiece& rewrite) { |
| 334 int vec[kVecSize]; |
| 335 int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize); |
| 336 if (matches == 0) |
| 337 return false; |
| 338 |
| 339 string s; |
| 340 if (!pattern.Rewrite(&s, rewrite, *str, vec, matches)) |
| 341 return false; |
| 342 |
| 343 assert(vec[0] >= 0); |
| 344 assert(vec[1] >= 0); |
| 345 str->replace(vec[0], vec[1] - vec[0], s); |
| 346 return true; |
| 347 } |
| 348 |
| 349 int PCRE::GlobalReplace(string *str, |
| 350 const PCRE& pattern, |
| 351 const StringPiece& rewrite) { |
| 352 int count = 0; |
| 353 int vec[kVecSize]; |
| 354 string out; |
| 355 int start = 0; |
| 356 bool last_match_was_empty_string = false; |
| 357 |
| 358 for (; start <= str->length();) { |
| 359 // If the previous match was for the empty string, we shouldn't |
| 360 // just match again: we'll match in the same way and get an |
| 361 // infinite loop. Instead, we do the match in a special way: |
| 362 // anchored -- to force another try at the same position -- |
| 363 // and with a flag saying that this time, ignore empty matches. |
| 364 // If this special match returns, that means there's a non-empty |
| 365 // match at this position as well, and we can continue. If not, |
| 366 // we do what perl does, and just advance by one. |
| 367 // Notice that perl prints '@@@' for this; |
| 368 // perl -le '$_ = "aa"; s/b*|aa/@/g; print' |
| 369 int matches; |
| 370 if (last_match_was_empty_string) { |
| 371 matches = pattern.TryMatch(*str, start, ANCHOR_START, false, |
| 372 vec, kVecSize); |
| 373 if (matches <= 0) { |
| 374 if (start < str->length()) |
| 375 out.push_back((*str)[start]); |
| 376 start++; |
| 377 last_match_was_empty_string = false; |
| 378 continue; |
| 379 } |
| 380 } else { |
| 381 matches = pattern.TryMatch(*str, start, UNANCHORED, true, vec, kVecSize); |
| 382 if (matches <= 0) |
| 383 break; |
| 384 } |
| 385 int matchstart = vec[0], matchend = vec[1]; |
| 386 assert(matchstart >= start); |
| 387 assert(matchend >= matchstart); |
| 388 |
| 389 out.append(*str, start, matchstart - start); |
| 390 pattern.Rewrite(&out, rewrite, *str, vec, matches); |
| 391 start = matchend; |
| 392 count++; |
| 393 last_match_was_empty_string = (matchstart == matchend); |
| 394 } |
| 395 |
| 396 if (count == 0) |
| 397 return 0; |
| 398 |
| 399 if (start < str->length()) |
| 400 out.append(*str, start, str->length() - start); |
| 401 swap(out, *str); |
| 402 return count; |
| 403 } |
| 404 |
| 405 bool PCRE::Extract(const StringPiece &text, |
| 406 const PCRE& pattern, |
| 407 const StringPiece &rewrite, |
| 408 string *out) { |
| 409 int vec[kVecSize]; |
| 410 int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize); |
| 411 if (matches == 0) |
| 412 return false; |
| 413 out->clear(); |
| 414 return pattern.Rewrite(out, rewrite, text, vec, matches); |
| 415 } |
| 416 |
| 417 string PCRE::QuoteMeta(const StringPiece& unquoted) { |
| 418 string result; |
| 419 result.reserve(unquoted.size() << 1); |
| 420 |
| 421 // Escape any ascii character not in [A-Za-z_0-9]. |
| 422 // |
| 423 // Note that it's legal to escape a character even if it has no |
| 424 // special meaning in a regular expression -- so this function does |
| 425 // that. (This also makes it identical to the perl function of the |
| 426 // same name except for the null-character special case; |
| 427 // see `perldoc -f quotemeta`.) |
| 428 for (int ii = 0; ii < unquoted.length(); ++ii) { |
| 429 // Note that using 'isalnum' here raises the benchmark time from |
| 430 // 32ns to 58ns: |
| 431 if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && |
| 432 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && |
| 433 (unquoted[ii] < '0' || unquoted[ii] > '9') && |
| 434 unquoted[ii] != '_' && |
| 435 // If this is the part of a UTF8 or Latin1 character, we need |
| 436 // to copy this byte without escaping. Experimentally this is |
| 437 // what works correctly with the regexp library. |
| 438 !(unquoted[ii] & 128)) { |
| 439 if (unquoted[ii] == '\0') { // Special handling for null chars. |
| 440 // Can't use "\\0" since the next character might be a digit. |
| 441 result += "\\x00"; |
| 442 continue; |
| 443 } |
| 444 result += '\\'; |
| 445 } |
| 446 result += unquoted[ii]; |
| 447 } |
| 448 |
| 449 return result; |
| 450 } |
| 451 |
| 452 /***** Actual matching and rewriting code *****/ |
| 453 |
| 454 bool PCRE::HitLimit() { |
| 455 return hit_limit_; |
| 456 } |
| 457 |
| 458 void PCRE::ClearHitLimit() { |
| 459 hit_limit_ = 0; |
| 460 } |
| 461 |
| 462 int PCRE::TryMatch(const StringPiece& text, |
| 463 int startpos, |
| 464 Anchor anchor, |
| 465 bool empty_ok, |
| 466 int *vec, |
| 467 int vecsize) const { |
| 468 pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_; |
| 469 if (re == NULL) { |
| 470 PCREPORT(ERROR) << "Matching against invalid re: " << *error_; |
| 471 return 0; |
| 472 } |
| 473 |
| 474 int match_limit = match_limit_; |
| 475 if (match_limit <= 0) { |
| 476 match_limit = FLAGS_regexp_match_limit; |
| 477 } |
| 478 |
| 479 int stack_limit = stack_limit_; |
| 480 if (stack_limit <= 0) { |
| 481 stack_limit = FLAGS_regexp_stack_limit; |
| 482 } |
| 483 |
| 484 pcre_extra extra = { 0 }; |
| 485 if (match_limit > 0) { |
| 486 extra.flags |= PCRE_EXTRA_MATCH_LIMIT; |
| 487 extra.match_limit = match_limit; |
| 488 } |
| 489 if (stack_limit > 0) { |
| 490 extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION; |
| 491 extra.match_limit_recursion = stack_limit / kPCREFrameSize; |
| 492 } |
| 493 |
| 494 int options = 0; |
| 495 if (anchor != UNANCHORED) |
| 496 options |= PCRE_ANCHORED; |
| 497 if (!empty_ok) |
| 498 options |= PCRE_NOTEMPTY; |
| 499 |
| 500 int rc = pcre_exec(re, // The regular expression object |
| 501 &extra, |
| 502 (text.data() == NULL) ? "" : text.data(), |
| 503 text.size(), |
| 504 startpos, |
| 505 options, |
| 506 vec, |
| 507 vecsize); |
| 508 |
| 509 // Handle errors |
| 510 if (rc == 0) { |
| 511 // pcre_exec() returns 0 as a special case when the number of |
| 512 // capturing subpatterns exceeds the size of the vector. |
| 513 // When this happens, there is a match and the output vector |
| 514 // is filled, but we miss out on the positions of the extra subpatterns. |
| 515 rc = vecsize / 2; |
| 516 } else if (rc < 0) { |
| 517 switch (rc) { |
| 518 case PCRE_ERROR_NOMATCH: |
| 519 return 0; |
| 520 case PCRE_ERROR_MATCHLIMIT: |
| 521 // Writing to hit_limit is not safe if multiple threads |
| 522 // are using the PCRE, but the flag is only intended |
| 523 // for use by unit tests anyway, so we let it go. |
| 524 hit_limit_ = true; |
| 525 PCREPORT(WARNING) << "Exceeded match limit of " << match_limit |
| 526 << " when matching '" << pattern_ << "'" |
| 527 << " against text that is " << text.size() << " bytes."; |
| 528 return 0; |
| 529 case PCRE_ERROR_RECURSIONLIMIT: |
| 530 // See comment about hit_limit above. |
| 531 hit_limit_ = true; |
| 532 PCREPORT(WARNING) << "Exceeded stack limit of " << stack_limit |
| 533 << " when matching '" << pattern_ << "'" |
| 534 << " against text that is " << text.size() << " bytes."; |
| 535 return 0; |
| 536 default: |
| 537 // There are other return codes from pcre.h : |
| 538 // PCRE_ERROR_NULL (-2) |
| 539 // PCRE_ERROR_BADOPTION (-3) |
| 540 // PCRE_ERROR_BADMAGIC (-4) |
| 541 // PCRE_ERROR_UNKNOWN_NODE (-5) |
| 542 // PCRE_ERROR_NOMEMORY (-6) |
| 543 // PCRE_ERROR_NOSUBSTRING (-7) |
| 544 // ... |
| 545 PCREPORT(ERROR) << "Unexpected return code: " << rc |
| 546 << " when matching '" << pattern_ << "'" |
| 547 << ", re=" << re |
| 548 << ", text=" << text |
| 549 << ", vec=" << vec |
| 550 << ", vecsize=" << vecsize; |
| 551 return 0; |
| 552 } |
| 553 } |
| 554 |
| 555 return rc; |
| 556 } |
| 557 |
| 558 bool PCRE::DoMatchImpl(const StringPiece& text, |
| 559 Anchor anchor, |
| 560 int* consumed, |
| 561 const Arg* const* args, |
| 562 int n, |
| 563 int* vec, |
| 564 int vecsize) const { |
| 565 assert((1 + n) * 3 <= vecsize); // results + PCRE workspace |
| 566 int matches = TryMatch(text, 0, anchor, true, vec, vecsize); |
| 567 assert(matches >= 0); // TryMatch never returns negatives |
| 568 if (matches == 0) |
| 569 return false; |
| 570 |
| 571 *consumed = vec[1]; |
| 572 |
| 573 if (n == 0 || args == NULL) { |
| 574 // We are not interested in results |
| 575 return true; |
| 576 } |
| 577 if (NumberOfCapturingGroups() < n) { |
| 578 // PCRE has fewer capturing groups than number of arg pointers passed in |
| 579 return false; |
| 580 } |
| 581 |
| 582 // If we got here, we must have matched the whole pattern. |
| 583 // We do not need (can not do) any more checks on the value of 'matches' here |
| 584 // -- see the comment for TryMatch. |
| 585 for (int i = 0; i < n; i++) { |
| 586 const int start = vec[2*(i+1)]; |
| 587 const int limit = vec[2*(i+1)+1]; |
| 588 if (!args[i]->Parse(text.data() + start, limit-start)) { |
| 589 // TODO: Should we indicate what the error was? |
| 590 return false; |
| 591 } |
| 592 } |
| 593 |
| 594 return true; |
| 595 } |
| 596 |
| 597 bool PCRE::DoMatch(const StringPiece& text, |
| 598 Anchor anchor, |
| 599 int* consumed, |
| 600 const Arg* const args[], |
| 601 int n) const { |
| 602 assert(n >= 0); |
| 603 size_t const vecsize = (1 + n) * 3; // results + PCRE workspace |
| 604 // (as for kVecSize) |
| 605 int *vec = new int[vecsize]; |
| 606 bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize); |
| 607 delete[] vec; |
| 608 return b; |
| 609 } |
| 610 |
| 611 bool PCRE::Rewrite(string *out, const StringPiece &rewrite, |
| 612 const StringPiece &text, int *vec, int veclen) const { |
| 613 int number_of_capturing_groups = NumberOfCapturingGroups(); |
| 614 for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
| 615 s < end; s++) { |
| 616 int c = *s; |
| 617 if (c == '\\') { |
| 618 c = *++s; |
| 619 if (isdigit(c)) { |
| 620 int n = (c - '0'); |
| 621 if (n >= veclen) { |
| 622 if (n <= number_of_capturing_groups) { |
| 623 // unmatched optional capturing group. treat |
| 624 // its value as empty string; i.e., nothing to append. |
| 625 } else { |
| 626 PCREPORT(ERROR) << "requested group " << n |
| 627 << " in regexp " << rewrite.data(); |
| 628 return false; |
| 629 } |
| 630 } |
| 631 int start = vec[2 * n]; |
| 632 if (start >= 0) |
| 633 out->append(text.data() + start, vec[2 * n + 1] - start); |
| 634 } else if (c == '\\') { |
| 635 out->push_back('\\'); |
| 636 } else { |
| 637 PCREPORT(ERROR) << "invalid rewrite pattern: " << rewrite.data(); |
| 638 return false; |
| 639 } |
| 640 } else { |
| 641 out->push_back(c); |
| 642 } |
| 643 } |
| 644 return true; |
| 645 } |
| 646 |
| 647 bool PCRE::CheckRewriteString(const StringPiece& rewrite, string* error) const { |
| 648 int max_token = -1; |
| 649 for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
| 650 s < end; s++) { |
| 651 int c = *s; |
| 652 if (c != '\\') { |
| 653 continue; |
| 654 } |
| 655 if (++s == end) { |
| 656 *error = "Rewrite schema error: '\\' not allowed at end."; |
| 657 return false; |
| 658 } |
| 659 c = *s; |
| 660 if (c == '\\') { |
| 661 continue; |
| 662 } |
| 663 if (!isdigit(c)) { |
| 664 *error = "Rewrite schema error: " |
| 665 "'\\' must be followed by a digit or '\\'."; |
| 666 return false; |
| 667 } |
| 668 int n = (c - '0'); |
| 669 if (max_token < n) { |
| 670 max_token = n; |
| 671 } |
| 672 } |
| 673 |
| 674 if (max_token > NumberOfCapturingGroups()) { |
| 675 SStringPrintf(error, "Rewrite schema requests %d matches, " |
| 676 "but the regexp only has %d parenthesized subexpressions.", |
| 677 max_token, NumberOfCapturingGroups()); |
| 678 return false; |
| 679 } |
| 680 return true; |
| 681 } |
| 682 |
| 683 |
| 684 // Return the number of capturing subpatterns, or -1 if the |
| 685 // regexp wasn't valid on construction. |
| 686 int PCRE::NumberOfCapturingGroups() const { |
| 687 if (re_partial_ == NULL) return -1; |
| 688 |
| 689 int result; |
| 690 CHECK(pcre_fullinfo(re_partial_, // The regular expression object |
| 691 NULL, // We did not study the pattern |
| 692 PCRE_INFO_CAPTURECOUNT, |
| 693 &result) == 0); |
| 694 return result; |
| 695 } |
| 696 |
| 697 |
| 698 /***** Parsers for various types *****/ |
| 699 |
| 700 bool PCRE::Arg::parse_null(const char* str, int n, void* dest) { |
| 701 // We fail if somebody asked us to store into a non-NULL void* pointer |
| 702 return (dest == NULL); |
| 703 } |
| 704 |
| 705 bool PCRE::Arg::parse_string(const char* str, int n, void* dest) { |
| 706 if (dest == NULL) return true; |
| 707 reinterpret_cast<string*>(dest)->assign(str, n); |
| 708 return true; |
| 709 } |
| 710 |
| 711 bool PCRE::Arg::parse_stringpiece(const char* str, int n, void* dest) { |
| 712 if (dest == NULL) return true; |
| 713 reinterpret_cast<StringPiece*>(dest)->set(str, n); |
| 714 return true; |
| 715 } |
| 716 |
| 717 bool PCRE::Arg::parse_char(const char* str, int n, void* dest) { |
| 718 if (n != 1) return false; |
| 719 if (dest == NULL) return true; |
| 720 *(reinterpret_cast<char*>(dest)) = str[0]; |
| 721 return true; |
| 722 } |
| 723 |
| 724 bool PCRE::Arg::parse_uchar(const char* str, int n, void* dest) { |
| 725 if (n != 1) return false; |
| 726 if (dest == NULL) return true; |
| 727 *(reinterpret_cast<unsigned char*>(dest)) = str[0]; |
| 728 return true; |
| 729 } |
| 730 |
| 731 // Largest number spec that we are willing to parse |
| 732 static const int kMaxNumberLength = 32; |
| 733 |
| 734 // PCREQUIPCRES "buf" must have length at least kMaxNumberLength+1 |
| 735 // PCREQUIPCRES "n > 0" |
| 736 // Copies "str" into "buf" and null-terminates if necessary. |
| 737 // Returns one of: |
| 738 // a. "str" if no termination is needed |
| 739 // b. "buf" if the string was copied and null-terminated |
| 740 // c. "" if the input was invalid and has no hope of being parsed |
| 741 static const char* TerminateNumber(char* buf, const char* str, int n) { |
| 742 if ((n > 0) && isspace(*str)) { |
| 743 // We are less forgiving than the strtoxxx() routines and do not |
| 744 // allow leading spaces. |
| 745 return ""; |
| 746 } |
| 747 |
| 748 // See if the character right after the input text may potentially |
| 749 // look like a digit. |
| 750 if (isdigit(str[n]) || |
| 751 ((str[n] >= 'a') && (str[n] <= 'f')) || |
| 752 ((str[n] >= 'A') && (str[n] <= 'F'))) { |
| 753 if (n > kMaxNumberLength) return ""; // Input too big to be a valid number |
| 754 memcpy(buf, str, n); |
| 755 buf[n] = '\0'; |
| 756 return buf; |
| 757 } else { |
| 758 // We can parse right out of the supplied string, so return it. |
| 759 return str; |
| 760 } |
| 761 } |
| 762 |
| 763 bool PCRE::Arg::parse_long_radix(const char* str, |
| 764 int n, |
| 765 void* dest, |
| 766 int radix) { |
| 767 if (n == 0) return false; |
| 768 char buf[kMaxNumberLength+1]; |
| 769 str = TerminateNumber(buf, str, n); |
| 770 char* end; |
| 771 errno = 0; |
| 772 long r = strtol(str, &end, radix); |
| 773 if (end != str + n) return false; // Leftover junk |
| 774 if (errno) return false; |
| 775 if (dest == NULL) return true; |
| 776 *(reinterpret_cast<long*>(dest)) = r; |
| 777 return true; |
| 778 } |
| 779 |
| 780 bool PCRE::Arg::parse_ulong_radix(const char* str, |
| 781 int n, |
| 782 void* dest, |
| 783 int radix) { |
| 784 if (n == 0) return false; |
| 785 char buf[kMaxNumberLength+1]; |
| 786 str = TerminateNumber(buf, str, n); |
| 787 if (str[0] == '-') { |
| 788 // strtoul() will silently accept negative numbers and parse |
| 789 // them. This module is more strict and treats them as errors. |
| 790 return false; |
| 791 } |
| 792 |
| 793 char* end; |
| 794 errno = 0; |
| 795 unsigned long r = strtoul(str, &end, radix); |
| 796 if (end != str + n) return false; // Leftover junk |
| 797 if (errno) return false; |
| 798 if (dest == NULL) return true; |
| 799 *(reinterpret_cast<unsigned long*>(dest)) = r; |
| 800 return true; |
| 801 } |
| 802 |
| 803 bool PCRE::Arg::parse_short_radix(const char* str, |
| 804 int n, |
| 805 void* dest, |
| 806 int radix) { |
| 807 long r; |
| 808 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse |
| 809 if ((short)r != r) return false; // Out of range |
| 810 if (dest == NULL) return true; |
| 811 *(reinterpret_cast<short*>(dest)) = r; |
| 812 return true; |
| 813 } |
| 814 |
| 815 bool PCRE::Arg::parse_ushort_radix(const char* str, |
| 816 int n, |
| 817 void* dest, |
| 818 int radix) { |
| 819 unsigned long r; |
| 820 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse |
| 821 if ((ushort)r != r) return false; // Out of range |
| 822 if (dest == NULL) return true; |
| 823 *(reinterpret_cast<unsigned short*>(dest)) = r; |
| 824 return true; |
| 825 } |
| 826 |
| 827 bool PCRE::Arg::parse_int_radix(const char* str, |
| 828 int n, |
| 829 void* dest, |
| 830 int radix) { |
| 831 long r; |
| 832 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse |
| 833 if ((int)r != r) return false; // Out of range |
| 834 if (dest == NULL) return true; |
| 835 *(reinterpret_cast<int*>(dest)) = r; |
| 836 return true; |
| 837 } |
| 838 |
| 839 bool PCRE::Arg::parse_uint_radix(const char* str, |
| 840 int n, |
| 841 void* dest, |
| 842 int radix) { |
| 843 unsigned long r; |
| 844 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse |
| 845 if ((uint)r != r) return false; // Out of range |
| 846 if (dest == NULL) return true; |
| 847 *(reinterpret_cast<unsigned int*>(dest)) = r; |
| 848 return true; |
| 849 } |
| 850 |
| 851 bool PCRE::Arg::parse_longlong_radix(const char* str, |
| 852 int n, |
| 853 void* dest, |
| 854 int radix) { |
| 855 if (n == 0) return false; |
| 856 char buf[kMaxNumberLength+1]; |
| 857 str = TerminateNumber(buf, str, n); |
| 858 char* end; |
| 859 errno = 0; |
| 860 int64 r = strtoll(str, &end, radix); |
| 861 if (end != str + n) return false; // Leftover junk |
| 862 if (errno) return false; |
| 863 if (dest == NULL) return true; |
| 864 *(reinterpret_cast<int64*>(dest)) = r; |
| 865 return true; |
| 866 } |
| 867 |
| 868 bool PCRE::Arg::parse_ulonglong_radix(const char* str, |
| 869 int n, |
| 870 void* dest, |
| 871 int radix) { |
| 872 if (n == 0) return false; |
| 873 char buf[kMaxNumberLength+1]; |
| 874 str = TerminateNumber(buf, str, n); |
| 875 if (str[0] == '-') { |
| 876 // strtoull() will silently accept negative numbers and parse |
| 877 // them. This module is more strict and treats them as errors. |
| 878 return false; |
| 879 } |
| 880 char* end; |
| 881 errno = 0; |
| 882 uint64 r = strtoull(str, &end, radix); |
| 883 if (end != str + n) return false; // Leftover junk |
| 884 if (errno) return false; |
| 885 if (dest == NULL) return true; |
| 886 *(reinterpret_cast<uint64*>(dest)) = r; |
| 887 return true; |
| 888 } |
| 889 |
| 890 bool PCRE::Arg::parse_double(const char* str, int n, void* dest) { |
| 891 if (n == 0) return false; |
| 892 static const int kMaxLength = 200; |
| 893 char buf[kMaxLength]; |
| 894 if (n >= kMaxLength) return false; |
| 895 memcpy(buf, str, n); |
| 896 buf[n] = '\0'; |
| 897 errno = 0; |
| 898 char* end; |
| 899 double r = strtod(buf, &end); |
| 900 if (end != buf + n) { |
| 901 #ifdef COMPILER_MSVC |
| 902 // Microsoft's strtod() doesn't handle inf and nan, so we have to |
| 903 // handle it explicitly. Speed is not important here because this |
| 904 // code is only called in unit tests. |
| 905 bool pos = true; |
| 906 const char* i = buf; |
| 907 if ('-' == *i) { |
| 908 pos = false; |
| 909 ++i; |
| 910 } else if ('+' == *i) { |
| 911 ++i; |
| 912 } |
| 913 if (0 == stricmp(i, "inf") || 0 == stricmp(i, "infinity")) { |
| 914 r = numeric_limits<double>::infinity(); |
| 915 if (!pos) |
| 916 r = -r; |
| 917 } else if (0 == stricmp(i, "nan")) { |
| 918 r = numeric_limits<double>::quiet_NaN(); |
| 919 } else { |
| 920 return false; |
| 921 } |
| 922 #else |
| 923 return false; // Leftover junk |
| 924 #endif |
| 925 } |
| 926 if (errno) return false; |
| 927 if (dest == NULL) return true; |
| 928 *(reinterpret_cast<double*>(dest)) = r; |
| 929 return true; |
| 930 } |
| 931 |
| 932 bool PCRE::Arg::parse_float(const char* str, int n, void* dest) { |
| 933 double r; |
| 934 if (!parse_double(str, n, &r)) return false; |
| 935 if (dest == NULL) return true; |
| 936 *(reinterpret_cast<float*>(dest)) = static_cast<float>(r); |
| 937 return true; |
| 938 } |
| 939 |
| 940 |
| 941 #define DEFINE_INTEGER_PARSERS(name) \ |
| 942 bool PCRE::Arg::parse_##name(const char* str, int n, void* dest) { \ |
| 943 return parse_##name##_radix(str, n, dest, 10); \ |
| 944 } \ |
| 945 bool PCRE::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \ |
| 946 return parse_##name##_radix(str, n, dest, 16); \ |
| 947 } \ |
| 948 bool PCRE::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \ |
| 949 return parse_##name##_radix(str, n, dest, 8); \ |
| 950 } \ |
| 951 bool PCRE::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \ |
| 952 return parse_##name##_radix(str, n, dest, 0); \ |
| 953 } |
| 954 |
| 955 DEFINE_INTEGER_PARSERS(short); |
| 956 DEFINE_INTEGER_PARSERS(ushort); |
| 957 DEFINE_INTEGER_PARSERS(int); |
| 958 DEFINE_INTEGER_PARSERS(uint); |
| 959 DEFINE_INTEGER_PARSERS(long); |
| 960 DEFINE_INTEGER_PARSERS(ulong); |
| 961 DEFINE_INTEGER_PARSERS(longlong); |
| 962 DEFINE_INTEGER_PARSERS(ulonglong); |
| 963 |
| 964 #undef DEFINE_INTEGER_PARSERS |
| 965 |
| 966 } // namespace re2 |
OLD | NEW |