Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(265)

Side by Side Diff: src/jsregexp.h

Issue 10105026: Version 3.10.3 (Closed) Base URL: http://v8.googlecode.com/svn/trunk/
Patch Set: Created 8 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « src/interface.cc ('k') | src/jsregexp.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 22 matching lines...) Expand all
33 #include "zone-inl.h" 33 #include "zone-inl.h"
34 34
35 namespace v8 { 35 namespace v8 {
36 namespace internal { 36 namespace internal {
37 37
38 class NodeVisitor; 38 class NodeVisitor;
39 class RegExpCompiler; 39 class RegExpCompiler;
40 class RegExpMacroAssembler; 40 class RegExpMacroAssembler;
41 class RegExpNode; 41 class RegExpNode;
42 class RegExpTree; 42 class RegExpTree;
43 class BoyerMooreLookahead;
43 44
44 class RegExpImpl { 45 class RegExpImpl {
45 public: 46 public:
46 // Whether V8 is compiled with native regexp support or not. 47 // Whether V8 is compiled with native regexp support or not.
47 static bool UsesNativeRegExp() { 48 static bool UsesNativeRegExp() {
48 #ifdef V8_INTERPRETED_REGEXP 49 #ifdef V8_INTERPRETED_REGEXP
49 return false; 50 return false;
50 #else 51 #else
51 return true; 52 return true;
52 #endif 53 #endif
(...skipping 164 matching lines...) Expand 10 before | Expand all | Expand 10 after
217 // Represents the location of one element relative to the intersection of 218 // Represents the location of one element relative to the intersection of
218 // two sets. Corresponds to the four areas of a Venn diagram. 219 // two sets. Corresponds to the four areas of a Venn diagram.
219 enum ElementInSetsRelation { 220 enum ElementInSetsRelation {
220 kInsideNone = 0, 221 kInsideNone = 0,
221 kInsideFirst = 1, 222 kInsideFirst = 1,
222 kInsideSecond = 2, 223 kInsideSecond = 2,
223 kInsideBoth = 3 224 kInsideBoth = 3
224 }; 225 };
225 226
226 227
227 // Represents the relation of two sets.
228 // Sets can be either disjoint, partially or fully overlapping, or equal.
229 class SetRelation BASE_EMBEDDED {
230 public:
231 // Relation is represented by a bit saying whether there are elements in
232 // one set that is not in the other, and a bit saying that there are elements
233 // that are in both sets.
234
235 // Location of an element. Corresponds to the internal areas of
236 // a Venn diagram.
237 enum {
238 kInFirst = 1 << kInsideFirst,
239 kInSecond = 1 << kInsideSecond,
240 kInBoth = 1 << kInsideBoth
241 };
242 SetRelation() : bits_(0) {}
243 ~SetRelation() {}
244 // Add the existence of objects in a particular
245 void SetElementsInFirstSet() { bits_ |= kInFirst; }
246 void SetElementsInSecondSet() { bits_ |= kInSecond; }
247 void SetElementsInBothSets() { bits_ |= kInBoth; }
248 // Check the currently known relation of the sets (common functions only,
249 // for other combinations, use value() to get the bits and check them
250 // manually).
251 // Sets are completely disjoint.
252 bool Disjoint() { return (bits_ & kInBoth) == 0; }
253 // Sets are equal.
254 bool Equals() { return (bits_ & (kInFirst | kInSecond)) == 0; }
255 // First set contains second.
256 bool Contains() { return (bits_ & kInSecond) == 0; }
257 // Second set contains first.
258 bool ContainedIn() { return (bits_ & kInFirst) == 0; }
259 bool NonTrivialIntersection() {
260 return (bits_ == (kInFirst | kInSecond | kInBoth));
261 }
262 int value() { return bits_; }
263
264 private:
265 int bits_;
266 };
267
268
269 class CharacterRange { 228 class CharacterRange {
270 public: 229 public:
271 CharacterRange() : from_(0), to_(0) { } 230 CharacterRange() : from_(0), to_(0) { }
272 // For compatibility with the CHECK_OK macro 231 // For compatibility with the CHECK_OK macro
273 CharacterRange(void* null) { ASSERT_EQ(NULL, null); } //NOLINT 232 CharacterRange(void* null) { ASSERT_EQ(NULL, null); } //NOLINT
274 CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) { } 233 CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) { }
275 static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges); 234 static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges);
276 static Vector<const uc16> GetWordBounds(); 235 static Vector<const int> GetWordBounds();
277 static inline CharacterRange Singleton(uc16 value) { 236 static inline CharacterRange Singleton(uc16 value) {
278 return CharacterRange(value, value); 237 return CharacterRange(value, value);
279 } 238 }
280 static inline CharacterRange Range(uc16 from, uc16 to) { 239 static inline CharacterRange Range(uc16 from, uc16 to) {
281 ASSERT(from <= to); 240 ASSERT(from <= to);
282 return CharacterRange(from, to); 241 return CharacterRange(from, to);
283 } 242 }
284 static inline CharacterRange Everything() { 243 static inline CharacterRange Everything() {
285 return CharacterRange(0, 0xFFFF); 244 return CharacterRange(0, 0xFFFF);
286 } 245 }
287 bool Contains(uc16 i) { return from_ <= i && i <= to_; } 246 bool Contains(uc16 i) { return from_ <= i && i <= to_; }
288 uc16 from() const { return from_; } 247 uc16 from() const { return from_; }
289 void set_from(uc16 value) { from_ = value; } 248 void set_from(uc16 value) { from_ = value; }
290 uc16 to() const { return to_; } 249 uc16 to() const { return to_; }
291 void set_to(uc16 value) { to_ = value; } 250 void set_to(uc16 value) { to_ = value; }
292 bool is_valid() { return from_ <= to_; } 251 bool is_valid() { return from_ <= to_; }
293 bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; } 252 bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
294 bool IsSingleton() { return (from_ == to_); } 253 bool IsSingleton() { return (from_ == to_); }
295 void AddCaseEquivalents(ZoneList<CharacterRange>* ranges, bool is_ascii); 254 void AddCaseEquivalents(ZoneList<CharacterRange>* ranges, bool is_ascii);
296 static void Split(ZoneList<CharacterRange>* base, 255 static void Split(ZoneList<CharacterRange>* base,
297 Vector<const uc16> overlay, 256 Vector<const int> overlay,
298 ZoneList<CharacterRange>** included, 257 ZoneList<CharacterRange>** included,
299 ZoneList<CharacterRange>** excluded); 258 ZoneList<CharacterRange>** excluded);
300 // Whether a range list is in canonical form: Ranges ordered by from value, 259 // Whether a range list is in canonical form: Ranges ordered by from value,
301 // and ranges non-overlapping and non-adjacent. 260 // and ranges non-overlapping and non-adjacent.
302 static bool IsCanonical(ZoneList<CharacterRange>* ranges); 261 static bool IsCanonical(ZoneList<CharacterRange>* ranges);
303 // Convert range list to canonical form. The characters covered by the ranges 262 // Convert range list to canonical form. The characters covered by the ranges
304 // will still be the same, but no character is in more than one range, and 263 // will still be the same, but no character is in more than one range, and
305 // adjacent ranges are merged. The resulting list may be shorter than the 264 // adjacent ranges are merged. The resulting list may be shorter than the
306 // original, but cannot be longer. 265 // original, but cannot be longer.
307 static void Canonicalize(ZoneList<CharacterRange>* ranges); 266 static void Canonicalize(ZoneList<CharacterRange>* ranges);
308 // Check how the set of characters defined by a CharacterRange list relates
309 // to the set of word characters. List must be in canonical form.
310 static SetRelation WordCharacterRelation(ZoneList<CharacterRange>* ranges);
311 // Takes two character range lists (representing character sets) in canonical
312 // form and merges them.
313 // The characters that are only covered by the first set are added to
314 // first_set_only_out. the characters that are only in the second set are
315 // added to second_set_only_out, and the characters that are in both are
316 // added to both_sets_out.
317 // The pointers to first_set_only_out, second_set_only_out and both_sets_out
318 // should be to empty lists, but they need not be distinct, and may be NULL.
319 // If NULL, the characters are dropped, and if two arguments are the same
320 // pointer, the result is the union of the two sets that would be created
321 // if the pointers had been distinct.
322 // This way, the Merge function can compute all the usual set operations:
323 // union (all three out-sets are equal), intersection (only both_sets_out is
324 // non-NULL), and set difference (only first_set is non-NULL).
325 static void Merge(ZoneList<CharacterRange>* first_set,
326 ZoneList<CharacterRange>* second_set,
327 ZoneList<CharacterRange>* first_set_only_out,
328 ZoneList<CharacterRange>* second_set_only_out,
329 ZoneList<CharacterRange>* both_sets_out);
330 // Negate the contents of a character range in canonical form. 267 // Negate the contents of a character range in canonical form.
331 static void Negate(ZoneList<CharacterRange>* src, 268 static void Negate(ZoneList<CharacterRange>* src,
332 ZoneList<CharacterRange>* dst); 269 ZoneList<CharacterRange>* dst);
333 static const int kStartMarker = (1 << 24); 270 static const int kStartMarker = (1 << 24);
334 static const int kPayloadMask = (1 << 24) - 1; 271 static const int kPayloadMask = (1 << 24) - 1;
335 272
336 private: 273 private:
337 uc16 from_; 274 uc16 from_;
338 uc16 to_; 275 uc16 to_;
339 }; 276 };
(...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after
414 private: 351 private:
415 // There can't be a static empty set since it allocates its 352 // There can't be a static empty set since it allocates its
416 // successors in a zone and caches them. 353 // successors in a zone and caches them.
417 OutSet* empty() { return &empty_; } 354 OutSet* empty() { return &empty_; }
418 OutSet empty_; 355 OutSet empty_;
419 ZoneSplayTree<Config>* tree() { return &tree_; } 356 ZoneSplayTree<Config>* tree() { return &tree_; }
420 ZoneSplayTree<Config> tree_; 357 ZoneSplayTree<Config> tree_;
421 }; 358 };
422 359
423 360
424 // Improve the speed that we scan for an initial point where a non-anchored
425 // regexp can match by using a Boyer-Moore-like table. This is done by
426 // identifying non-greedy non-capturing loops in the nodes that eat any
427 // character one at a time. For example in the middle of the regexp
428 // /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
429 // inserted at the start of any non-anchored regexp.
430 //
431 // When we have found such a loop we look ahead in the nodes to find the set of
432 // characters that can come at given distances. For example for the regexp
433 // /.?foo/ we know that there are at least 3 characters ahead of us, and the
434 // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
435 // the lookahead info where the set of characters is reasonably constrained. In
436 // our example this is from index 1 to 2 (0 is not constrained). We can now
437 // look 3 characters ahead and if we don't find one of [f, o] (the union of
438 // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
439 //
440 // For Unicode input strings we do the same, but modulo 128.
441 //
442 // We also look at the first string fed to the regexp and use that to get a hint
443 // of the character frequencies in the inputs. This affects the assessment of
444 // whether the set of characters is 'reasonably constrained'.
445 //
446 // We also have another lookahead mechanism (called quick check in the code),
447 // which uses a wide load of multiple characters followed by a mask and compare
448 // to determine whether a match is possible at this point.
449 class BoyerMooreLookahead {
450 public:
451 BoyerMooreLookahead(int length, int map_length, RegExpCompiler* compiler);
452
453 int length() { return length_; }
454 int max_char() { return max_char_; }
455 RegExpCompiler* compiler() { return compiler_; }
456
457 static const int kTooManyCharacters = 32;
458
459 int Count(int map_number) {
460 ZoneList<bool>* map = bitmaps_->at(map_number);
461 if (map == NULL) return map_length_;
462 int count = 0;
463 for (int i = 0; i < map_length_; i++) {
464 if (map->at(i)) count++;
465 }
466 return count;
467 }
468
469 void Set(int map_number, int character) {
470 if (character > max_char_) return;
471 ZoneList<bool>* map = bitmaps_->at(map_number);
472 if (map == NULL) return;
473 map->at(character & (map_length_ - 1)) = true;
474 }
475
476 void SetAll(int map_number) {
477 bitmaps_->at(map_number) = NULL;
478 }
479
480 void SetRest(int from_map) {
481 for (int i = from_map; i < length_; i++) SetAll(i);
482 }
483 bool EmitSkipInstructions(RegExpMacroAssembler* masm);
484
485 private:
486 // This is the value obtained by EatsAtLeast. If we do not have at least this
487 // many characters left in the sample string then the match is bound to fail.
488 // Therefore it is OK to read a character this far ahead of the current match
489 // point.
490 int length_;
491 // We conservatively consider all character values modulo this length. For
492 // ASCII there is no loss of precision, since this has a value of 128.
493 int map_length_;
494 RegExpCompiler* compiler_;
495 // 0x7f for ASCII, 0xffff for UTF-16.
496 int max_char_;
497 ZoneList<ZoneList<bool>*>* bitmaps_;
498
499 int GetSkipTable(int min_lookahead,
500 int max_lookahead,
501 Handle<ByteArray> boolean_skip_table);
502 bool FindWorthwhileInterval(int* from, int* to);
503 int FindBestInterval(
504 int max_number_of_chars, int old_biggest_points, int* from, int* to);
505 };
506
507
508 #define FOR_EACH_NODE_TYPE(VISIT) \ 361 #define FOR_EACH_NODE_TYPE(VISIT) \
509 VISIT(End) \ 362 VISIT(End) \
510 VISIT(Action) \ 363 VISIT(Action) \
511 VISIT(Choice) \ 364 VISIT(Choice) \
512 VISIT(BackReference) \ 365 VISIT(BackReference) \
513 VISIT(Assertion) \ 366 VISIT(Assertion) \
514 VISIT(Text) 367 VISIT(Text)
515 368
516 369
517 #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \ 370 #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \
(...skipping 162 matching lines...) Expand 10 before | Expand all | Expand 10 after
680 uint32_t mask_; 533 uint32_t mask_;
681 uint32_t value_; 534 uint32_t value_;
682 // If set to true, there is no way this quick check can match at all. 535 // If set to true, there is no way this quick check can match at all.
683 // E.g., if it requires to be at the start of the input, and isn't. 536 // E.g., if it requires to be at the start of the input, and isn't.
684 bool cannot_match_; 537 bool cannot_match_;
685 }; 538 };
686 539
687 540
688 class RegExpNode: public ZoneObject { 541 class RegExpNode: public ZoneObject {
689 public: 542 public:
690 RegExpNode() : first_character_set_(NULL), trace_count_(0) { } 543 RegExpNode() : first_character_set_(NULL), trace_count_(0) {
544 bm_info_[0] = bm_info_[1] = NULL;
545 }
691 virtual ~RegExpNode(); 546 virtual ~RegExpNode();
692 virtual void Accept(NodeVisitor* visitor) = 0; 547 virtual void Accept(NodeVisitor* visitor) = 0;
693 // Generates a goto to this node or actually generates the code at this point. 548 // Generates a goto to this node or actually generates the code at this point.
694 virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0; 549 virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
695 // How many characters must this node consume at a minimum in order to 550 // How many characters must this node consume at a minimum in order to
696 // succeed. If we have found at least 'still_to_find' characters that 551 // succeed. If we have found at least 'still_to_find' characters that
697 // must be consumed there is no need to ask any following nodes whether 552 // must be consumed there is no need to ask any following nodes whether
698 // they are sure to eat any more characters. The not_at_start argument is 553 // they are sure to eat any more characters. The not_at_start argument is
699 // used to indicate that we know we are not at the start of the input. In 554 // used to indicate that we know we are not at the start of the input. In
700 // this case anchored branches will always fail and can be ignored when 555 // this case anchored branches will always fail and can be ignored when
(...skipping 23 matching lines...) Expand all
724 // Only returns the successor for a text node of length 1 that matches any 579 // Only returns the successor for a text node of length 1 that matches any
725 // character and that has no guards on it. 580 // character and that has no guards on it.
726 virtual RegExpNode* GetSuccessorOfOmnivorousTextNode( 581 virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
727 RegExpCompiler* compiler) { 582 RegExpCompiler* compiler) {
728 return NULL; 583 return NULL;
729 } 584 }
730 585
731 // Collects information on the possible code units (mod 128) that can match if 586 // Collects information on the possible code units (mod 128) that can match if
732 // we look forward. This is used for a Boyer-Moore-like string searching 587 // we look forward. This is used for a Boyer-Moore-like string searching
733 // implementation. TODO(erikcorry): This should share more code with 588 // implementation. TODO(erikcorry): This should share more code with
734 // EatsAtLeast, GetQuickCheckDetails and ComputeFirstCharacterSet. 589 // EatsAtLeast, GetQuickCheckDetails.
735 virtual void FillInBMInfo( 590 virtual void FillInBMInfo(
736 int offset, BoyerMooreLookahead* bm, bool not_at_start) { 591 int offset, BoyerMooreLookahead* bm, bool not_at_start) {
737 UNREACHABLE(); 592 UNREACHABLE();
738 } 593 }
594 // We want to avoid recalculating the lookahead info, so we store it on the
595 // node. Only info that is for this node is stored. We can tell that the
596 // info is for this node when offset == 0, so the information is calculated
597 // relative to this node.
598 void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
599 if (offset == 0) set_bm_info(not_at_start, bm);
600 }
739 601
740 Label* label() { return &label_; } 602 Label* label() { return &label_; }
741 // If non-generic code is generated for a node (i.e. the node is not at the 603 // If non-generic code is generated for a node (i.e. the node is not at the
742 // start of the trace) then it cannot be reused. This variable sets a limit 604 // start of the trace) then it cannot be reused. This variable sets a limit
743 // on how often we allow that to happen before we insist on starting a new 605 // on how often we allow that to happen before we insist on starting a new
744 // trace and generating generic code for a node that can be reused by flushing 606 // trace and generating generic code for a node that can be reused by flushing
745 // the deferred actions in the current trace and generating a goto. 607 // the deferred actions in the current trace and generating a goto.
746 static const int kMaxCopiesCodeGenerated = 10; 608 static const int kMaxCopiesCodeGenerated = 10;
747 609
748 NodeInfo* info() { return &info_; } 610 NodeInfo* info() { return &info_; }
749 611
750 void AddSibling(RegExpNode* node) { siblings_.Add(node); } 612 void AddSibling(RegExpNode* node) { siblings_.Add(node); }
751 613
752 // Static version of EnsureSibling that expresses the fact that the 614 // Static version of EnsureSibling that expresses the fact that the
753 // result has the same type as the input. 615 // result has the same type as the input.
754 template <class C> 616 template <class C>
755 static C* EnsureSibling(C* node, NodeInfo* info, bool* cloned) { 617 static C* EnsureSibling(C* node, NodeInfo* info, bool* cloned) {
756 return static_cast<C*>(node->EnsureSibling(info, cloned)); 618 return static_cast<C*>(node->EnsureSibling(info, cloned));
757 } 619 }
758 620
759 SiblingList* siblings() { return &siblings_; } 621 SiblingList* siblings() { return &siblings_; }
760 void set_siblings(SiblingList* other) { siblings_ = *other; } 622 void set_siblings(SiblingList* other) { siblings_ = *other; }
761 623
762 // Return the set of possible next characters recognized by the regexp
763 // (or a safe subset, potentially the set of all characters).
764 ZoneList<CharacterRange>* FirstCharacterSet();
765
766 // Compute (if possible within the budget of traversed nodes) the
767 // possible first characters of the input matched by this node and
768 // its continuation. Returns the remaining budget after the computation.
769 // If the budget is spent, the result is negative, and the cached
770 // first_character_set_ value isn't set.
771 virtual int ComputeFirstCharacterSet(int budget);
772
773 // Get and set the cached first character set value. 624 // Get and set the cached first character set value.
774 ZoneList<CharacterRange>* first_character_set() { 625 ZoneList<CharacterRange>* first_character_set() {
775 return first_character_set_; 626 return first_character_set_;
776 } 627 }
777 void set_first_character_set(ZoneList<CharacterRange>* character_set) { 628 void set_first_character_set(ZoneList<CharacterRange>* character_set) {
778 first_character_set_ = character_set; 629 first_character_set_ = character_set;
779 } 630 }
631 BoyerMooreLookahead* bm_info(bool not_at_start) {
632 return bm_info_[not_at_start ? 1 : 0];
633 }
780 634
781 protected: 635 protected:
782 enum LimitResult { DONE, CONTINUE }; 636 enum LimitResult { DONE, CONTINUE };
783 static const int kComputeFirstCharacterSetFail = -1; 637 static const int kComputeFirstCharacterSetFail = -1;
784 638
785 LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace); 639 LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
786 640
787 // Returns a sibling of this node whose interests and assumptions 641 // Returns a sibling of this node whose interests and assumptions
788 // match the ones in the given node info. If no sibling exists NULL 642 // match the ones in the given node info. If no sibling exists NULL
789 // is returned. 643 // is returned.
790 RegExpNode* TryGetSibling(NodeInfo* info); 644 RegExpNode* TryGetSibling(NodeInfo* info);
791 645
792 // Returns a sibling of this node whose interests match the ones in 646 // Returns a sibling of this node whose interests match the ones in
793 // the given node info. The info must not contain any assertions. 647 // the given node info. The info must not contain any assertions.
794 // If no node exists a new one will be created by cloning the current 648 // If no node exists a new one will be created by cloning the current
795 // node. The result will always be an instance of the same concrete 649 // node. The result will always be an instance of the same concrete
796 // class as this node. 650 // class as this node.
797 RegExpNode* EnsureSibling(NodeInfo* info, bool* cloned); 651 RegExpNode* EnsureSibling(NodeInfo* info, bool* cloned);
798 652
799 // Returns a clone of this node initialized using the copy constructor 653 // Returns a clone of this node initialized using the copy constructor
800 // of its concrete class. Note that the node may have to be pre- 654 // of its concrete class. Note that the node may have to be pre-
801 // processed before it is on a usable state. 655 // processed before it is on a usable state.
802 virtual RegExpNode* Clone() = 0; 656 virtual RegExpNode* Clone() = 0;
803 657
658 void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
659 bm_info_[not_at_start ? 1 : 0] = bm;
660 }
661
804 private: 662 private:
805 static const int kFirstCharBudget = 10; 663 static const int kFirstCharBudget = 10;
806 Label label_; 664 Label label_;
807 NodeInfo info_; 665 NodeInfo info_;
808 SiblingList siblings_; 666 SiblingList siblings_;
809 ZoneList<CharacterRange>* first_character_set_; 667 ZoneList<CharacterRange>* first_character_set_;
810 // This variable keeps track of how many times code has been generated for 668 // This variable keeps track of how many times code has been generated for
811 // this node (in different traces). We don't keep track of where the 669 // this node (in different traces). We don't keep track of where the
812 // generated code is located unless the code is generated at the start of 670 // generated code is located unless the code is generated at the start of
813 // a trace, in which case it is generic and can be reused by flushing the 671 // a trace, in which case it is generic and can be reused by flushing the
814 // deferred operations in the current trace and generating a goto. 672 // deferred operations in the current trace and generating a goto.
815 int trace_count_; 673 int trace_count_;
674 BoyerMooreLookahead* bm_info_[2];
816 }; 675 };
817 676
818 677
819 // A simple closed interval. 678 // A simple closed interval.
820 class Interval { 679 class Interval {
821 public: 680 public:
822 Interval() : from_(kNone), to_(kNone) { } 681 Interval() : from_(kNone), to_(kNone) { }
823 Interval(int from, int to) : from_(from), to_(to) { } 682 Interval(int from, int to) : from_(from), to_(to) { }
824 Interval Union(Interval that) { 683 Interval Union(Interval that) {
825 if (that.from_ == kNone) 684 if (that.from_ == kNone)
826 return *this; 685 return *this;
827 else if (from_ == kNone) 686 else if (from_ == kNone)
828 return that; 687 return that;
829 else 688 else
830 return Interval(Min(from_, that.from_), Max(to_, that.to_)); 689 return Interval(Min(from_, that.from_), Max(to_, that.to_));
831 } 690 }
832 bool Contains(int value) { 691 bool Contains(int value) {
833 return (from_ <= value) && (value <= to_); 692 return (from_ <= value) && (value <= to_);
834 } 693 }
835 bool is_empty() { return from_ == kNone; } 694 bool is_empty() { return from_ == kNone; }
836 int from() { return from_; } 695 int from() const { return from_; }
837 int to() { return to_; } 696 int to() const { return to_; }
838 static Interval Empty() { return Interval(); } 697 static Interval Empty() { return Interval(); }
839 static const int kNone = -1; 698 static const int kNone = -1;
840 private: 699 private:
841 int from_; 700 int from_;
842 int to_; 701 int to_;
843 }; 702 };
844 703
845 704
846 class SeqRegExpNode: public RegExpNode { 705 class SeqRegExpNode: public RegExpNode {
847 public: 706 public:
848 explicit SeqRegExpNode(RegExpNode* on_success) 707 explicit SeqRegExpNode(RegExpNode* on_success)
849 : on_success_(on_success) { } 708 : on_success_(on_success) { }
850 RegExpNode* on_success() { return on_success_; } 709 RegExpNode* on_success() { return on_success_; }
851 void set_on_success(RegExpNode* node) { on_success_ = node; } 710 void set_on_success(RegExpNode* node) { on_success_ = node; }
852 virtual void FillInBMInfo( 711 virtual void FillInBMInfo(
853 int offset, BoyerMooreLookahead* bm, bool not_at_start) { 712 int offset, BoyerMooreLookahead* bm, bool not_at_start) {
854 on_success_->FillInBMInfo(offset, bm, not_at_start); 713 on_success_->FillInBMInfo(offset, bm, not_at_start);
714 if (offset == 0) set_bm_info(not_at_start, bm);
855 } 715 }
856 private: 716 private:
857 RegExpNode* on_success_; 717 RegExpNode* on_success_;
858 }; 718 };
859 719
860 720
861 class ActionNode: public SeqRegExpNode { 721 class ActionNode: public SeqRegExpNode {
862 public: 722 public:
863 enum Type { 723 enum Type {
864 SET_REGISTER, 724 SET_REGISTER,
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
898 bool not_at_start) { 758 bool not_at_start) {
899 return on_success()->GetQuickCheckDetails( 759 return on_success()->GetQuickCheckDetails(
900 details, compiler, filled_in, not_at_start); 760 details, compiler, filled_in, not_at_start);
901 } 761 }
902 virtual void FillInBMInfo( 762 virtual void FillInBMInfo(
903 int offset, BoyerMooreLookahead* bm, bool not_at_start); 763 int offset, BoyerMooreLookahead* bm, bool not_at_start);
904 Type type() { return type_; } 764 Type type() { return type_; }
905 // TODO(erikcorry): We should allow some action nodes in greedy loops. 765 // TODO(erikcorry): We should allow some action nodes in greedy loops.
906 virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; } 766 virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
907 virtual ActionNode* Clone() { return new ActionNode(*this); } 767 virtual ActionNode* Clone() { return new ActionNode(*this); }
908 virtual int ComputeFirstCharacterSet(int budget);
909 768
910 private: 769 private:
911 union { 770 union {
912 struct { 771 struct {
913 int reg; 772 int reg;
914 int value; 773 int value;
915 } u_store_register; 774 } u_store_register;
916 struct { 775 struct {
917 int reg; 776 int reg;
918 } u_increment_register; 777 } u_increment_register;
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after
971 virtual RegExpNode* GetSuccessorOfOmnivorousTextNode( 830 virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
972 RegExpCompiler* compiler); 831 RegExpCompiler* compiler);
973 virtual void FillInBMInfo( 832 virtual void FillInBMInfo(
974 int offset, BoyerMooreLookahead* bm, bool not_at_start); 833 int offset, BoyerMooreLookahead* bm, bool not_at_start);
975 virtual TextNode* Clone() { 834 virtual TextNode* Clone() {
976 TextNode* result = new TextNode(*this); 835 TextNode* result = new TextNode(*this);
977 result->CalculateOffsets(); 836 result->CalculateOffsets();
978 return result; 837 return result;
979 } 838 }
980 void CalculateOffsets(); 839 void CalculateOffsets();
981 virtual int ComputeFirstCharacterSet(int budget);
982 840
983 private: 841 private:
984 enum TextEmitPassType { 842 enum TextEmitPassType {
985 NON_ASCII_MATCH, // Check for characters that can't match. 843 NON_ASCII_MATCH, // Check for characters that can't match.
986 SIMPLE_CHARACTER_MATCH, // Case-dependent single character check. 844 SIMPLE_CHARACTER_MATCH, // Case-dependent single character check.
987 NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs. 845 NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs.
988 CASE_CHARACTER_MATCH, // Case-independent single character check. 846 CASE_CHARACTER_MATCH, // Case-independent single character check.
989 CHARACTER_CLASS_MATCH // Character class. 847 CHARACTER_CLASS_MATCH // Character class.
990 }; 848 };
991 static bool SkipPass(int pass, bool ignore_case); 849 static bool SkipPass(int pass, bool ignore_case);
(...skipping 10 matching lines...) Expand all
1002 }; 860 };
1003 861
1004 862
1005 class AssertionNode: public SeqRegExpNode { 863 class AssertionNode: public SeqRegExpNode {
1006 public: 864 public:
1007 enum AssertionNodeType { 865 enum AssertionNodeType {
1008 AT_END, 866 AT_END,
1009 AT_START, 867 AT_START,
1010 AT_BOUNDARY, 868 AT_BOUNDARY,
1011 AT_NON_BOUNDARY, 869 AT_NON_BOUNDARY,
1012 AFTER_NEWLINE, 870 AFTER_NEWLINE
1013 // Types not directly expressible in regexp syntax.
1014 // Used for modifying a boundary node if its following character is
1015 // known to be word and/or non-word.
1016 AFTER_NONWORD_CHARACTER,
1017 AFTER_WORD_CHARACTER
1018 }; 871 };
1019 static AssertionNode* AtEnd(RegExpNode* on_success) { 872 static AssertionNode* AtEnd(RegExpNode* on_success) {
1020 return new AssertionNode(AT_END, on_success); 873 return new AssertionNode(AT_END, on_success);
1021 } 874 }
1022 static AssertionNode* AtStart(RegExpNode* on_success) { 875 static AssertionNode* AtStart(RegExpNode* on_success) {
1023 return new AssertionNode(AT_START, on_success); 876 return new AssertionNode(AT_START, on_success);
1024 } 877 }
1025 static AssertionNode* AtBoundary(RegExpNode* on_success) { 878 static AssertionNode* AtBoundary(RegExpNode* on_success) {
1026 return new AssertionNode(AT_BOUNDARY, on_success); 879 return new AssertionNode(AT_BOUNDARY, on_success);
1027 } 880 }
1028 static AssertionNode* AtNonBoundary(RegExpNode* on_success) { 881 static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
1029 return new AssertionNode(AT_NON_BOUNDARY, on_success); 882 return new AssertionNode(AT_NON_BOUNDARY, on_success);
1030 } 883 }
1031 static AssertionNode* AfterNewline(RegExpNode* on_success) { 884 static AssertionNode* AfterNewline(RegExpNode* on_success) {
1032 return new AssertionNode(AFTER_NEWLINE, on_success); 885 return new AssertionNode(AFTER_NEWLINE, on_success);
1033 } 886 }
1034 virtual void Accept(NodeVisitor* visitor); 887 virtual void Accept(NodeVisitor* visitor);
1035 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 888 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1036 virtual int EatsAtLeast(int still_to_find, 889 virtual int EatsAtLeast(int still_to_find,
1037 int recursion_depth, 890 int recursion_depth,
1038 bool not_at_start); 891 bool not_at_start);
1039 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 892 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1040 RegExpCompiler* compiler, 893 RegExpCompiler* compiler,
1041 int filled_in, 894 int filled_in,
1042 bool not_at_start); 895 bool not_at_start);
1043 virtual void FillInBMInfo( 896 virtual void FillInBMInfo(
1044 int offset, BoyerMooreLookahead* bm, bool not_at_start); 897 int offset, BoyerMooreLookahead* bm, bool not_at_start);
1045 virtual int ComputeFirstCharacterSet(int budget);
1046 virtual AssertionNode* Clone() { return new AssertionNode(*this); } 898 virtual AssertionNode* Clone() { return new AssertionNode(*this); }
1047 AssertionNodeType type() { return type_; } 899 AssertionNodeType type() { return type_; }
1048 void set_type(AssertionNodeType type) { type_ = type; } 900 void set_type(AssertionNodeType type) { type_ = type; }
1049 901
1050 private: 902 private:
903 void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
904 enum IfPrevious { kIsNonWord, kIsWord };
905 void BacktrackIfPrevious(RegExpCompiler* compiler,
906 Trace* trace,
907 IfPrevious backtrack_if_previous);
1051 AssertionNode(AssertionNodeType t, RegExpNode* on_success) 908 AssertionNode(AssertionNodeType t, RegExpNode* on_success)
1052 : SeqRegExpNode(on_success), type_(t) { } 909 : SeqRegExpNode(on_success), type_(t) { }
1053 AssertionNodeType type_; 910 AssertionNodeType type_;
1054 }; 911 };
1055 912
1056 913
1057 class BackReferenceNode: public SeqRegExpNode { 914 class BackReferenceNode: public SeqRegExpNode {
1058 public: 915 public:
1059 BackReferenceNode(int start_reg, 916 BackReferenceNode(int start_reg,
1060 int end_reg, 917 int end_reg,
1061 RegExpNode* on_success) 918 RegExpNode* on_success)
1062 : SeqRegExpNode(on_success), 919 : SeqRegExpNode(on_success),
1063 start_reg_(start_reg), 920 start_reg_(start_reg),
1064 end_reg_(end_reg) { } 921 end_reg_(end_reg) { }
1065 virtual void Accept(NodeVisitor* visitor); 922 virtual void Accept(NodeVisitor* visitor);
1066 int start_register() { return start_reg_; } 923 int start_register() { return start_reg_; }
1067 int end_register() { return end_reg_; } 924 int end_register() { return end_reg_; }
1068 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 925 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1069 virtual int EatsAtLeast(int still_to_find, 926 virtual int EatsAtLeast(int still_to_find,
1070 int recursion_depth, 927 int recursion_depth,
1071 bool not_at_start); 928 bool not_at_start);
1072 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 929 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1073 RegExpCompiler* compiler, 930 RegExpCompiler* compiler,
1074 int characters_filled_in, 931 int characters_filled_in,
1075 bool not_at_start) { 932 bool not_at_start) {
1076 return; 933 return;
1077 } 934 }
1078 virtual void FillInBMInfo( 935 virtual void FillInBMInfo(
1079 int offset, BoyerMooreLookahead* bm, bool not_at_start) { 936 int offset, BoyerMooreLookahead* bm, bool not_at_start);
1080 // Working out the set of characters that a backreference can match is too
1081 // hard, so we just say that any character can match.
1082 bm->SetRest(offset);
1083 }
1084 virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); } 937 virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); }
1085 virtual int ComputeFirstCharacterSet(int budget);
1086 938
1087 private: 939 private:
1088 int start_reg_; 940 int start_reg_;
1089 int end_reg_; 941 int end_reg_;
1090 }; 942 };
1091 943
1092 944
1093 class EndNode: public RegExpNode { 945 class EndNode: public RegExpNode {
1094 public: 946 public:
1095 enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS }; 947 enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
(...skipping 146 matching lines...) Expand 10 before | Expand all | Expand 10 after
1242 virtual int EatsAtLeast(int still_to_find, 1094 virtual int EatsAtLeast(int still_to_find,
1243 int recursion_depth, 1095 int recursion_depth,
1244 bool not_at_start); 1096 bool not_at_start);
1245 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 1097 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1246 RegExpCompiler* compiler, 1098 RegExpCompiler* compiler,
1247 int characters_filled_in, 1099 int characters_filled_in,
1248 bool not_at_start); 1100 bool not_at_start);
1249 virtual void FillInBMInfo( 1101 virtual void FillInBMInfo(
1250 int offset, BoyerMooreLookahead* bm, bool not_at_start) { 1102 int offset, BoyerMooreLookahead* bm, bool not_at_start) {
1251 alternatives_->at(1).node()->FillInBMInfo(offset, bm, not_at_start); 1103 alternatives_->at(1).node()->FillInBMInfo(offset, bm, not_at_start);
1104 if (offset == 0) set_bm_info(not_at_start, bm);
1252 } 1105 }
1253 // For a negative lookahead we don't emit the quick check for the 1106 // For a negative lookahead we don't emit the quick check for the
1254 // alternative that is expected to fail. This is because quick check code 1107 // alternative that is expected to fail. This is because quick check code
1255 // starts by loading enough characters for the alternative that takes fewest 1108 // starts by loading enough characters for the alternative that takes fewest
1256 // characters, but on a negative lookahead the negative branch did not take 1109 // characters, but on a negative lookahead the negative branch did not take
1257 // part in that calculation (EatsAtLeast) so the assumptions don't hold. 1110 // part in that calculation (EatsAtLeast) so the assumptions don't hold.
1258 virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; } 1111 virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; }
1259 virtual int ComputeFirstCharacterSet(int budget);
1260 }; 1112 };
1261 1113
1262 1114
1263 class LoopChoiceNode: public ChoiceNode { 1115 class LoopChoiceNode: public ChoiceNode {
1264 public: 1116 public:
1265 explicit LoopChoiceNode(bool body_can_be_zero_length) 1117 explicit LoopChoiceNode(bool body_can_be_zero_length)
1266 : ChoiceNode(2), 1118 : ChoiceNode(2),
1267 loop_node_(NULL), 1119 loop_node_(NULL),
1268 continue_node_(NULL), 1120 continue_node_(NULL),
1269 body_can_be_zero_length_(body_can_be_zero_length) { } 1121 body_can_be_zero_length_(body_can_be_zero_length) { }
1270 void AddLoopAlternative(GuardedAlternative alt); 1122 void AddLoopAlternative(GuardedAlternative alt);
1271 void AddContinueAlternative(GuardedAlternative alt); 1123 void AddContinueAlternative(GuardedAlternative alt);
1272 virtual void Emit(RegExpCompiler* compiler, Trace* trace); 1124 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1273 virtual int EatsAtLeast(int still_to_find, 1125 virtual int EatsAtLeast(int still_to_find,
1274 int recursion_depth, 1126 int recursion_depth,
1275 bool not_at_start); 1127 bool not_at_start);
1276 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 1128 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1277 RegExpCompiler* compiler, 1129 RegExpCompiler* compiler,
1278 int characters_filled_in, 1130 int characters_filled_in,
1279 bool not_at_start); 1131 bool not_at_start);
1280 virtual void FillInBMInfo( 1132 virtual void FillInBMInfo(
1281 int offset, BoyerMooreLookahead* bm, bool not_at_start); 1133 int offset, BoyerMooreLookahead* bm, bool not_at_start);
1282 virtual int ComputeFirstCharacterSet(int budget);
1283 virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); } 1134 virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); }
1284 RegExpNode* loop_node() { return loop_node_; } 1135 RegExpNode* loop_node() { return loop_node_; }
1285 RegExpNode* continue_node() { return continue_node_; } 1136 RegExpNode* continue_node() { return continue_node_; }
1286 bool body_can_be_zero_length() { return body_can_be_zero_length_; } 1137 bool body_can_be_zero_length() { return body_can_be_zero_length_; }
1287 virtual void Accept(NodeVisitor* visitor); 1138 virtual void Accept(NodeVisitor* visitor);
1288 1139
1289 private: 1140 private:
1290 // AddAlternative is made private for loop nodes because alternatives 1141 // AddAlternative is made private for loop nodes because alternatives
1291 // should not be added freely, we need to keep track of which node 1142 // should not be added freely, we need to keep track of which node
1292 // goes back to the node itself. 1143 // goes back to the node itself.
1293 void AddAlternative(GuardedAlternative node) { 1144 void AddAlternative(GuardedAlternative node) {
1294 ChoiceNode::AddAlternative(node); 1145 ChoiceNode::AddAlternative(node);
1295 } 1146 }
1296 1147
1297 RegExpNode* loop_node_; 1148 RegExpNode* loop_node_;
1298 RegExpNode* continue_node_; 1149 RegExpNode* continue_node_;
1299 bool body_can_be_zero_length_; 1150 bool body_can_be_zero_length_;
1300 }; 1151 };
1301 1152
1302 1153
1154 // Improve the speed that we scan for an initial point where a non-anchored
1155 // regexp can match by using a Boyer-Moore-like table. This is done by
1156 // identifying non-greedy non-capturing loops in the nodes that eat any
1157 // character one at a time. For example in the middle of the regexp
1158 // /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
1159 // inserted at the start of any non-anchored regexp.
1160 //
1161 // When we have found such a loop we look ahead in the nodes to find the set of
1162 // characters that can come at given distances. For example for the regexp
1163 // /.?foo/ we know that there are at least 3 characters ahead of us, and the
1164 // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
1165 // the lookahead info where the set of characters is reasonably constrained. In
1166 // our example this is from index 1 to 2 (0 is not constrained). We can now
1167 // look 3 characters ahead and if we don't find one of [f, o] (the union of
1168 // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
1169 //
1170 // For Unicode input strings we do the same, but modulo 128.
1171 //
1172 // We also look at the first string fed to the regexp and use that to get a hint
1173 // of the character frequencies in the inputs. This affects the assessment of
1174 // whether the set of characters is 'reasonably constrained'.
1175 //
1176 // We also have another lookahead mechanism (called quick check in the code),
1177 // which uses a wide load of multiple characters followed by a mask and compare
1178 // to determine whether a match is possible at this point.
1179 enum ContainedInLattice {
1180 kNotYet = 0,
1181 kLatticeIn = 1,
1182 kLatticeOut = 2,
1183 kLatticeUnknown = 3 // Can also mean both in and out.
1184 };
1185
1186
1187 inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
1188 return static_cast<ContainedInLattice>(a | b);
1189 }
1190
1191
1192 ContainedInLattice AddRange(ContainedInLattice a,
1193 const int* ranges,
1194 int ranges_size,
1195 Interval new_range);
1196
1197
1198 class BoyerMoorePositionInfo : public ZoneObject {
1199 public:
1200 BoyerMoorePositionInfo()
1201 : map_(new ZoneList<bool>(kMapSize)),
1202 map_count_(0),
1203 w_(kNotYet),
1204 s_(kNotYet),
1205 d_(kNotYet),
1206 surrogate_(kNotYet) {
1207 for (int i = 0; i < kMapSize; i++) {
1208 map_->Add(false);
1209 }
1210 }
1211
1212 bool& at(int i) { return map_->at(i); }
1213
1214 static const int kMapSize = 128;
1215 static const int kMask = kMapSize - 1;
1216
1217 int map_count() const { return map_count_; }
1218
1219 void Set(int character);
1220 void SetInterval(const Interval& interval);
1221 void SetAll();
1222 bool is_non_word() { return w_ == kLatticeOut; }
1223 bool is_word() { return w_ == kLatticeIn; }
1224
1225 private:
1226 ZoneList<bool>* map_;
1227 int map_count_; // Number of set bits in the map.
1228 ContainedInLattice w_; // The \w character class.
1229 ContainedInLattice s_; // The \s character class.
1230 ContainedInLattice d_; // The \d character class.
1231 ContainedInLattice surrogate_; // Surrogate UTF-16 code units.
1232 };
1233
1234
1235 class BoyerMooreLookahead : public ZoneObject {
1236 public:
1237 BoyerMooreLookahead(int length, RegExpCompiler* compiler);
1238
1239 int length() { return length_; }
1240 int max_char() { return max_char_; }
1241 RegExpCompiler* compiler() { return compiler_; }
1242
1243 int Count(int map_number) {
1244 return bitmaps_->at(map_number)->map_count();
1245 }
1246
1247 BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
1248
1249 void Set(int map_number, int character) {
1250 if (character > max_char_) return;
1251 BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1252 info->Set(character);
1253 }
1254
1255 void SetInterval(int map_number, const Interval& interval) {
1256 if (interval.from() > max_char_) return;
1257 BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1258 if (interval.to() > max_char_) {
1259 info->SetInterval(Interval(interval.from(), max_char_));
1260 } else {
1261 info->SetInterval(interval);
1262 }
1263 }
1264
1265 void SetAll(int map_number) {
1266 bitmaps_->at(map_number)->SetAll();
1267 }
1268
1269 void SetRest(int from_map) {
1270 for (int i = from_map; i < length_; i++) SetAll(i);
1271 }
1272 bool EmitSkipInstructions(RegExpMacroAssembler* masm);
1273
1274 private:
1275 // This is the value obtained by EatsAtLeast. If we do not have at least this
1276 // many characters left in the sample string then the match is bound to fail.
1277 // Therefore it is OK to read a character this far ahead of the current match
1278 // point.
1279 int length_;
1280 RegExpCompiler* compiler_;
1281 // 0x7f for ASCII, 0xffff for UTF-16.
1282 int max_char_;
1283 ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
1284
1285 int GetSkipTable(int min_lookahead,
1286 int max_lookahead,
1287 Handle<ByteArray> boolean_skip_table);
1288 bool FindWorthwhileInterval(int* from, int* to);
1289 int FindBestInterval(
1290 int max_number_of_chars, int old_biggest_points, int* from, int* to);
1291 };
1292
1293
1303 // There are many ways to generate code for a node. This class encapsulates 1294 // There are many ways to generate code for a node. This class encapsulates
1304 // the current way we should be generating. In other words it encapsulates 1295 // the current way we should be generating. In other words it encapsulates
1305 // the current state of the code generator. The effect of this is that we 1296 // the current state of the code generator. The effect of this is that we
1306 // generate code for paths that the matcher can take through the regular 1297 // generate code for paths that the matcher can take through the regular
1307 // expression. A given node in the regexp can be code-generated several times 1298 // expression. A given node in the regexp can be code-generated several times
1308 // as it can be part of several traces. For example for the regexp: 1299 // as it can be part of several traces. For example for the regexp:
1309 // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part 1300 // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
1310 // of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code 1301 // of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code
1311 // to match foo is generated only once (the traces have a common prefix). The 1302 // to match foo is generated only once (the traces have a common prefix). The
1312 // code to store the capture is deferred and generated (twice) after the places 1303 // code to store the capture is deferred and generated (twice) after the places
(...skipping 314 matching lines...) Expand 10 before | Expand all | Expand 10 after
1627 int* vector_; 1618 int* vector_;
1628 int offsets_vector_length_; 1619 int offsets_vector_length_;
1629 1620
1630 friend class ExternalReference; 1621 friend class ExternalReference;
1631 }; 1622 };
1632 1623
1633 1624
1634 } } // namespace v8::internal 1625 } } // namespace v8::internal
1635 1626
1636 #endif // V8_JSREGEXP_H_ 1627 #endif // V8_JSREGEXP_H_
OLDNEW
« no previous file with comments | « src/interface.cc ('k') | src/jsregexp.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698