Chromium Code Reviews| Index: src/jsregexp.h |
| =================================================================== |
| --- src/jsregexp.h (revision 11224) |
| +++ src/jsregexp.h (working copy) |
| @@ -40,6 +40,7 @@ |
| class RegExpMacroAssembler; |
| class RegExpNode; |
| class RegExpTree; |
| +class BoyerMooreLookahead; |
| class RegExpImpl { |
| public: |
| @@ -224,48 +225,6 @@ |
| }; |
| -// Represents the relation of two sets. |
| -// Sets can be either disjoint, partially or fully overlapping, or equal. |
| -class SetRelation BASE_EMBEDDED { |
| - public: |
| - // Relation is represented by a bit saying whether there are elements in |
| - // one set that is not in the other, and a bit saying that there are elements |
| - // that are in both sets. |
| - |
| - // Location of an element. Corresponds to the internal areas of |
| - // a Venn diagram. |
| - enum { |
| - kInFirst = 1 << kInsideFirst, |
| - kInSecond = 1 << kInsideSecond, |
| - kInBoth = 1 << kInsideBoth |
| - }; |
| - SetRelation() : bits_(0) {} |
| - ~SetRelation() {} |
| - // Add the existence of objects in a particular |
| - void SetElementsInFirstSet() { bits_ |= kInFirst; } |
| - void SetElementsInSecondSet() { bits_ |= kInSecond; } |
| - void SetElementsInBothSets() { bits_ |= kInBoth; } |
| - // Check the currently known relation of the sets (common functions only, |
| - // for other combinations, use value() to get the bits and check them |
| - // manually). |
| - // Sets are completely disjoint. |
| - bool Disjoint() { return (bits_ & kInBoth) == 0; } |
| - // Sets are equal. |
| - bool Equals() { return (bits_ & (kInFirst | kInSecond)) == 0; } |
| - // First set contains second. |
| - bool Contains() { return (bits_ & kInSecond) == 0; } |
| - // Second set contains first. |
| - bool ContainedIn() { return (bits_ & kInFirst) == 0; } |
| - bool NonTrivialIntersection() { |
| - return (bits_ == (kInFirst | kInSecond | kInBoth)); |
| - } |
| - int value() { return bits_; } |
| - |
| - private: |
| - int bits_; |
| -}; |
| - |
| - |
| class CharacterRange { |
| public: |
| CharacterRange() : from_(0), to_(0) { } |
| @@ -273,7 +232,7 @@ |
| CharacterRange(void* null) { ASSERT_EQ(NULL, null); } //NOLINT |
| CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) { } |
| static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges); |
| - static Vector<const uc16> GetWordBounds(); |
| + static Vector<const int> GetWordBounds(); |
| static inline CharacterRange Singleton(uc16 value) { |
| return CharacterRange(value, value); |
| } |
| @@ -294,7 +253,7 @@ |
| bool IsSingleton() { return (from_ == to_); } |
| void AddCaseEquivalents(ZoneList<CharacterRange>* ranges, bool is_ascii); |
| static void Split(ZoneList<CharacterRange>* base, |
| - Vector<const uc16> overlay, |
| + Vector<const int> overlay, |
| ZoneList<CharacterRange>** included, |
| ZoneList<CharacterRange>** excluded); |
| // Whether a range list is in canonical form: Ranges ordered by from value, |
| @@ -305,28 +264,6 @@ |
| // adjacent ranges are merged. The resulting list may be shorter than the |
| // original, but cannot be longer. |
| static void Canonicalize(ZoneList<CharacterRange>* ranges); |
| - // Check how the set of characters defined by a CharacterRange list relates |
| - // to the set of word characters. List must be in canonical form. |
| - static SetRelation WordCharacterRelation(ZoneList<CharacterRange>* ranges); |
| - // Takes two character range lists (representing character sets) in canonical |
| - // form and merges them. |
| - // The characters that are only covered by the first set are added to |
| - // first_set_only_out. the characters that are only in the second set are |
| - // added to second_set_only_out, and the characters that are in both are |
| - // added to both_sets_out. |
| - // The pointers to first_set_only_out, second_set_only_out and both_sets_out |
| - // should be to empty lists, but they need not be distinct, and may be NULL. |
| - // If NULL, the characters are dropped, and if two arguments are the same |
| - // pointer, the result is the union of the two sets that would be created |
| - // if the pointers had been distinct. |
| - // This way, the Merge function can compute all the usual set operations: |
| - // union (all three out-sets are equal), intersection (only both_sets_out is |
| - // non-NULL), and set difference (only first_set is non-NULL). |
| - static void Merge(ZoneList<CharacterRange>* first_set, |
| - ZoneList<CharacterRange>* second_set, |
| - ZoneList<CharacterRange>* first_set_only_out, |
| - ZoneList<CharacterRange>* second_set_only_out, |
| - ZoneList<CharacterRange>* both_sets_out); |
| // Negate the contents of a character range in canonical form. |
| static void Negate(ZoneList<CharacterRange>* src, |
| ZoneList<CharacterRange>* dst); |
| @@ -421,90 +358,6 @@ |
| }; |
| -// Improve the speed that we scan for an initial point where a non-anchored |
| -// regexp can match by using a Boyer-Moore-like table. This is done by |
| -// identifying non-greedy non-capturing loops in the nodes that eat any |
| -// character one at a time. For example in the middle of the regexp |
| -// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly |
| -// inserted at the start of any non-anchored regexp. |
| -// |
| -// When we have found such a loop we look ahead in the nodes to find the set of |
| -// characters that can come at given distances. For example for the regexp |
| -// /.?foo/ we know that there are at least 3 characters ahead of us, and the |
| -// sets of characters that can occur are [any, [f, o], [o]]. We find a range in |
| -// the lookahead info where the set of characters is reasonably constrained. In |
| -// our example this is from index 1 to 2 (0 is not constrained). We can now |
| -// look 3 characters ahead and if we don't find one of [f, o] (the union of |
| -// [f, o] and [o]) then we can skip forwards by the range size (in this case 2). |
| -// |
| -// For Unicode input strings we do the same, but modulo 128. |
| -// |
| -// We also look at the first string fed to the regexp and use that to get a hint |
| -// of the character frequencies in the inputs. This affects the assessment of |
| -// whether the set of characters is 'reasonably constrained'. |
| -// |
| -// We also have another lookahead mechanism (called quick check in the code), |
| -// which uses a wide load of multiple characters followed by a mask and compare |
| -// to determine whether a match is possible at this point. |
| -class BoyerMooreLookahead { |
| - public: |
| - BoyerMooreLookahead(int length, int map_length, RegExpCompiler* compiler); |
| - |
| - int length() { return length_; } |
| - int max_char() { return max_char_; } |
| - RegExpCompiler* compiler() { return compiler_; } |
| - |
| - static const int kTooManyCharacters = 32; |
| - |
| - int Count(int map_number) { |
| - ZoneList<bool>* map = bitmaps_->at(map_number); |
| - if (map == NULL) return map_length_; |
| - int count = 0; |
| - for (int i = 0; i < map_length_; i++) { |
| - if (map->at(i)) count++; |
| - } |
| - return count; |
| - } |
| - |
| - void Set(int map_number, int character) { |
| - if (character > max_char_) return; |
| - ZoneList<bool>* map = bitmaps_->at(map_number); |
| - if (map == NULL) return; |
| - map->at(character & (map_length_ - 1)) = true; |
| - } |
| - |
| - void SetAll(int map_number) { |
| - bitmaps_->at(map_number) = NULL; |
| - } |
| - |
| - void SetRest(int from_map) { |
| - for (int i = from_map; i < length_; i++) SetAll(i); |
| - } |
| - bool EmitSkipInstructions(RegExpMacroAssembler* masm); |
| - |
| - private: |
| - // This is the value obtained by EatsAtLeast. If we do not have at least this |
| - // many characters left in the sample string then the match is bound to fail. |
| - // Therefore it is OK to read a character this far ahead of the current match |
| - // point. |
| - int length_; |
| - // We conservatively consider all character values modulo this length. For |
| - // ASCII there is no loss of precision, since this has a value of 128. |
| - int map_length_; |
| - RegExpCompiler* compiler_; |
| - // 0x7f for ASCII, 0xffff for UTF-16. |
| - int max_char_; |
| - ZoneList<ZoneList<bool>*>* bitmaps_; |
| - |
| - int GetSkipTable(int min_lookahead, |
| - int max_lookahead, |
| - Handle<ByteArray> boolean_skip_table); |
| - bool FindWorthwhileInterval(int* from, int* to); |
| - int FindBestInterval( |
| - int max_number_of_chars, int old_biggest_points, int* from, int* to); |
| -}; |
| - |
| - |
| #define FOR_EACH_NODE_TYPE(VISIT) \ |
| VISIT(End) \ |
| VISIT(Action) \ |
| @@ -687,7 +540,9 @@ |
| class RegExpNode: public ZoneObject { |
| public: |
| - RegExpNode() : first_character_set_(NULL), trace_count_(0) { } |
| + RegExpNode() : first_character_set_(NULL), trace_count_(0) { |
| + bm_info_[0] = bm_info_[1] = NULL; |
| + } |
| virtual ~RegExpNode(); |
| virtual void Accept(NodeVisitor* visitor) = 0; |
| // Generates a goto to this node or actually generates the code at this point. |
| @@ -731,7 +586,7 @@ |
| // Collects information on the possible code units (mod 128) that can match if |
| // we look forward. This is used for a Boyer-Moore-like string searching |
| // implementation. TODO(erikcorry): This should share more code with |
| - // EatsAtLeast, GetQuickCheckDetails and ComputeFirstCharacterSet. |
| + // EatsAtLeast, GetQuickCheckDetails. |
| virtual void FillInBMInfo( |
| int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
| UNREACHABLE(); |
| @@ -759,17 +614,6 @@ |
| SiblingList* siblings() { return &siblings_; } |
| void set_siblings(SiblingList* other) { siblings_ = *other; } |
| - // Return the set of possible next characters recognized by the regexp |
| - // (or a safe subset, potentially the set of all characters). |
| - ZoneList<CharacterRange>* FirstCharacterSet(); |
| - |
| - // Compute (if possible within the budget of traversed nodes) the |
| - // possible first characters of the input matched by this node and |
| - // its continuation. Returns the remaining budget after the computation. |
| - // If the budget is spent, the result is negative, and the cached |
| - // first_character_set_ value isn't set. |
| - virtual int ComputeFirstCharacterSet(int budget); |
| - |
| // Get and set the cached first character set value. |
| ZoneList<CharacterRange>* first_character_set() { |
| return first_character_set_; |
| @@ -777,6 +621,9 @@ |
| void set_first_character_set(ZoneList<CharacterRange>* character_set) { |
| first_character_set_ = character_set; |
| } |
| + BoyerMooreLookahead* bm_info(bool not_at_start) { |
| + return bm_info_[not_at_start ? 1 : 0]; |
| + } |
| protected: |
| enum LimitResult { DONE, CONTINUE }; |
| @@ -801,6 +648,10 @@ |
| // processed before it is on a usable state. |
| virtual RegExpNode* Clone() = 0; |
| + void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) { |
| + bm_info_[not_at_start ? 1 : 0] = bm; |
| + } |
| + |
| private: |
| static const int kFirstCharBudget = 10; |
| Label label_; |
| @@ -813,6 +664,7 @@ |
| // a trace, in which case it is generic and can be reused by flushing the |
| // deferred operations in the current trace and generating a goto. |
| int trace_count_; |
| + BoyerMooreLookahead* bm_info_[2]; |
| }; |
| @@ -833,8 +685,8 @@ |
| return (from_ <= value) && (value <= to_); |
| } |
| bool is_empty() { return from_ == kNone; } |
| - int from() { return from_; } |
| - int to() { return to_; } |
| + int from() const { return from_; } |
| + int to() const { return to_; } |
| static Interval Empty() { return Interval(); } |
| static const int kNone = -1; |
| private: |
| @@ -852,6 +704,7 @@ |
| virtual void FillInBMInfo( |
| int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
| on_success_->FillInBMInfo(offset, bm, not_at_start); |
| + if (offset == 0) set_bm_info(not_at_start, bm); |
| } |
| private: |
| RegExpNode* on_success_; |
| @@ -905,7 +758,6 @@ |
| // TODO(erikcorry): We should allow some action nodes in greedy loops. |
| virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; } |
| virtual ActionNode* Clone() { return new ActionNode(*this); } |
| - virtual int ComputeFirstCharacterSet(int budget); |
| private: |
| union { |
| @@ -978,7 +830,6 @@ |
| return result; |
| } |
| void CalculateOffsets(); |
| - virtual int ComputeFirstCharacterSet(int budget); |
| private: |
| enum TextEmitPassType { |
| @@ -1009,12 +860,7 @@ |
| AT_START, |
| AT_BOUNDARY, |
| AT_NON_BOUNDARY, |
| - AFTER_NEWLINE, |
| - // Types not directly expressible in regexp syntax. |
| - // Used for modifying a boundary node if its following character is |
| - // known to be word and/or non-word. |
| - AFTER_NONWORD_CHARACTER, |
| - AFTER_WORD_CHARACTER |
| + AFTER_NEWLINE |
| }; |
| static AssertionNode* AtEnd(RegExpNode* on_success) { |
| return new AssertionNode(AT_END, on_success); |
| @@ -1042,12 +888,16 @@ |
| bool not_at_start); |
| virtual void FillInBMInfo( |
| int offset, BoyerMooreLookahead* bm, bool not_at_start); |
| - virtual int ComputeFirstCharacterSet(int budget); |
| virtual AssertionNode* Clone() { return new AssertionNode(*this); } |
| AssertionNodeType type() { return type_; } |
| void set_type(AssertionNodeType type) { type_ = type; } |
| private: |
| + void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace); |
| + enum IfPrevious { kIsNonWord, kIsWord }; |
| + void BacktrackIfPrevious(RegExpCompiler* compiler, |
| + Trace* trace, |
| + IfPrevious backtrack_if_previous); |
| AssertionNode(AssertionNodeType t, RegExpNode* on_success) |
| : SeqRegExpNode(on_success), type_(t) { } |
| AssertionNodeType type_; |
| @@ -1076,13 +926,8 @@ |
| return; |
| } |
| virtual void FillInBMInfo( |
| - int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
| - // Working out the set of characters that a backreference can match is too |
| - // hard, so we just say that any character can match. |
| - bm->SetRest(offset); |
| - } |
| + int offset, BoyerMooreLookahead* bm, bool not_at_start); |
| virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); } |
| - virtual int ComputeFirstCharacterSet(int budget); |
| private: |
| int start_reg_; |
| @@ -1249,6 +1094,7 @@ |
| virtual void FillInBMInfo( |
| int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
| alternatives_->at(1).node()->FillInBMInfo(offset, bm, not_at_start); |
| + if (offset == 0) set_bm_info(not_at_start, bm); |
| } |
| // For a negative lookahead we don't emit the quick check for the |
| // alternative that is expected to fail. This is because quick check code |
| @@ -1256,7 +1102,6 @@ |
| // characters, but on a negative lookahead the negative branch did not take |
| // part in that calculation (EatsAtLeast) so the assumptions don't hold. |
| virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; } |
| - virtual int ComputeFirstCharacterSet(int budget); |
| }; |
| @@ -1279,7 +1124,6 @@ |
| bool not_at_start); |
| virtual void FillInBMInfo( |
| int offset, BoyerMooreLookahead* bm, bool not_at_start); |
| - virtual int ComputeFirstCharacterSet(int budget); |
| virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); } |
| RegExpNode* loop_node() { return loop_node_; } |
| RegExpNode* continue_node() { return continue_node_; } |
| @@ -1300,6 +1144,141 @@ |
| }; |
| +// Improve the speed that we scan for an initial point where a non-anchored |
| +// regexp can match by using a Boyer-Moore-like table. This is done by |
| +// identifying non-greedy non-capturing loops in the nodes that eat any |
| +// character one at a time. For example in the middle of the regexp |
| +// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly |
| +// inserted at the start of any non-anchored regexp. |
| +// |
| +// When we have found such a loop we look ahead in the nodes to find the set of |
| +// characters that can come at given distances. For example for the regexp |
| +// /.?foo/ we know that there are at least 3 characters ahead of us, and the |
| +// sets of characters that can occur are [any, [f, o], [o]]. We find a range in |
| +// the lookahead info where the set of characters is reasonably constrained. In |
| +// our example this is from index 1 to 2 (0 is not constrained). We can now |
| +// look 3 characters ahead and if we don't find one of [f, o] (the union of |
| +// [f, o] and [o]) then we can skip forwards by the range size (in this case 2). |
| +// |
| +// For Unicode input strings we do the same, but modulo 128. |
| +// |
| +// We also look at the first string fed to the regexp and use that to get a hint |
| +// of the character frequencies in the inputs. This affects the assessment of |
| +// whether the set of characters is 'reasonably constrained'. |
| +// |
| +// We also have another lookahead mechanism (called quick check in the code), |
| +// which uses a wide load of multiple characters followed by a mask and compare |
| +// to determine whether a match is possible at this point. |
| +enum ContainedInLattice { |
| + kNotYet = 0, |
| + kLatticeIn = 1, |
| + kLatticeOut = 2, |
| + kLatticeUnknown = 3 // Can also mean both in and out. |
| +}; |
| + |
| + |
| +ContainedInLattice AddRange(ContainedInLattice a, |
| + const int* ranges, |
| + int ranges_size, |
| + Interval new_range); |
| + |
| + |
| +class BoyerMoorePositionInfo : public ZoneObject { |
| + public: |
| + BoyerMoorePositionInfo() |
| + : map_(new ZoneList<bool>(kMapSize)), |
| + map_count_(0), |
| + w_(kNotYet), |
| + s_(kNotYet), |
| + d_(kNotYet), |
| + surrogate_(kNotYet) { |
| + for (int i = 0; i < kMapSize; i++) { |
| + map_->Add(false); |
| + } |
| + } |
| + |
| + bool& at(int i) { return map_->at(i); } |
| + |
| + static const int kMapSize = 128; |
| + static const int kMask = kMapSize -1 ; |
|
ulan
2012/04/16 13:52:22
Should be kMapSize - 1;
Erik Corry
2012/04/17 07:51:45
Done.
|
| + |
| + int map_count() const { return map_count_; } |
| + |
| + void Set(int character); |
| + void SetInterval(const Interval& interval); |
| + void SetAll(); |
| + bool is_non_word() { return w_ == kLatticeOut; } |
| + bool is_word() { return w_ == kLatticeIn; } |
| + |
| + private: |
| + ZoneList<bool>* map_; |
| + int map_count_; // Number of set bits in the map. |
| + ContainedInLattice w_; // The \w character class. |
| + ContainedInLattice s_; // The \s character class. |
| + ContainedInLattice d_; // The \d character class. |
| + ContainedInLattice surrogate_; // Surrogate UTF-16 code units. |
| +}; |
| + |
| + |
| +class BoyerMooreLookahead : public ZoneObject { |
| + public: |
| + BoyerMooreLookahead(int length, RegExpCompiler* compiler); |
| + |
| + int length() { return length_; } |
| + int max_char() { return max_char_; } |
| + RegExpCompiler* compiler() { return compiler_; } |
| + |
| + int Count(int map_number) { |
| + return bitmaps_->at(map_number)->map_count(); |
| + } |
| + |
| + BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); } |
| + |
| + void Set(int map_number, int character) { |
| + if (character > max_char_) return; |
| + BoyerMoorePositionInfo* info = bitmaps_->at(map_number); |
| + info->Set(character); |
| + } |
| + |
| + void SetInterval(int map_number, const Interval& interval) { |
| + if (interval.from() > max_char_) return; |
| + BoyerMoorePositionInfo* info = bitmaps_->at(map_number); |
| + if (interval.to() > max_char_) { |
| + info->SetInterval(Interval(interval.from(), max_char_)); |
| + } else { |
| + info->SetInterval(interval); |
| + } |
| + } |
| + |
| + void SetAll(int map_number) { |
| + bitmaps_->at(map_number)->SetAll(); |
| + } |
| + |
| + void SetRest(int from_map) { |
| + for (int i = from_map; i < length_; i++) SetAll(i); |
| + } |
| + bool EmitSkipInstructions(RegExpMacroAssembler* masm); |
| + |
| + private: |
| + // This is the value obtained by EatsAtLeast. If we do not have at least this |
| + // many characters left in the sample string then the match is bound to fail. |
| + // Therefore it is OK to read a character this far ahead of the current match |
| + // point. |
| + int length_; |
| + RegExpCompiler* compiler_; |
| + // 0x7f for ASCII, 0xffff for UTF-16. |
| + int max_char_; |
| + ZoneList<BoyerMoorePositionInfo*>* bitmaps_; |
| + |
| + int GetSkipTable(int min_lookahead, |
| + int max_lookahead, |
| + Handle<ByteArray> boolean_skip_table); |
| + bool FindWorthwhileInterval(int* from, int* to); |
| + int FindBestInterval( |
| + int max_number_of_chars, int old_biggest_points, int* from, int* to); |
| +}; |
| + |
| + |
| // There are many ways to generate code for a node. This class encapsulates |
| // the current way we should be generating. In other words it encapsulates |
| // the current state of the code generator. The effect of this is that we |