OLD | NEW |
---|---|
1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
101 Factory* factory = isolate->factory(); | 101 Factory* factory = isolate->factory(); |
102 Handle<FixedArray> elements = factory->NewFixedArray(2); | 102 Handle<FixedArray> elements = factory->NewFixedArray(2); |
103 elements->set(0, *pattern); | 103 elements->set(0, *pattern); |
104 elements->set(1, *error_text); | 104 elements->set(1, *error_text); |
105 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); | 105 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); |
106 Handle<Object> regexp_err = factory->NewSyntaxError(message, array); | 106 Handle<Object> regexp_err = factory->NewSyntaxError(message, array); |
107 isolate->Throw(*regexp_err); | 107 isolate->Throw(*regexp_err); |
108 } | 108 } |
109 | 109 |
110 | 110 |
111 static ContainedInLattice Combine(ContainedInLattice a, bool inside) { | |
ulan
2012/04/16 13:52:22
Passing kLatticeIn/kLatticeOut instead of bool wou
Erik Corry
2012/04/17 07:51:45
Done.
| |
112 if (a == kLatticeUnknown) return a; | |
113 if (a == kNotYet) return inside ? kLatticeIn : kLatticeOut; | |
114 if (a == kLatticeIn) return inside ? kLatticeIn : kLatticeUnknown; | |
115 if (a == kLatticeOut) return inside ? kLatticeUnknown : kLatticeOut; | |
116 UNREACHABLE(); | |
117 return kLatticeUnknown; | |
118 } | |
119 | |
120 | |
121 ContainedInLattice AddRange(ContainedInLattice containment, | |
122 const int* ranges, | |
123 int ranges_length, | |
124 Interval new_range) { | |
125 if (containment == kLatticeUnknown) return containment; | |
126 bool inside = false; | |
127 int last = 0; | |
128 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) { | |
129 // Consider the range from last to ranges[i]. | |
130 // We haven't got to the new range yet. | |
131 if (ranges[i] <= new_range.from()) continue; | |
132 // New range is wholly inside last-ranges[i]. | |
133 if (last <= new_range.from() && ranges[i] > new_range.to()) { | |
ulan
2012/04/16 13:52:22
So ranges[i] is not inclusive, but new_range.to()
Erik Corry
2012/04/17 07:51:45
Done.
| |
134 return Combine(containment, inside); | |
135 } | |
136 // Otherwise it | |
ulan
2012/04/16 13:52:22
Trailing space.
Erik Corry
2012/04/17 07:51:45
Done.
| |
137 return kLatticeUnknown; | |
138 } | |
139 return containment; | |
140 } | |
141 | |
142 | |
111 // More makes code generation slower, less makes V8 benchmark score lower. | 143 // More makes code generation slower, less makes V8 benchmark score lower. |
112 const int kMaxLookaheadForBoyerMoore = 8; | 144 const int kMaxLookaheadForBoyerMoore = 8; |
113 // In a 3-character pattern you can maximally step forwards 3 characters | 145 // In a 3-character pattern you can maximally step forwards 3 characters |
114 // at a time, which is not always enough to pay for the extra logic. | 146 // at a time, which is not always enough to pay for the extra logic. |
115 const int kPatternTooShortForBoyerMoore = 2; | 147 const int kPatternTooShortForBoyerMoore = 2; |
116 | 148 |
117 | 149 |
118 // Identifies the sort of regexps where the regexp engine is faster | 150 // Identifies the sort of regexps where the regexp engine is faster |
119 // than the code used for atom matches. | 151 // than the code used for atom matches. |
120 static bool HasFewDifferentCharacters(Handle<String> pattern) { | 152 static bool HasFewDifferentCharacters(Handle<String> pattern) { |
(...skipping 2029 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2150 | 2182 |
2151 | 2183 |
2152 void ActionNode::FillInBMInfo(int offset, | 2184 void ActionNode::FillInBMInfo(int offset, |
2153 BoyerMooreLookahead* bm, | 2185 BoyerMooreLookahead* bm, |
2154 bool not_at_start) { | 2186 bool not_at_start) { |
2155 if (type_ == BEGIN_SUBMATCH) { | 2187 if (type_ == BEGIN_SUBMATCH) { |
2156 bm->SetRest(offset); | 2188 bm->SetRest(offset); |
2157 } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) { | 2189 } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) { |
2158 on_success()->FillInBMInfo(offset, bm, not_at_start); | 2190 on_success()->FillInBMInfo(offset, bm, not_at_start); |
2159 } | 2191 } |
2192 if (offset == 0) set_bm_info(not_at_start, bm); | |
ulan
2012/04/16 13:52:22
As discussed offline, it probably better to move t
Erik Corry
2012/04/17 07:51:45
Done.
| |
2160 } | 2193 } |
2161 | 2194 |
2162 | 2195 |
2163 int AssertionNode::EatsAtLeast(int still_to_find, | 2196 int AssertionNode::EatsAtLeast(int still_to_find, |
2164 int recursion_depth, | 2197 int recursion_depth, |
2165 bool not_at_start) { | 2198 bool not_at_start) { |
2166 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; | 2199 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
2167 // If we know we are not at the start and we are asked "how many characters | 2200 // If we know we are not at the start and we are asked "how many characters |
2168 // will you match if you succeed?" then we can answer anything since false | 2201 // will you match if you succeed?" then we can answer anything since false |
2169 // implies false. So lets just return the max answer (still_to_find) since | 2202 // implies false. So lets just return the max answer (still_to_find) since |
2170 // that won't prevent us from preloading a lot of characters for the other | 2203 // that won't prevent us from preloading a lot of characters for the other |
2171 // branches in the node graph. | 2204 // branches in the node graph. |
2172 if (type() == AT_START && not_at_start) return still_to_find; | 2205 if (type() == AT_START && not_at_start) return still_to_find; |
2173 return on_success()->EatsAtLeast(still_to_find, | 2206 return on_success()->EatsAtLeast(still_to_find, |
2174 recursion_depth + 1, | 2207 recursion_depth + 1, |
2175 not_at_start); | 2208 not_at_start); |
2176 } | 2209 } |
2177 | 2210 |
2178 | 2211 |
2179 void AssertionNode::FillInBMInfo( | 2212 void AssertionNode::FillInBMInfo( |
2180 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | 2213 int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
2181 // Match the behaviour of EatsAtLeast on this node. | 2214 // Match the behaviour of EatsAtLeast on this node. |
2182 if (type() == AT_START && not_at_start) return; | 2215 if (type() == AT_START && not_at_start) return; |
2183 on_success()->FillInBMInfo(offset, bm, not_at_start); | 2216 on_success()->FillInBMInfo(offset, bm, not_at_start); |
2217 if (offset == 0) set_bm_info(not_at_start, bm); | |
2184 } | 2218 } |
2185 | 2219 |
2186 | 2220 |
2187 int BackReferenceNode::EatsAtLeast(int still_to_find, | 2221 int BackReferenceNode::EatsAtLeast(int still_to_find, |
2188 int recursion_depth, | 2222 int recursion_depth, |
2189 bool not_at_start) { | 2223 bool not_at_start) { |
2190 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; | 2224 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
2191 return on_success()->EatsAtLeast(still_to_find, | 2225 return on_success()->EatsAtLeast(still_to_find, |
2192 recursion_depth + 1, | 2226 recursion_depth + 1, |
2193 not_at_start); | 2227 not_at_start); |
(...skipping 416 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2610 if (body_can_be_zero_length_ || info()->visited) return; | 2644 if (body_can_be_zero_length_ || info()->visited) return; |
2611 VisitMarker marker(info()); | 2645 VisitMarker marker(info()); |
2612 return ChoiceNode::GetQuickCheckDetails(details, | 2646 return ChoiceNode::GetQuickCheckDetails(details, |
2613 compiler, | 2647 compiler, |
2614 characters_filled_in, | 2648 characters_filled_in, |
2615 not_at_start); | 2649 not_at_start); |
2616 } | 2650 } |
2617 | 2651 |
2618 | 2652 |
2619 void LoopChoiceNode::FillInBMInfo( | 2653 void LoopChoiceNode::FillInBMInfo( |
2620 int offset, BoyerMooreLookahead* bm, bool nas) { | 2654 int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
2621 if (body_can_be_zero_length_) { | 2655 if (body_can_be_zero_length_) { |
2622 bm->SetRest(offset); | 2656 bm->SetRest(offset); |
2657 if (offset == 0) set_bm_info(not_at_start, bm); | |
2623 return; | 2658 return; |
2624 } | 2659 } |
2625 ChoiceNode::FillInBMInfo(offset, bm, nas); | 2660 ChoiceNode::FillInBMInfo(offset, bm, not_at_start); |
2661 if (offset == 0) set_bm_info(not_at_start, bm); | |
2626 } | 2662 } |
2627 | 2663 |
2628 | 2664 |
2629 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, | 2665 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, |
2630 RegExpCompiler* compiler, | 2666 RegExpCompiler* compiler, |
2631 int characters_filled_in, | 2667 int characters_filled_in, |
2632 bool not_at_start) { | 2668 bool not_at_start) { |
2633 not_at_start = (not_at_start || not_at_start_); | 2669 not_at_start = (not_at_start || not_at_start_); |
2634 int choice_count = alternatives_->length(); | 2670 int choice_count = alternatives_->length(); |
2635 ASSERT(choice_count > 0); | 2671 ASSERT(choice_count > 0); |
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2703 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); | 2739 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); |
2704 } | 2740 } |
2705 assembler->CheckCharacter('\n', &ok); | 2741 assembler->CheckCharacter('\n', &ok); |
2706 assembler->CheckNotCharacter('\r', new_trace.backtrack()); | 2742 assembler->CheckNotCharacter('\r', new_trace.backtrack()); |
2707 } | 2743 } |
2708 assembler->Bind(&ok); | 2744 assembler->Bind(&ok); |
2709 on_success->Emit(compiler, &new_trace); | 2745 on_success->Emit(compiler, &new_trace); |
2710 } | 2746 } |
2711 | 2747 |
2712 | 2748 |
2713 // Emit the code to handle \b and \B (word-boundary or non-word-boundary) | |
2714 // when we know whether the next character must be a word character or not. | |
2715 static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type, | |
2716 RegExpCompiler* compiler, | |
2717 RegExpNode* on_success, | |
2718 Trace* trace) { | |
2719 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | |
2720 Label done; | |
2721 | |
2722 Trace new_trace(*trace); | |
2723 | |
2724 bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER); | |
2725 Label* on_word = expect_word_character ? &done : new_trace.backtrack(); | |
2726 Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done; | |
2727 | |
2728 // Check whether previous character was a word character. | |
2729 switch (trace->at_start()) { | |
2730 case Trace::TRUE: | |
2731 if (expect_word_character) { | |
2732 assembler->GoTo(on_non_word); | |
2733 } | |
2734 break; | |
2735 case Trace::UNKNOWN: | |
2736 ASSERT_EQ(0, trace->cp_offset()); | |
2737 assembler->CheckAtStart(on_non_word); | |
2738 // Fall through. | |
2739 case Trace::FALSE: | |
2740 int prev_char_offset = trace->cp_offset() - 1; | |
2741 assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1); | |
2742 EmitWordCheck(assembler, on_word, on_non_word, expect_word_character); | |
2743 // We may or may not have loaded the previous character. | |
2744 new_trace.InvalidateCurrentCharacter(); | |
2745 } | |
2746 | |
2747 assembler->Bind(&done); | |
2748 | |
2749 on_success->Emit(compiler, &new_trace); | |
2750 } | |
2751 | |
2752 | |
2753 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). | 2749 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). |
2754 static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type, | 2750 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { |
2755 RegExpCompiler* compiler, | |
2756 RegExpNode* on_success, | |
2757 Trace* trace) { | |
2758 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | 2751 RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
2759 Label before_non_word; | 2752 Trace::TriBool next_is_word_character = Trace::UNKNOWN; |
2760 Label before_word; | 2753 bool not_at_start = (trace->at_start() == Trace::FALSE); |
2761 if (trace->characters_preloaded() != 1) { | 2754 BoyerMooreLookahead* lookahead = bm_info(not_at_start); |
2762 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); | 2755 if (lookahead == NULL) { |
2756 int eats_at_least = | |
2757 Min(kMaxLookaheadForBoyerMoore, | |
2758 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start)); | |
2759 if (eats_at_least >= 1) { | |
2760 BoyerMooreLookahead* bm = | |
2761 new BoyerMooreLookahead(eats_at_least, compiler); | |
2762 FillInBMInfo(0, bm, not_at_start); | |
2763 if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE; | |
2764 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE; | |
ulan
2012/04/16 13:52:22
Consider writing the two lines above as:
next_is_w
Erik Corry
2012/04/17 07:51:45
Not the same, since it could be neither known to b
| |
2765 } | |
2766 } else { | |
2767 if (lookahead->at(0)->is_non_word()) next_is_word_character = Trace::FALSE; | |
2768 if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE; | |
2763 } | 2769 } |
2764 // Fall through on non-word. | 2770 bool at_boundary = (type_ == AssertionNode::AT_BOUNDARY); |
2765 EmitWordCheck(assembler, &before_word, &before_non_word, false); | 2771 if (next_is_word_character == Trace::UNKNOWN) { |
2772 Label before_non_word; | |
2773 Label before_word; | |
2774 if (trace->characters_preloaded() != 1) { | |
2775 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); | |
2776 } | |
2777 // Fall through on non-word. | |
2778 EmitWordCheck(assembler, &before_word, &before_non_word, false); | |
2779 // Next character is not a word character. | |
2780 assembler->Bind(&before_non_word); | |
2781 Label ok; | |
2782 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); | |
2783 assembler->GoTo(&ok); | |
2766 | 2784 |
2767 // We will be loading the previous character into the current character | 2785 assembler->Bind(&before_word); |
2768 // register. | 2786 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); |
2787 assembler->Bind(&ok); | |
2788 } else if (next_is_word_character == Trace::TRUE) { | |
2789 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); | |
2790 } else { | |
2791 ASSERT(next_is_word_character == Trace::FALSE); | |
2792 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); | |
2793 } | |
2794 } | |
2795 | |
2796 | |
2797 void AssertionNode::BacktrackIfPrevious( | |
2798 RegExpCompiler* compiler, | |
2799 Trace* trace, | |
2800 AssertionNode::IfPrevious backtrack_if_previous) { | |
2801 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | |
2769 Trace new_trace(*trace); | 2802 Trace new_trace(*trace); |
2770 new_trace.InvalidateCurrentCharacter(); | 2803 new_trace.InvalidateCurrentCharacter(); |
2771 | 2804 |
2772 Label ok; | 2805 Label fall_through, dummy; |
2773 Label* boundary; | |
2774 Label* not_boundary; | |
2775 if (type == AssertionNode::AT_BOUNDARY) { | |
2776 boundary = &ok; | |
2777 not_boundary = new_trace.backtrack(); | |
2778 } else { | |
2779 not_boundary = &ok; | |
2780 boundary = new_trace.backtrack(); | |
2781 } | |
2782 | 2806 |
2783 // Next character is not a word character. | 2807 Label* non_word = backtrack_if_previous == kIsNonWord ? |
2784 assembler->Bind(&before_non_word); | 2808 new_trace.backtrack() : |
2809 &fall_through; | |
2810 Label* word = backtrack_if_previous == kIsNonWord ? | |
2811 &fall_through : | |
2812 new_trace.backtrack(); | |
2813 | |
2785 if (new_trace.cp_offset() == 0) { | 2814 if (new_trace.cp_offset() == 0) { |
2786 // The start of input counts as a non-word character, so the question is | 2815 // The start of input counts as a non-word character, so the question is |
2787 // decided if we are at the start. | 2816 // decided if we are at the start. |
2788 assembler->CheckAtStart(not_boundary); | 2817 assembler->CheckAtStart(non_word); |
2789 } | 2818 } |
2790 // We already checked that we are not at the start of input so it must be | 2819 // We already checked that we are not at the start of input so it must be |
2791 // OK to load the previous character. | 2820 // OK to load the previous character. |
2792 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, | 2821 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false); |
2793 &ok, // Unused dummy label in this call. | 2822 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord); |
2794 false); | |
2795 // Fall through on non-word. | |
2796 EmitWordCheck(assembler, boundary, not_boundary, false); | |
2797 assembler->GoTo(not_boundary); | |
2798 | 2823 |
2799 // Next character is a word character. | 2824 assembler->Bind(&fall_through); |
2800 assembler->Bind(&before_word); | 2825 on_success()->Emit(compiler, &new_trace); |
2801 if (new_trace.cp_offset() == 0) { | |
2802 // The start of input counts as a non-word character, so the question is | |
2803 // decided if we are at the start. | |
2804 assembler->CheckAtStart(boundary); | |
2805 } | |
2806 // We already checked that we are not at the start of input so it must be | |
2807 // OK to load the previous character. | |
2808 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, | |
2809 &ok, // Unused dummy label in this call. | |
2810 false); | |
2811 bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY); | |
2812 EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word); | |
2813 | |
2814 assembler->Bind(&ok); | |
2815 | |
2816 on_success->Emit(compiler, &new_trace); | |
2817 } | 2826 } |
2818 | 2827 |
2819 | 2828 |
2820 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, | 2829 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, |
2821 RegExpCompiler* compiler, | 2830 RegExpCompiler* compiler, |
2822 int filled_in, | 2831 int filled_in, |
2823 bool not_at_start) { | 2832 bool not_at_start) { |
2824 if (type_ == AT_START && not_at_start) { | 2833 if (type_ == AT_START && not_at_start) { |
2825 details->set_cannot_match(); | 2834 details->set_cannot_match(); |
2826 return; | 2835 return; |
(...skipping 27 matching lines...) Expand all Loading... | |
2854 on_success()->Emit(compiler, &at_start_trace); | 2863 on_success()->Emit(compiler, &at_start_trace); |
2855 return; | 2864 return; |
2856 } | 2865 } |
2857 } | 2866 } |
2858 break; | 2867 break; |
2859 case AFTER_NEWLINE: | 2868 case AFTER_NEWLINE: |
2860 EmitHat(compiler, on_success(), trace); | 2869 EmitHat(compiler, on_success(), trace); |
2861 return; | 2870 return; |
2862 case AT_BOUNDARY: | 2871 case AT_BOUNDARY: |
2863 case AT_NON_BOUNDARY: { | 2872 case AT_NON_BOUNDARY: { |
2864 EmitBoundaryCheck(type_, compiler, on_success(), trace); | 2873 EmitBoundaryCheck(compiler, trace); |
2865 return; | 2874 return; |
2866 } | 2875 } |
2867 case AFTER_WORD_CHARACTER: | |
2868 case AFTER_NONWORD_CHARACTER: { | |
2869 EmitHalfBoundaryCheck(type_, compiler, on_success(), trace); | |
2870 } | |
2871 } | 2876 } |
2872 on_success()->Emit(compiler, trace); | 2877 on_success()->Emit(compiler, trace); |
2873 } | 2878 } |
2874 | 2879 |
2875 | 2880 |
2876 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { | 2881 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { |
2877 if (quick_check == NULL) return false; | 2882 if (quick_check == NULL) return false; |
2878 if (offset >= quick_check->characters()) return false; | 2883 if (offset >= quick_check->characters()) return false; |
2879 return quick_check->positions(offset)->determines_perfectly; | 2884 return quick_check->positions(offset)->determines_perfectly; |
2880 } | 2885 } |
(...skipping 389 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3270 return alt_gens_[i]; | 3275 return alt_gens_[i]; |
3271 } | 3276 } |
3272 | 3277 |
3273 private: | 3278 private: |
3274 static const int kAFew = 10; | 3279 static const int kAFew = 10; |
3275 ZoneList<AlternativeGeneration*> alt_gens_; | 3280 ZoneList<AlternativeGeneration*> alt_gens_; |
3276 AlternativeGeneration a_few_alt_gens_[kAFew]; | 3281 AlternativeGeneration a_few_alt_gens_[kAFew]; |
3277 }; | 3282 }; |
3278 | 3283 |
3279 | 3284 |
3285 // The '2' variant is has inclusive from and exclusive to. | |
3286 static const int kSpaceRanges2[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, | |
ulan
2012/04/16 13:52:22
Consider adding a static assert that the range cou
Erik Corry
2012/04/17 07:51:45
AddRange jsregexp.cc:118 uses all the elements in
| |
3287 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, | |
3288 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, 0x10000 }; | |
3289 static const int kSpaceRange2Count = ARRAY_SIZE(kSpaceRanges2); | |
3290 | |
3291 static const int kWordRanges2[] = { | |
3292 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 }; | |
3293 static const int kWordRange2Count = ARRAY_SIZE(kWordRanges2); | |
3294 static const int kDigitRanges2[] = { '0', '9' + 1, 0x10000 }; | |
3295 static const int kDigitRange2Count = ARRAY_SIZE(kDigitRanges2); | |
3296 static const int kSurrogateRanges2[] = { 0xd800, 0xe000, 0x10000 }; | |
3297 static const int kSurrogateRange2Count = ARRAY_SIZE(kSurrogateRanges2); | |
3298 static const int kLineTerminatorRanges2[] = { 0x000A, 0x000B, 0x000D, 0x000E, | |
3299 0x2028, 0x202A, 0x10000 }; | |
3300 static const int kLineTerminatorRange2Count = ARRAY_SIZE(kLineTerminatorRanges2) ; | |
ulan
2012/04/16 13:52:22
Long line above and too many new lines below.
Erik Corry
2012/04/17 07:51:45
Done.
| |
3301 | |
3302 | |
3303 | |
3304 void BoyerMoorePositionInfo::Set(int character) { | |
3305 SetInterval(Interval(character, character)); | |
3306 } | |
3307 | |
3308 | |
3309 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) { | |
3310 s_ = AddRange(s_, kSpaceRanges2, kSpaceRange2Count, interval); | |
3311 w_ = AddRange(w_, kWordRanges2, kWordRange2Count, interval); | |
3312 d_ = AddRange(d_, kDigitRanges2, kDigitRange2Count, interval); | |
3313 surrogate_ = | |
3314 AddRange(surrogate_, kSurrogateRanges2, kSurrogateRange2Count, interval); | |
3315 if (interval.to() - interval.from() >= kMapSize - 1) { | |
3316 if (map_count_ != kMapSize) { | |
3317 map_count_ = kMapSize; | |
3318 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; | |
3319 } | |
3320 return; | |
3321 } | |
3322 for (int i = interval.from(); i <= interval.to(); i++) { | |
3323 int mod_character = (i & kMask); | |
3324 if (!map_->at(mod_character)) { | |
3325 map_count_++; | |
3326 map_->at(mod_character) = true; | |
3327 } | |
3328 if (map_count_ == kMapSize) return; | |
3329 } | |
3330 } | |
3331 | |
3332 | |
3333 void BoyerMoorePositionInfo::SetAll() { | |
3334 s_ = w_ = d_ = kLatticeUnknown; | |
3335 if (map_count_ != kMapSize) { | |
3336 map_count_ = kMapSize; | |
3337 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; | |
3338 } | |
3339 } | |
3340 | |
3341 | |
3280 BoyerMooreLookahead::BoyerMooreLookahead( | 3342 BoyerMooreLookahead::BoyerMooreLookahead( |
3281 int length, int map_length, RegExpCompiler* compiler) | 3343 int length, RegExpCompiler* compiler) |
3282 : length_(length), | 3344 : length_(length), |
3283 map_length_(map_length), | |
3284 compiler_(compiler) { | 3345 compiler_(compiler) { |
3285 ASSERT(IsPowerOf2(map_length)); | |
3286 if (compiler->ascii()) { | 3346 if (compiler->ascii()) { |
3287 max_char_ = String::kMaxAsciiCharCode; | 3347 max_char_ = String::kMaxAsciiCharCode; |
3288 } else { | 3348 } else { |
3289 max_char_ = String::kMaxUtf16CodeUnit; | 3349 max_char_ = String::kMaxUtf16CodeUnit; |
3290 } | 3350 } |
3291 bitmaps_ = new ZoneList<ZoneList<bool>*>(length); | 3351 bitmaps_ = new ZoneList<BoyerMoorePositionInfo*>(length); |
3292 for (int i = 0; i < length; i++) { | 3352 for (int i = 0; i < length; i++) { |
3293 bitmaps_->Add(new ZoneList<bool>(map_length)); | 3353 bitmaps_->Add(new BoyerMoorePositionInfo()); |
3294 ZoneList<bool>* map = bitmaps_->at(i); | |
3295 for (int i = 0; i < map_length; i++) { | |
3296 map->Add(false); | |
3297 } | |
3298 } | 3354 } |
3299 } | 3355 } |
3300 | 3356 |
3301 | 3357 |
3302 // Find the longest range of lookahead that has the fewest number of different | 3358 // Find the longest range of lookahead that has the fewest number of different |
3303 // characters that can occur at a given position. Since we are optimizing two | 3359 // characters that can occur at a given position. Since we are optimizing two |
3304 // different parameters at once this is a tradeoff. | 3360 // different parameters at once this is a tradeoff. |
3305 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { | 3361 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { |
3306 int biggest_points = 0; | 3362 int biggest_points = 0; |
3307 for (int max_number_of_chars = 4; | 3363 for (int max_number_of_chars = 4; |
3308 max_number_of_chars < kTooManyCharacters; | 3364 max_number_of_chars < 32; |
ulan
2012/04/16 13:52:22
Now 32 is more magical :)
Erik Corry
2012/04/17 07:51:45
Done.
| |
3309 max_number_of_chars *= 2) { | 3365 max_number_of_chars *= 2) { |
3310 biggest_points = | 3366 biggest_points = |
3311 FindBestInterval(max_number_of_chars, biggest_points, from, to); | 3367 FindBestInterval(max_number_of_chars, biggest_points, from, to); |
3312 } | 3368 } |
3313 if (biggest_points == 0) return false; | 3369 if (biggest_points == 0) return false; |
3314 return true; | 3370 return true; |
3315 } | 3371 } |
3316 | 3372 |
3317 | 3373 |
3318 // Find the highest-points range between 0 and length_ where the character | 3374 // Find the highest-points range between 0 and length_ where the character |
3319 // information is not too vague. 'Too vague' means that there are more than | 3375 // information is not too vague. 'Too vague' means that there are more than |
3320 // max_number_of_chars that can occur at this position. Calculates the number | 3376 // max_number_of_chars that can occur at this position. Calculates the number |
3321 // of points as the product of width-of-the-range and | 3377 // of points as the product of width-of-the-range and |
3322 // probability-of-finding-one-of-the-characters, where the probability is | 3378 // probability-of-finding-one-of-the-characters, where the probability is |
3323 // calculated using the frequency distribution of the sample subject string. | 3379 // calculated using the frequency distribution of the sample subject string. |
3324 int BoyerMooreLookahead::FindBestInterval( | 3380 int BoyerMooreLookahead::FindBestInterval( |
3325 int max_number_of_chars, int old_biggest_points, int* from, int* to) { | 3381 int max_number_of_chars, int old_biggest_points, int* from, int* to) { |
3326 int biggest_points = old_biggest_points; | 3382 int biggest_points = old_biggest_points; |
3327 static const int kSize = RegExpMacroAssembler::kTableSize; | 3383 static const int kSize = RegExpMacroAssembler::kTableSize; |
3328 for (int i = 0; i < length_; ) { | 3384 for (int i = 0; i < length_; ) { |
3329 while (i < length_ && Count(i) > max_number_of_chars) i++; | 3385 while (i < length_ && Count(i) > max_number_of_chars) i++; |
3330 if (i == length_) break; | 3386 if (i == length_) break; |
3331 int remembered_from = i; | 3387 int remembered_from = i; |
3332 bool union_map[kSize]; | 3388 bool union_map[kSize]; |
3333 for (int j = 0; j < kSize; j++) union_map[j] = false; | 3389 for (int j = 0; j < kSize; j++) union_map[j] = false; |
3334 while (i < length_ && Count(i) <= max_number_of_chars) { | 3390 while (i < length_ && Count(i) <= max_number_of_chars) { |
3335 ZoneList<bool>* map = bitmaps_->at(i); | 3391 BoyerMoorePositionInfo* map = bitmaps_->at(i); |
3336 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j); | 3392 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j); |
3337 i++; | 3393 i++; |
3338 } | 3394 } |
3339 int frequency = 0; | 3395 int frequency = 0; |
3340 for (int j = 0; j < kSize; j++) { | 3396 for (int j = 0; j < kSize; j++) { |
3341 if (union_map[j]) { | 3397 if (union_map[j]) { |
3342 // Add 1 to the frequency to give a small per-character boost for | 3398 // Add 1 to the frequency to give a small per-character boost for |
3343 // the cases where our sampling is not good enough and many | 3399 // the cases where our sampling is not good enough and many |
3344 // characters have a frequency of zero. This means the frequency | 3400 // characters have a frequency of zero. This means the frequency |
3345 // can theoretically be up to 2*kSize though we treat it mostly as | 3401 // can theoretically be up to 2*kSize though we treat it mostly as |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3380 | 3436 |
3381 const int kSkipArrayEntry = 0; | 3437 const int kSkipArrayEntry = 0; |
3382 const int kDontSkipArrayEntry = 1; | 3438 const int kDontSkipArrayEntry = 1; |
3383 | 3439 |
3384 for (int i = 0; i < kSize; i++) { | 3440 for (int i = 0; i < kSize; i++) { |
3385 boolean_skip_table->set(i, kSkipArrayEntry); | 3441 boolean_skip_table->set(i, kSkipArrayEntry); |
3386 } | 3442 } |
3387 int skip = max_lookahead + 1 - min_lookahead; | 3443 int skip = max_lookahead + 1 - min_lookahead; |
3388 | 3444 |
3389 for (int i = max_lookahead; i >= min_lookahead; i--) { | 3445 for (int i = max_lookahead; i >= min_lookahead; i--) { |
3390 ZoneList<bool>* map = bitmaps_->at(i); | 3446 BoyerMoorePositionInfo* map = bitmaps_->at(i); |
3391 for (int j = 0; j < map_length_; j++) { | 3447 for (int j = 0; j < kSize; j++) { |
3392 if (map->at(j)) { | 3448 if (map->at(j)) { |
3393 boolean_skip_table->set(j, kDontSkipArrayEntry); | 3449 boolean_skip_table->set(j, kDontSkipArrayEntry); |
3394 } | 3450 } |
3395 } | 3451 } |
3396 } | 3452 } |
3397 | 3453 |
3398 return skip; | 3454 return skip; |
3399 } | 3455 } |
3400 | 3456 |
3401 | 3457 |
3402 // See comment above on the implementation of GetSkipTable. | 3458 // See comment above on the implementation of GetSkipTable. |
3403 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { | 3459 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { |
3460 const int kSize = RegExpMacroAssembler::kTableSize; | |
3461 | |
3404 int min_lookahead = 0; | 3462 int min_lookahead = 0; |
3405 int max_lookahead = 0; | 3463 int max_lookahead = 0; |
3406 | 3464 |
3407 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false; | 3465 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false; |
3408 | 3466 |
3409 bool found_single_character = false; | 3467 bool found_single_character = false; |
3410 bool abandoned_search_for_single_character = false; | |
3411 int single_character = 0; | 3468 int single_character = 0; |
3412 for (int i = max_lookahead; i >= min_lookahead; i--) { | 3469 for (int i = max_lookahead; i >= min_lookahead; i--) { |
3413 ZoneList<bool>* map = bitmaps_->at(i); | 3470 BoyerMoorePositionInfo* map = bitmaps_->at(i); |
3414 for (int j = 0; j < map_length_; j++) { | 3471 if (map->map_count() > 1 || |
3472 (found_single_character && map->map_count() != 0)) { | |
3473 found_single_character = false; | |
3474 break; | |
3475 } | |
3476 for (int j = 0; j < kSize; j++) { | |
3415 if (map->at(j)) { | 3477 if (map->at(j)) { |
3416 if (found_single_character) { | 3478 found_single_character = true; |
3417 found_single_character = false; // Found two. | 3479 single_character = j; |
3418 abandoned_search_for_single_character = true; | 3480 break; |
3419 break; | |
3420 } else { | |
3421 found_single_character = true; | |
3422 single_character = j; | |
3423 } | |
3424 } | 3481 } |
3425 } | 3482 } |
3426 if (abandoned_search_for_single_character) break; | |
3427 } | 3483 } |
3428 | 3484 |
3429 int lookahead_width = max_lookahead + 1 - min_lookahead; | 3485 int lookahead_width = max_lookahead + 1 - min_lookahead; |
3430 | 3486 |
3431 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { | 3487 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { |
3432 // The mask-compare can probably handle this better. | 3488 // The mask-compare can probably handle this better. |
3433 return false; | 3489 return false; |
3434 } | 3490 } |
3435 | 3491 |
3436 if (found_single_character) { | 3492 if (found_single_character) { |
3437 Label cont, again; | 3493 Label cont, again; |
3438 masm->Bind(&again); | 3494 masm->Bind(&again); |
3439 masm->LoadCurrentCharacter(max_lookahead, &cont, true); | 3495 masm->LoadCurrentCharacter(max_lookahead, &cont, true); |
3440 if (max_char_ > map_length_) { | 3496 if (max_char_ > kSize) { |
3441 ASSERT(map_length_ == RegExpMacroAssembler::kTableSize); | |
3442 masm->CheckCharacterAfterAnd(single_character, | 3497 masm->CheckCharacterAfterAnd(single_character, |
3443 RegExpMacroAssembler::kTableMask, | 3498 RegExpMacroAssembler::kTableMask, |
3444 &cont); | 3499 &cont); |
3445 } else { | 3500 } else { |
3446 masm->CheckCharacter(single_character, &cont); | 3501 masm->CheckCharacter(single_character, &cont); |
3447 } | 3502 } |
3448 masm->AdvanceCurrentPosition(lookahead_width); | 3503 masm->AdvanceCurrentPosition(lookahead_width); |
3449 masm->GoTo(&again); | 3504 masm->GoTo(&again); |
3450 masm->Bind(&cont); | 3505 masm->Bind(&cont); |
3451 return true; | 3506 return true; |
3452 } | 3507 } |
3453 | 3508 |
3454 Handle<ByteArray> boolean_skip_table = | 3509 Handle<ByteArray> boolean_skip_table = |
3455 FACTORY->NewByteArray(map_length_, TENURED); | 3510 FACTORY->NewByteArray(kSize, TENURED); |
3456 int skip_distance = GetSkipTable( | 3511 int skip_distance = GetSkipTable( |
3457 min_lookahead, max_lookahead, boolean_skip_table); | 3512 min_lookahead, max_lookahead, boolean_skip_table); |
3458 ASSERT(skip_distance != 0); | 3513 ASSERT(skip_distance != 0); |
3459 | 3514 |
3460 Label cont, again; | 3515 Label cont, again; |
3461 masm->Bind(&again); | 3516 masm->Bind(&again); |
3462 masm->LoadCurrentCharacter(max_lookahead, &cont, true); | 3517 masm->LoadCurrentCharacter(max_lookahead, &cont, true); |
3463 masm->CheckBitInTable(boolean_skip_table, &cont); | 3518 masm->CheckBitInTable(boolean_skip_table, &cont); |
3464 masm->AdvanceCurrentPosition(skip_distance); | 3519 masm->AdvanceCurrentPosition(skip_distance); |
3465 masm->GoTo(&again); | 3520 masm->GoTo(&again); |
(...skipping 158 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3624 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) == | 3679 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) == |
3625 this) { | 3680 this) { |
3626 // At this point we know that we are at a non-greedy loop that will eat | 3681 // At this point we know that we are at a non-greedy loop that will eat |
3627 // any character one at a time. Any non-anchored regexp has such a | 3682 // any character one at a time. Any non-anchored regexp has such a |
3628 // loop prepended to it in order to find where it starts. We look for | 3683 // loop prepended to it in order to find where it starts. We look for |
3629 // a pattern of the form ...abc... where we can look 6 characters ahead | 3684 // a pattern of the form ...abc... where we can look 6 characters ahead |
3630 // and step forwards 3 if the character is not one of abc. Abc need | 3685 // and step forwards 3 if the character is not one of abc. Abc need |
3631 // not be atoms, they can be any reasonably limited character class or | 3686 // not be atoms, they can be any reasonably limited character class or |
3632 // small alternation. | 3687 // small alternation. |
3633 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes. | 3688 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes. |
3634 eats_at_least = | 3689 BoyerMooreLookahead* lookahead = bm_info(not_at_start); |
3635 Min(kMaxLookaheadForBoyerMoore, | 3690 if (lookahead == NULL) { |
3636 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start)); | 3691 eats_at_least = |
3637 if (eats_at_least >= 1) { | 3692 Min(kMaxLookaheadForBoyerMoore, |
3638 BoyerMooreLookahead bm(eats_at_least, | 3693 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start)); |
3639 RegExpMacroAssembler::kTableSize, | 3694 if (eats_at_least >= 1) { |
3640 compiler); | 3695 BoyerMooreLookahead* bm = |
3641 GuardedAlternative alt0 = alternatives_->at(0); | 3696 new BoyerMooreLookahead(eats_at_least, compiler); |
3642 alt0.node()->FillInBMInfo(0, &bm, not_at_start); | 3697 GuardedAlternative alt0 = alternatives_->at(0); |
3643 skip_was_emitted = bm.EmitSkipInstructions(macro_assembler); | 3698 alt0.node()->FillInBMInfo(0, bm, not_at_start); |
3699 skip_was_emitted = bm->EmitSkipInstructions(macro_assembler); | |
3700 } | |
3701 } else { | |
3702 skip_was_emitted = lookahead->EmitSkipInstructions(macro_assembler); | |
3644 } | 3703 } |
3645 } | 3704 } |
3646 } | 3705 } |
3647 } | 3706 } |
3648 | 3707 |
3649 if (eats_at_least == kEatsAtLeastNotYetInitialized) { | 3708 if (eats_at_least == kEatsAtLeastNotYetInitialized) { |
3650 // Save some time by looking at most one machine word ahead. | 3709 // Save some time by looking at most one machine word ahead. |
3651 eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start); | 3710 eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start); |
3652 } | 3711 } |
3653 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least); | 3712 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least); |
(...skipping 542 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
4196 break; | 4255 break; |
4197 case AssertionNode::AT_BOUNDARY: | 4256 case AssertionNode::AT_BOUNDARY: |
4198 stream()->Add("label=\"\\b\", shape=septagon"); | 4257 stream()->Add("label=\"\\b\", shape=septagon"); |
4199 break; | 4258 break; |
4200 case AssertionNode::AT_NON_BOUNDARY: | 4259 case AssertionNode::AT_NON_BOUNDARY: |
4201 stream()->Add("label=\"\\B\", shape=septagon"); | 4260 stream()->Add("label=\"\\B\", shape=septagon"); |
4202 break; | 4261 break; |
4203 case AssertionNode::AFTER_NEWLINE: | 4262 case AssertionNode::AFTER_NEWLINE: |
4204 stream()->Add("label=\"(?<=\\n)\", shape=septagon"); | 4263 stream()->Add("label=\"(?<=\\n)\", shape=septagon"); |
4205 break; | 4264 break; |
4206 case AssertionNode::AFTER_WORD_CHARACTER: | |
4207 stream()->Add("label=\"(?<=\\w)\", shape=septagon"); | |
4208 break; | |
4209 case AssertionNode::AFTER_NONWORD_CHARACTER: | |
4210 stream()->Add("label=\"(?<=\\W)\", shape=septagon"); | |
4211 break; | |
4212 } | 4265 } |
4213 stream()->Add("];\n"); | 4266 stream()->Add("];\n"); |
4214 PrintAttributes(that); | 4267 PrintAttributes(that); |
4215 RegExpNode* successor = that->on_success(); | 4268 RegExpNode* successor = that->on_success(); |
4216 stream()->Add(" n%p -> n%p;\n", that, successor); | 4269 stream()->Add(" n%p -> n%p;\n", that, successor); |
4217 Visit(successor); | 4270 Visit(successor); |
4218 } | 4271 } |
4219 | 4272 |
4220 | 4273 |
4221 void DotPrinter::VisitAction(ActionNode* that) { | 4274 void DotPrinter::VisitAction(ActionNode* that) { |
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
4306 printer.PrintNode(label, node); | 4359 printer.PrintNode(label, node); |
4307 } | 4360 } |
4308 | 4361 |
4309 | 4362 |
4310 #endif // DEBUG | 4363 #endif // DEBUG |
4311 | 4364 |
4312 | 4365 |
4313 // ------------------------------------------------------------------- | 4366 // ------------------------------------------------------------------- |
4314 // Tree to graph conversion | 4367 // Tree to graph conversion |
4315 | 4368 |
4316 static const uc16 kSpaceRanges[] = { 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0, | |
4317 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029, | |
4318 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000, 0xFEFF, 0xFEFF }; | |
4319 static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges); | |
4320 | |
4321 static const uc16 kWordRanges[] = { '0', '9', 'A', 'Z', '_', '_', 'a', 'z' }; | |
4322 static const int kWordRangeCount = ARRAY_SIZE(kWordRanges); | |
4323 | |
4324 static const uc16 kDigitRanges[] = { '0', '9' }; | |
4325 static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges); | |
4326 | |
4327 static const uc16 kLineTerminatorRanges[] = { 0x000A, 0x000A, 0x000D, 0x000D, | |
4328 0x2028, 0x2029 }; | |
4329 static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges); | |
4330 | |
4331 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, | 4369 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, |
4332 RegExpNode* on_success) { | 4370 RegExpNode* on_success) { |
4333 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1); | 4371 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1); |
4334 elms->Add(TextElement::Atom(this)); | 4372 elms->Add(TextElement::Atom(this)); |
4335 return new TextNode(elms, on_success); | 4373 return new TextNode(elms, on_success); |
4336 } | 4374 } |
4337 | 4375 |
4338 | 4376 |
4339 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, | 4377 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, |
4340 RegExpNode* on_success) { | 4378 RegExpNode* on_success) { |
4341 return new TextNode(elements(), on_success); | 4379 return new TextNode(elements(), on_success); |
4342 } | 4380 } |
4343 | 4381 |
4382 | |
4344 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, | 4383 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, |
4345 const uc16* special_class, | 4384 const int* special_class, |
4346 int length) { | 4385 int length) { |
4386 length--; // Remove final 0x10000. | |
4387 ASSERT(special_class[length] == 0x10000); | |
4347 ASSERT(ranges->length() != 0); | 4388 ASSERT(ranges->length() != 0); |
4348 ASSERT(length != 0); | 4389 ASSERT(length != 0); |
4349 ASSERT(special_class[0] != 0); | 4390 ASSERT(special_class[0] != 0); |
4350 if (ranges->length() != (length >> 1) + 1) { | 4391 if (ranges->length() != (length >> 1) + 1) { |
4351 return false; | 4392 return false; |
4352 } | 4393 } |
4353 CharacterRange range = ranges->at(0); | 4394 CharacterRange range = ranges->at(0); |
4354 if (range.from() != 0) { | 4395 if (range.from() != 0) { |
4355 return false; | 4396 return false; |
4356 } | 4397 } |
4357 for (int i = 0; i < length; i += 2) { | 4398 for (int i = 0; i < length; i += 2) { |
4358 if (special_class[i] != (range.to() + 1)) { | 4399 if (special_class[i] != (range.to() + 1)) { |
4359 return false; | 4400 return false; |
4360 } | 4401 } |
4361 range = ranges->at((i >> 1) + 1); | 4402 range = ranges->at((i >> 1) + 1); |
4362 if (special_class[i+1] != range.from() - 1) { | 4403 if (special_class[i+1] != range.from()) { |
4363 return false; | 4404 return false; |
4364 } | 4405 } |
4365 } | 4406 } |
4366 if (range.to() != 0xffff) { | 4407 if (range.to() != 0xffff) { |
4367 return false; | 4408 return false; |
4368 } | 4409 } |
4369 return true; | 4410 return true; |
4370 } | 4411 } |
4371 | 4412 |
4372 | 4413 |
4373 static bool CompareRanges(ZoneList<CharacterRange>* ranges, | 4414 static bool CompareRanges(ZoneList<CharacterRange>* ranges, |
4374 const uc16* special_class, | 4415 const int* special_class, |
4375 int length) { | 4416 int length) { |
4417 length--; // Remove final 0x10000. | |
4418 ASSERT(special_class[length] == 0x10000); | |
4376 if (ranges->length() * 2 != length) { | 4419 if (ranges->length() * 2 != length) { |
4377 return false; | 4420 return false; |
4378 } | 4421 } |
4379 for (int i = 0; i < length; i += 2) { | 4422 for (int i = 0; i < length; i += 2) { |
4380 CharacterRange range = ranges->at(i >> 1); | 4423 CharacterRange range = ranges->at(i >> 1); |
4381 if (range.from() != special_class[i] || range.to() != special_class[i+1]) { | 4424 if (range.from() != special_class[i] || |
4425 range.to() != special_class[i + 1] - 1) { | |
4382 return false; | 4426 return false; |
4383 } | 4427 } |
4384 } | 4428 } |
4385 return true; | 4429 return true; |
4386 } | 4430 } |
4387 | 4431 |
4388 | 4432 |
4389 bool RegExpCharacterClass::is_standard() { | 4433 bool RegExpCharacterClass::is_standard() { |
4390 // TODO(lrn): Remove need for this function, by not throwing away information | 4434 // TODO(lrn): Remove need for this function, by not throwing away information |
4391 // along the way. | 4435 // along the way. |
4392 if (is_negated_) { | 4436 if (is_negated_) { |
4393 return false; | 4437 return false; |
4394 } | 4438 } |
4395 if (set_.is_standard()) { | 4439 if (set_.is_standard()) { |
4396 return true; | 4440 return true; |
4397 } | 4441 } |
4398 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) { | 4442 if (CompareRanges(set_.ranges(), kSpaceRanges2, kSpaceRange2Count)) { |
4399 set_.set_standard_set_type('s'); | 4443 set_.set_standard_set_type('s'); |
4400 return true; | 4444 return true; |
4401 } | 4445 } |
4402 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) { | 4446 if (CompareInverseRanges(set_.ranges(), kSpaceRanges2, kSpaceRange2Count)) { |
4403 set_.set_standard_set_type('S'); | 4447 set_.set_standard_set_type('S'); |
4404 return true; | 4448 return true; |
4405 } | 4449 } |
4406 if (CompareInverseRanges(set_.ranges(), | 4450 if (CompareInverseRanges(set_.ranges(), |
4407 kLineTerminatorRanges, | 4451 kLineTerminatorRanges2, |
4408 kLineTerminatorRangeCount)) { | 4452 kLineTerminatorRange2Count)) { |
4409 set_.set_standard_set_type('.'); | 4453 set_.set_standard_set_type('.'); |
4410 return true; | 4454 return true; |
4411 } | 4455 } |
4412 if (CompareRanges(set_.ranges(), | 4456 if (CompareRanges(set_.ranges(), |
4413 kLineTerminatorRanges, | 4457 kLineTerminatorRanges2, |
4414 kLineTerminatorRangeCount)) { | 4458 kLineTerminatorRange2Count)) { |
4415 set_.set_standard_set_type('n'); | 4459 set_.set_standard_set_type('n'); |
4416 return true; | 4460 return true; |
4417 } | 4461 } |
4418 if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) { | 4462 if (CompareRanges(set_.ranges(), kWordRanges2, kWordRange2Count)) { |
4419 set_.set_standard_set_type('w'); | 4463 set_.set_standard_set_type('w'); |
4420 return true; | 4464 return true; |
4421 } | 4465 } |
4422 if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) { | 4466 if (CompareInverseRanges(set_.ranges(), kWordRanges2, kWordRange2Count)) { |
4423 set_.set_standard_set_type('W'); | 4467 set_.set_standard_set_type('W'); |
4424 return true; | 4468 return true; |
4425 } | 4469 } |
4426 return false; | 4470 return false; |
4427 } | 4471 } |
4428 | 4472 |
4429 | 4473 |
4430 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, | 4474 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, |
4431 RegExpNode* on_success) { | 4475 RegExpNode* on_success) { |
4432 return new TextNode(this, on_success); | 4476 return new TextNode(this, on_success); |
(...skipping 339 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
4772 RegExpNode* on_success) { | 4816 RegExpNode* on_success) { |
4773 ZoneList<RegExpTree*>* children = nodes(); | 4817 ZoneList<RegExpTree*>* children = nodes(); |
4774 RegExpNode* current = on_success; | 4818 RegExpNode* current = on_success; |
4775 for (int i = children->length() - 1; i >= 0; i--) { | 4819 for (int i = children->length() - 1; i >= 0; i--) { |
4776 current = children->at(i)->ToNode(compiler, current); | 4820 current = children->at(i)->ToNode(compiler, current); |
4777 } | 4821 } |
4778 return current; | 4822 return current; |
4779 } | 4823 } |
4780 | 4824 |
4781 | 4825 |
4782 static void AddClass(const uc16* elmv, | 4826 static void AddClass(const int* elmv, |
4783 int elmc, | 4827 int elmc, |
4784 ZoneList<CharacterRange>* ranges) { | 4828 ZoneList<CharacterRange>* ranges) { |
4829 elmc--; | |
4830 ASSERT(elmv[elmc] == 0x10000); | |
4785 for (int i = 0; i < elmc; i += 2) { | 4831 for (int i = 0; i < elmc; i += 2) { |
4786 ASSERT(elmv[i] <= elmv[i + 1]); | 4832 ASSERT(elmv[i] < elmv[i + 1]); |
4787 ranges->Add(CharacterRange(elmv[i], elmv[i + 1])); | 4833 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1)); |
4788 } | 4834 } |
4789 } | 4835 } |
4790 | 4836 |
4791 | 4837 |
4792 static void AddClassNegated(const uc16 *elmv, | 4838 static void AddClassNegated(const int *elmv, |
4793 int elmc, | 4839 int elmc, |
4794 ZoneList<CharacterRange>* ranges) { | 4840 ZoneList<CharacterRange>* ranges) { |
4841 elmc--; | |
4842 ASSERT(elmv[elmc] == 0x10000); | |
4795 ASSERT(elmv[0] != 0x0000); | 4843 ASSERT(elmv[0] != 0x0000); |
4796 ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit); | 4844 ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit); |
4797 uc16 last = 0x0000; | 4845 uc16 last = 0x0000; |
4798 for (int i = 0; i < elmc; i += 2) { | 4846 for (int i = 0; i < elmc; i += 2) { |
4799 ASSERT(last <= elmv[i] - 1); | 4847 ASSERT(last <= elmv[i] - 1); |
4800 ASSERT(elmv[i] <= elmv[i + 1]); | 4848 ASSERT(elmv[i] < elmv[i + 1]); |
4801 ranges->Add(CharacterRange(last, elmv[i] - 1)); | 4849 ranges->Add(CharacterRange(last, elmv[i] - 1)); |
4802 last = elmv[i + 1] + 1; | 4850 last = elmv[i + 1]; |
4803 } | 4851 } |
4804 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit)); | 4852 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit)); |
4805 } | 4853 } |
4806 | 4854 |
4807 | 4855 |
4808 void CharacterRange::AddClassEscape(uc16 type, | 4856 void CharacterRange::AddClassEscape(uc16 type, |
4809 ZoneList<CharacterRange>* ranges) { | 4857 ZoneList<CharacterRange>* ranges) { |
4810 switch (type) { | 4858 switch (type) { |
4811 case 's': | 4859 case 's': |
4812 AddClass(kSpaceRanges, kSpaceRangeCount, ranges); | 4860 AddClass(kSpaceRanges2, kSpaceRange2Count, ranges); |
4813 break; | 4861 break; |
4814 case 'S': | 4862 case 'S': |
4815 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges); | 4863 AddClassNegated(kSpaceRanges2, kSpaceRange2Count, ranges); |
4816 break; | 4864 break; |
4817 case 'w': | 4865 case 'w': |
4818 AddClass(kWordRanges, kWordRangeCount, ranges); | 4866 AddClass(kWordRanges2, kWordRange2Count, ranges); |
4819 break; | 4867 break; |
4820 case 'W': | 4868 case 'W': |
4821 AddClassNegated(kWordRanges, kWordRangeCount, ranges); | 4869 AddClassNegated(kWordRanges2, kWordRange2Count, ranges); |
4822 break; | 4870 break; |
4823 case 'd': | 4871 case 'd': |
4824 AddClass(kDigitRanges, kDigitRangeCount, ranges); | 4872 AddClass(kDigitRanges2, kDigitRange2Count, ranges); |
4825 break; | 4873 break; |
4826 case 'D': | 4874 case 'D': |
4827 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges); | 4875 AddClassNegated(kDigitRanges2, kDigitRange2Count, ranges); |
4828 break; | 4876 break; |
4829 case '.': | 4877 case '.': |
4830 AddClassNegated(kLineTerminatorRanges, | 4878 AddClassNegated(kLineTerminatorRanges2, |
4831 kLineTerminatorRangeCount, | 4879 kLineTerminatorRange2Count, |
4832 ranges); | 4880 ranges); |
4833 break; | 4881 break; |
4834 // This is not a character range as defined by the spec but a | 4882 // This is not a character range as defined by the spec but a |
4835 // convenient shorthand for a character class that matches any | 4883 // convenient shorthand for a character class that matches any |
4836 // character. | 4884 // character. |
4837 case '*': | 4885 case '*': |
4838 ranges->Add(CharacterRange::Everything()); | 4886 ranges->Add(CharacterRange::Everything()); |
4839 break; | 4887 break; |
4840 // This is the set of characters matched by the $ and ^ symbols | 4888 // This is the set of characters matched by the $ and ^ symbols |
4841 // in multiline mode. | 4889 // in multiline mode. |
4842 case 'n': | 4890 case 'n': |
4843 AddClass(kLineTerminatorRanges, | 4891 AddClass(kLineTerminatorRanges2, |
4844 kLineTerminatorRangeCount, | 4892 kLineTerminatorRange2Count, |
4845 ranges); | 4893 ranges); |
4846 break; | 4894 break; |
4847 default: | 4895 default: |
4848 UNREACHABLE(); | 4896 UNREACHABLE(); |
4849 } | 4897 } |
4850 } | 4898 } |
4851 | 4899 |
4852 | 4900 |
4853 Vector<const uc16> CharacterRange::GetWordBounds() { | 4901 Vector<const int> CharacterRange::GetWordBounds() { |
4854 return Vector<const uc16>(kWordRanges, kWordRangeCount); | 4902 return Vector<const int>(kWordRanges2, kWordRange2Count - 1); |
4855 } | 4903 } |
4856 | 4904 |
4857 | 4905 |
4858 class CharacterRangeSplitter { | 4906 class CharacterRangeSplitter { |
4859 public: | 4907 public: |
4860 CharacterRangeSplitter(ZoneList<CharacterRange>** included, | 4908 CharacterRangeSplitter(ZoneList<CharacterRange>** included, |
4861 ZoneList<CharacterRange>** excluded) | 4909 ZoneList<CharacterRange>** excluded) |
4862 : included_(included), | 4910 : included_(included), |
4863 excluded_(excluded) { } | 4911 excluded_(excluded) { } |
4864 void Call(uc16 from, DispatchTable::Entry entry); | 4912 void Call(uc16 from, DispatchTable::Entry entry); |
(...skipping 11 matching lines...) Expand all Loading... | |
4876 if (!entry.out_set()->Get(kInBase)) return; | 4924 if (!entry.out_set()->Get(kInBase)) return; |
4877 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay) | 4925 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay) |
4878 ? included_ | 4926 ? included_ |
4879 : excluded_; | 4927 : excluded_; |
4880 if (*target == NULL) *target = new ZoneList<CharacterRange>(2); | 4928 if (*target == NULL) *target = new ZoneList<CharacterRange>(2); |
4881 (*target)->Add(CharacterRange(entry.from(), entry.to())); | 4929 (*target)->Add(CharacterRange(entry.from(), entry.to())); |
4882 } | 4930 } |
4883 | 4931 |
4884 | 4932 |
4885 void CharacterRange::Split(ZoneList<CharacterRange>* base, | 4933 void CharacterRange::Split(ZoneList<CharacterRange>* base, |
4886 Vector<const uc16> overlay, | 4934 Vector<const int> overlay, |
4887 ZoneList<CharacterRange>** included, | 4935 ZoneList<CharacterRange>** included, |
4888 ZoneList<CharacterRange>** excluded) { | 4936 ZoneList<CharacterRange>** excluded) { |
4889 ASSERT_EQ(NULL, *included); | 4937 ASSERT_EQ(NULL, *included); |
4890 ASSERT_EQ(NULL, *excluded); | 4938 ASSERT_EQ(NULL, *excluded); |
4891 DispatchTable table; | 4939 DispatchTable table; |
4892 for (int i = 0; i < base->length(); i++) | 4940 for (int i = 0; i < base->length(); i++) |
4893 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase); | 4941 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase); |
4894 for (int i = 0; i < overlay.length(); i += 2) { | 4942 for (int i = 0; i < overlay.length(); i += 2) { |
4895 table.AddRange(CharacterRange(overlay[i], overlay[i+1]), | 4943 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1), |
4896 CharacterRangeSplitter::kInOverlay); | 4944 CharacterRangeSplitter::kInOverlay); |
4897 } | 4945 } |
4898 CharacterRangeSplitter callback(included, excluded); | 4946 CharacterRangeSplitter callback(included, excluded); |
4899 table.ForEach(&callback); | 4947 table.ForEach(&callback); |
4900 } | 4948 } |
4901 | 4949 |
4902 | 4950 |
4903 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, | 4951 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, |
4904 bool is_ascii) { | 4952 bool is_ascii) { |
4905 Isolate* isolate = Isolate::Current(); | 4953 Isolate* isolate = Isolate::Current(); |
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
4971 if (n <= 1) return true; | 5019 if (n <= 1) return true; |
4972 int max = ranges->at(0).to(); | 5020 int max = ranges->at(0).to(); |
4973 for (int i = 1; i < n; i++) { | 5021 for (int i = 1; i < n; i++) { |
4974 CharacterRange next_range = ranges->at(i); | 5022 CharacterRange next_range = ranges->at(i); |
4975 if (next_range.from() <= max + 1) return false; | 5023 if (next_range.from() <= max + 1) return false; |
4976 max = next_range.to(); | 5024 max = next_range.to(); |
4977 } | 5025 } |
4978 return true; | 5026 return true; |
4979 } | 5027 } |
4980 | 5028 |
4981 SetRelation CharacterRange::WordCharacterRelation( | |
4982 ZoneList<CharacterRange>* range) { | |
4983 ASSERT(IsCanonical(range)); | |
4984 int i = 0; // Word character range index. | |
4985 int j = 0; // Argument range index. | |
4986 ASSERT_NE(0, kWordRangeCount); | |
4987 SetRelation result; | |
4988 if (range->length() == 0) { | |
4989 result.SetElementsInSecondSet(); | |
4990 return result; | |
4991 } | |
4992 CharacterRange argument_range = range->at(0); | |
4993 CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]); | |
4994 while (i < kWordRangeCount && j < range->length()) { | |
4995 // Check the two ranges for the five cases: | |
4996 // - no overlap. | |
4997 // - partial overlap (there are elements in both ranges that isn't | |
4998 // in the other, and there are also elements that are in both). | |
4999 // - argument range entirely inside word range. | |
5000 // - word range entirely inside argument range. | |
5001 // - ranges are completely equal. | |
5002 | |
5003 // First check for no overlap. The earlier range is not in the other set. | |
5004 if (argument_range.from() > word_range.to()) { | |
5005 // Ranges are disjoint. The earlier word range contains elements that | |
5006 // cannot be in the argument set. | |
5007 result.SetElementsInSecondSet(); | |
5008 } else if (word_range.from() > argument_range.to()) { | |
5009 // Ranges are disjoint. The earlier argument range contains elements that | |
5010 // cannot be in the word set. | |
5011 result.SetElementsInFirstSet(); | |
5012 } else if (word_range.from() <= argument_range.from() && | |
5013 word_range.to() >= argument_range.from()) { | |
5014 result.SetElementsInBothSets(); | |
5015 // argument range completely inside word range. | |
5016 if (word_range.from() < argument_range.from() || | |
5017 word_range.to() > argument_range.from()) { | |
5018 result.SetElementsInSecondSet(); | |
5019 } | |
5020 } else if (word_range.from() >= argument_range.from() && | |
5021 word_range.to() <= argument_range.from()) { | |
5022 result.SetElementsInBothSets(); | |
5023 result.SetElementsInFirstSet(); | |
5024 } else { | |
5025 // There is overlap, and neither is a subrange of the other | |
5026 result.SetElementsInFirstSet(); | |
5027 result.SetElementsInSecondSet(); | |
5028 result.SetElementsInBothSets(); | |
5029 } | |
5030 if (result.NonTrivialIntersection()) { | |
5031 // The result is as (im)precise as we can possibly make it. | |
5032 return result; | |
5033 } | |
5034 // Progress the range(s) with minimal to-character. | |
5035 uc16 word_to = word_range.to(); | |
5036 uc16 argument_to = argument_range.to(); | |
5037 if (argument_to <= word_to) { | |
5038 j++; | |
5039 if (j < range->length()) { | |
5040 argument_range = range->at(j); | |
5041 } | |
5042 } | |
5043 if (word_to <= argument_to) { | |
5044 i += 2; | |
5045 if (i < kWordRangeCount) { | |
5046 word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]); | |
5047 } | |
5048 } | |
5049 } | |
5050 // Check if anything wasn't compared in the loop. | |
5051 if (i < kWordRangeCount) { | |
5052 // word range contains something not in argument range. | |
5053 result.SetElementsInSecondSet(); | |
5054 } else if (j < range->length()) { | |
5055 // Argument range contains something not in word range. | |
5056 result.SetElementsInFirstSet(); | |
5057 } | |
5058 | |
5059 return result; | |
5060 } | |
5061 | |
5062 | 5029 |
5063 ZoneList<CharacterRange>* CharacterSet::ranges() { | 5030 ZoneList<CharacterRange>* CharacterSet::ranges() { |
5064 if (ranges_ == NULL) { | 5031 if (ranges_ == NULL) { |
5065 ranges_ = new ZoneList<CharacterRange>(2); | 5032 ranges_ = new ZoneList<CharacterRange>(2); |
5066 CharacterRange::AddClassEscape(standard_set_type_, ranges_); | 5033 CharacterRange::AddClassEscape(standard_set_type_, ranges_); |
5067 } | 5034 } |
5068 return ranges_; | 5035 return ranges_; |
5069 } | 5036 } |
5070 | 5037 |
5071 | 5038 |
(...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
5184 num_canonical, | 5151 num_canonical, |
5185 character_ranges->at(read)); | 5152 character_ranges->at(read)); |
5186 read++; | 5153 read++; |
5187 } while (read < n); | 5154 } while (read < n); |
5188 character_ranges->Rewind(num_canonical); | 5155 character_ranges->Rewind(num_canonical); |
5189 | 5156 |
5190 ASSERT(CharacterRange::IsCanonical(character_ranges)); | 5157 ASSERT(CharacterRange::IsCanonical(character_ranges)); |
5191 } | 5158 } |
5192 | 5159 |
5193 | 5160 |
5194 // Utility function for CharacterRange::Merge. Adds a range at the end of | |
5195 // a canonicalized range list, if necessary merging the range with the last | |
5196 // range of the list. | |
5197 static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) { | |
5198 if (set == NULL) return; | |
5199 ASSERT(set->length() == 0 || set->at(set->length() - 1).to() < range.from()); | |
5200 int n = set->length(); | |
5201 if (n > 0) { | |
5202 CharacterRange lastRange = set->at(n - 1); | |
5203 if (lastRange.to() == range.from() - 1) { | |
5204 set->at(n - 1) = CharacterRange(lastRange.from(), range.to()); | |
5205 return; | |
5206 } | |
5207 } | |
5208 set->Add(range); | |
5209 } | |
5210 | |
5211 | |
5212 static void AddRangeToSelectedSet(int selector, | |
5213 ZoneList<CharacterRange>* first_set, | |
5214 ZoneList<CharacterRange>* second_set, | |
5215 ZoneList<CharacterRange>* intersection_set, | |
5216 CharacterRange range) { | |
5217 switch (selector) { | |
5218 case kInsideFirst: | |
5219 AddRangeToSet(first_set, range); | |
5220 break; | |
5221 case kInsideSecond: | |
5222 AddRangeToSet(second_set, range); | |
5223 break; | |
5224 case kInsideBoth: | |
5225 AddRangeToSet(intersection_set, range); | |
5226 break; | |
5227 } | |
5228 } | |
5229 | |
5230 | |
5231 | |
5232 void CharacterRange::Merge(ZoneList<CharacterRange>* first_set, | |
5233 ZoneList<CharacterRange>* second_set, | |
5234 ZoneList<CharacterRange>* first_set_only_out, | |
5235 ZoneList<CharacterRange>* second_set_only_out, | |
5236 ZoneList<CharacterRange>* both_sets_out) { | |
5237 // Inputs are canonicalized. | |
5238 ASSERT(CharacterRange::IsCanonical(first_set)); | |
5239 ASSERT(CharacterRange::IsCanonical(second_set)); | |
5240 // Outputs are empty, if applicable. | |
5241 ASSERT(first_set_only_out == NULL || first_set_only_out->length() == 0); | |
5242 ASSERT(second_set_only_out == NULL || second_set_only_out->length() == 0); | |
5243 ASSERT(both_sets_out == NULL || both_sets_out->length() == 0); | |
5244 | |
5245 // Merge sets by iterating through the lists in order of lowest "from" value, | |
5246 // and putting intervals into one of three sets. | |
5247 | |
5248 if (first_set->length() == 0) { | |
5249 second_set_only_out->AddAll(*second_set); | |
5250 return; | |
5251 } | |
5252 if (second_set->length() == 0) { | |
5253 first_set_only_out->AddAll(*first_set); | |
5254 return; | |
5255 } | |
5256 // Indices into input lists. | |
5257 int i1 = 0; | |
5258 int i2 = 0; | |
5259 // Cache length of input lists. | |
5260 int n1 = first_set->length(); | |
5261 int n2 = second_set->length(); | |
5262 // Current range. May be invalid if state is kInsideNone. | |
5263 int from = 0; | |
5264 int to = -1; | |
5265 // Where current range comes from. | |
5266 int state = kInsideNone; | |
5267 | |
5268 while (i1 < n1 || i2 < n2) { | |
5269 CharacterRange next_range; | |
5270 int range_source; | |
5271 if (i2 == n2 || | |
5272 (i1 < n1 && first_set->at(i1).from() < second_set->at(i2).from())) { | |
5273 // Next smallest element is in first set. | |
5274 next_range = first_set->at(i1++); | |
5275 range_source = kInsideFirst; | |
5276 } else { | |
5277 // Next smallest element is in second set. | |
5278 next_range = second_set->at(i2++); | |
5279 range_source = kInsideSecond; | |
5280 } | |
5281 if (to < next_range.from()) { | |
5282 // Ranges disjoint: |current| |next| | |
5283 AddRangeToSelectedSet(state, | |
5284 first_set_only_out, | |
5285 second_set_only_out, | |
5286 both_sets_out, | |
5287 CharacterRange(from, to)); | |
5288 from = next_range.from(); | |
5289 to = next_range.to(); | |
5290 state = range_source; | |
5291 } else { | |
5292 if (from < next_range.from()) { | |
5293 AddRangeToSelectedSet(state, | |
5294 first_set_only_out, | |
5295 second_set_only_out, | |
5296 both_sets_out, | |
5297 CharacterRange(from, next_range.from()-1)); | |
5298 } | |
5299 if (to < next_range.to()) { | |
5300 // Ranges overlap: |current| | |
5301 // |next| | |
5302 AddRangeToSelectedSet(state | range_source, | |
5303 first_set_only_out, | |
5304 second_set_only_out, | |
5305 both_sets_out, | |
5306 CharacterRange(next_range.from(), to)); | |
5307 from = to + 1; | |
5308 to = next_range.to(); | |
5309 state = range_source; | |
5310 } else { | |
5311 // Range included: |current| , possibly ending at same character. | |
5312 // |next| | |
5313 AddRangeToSelectedSet( | |
5314 state | range_source, | |
5315 first_set_only_out, | |
5316 second_set_only_out, | |
5317 both_sets_out, | |
5318 CharacterRange(next_range.from(), next_range.to())); | |
5319 from = next_range.to() + 1; | |
5320 // If ranges end at same character, both ranges are consumed completely. | |
5321 if (next_range.to() == to) state = kInsideNone; | |
5322 } | |
5323 } | |
5324 } | |
5325 AddRangeToSelectedSet(state, | |
5326 first_set_only_out, | |
5327 second_set_only_out, | |
5328 both_sets_out, | |
5329 CharacterRange(from, to)); | |
5330 } | |
5331 | |
5332 | |
5333 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, | 5161 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, |
5334 ZoneList<CharacterRange>* negated_ranges) { | 5162 ZoneList<CharacterRange>* negated_ranges) { |
5335 ASSERT(CharacterRange::IsCanonical(ranges)); | 5163 ASSERT(CharacterRange::IsCanonical(ranges)); |
5336 ASSERT_EQ(0, negated_ranges->length()); | 5164 ASSERT_EQ(0, negated_ranges->length()); |
5337 int range_count = ranges->length(); | 5165 int range_count = ranges->length(); |
5338 uc16 from = 0; | 5166 uc16 from = 0; |
5339 int i = 0; | 5167 int i = 0; |
5340 if (range_count > 0 && ranges->at(0).from() == 0) { | 5168 if (range_count > 0 && ranges->at(0).from() == 0) { |
5341 from = ranges->at(0).to(); | 5169 from = ranges->at(0).to(); |
5342 i = 1; | 5170 i = 1; |
(...skipping 292 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
5635 } | 5463 } |
5636 | 5464 |
5637 | 5465 |
5638 void Analysis::VisitBackReference(BackReferenceNode* that) { | 5466 void Analysis::VisitBackReference(BackReferenceNode* that) { |
5639 EnsureAnalyzed(that->on_success()); | 5467 EnsureAnalyzed(that->on_success()); |
5640 } | 5468 } |
5641 | 5469 |
5642 | 5470 |
5643 void Analysis::VisitAssertion(AssertionNode* that) { | 5471 void Analysis::VisitAssertion(AssertionNode* that) { |
5644 EnsureAnalyzed(that->on_success()); | 5472 EnsureAnalyzed(that->on_success()); |
5645 AssertionNode::AssertionNodeType type = that->type(); | |
5646 if (type == AssertionNode::AT_BOUNDARY || | |
5647 type == AssertionNode::AT_NON_BOUNDARY) { | |
5648 // Check if the following character is known to be a word character | |
5649 // or known to not be a word character. | |
5650 ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet(); | |
5651 | |
5652 CharacterRange::Canonicalize(following_chars); | |
5653 | |
5654 SetRelation word_relation = | |
5655 CharacterRange::WordCharacterRelation(following_chars); | |
5656 if (word_relation.Disjoint()) { | |
5657 // Includes the case where following_chars is empty (e.g., end-of-input). | |
5658 // Following character is definitely *not* a word character. | |
5659 type = (type == AssertionNode::AT_BOUNDARY) ? | |
5660 AssertionNode::AFTER_WORD_CHARACTER : | |
5661 AssertionNode::AFTER_NONWORD_CHARACTER; | |
5662 that->set_type(type); | |
5663 } else if (word_relation.ContainedIn()) { | |
5664 // Following character is definitely a word character. | |
5665 type = (type == AssertionNode::AT_BOUNDARY) ? | |
5666 AssertionNode::AFTER_NONWORD_CHARACTER : | |
5667 AssertionNode::AFTER_WORD_CHARACTER; | |
5668 that->set_type(type); | |
5669 } | |
5670 } | |
5671 } | 5473 } |
5672 | 5474 |
5673 | 5475 |
5674 ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() { | 5476 void BackReferenceNode::FillInBMInfo( |
5675 if (first_character_set_ == NULL) { | 5477 int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
5676 if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) { | 5478 // Working out the set of characters that a backreference can match is too |
5677 // If we can't find an exact solution within the budget, we | 5479 // hard, so we just say that any character can match. |
5678 // set the value to the set of every character, i.e., all characters | 5480 bm->SetRest(offset); |
5679 // are possible. | 5481 if (offset == 0) set_bm_info(not_at_start, bm); |
5680 ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1); | |
5681 all_set->Add(CharacterRange::Everything()); | |
5682 first_character_set_ = all_set; | |
5683 } | |
5684 } | |
5685 return first_character_set_; | |
5686 } | 5482 } |
5687 | 5483 |
5688 | 5484 |
5689 int RegExpNode::ComputeFirstCharacterSet(int budget) { | 5485 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == |
5690 // Default behavior is to not be able to determine the first character. | 5486 RegExpMacroAssembler::kTableSize); |
5691 return kComputeFirstCharacterSetFail; | |
5692 } | |
5693 | |
5694 | |
5695 int LoopChoiceNode::ComputeFirstCharacterSet(int budget) { | |
5696 budget--; | |
5697 if (budget >= 0) { | |
5698 // Find loop min-iteration. It's the value of the guarded choice node | |
5699 // with a GEQ guard, if any. | |
5700 int min_repetition = 0; | |
5701 | |
5702 for (int i = 0; i <= 1; i++) { | |
5703 GuardedAlternative alternative = alternatives()->at(i); | |
5704 ZoneList<Guard*>* guards = alternative.guards(); | |
5705 if (guards != NULL && guards->length() > 0) { | |
5706 Guard* guard = guards->at(0); | |
5707 if (guard->op() == Guard::GEQ) { | |
5708 min_repetition = guard->value(); | |
5709 break; | |
5710 } | |
5711 } | |
5712 } | |
5713 | |
5714 budget = loop_node()->ComputeFirstCharacterSet(budget); | |
5715 if (budget >= 0) { | |
5716 ZoneList<CharacterRange>* character_set = | |
5717 loop_node()->first_character_set(); | |
5718 if (body_can_be_zero_length() || min_repetition == 0) { | |
5719 budget = continue_node()->ComputeFirstCharacterSet(budget); | |
5720 if (budget < 0) return budget; | |
5721 ZoneList<CharacterRange>* body_set = | |
5722 continue_node()->first_character_set(); | |
5723 ZoneList<CharacterRange>* union_set = | |
5724 new ZoneList<CharacterRange>(Max(character_set->length(), | |
5725 body_set->length())); | |
5726 CharacterRange::Merge(character_set, | |
5727 body_set, | |
5728 union_set, | |
5729 union_set, | |
5730 union_set); | |
5731 character_set = union_set; | |
5732 } | |
5733 set_first_character_set(character_set); | |
5734 } | |
5735 } | |
5736 return budget; | |
5737 } | |
5738 | |
5739 | |
5740 int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) { | |
5741 budget--; | |
5742 if (budget >= 0) { | |
5743 GuardedAlternative successor = this->alternatives()->at(1); | |
5744 RegExpNode* successor_node = successor.node(); | |
5745 budget = successor_node->ComputeFirstCharacterSet(budget); | |
5746 if (budget >= 0) { | |
5747 set_first_character_set(successor_node->first_character_set()); | |
5748 } | |
5749 } | |
5750 return budget; | |
5751 } | |
5752 | |
5753 | |
5754 // The first character set of an EndNode is unknowable. Just use the | |
5755 // default implementation that fails and returns all characters as possible. | |
5756 | |
5757 | |
5758 int AssertionNode::ComputeFirstCharacterSet(int budget) { | |
5759 budget -= 1; | |
5760 if (budget >= 0) { | |
5761 switch (type_) { | |
5762 case AT_END: { | |
5763 set_first_character_set(new ZoneList<CharacterRange>(0)); | |
5764 break; | |
5765 } | |
5766 case AT_START: | |
5767 case AT_BOUNDARY: | |
5768 case AT_NON_BOUNDARY: | |
5769 case AFTER_NEWLINE: | |
5770 case AFTER_NONWORD_CHARACTER: | |
5771 case AFTER_WORD_CHARACTER: { | |
5772 ASSERT_NOT_NULL(on_success()); | |
5773 budget = on_success()->ComputeFirstCharacterSet(budget); | |
5774 if (budget >= 0) { | |
5775 set_first_character_set(on_success()->first_character_set()); | |
5776 } | |
5777 break; | |
5778 } | |
5779 } | |
5780 } | |
5781 return budget; | |
5782 } | |
5783 | |
5784 | |
5785 int ActionNode::ComputeFirstCharacterSet(int budget) { | |
5786 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail; | |
5787 budget--; | |
5788 if (budget >= 0) { | |
5789 ASSERT_NOT_NULL(on_success()); | |
5790 budget = on_success()->ComputeFirstCharacterSet(budget); | |
5791 if (budget >= 0) { | |
5792 set_first_character_set(on_success()->first_character_set()); | |
5793 } | |
5794 } | |
5795 return budget; | |
5796 } | |
5797 | |
5798 | |
5799 int BackReferenceNode::ComputeFirstCharacterSet(int budget) { | |
5800 // We don't know anything about the first character of a backreference | |
5801 // at this point. | |
5802 // The potential first characters are the first characters of the capture, | |
5803 // and the first characters of the on_success node, depending on whether the | |
5804 // capture can be empty and whether it is known to be participating or known | |
5805 // not to be. | |
5806 return kComputeFirstCharacterSetFail; | |
5807 } | |
5808 | 5487 |
5809 | 5488 |
5810 void ChoiceNode::FillInBMInfo( | 5489 void ChoiceNode::FillInBMInfo( |
5811 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | 5490 int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
5812 ZoneList<GuardedAlternative>* alts = alternatives(); | 5491 ZoneList<GuardedAlternative>* alts = alternatives(); |
5813 for (int i = 0; i < alts->length(); i++) { | 5492 for (int i = 0; i < alts->length(); i++) { |
5814 GuardedAlternative& alt = alts->at(i); | 5493 GuardedAlternative& alt = alts->at(i); |
5815 if (alt.guards() != NULL && alt.guards()->length() != 0) { | 5494 if (alt.guards() != NULL && alt.guards()->length() != 0) { |
5816 bm->SetRest(offset); // Give up trying to fill in info. | 5495 bm->SetRest(offset); // Give up trying to fill in info. |
5496 if (offset == 0) set_bm_info(not_at_start, bm); | |
5817 return; | 5497 return; |
5818 } | 5498 } |
5819 alt.node()->FillInBMInfo(offset, bm, not_at_start); | 5499 alt.node()->FillInBMInfo(offset, bm, not_at_start); |
5820 } | 5500 } |
5501 if (offset == 0) set_bm_info(not_at_start, bm); | |
5821 } | 5502 } |
5822 | 5503 |
5823 | 5504 |
5824 void TextNode::FillInBMInfo( | 5505 void TextNode::FillInBMInfo( |
5825 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | 5506 int initial_offset, BoyerMooreLookahead* bm, bool not_at_start) { |
5826 if (offset >= bm->length()) return; | 5507 if (initial_offset >= bm->length()) return; |
5508 int offset = initial_offset; | |
5827 int max_char = bm->max_char(); | 5509 int max_char = bm->max_char(); |
5828 for (int i = 0; i < elements()->length(); i++) { | 5510 for (int i = 0; i < elements()->length(); i++) { |
5829 if (offset >= bm->length()) return; | 5511 if (offset >= bm->length()) { |
5512 if (initial_offset == 0) set_bm_info(not_at_start, bm); | |
5513 return; | |
5514 } | |
5830 TextElement text = elements()->at(i); | 5515 TextElement text = elements()->at(i); |
5831 if (text.type == TextElement::ATOM) { | 5516 if (text.type == TextElement::ATOM) { |
5832 RegExpAtom* atom = text.data.u_atom; | 5517 RegExpAtom* atom = text.data.u_atom; |
5833 for (int j = 0; j < atom->length(); j++, offset++) { | 5518 for (int j = 0; j < atom->length(); j++, offset++) { |
5834 if (offset >= bm->length()) return; | 5519 if (offset >= bm->length()) { |
5520 if (initial_offset == 0) set_bm_info(not_at_start, bm); | |
5521 return; | |
5522 } | |
5835 uc16 character = atom->data()[j]; | 5523 uc16 character = atom->data()[j]; |
5836 if (bm->compiler()->ignore_case()) { | 5524 if (bm->compiler()->ignore_case()) { |
5837 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 5525 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
5838 int length = GetCaseIndependentLetters( | 5526 int length = GetCaseIndependentLetters( |
5839 ISOLATE, | 5527 ISOLATE, |
5840 character, | 5528 character, |
5841 bm->max_char() == String::kMaxAsciiCharCode, | 5529 bm->max_char() == String::kMaxAsciiCharCode, |
5842 chars); | 5530 chars); |
5843 for (int j = 0; j < length; j++) { | 5531 for (int j = 0; j < length; j++) { |
5844 bm->Set(offset, chars[j]); | 5532 bm->Set(offset, chars[j]); |
5845 } | 5533 } |
5846 } else { | 5534 } else { |
5847 if (character <= max_char) bm->Set(offset, character); | 5535 if (character <= max_char) bm->Set(offset, character); |
5848 } | 5536 } |
5849 } | 5537 } |
5850 } else { | 5538 } else { |
5851 ASSERT(text.type == TextElement::CHAR_CLASS); | 5539 ASSERT(text.type == TextElement::CHAR_CLASS); |
5852 RegExpCharacterClass* char_class = text.data.u_char_class; | 5540 RegExpCharacterClass* char_class = text.data.u_char_class; |
5853 ZoneList<CharacterRange>* ranges = char_class->ranges(); | 5541 ZoneList<CharacterRange>* ranges = char_class->ranges(); |
5854 if (char_class->is_negated()) { | 5542 if (char_class->is_negated()) { |
5855 bm->SetAll(offset); | 5543 bm->SetAll(offset); |
5856 } else { | 5544 } else { |
5857 for (int k = 0; k < ranges->length(); k++) { | 5545 for (int k = 0; k < ranges->length(); k++) { |
5858 CharacterRange& range = ranges->at(k); | 5546 CharacterRange& range = ranges->at(k); |
5859 if (range.from() > max_char) continue; | 5547 if (range.from() > max_char) continue; |
5860 int to = Min(max_char, static_cast<int>(range.to())); | 5548 int to = Min(max_char, static_cast<int>(range.to())); |
5861 if (to - range.from() >= BoyerMooreLookahead::kTooManyCharacters) { | 5549 bm->SetInterval(offset, Interval(range.from(), to)); |
5862 bm->SetAll(offset); | |
5863 break; | |
5864 } | |
5865 for (int m = range.from(); m <= to; m++) { | |
5866 bm->Set(offset, m); | |
5867 } | |
5868 } | 5550 } |
5869 } | 5551 } |
5870 offset++; | 5552 offset++; |
5871 } | 5553 } |
5872 } | 5554 } |
5873 if (offset >= bm->length()) return; | 5555 if (offset >= bm->length()) { |
5556 if (initial_offset == 0) set_bm_info(not_at_start, bm); | |
5557 return; | |
5558 } | |
5874 on_success()->FillInBMInfo(offset, | 5559 on_success()->FillInBMInfo(offset, |
5875 bm, | 5560 bm, |
5876 true); // Not at start after a text node. | 5561 true); // Not at start after a text node. |
5562 if (initial_offset == 0) set_bm_info(not_at_start, bm); | |
5877 } | 5563 } |
5878 | 5564 |
5879 | 5565 |
5880 int TextNode::ComputeFirstCharacterSet(int budget) { | |
5881 budget--; | |
5882 if (budget >= 0) { | |
5883 ASSERT_NE(0, elements()->length()); | |
5884 TextElement text = elements()->at(0); | |
5885 if (text.type == TextElement::ATOM) { | |
5886 RegExpAtom* atom = text.data.u_atom; | |
5887 ASSERT_NE(0, atom->length()); | |
5888 uc16 first_char = atom->data()[0]; | |
5889 ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1); | |
5890 range->Add(CharacterRange(first_char, first_char)); | |
5891 set_first_character_set(range); | |
5892 } else { | |
5893 ASSERT(text.type == TextElement::CHAR_CLASS); | |
5894 RegExpCharacterClass* char_class = text.data.u_char_class; | |
5895 ZoneList<CharacterRange>* ranges = char_class->ranges(); | |
5896 // TODO(lrn): Canonicalize ranges when they are created | |
5897 // instead of waiting until now. | |
5898 CharacterRange::Canonicalize(ranges); | |
5899 if (char_class->is_negated()) { | |
5900 int length = ranges->length(); | |
5901 int new_length = length + 1; | |
5902 if (length > 0) { | |
5903 if (ranges->at(0).from() == 0) new_length--; | |
5904 if (ranges->at(length - 1).to() == String::kMaxUtf16CodeUnit) { | |
5905 new_length--; | |
5906 } | |
5907 } | |
5908 ZoneList<CharacterRange>* negated_ranges = | |
5909 new ZoneList<CharacterRange>(new_length); | |
5910 CharacterRange::Negate(ranges, negated_ranges); | |
5911 set_first_character_set(negated_ranges); | |
5912 } else { | |
5913 set_first_character_set(ranges); | |
5914 } | |
5915 } | |
5916 } | |
5917 return budget; | |
5918 } | |
5919 | |
5920 | |
5921 | |
5922 // ------------------------------------------------------------------- | 5566 // ------------------------------------------------------------------- |
5923 // Dispatch table construction | 5567 // Dispatch table construction |
5924 | 5568 |
5925 | 5569 |
5926 void DispatchTableConstructor::VisitEnd(EndNode* that) { | 5570 void DispatchTableConstructor::VisitEnd(EndNode* that) { |
5927 AddRange(CharacterRange::Everything()); | 5571 AddRange(CharacterRange::Everything()); |
5928 } | 5572 } |
5929 | 5573 |
5930 | 5574 |
5931 void DispatchTableConstructor::BuildTable(ChoiceNode* node) { | 5575 void DispatchTableConstructor::BuildTable(ChoiceNode* node) { |
(...skipping 201 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
6133 } | 5777 } |
6134 | 5778 |
6135 return compiler.Assemble(¯o_assembler, | 5779 return compiler.Assemble(¯o_assembler, |
6136 node, | 5780 node, |
6137 data->capture_count, | 5781 data->capture_count, |
6138 pattern); | 5782 pattern); |
6139 } | 5783 } |
6140 | 5784 |
6141 | 5785 |
6142 }} // namespace v8::internal | 5786 }} // namespace v8::internal |
OLD | NEW |