OLD | NEW |
1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
101 Factory* factory = isolate->factory(); | 101 Factory* factory = isolate->factory(); |
102 Handle<FixedArray> elements = factory->NewFixedArray(2); | 102 Handle<FixedArray> elements = factory->NewFixedArray(2); |
103 elements->set(0, *pattern); | 103 elements->set(0, *pattern); |
104 elements->set(1, *error_text); | 104 elements->set(1, *error_text); |
105 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); | 105 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); |
106 Handle<Object> regexp_err = factory->NewSyntaxError(message, array); | 106 Handle<Object> regexp_err = factory->NewSyntaxError(message, array); |
107 isolate->Throw(*regexp_err); | 107 isolate->Throw(*regexp_err); |
108 } | 108 } |
109 | 109 |
110 | 110 |
| 111 ContainedInLattice AddRange(ContainedInLattice containment, |
| 112 const int* ranges, |
| 113 int ranges_length, |
| 114 Interval new_range) { |
| 115 ASSERT((ranges_length & 1) == 1); |
| 116 ASSERT(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1); |
| 117 if (containment == kLatticeUnknown) return containment; |
| 118 bool inside = false; |
| 119 int last = 0; |
| 120 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) { |
| 121 // Consider the range from last to ranges[i]. |
| 122 // We haven't got to the new range yet. |
| 123 if (ranges[i] <= new_range.from()) continue; |
| 124 // New range is wholly inside last-ranges[i]. Note that new_range.to() is |
| 125 // inclusive, but the values in ranges are not. |
| 126 if (last <= new_range.from() && new_range.to() < ranges[i]) { |
| 127 return Combine(containment, inside ? kLatticeIn : kLatticeOut); |
| 128 } |
| 129 return kLatticeUnknown; |
| 130 } |
| 131 return containment; |
| 132 } |
| 133 |
| 134 |
111 // More makes code generation slower, less makes V8 benchmark score lower. | 135 // More makes code generation slower, less makes V8 benchmark score lower. |
112 const int kMaxLookaheadForBoyerMoore = 8; | 136 const int kMaxLookaheadForBoyerMoore = 8; |
113 // In a 3-character pattern you can maximally step forwards 3 characters | 137 // In a 3-character pattern you can maximally step forwards 3 characters |
114 // at a time, which is not always enough to pay for the extra logic. | 138 // at a time, which is not always enough to pay for the extra logic. |
115 const int kPatternTooShortForBoyerMoore = 2; | 139 const int kPatternTooShortForBoyerMoore = 2; |
116 | 140 |
117 | 141 |
118 // Identifies the sort of regexps where the regexp engine is faster | 142 // Identifies the sort of regexps where the regexp engine is faster |
119 // than the code used for atom matches. | 143 // than the code used for atom matches. |
120 static bool HasFewDifferentCharacters(Handle<String> pattern) { | 144 static bool HasFewDifferentCharacters(Handle<String> pattern) { |
(...skipping 2029 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2150 | 2174 |
2151 | 2175 |
2152 void ActionNode::FillInBMInfo(int offset, | 2176 void ActionNode::FillInBMInfo(int offset, |
2153 BoyerMooreLookahead* bm, | 2177 BoyerMooreLookahead* bm, |
2154 bool not_at_start) { | 2178 bool not_at_start) { |
2155 if (type_ == BEGIN_SUBMATCH) { | 2179 if (type_ == BEGIN_SUBMATCH) { |
2156 bm->SetRest(offset); | 2180 bm->SetRest(offset); |
2157 } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) { | 2181 } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) { |
2158 on_success()->FillInBMInfo(offset, bm, not_at_start); | 2182 on_success()->FillInBMInfo(offset, bm, not_at_start); |
2159 } | 2183 } |
| 2184 SaveBMInfo(bm, not_at_start, offset); |
2160 } | 2185 } |
2161 | 2186 |
2162 | 2187 |
2163 int AssertionNode::EatsAtLeast(int still_to_find, | 2188 int AssertionNode::EatsAtLeast(int still_to_find, |
2164 int recursion_depth, | 2189 int recursion_depth, |
2165 bool not_at_start) { | 2190 bool not_at_start) { |
2166 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; | 2191 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
2167 // If we know we are not at the start and we are asked "how many characters | 2192 // If we know we are not at the start and we are asked "how many characters |
2168 // will you match if you succeed?" then we can answer anything since false | 2193 // will you match if you succeed?" then we can answer anything since false |
2169 // implies false. So lets just return the max answer (still_to_find) since | 2194 // implies false. So lets just return the max answer (still_to_find) since |
2170 // that won't prevent us from preloading a lot of characters for the other | 2195 // that won't prevent us from preloading a lot of characters for the other |
2171 // branches in the node graph. | 2196 // branches in the node graph. |
2172 if (type() == AT_START && not_at_start) return still_to_find; | 2197 if (type() == AT_START && not_at_start) return still_to_find; |
2173 return on_success()->EatsAtLeast(still_to_find, | 2198 return on_success()->EatsAtLeast(still_to_find, |
2174 recursion_depth + 1, | 2199 recursion_depth + 1, |
2175 not_at_start); | 2200 not_at_start); |
2176 } | 2201 } |
2177 | 2202 |
2178 | 2203 |
2179 void AssertionNode::FillInBMInfo( | 2204 void AssertionNode::FillInBMInfo( |
2180 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | 2205 int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
2181 // Match the behaviour of EatsAtLeast on this node. | 2206 // Match the behaviour of EatsAtLeast on this node. |
2182 if (type() == AT_START && not_at_start) return; | 2207 if (type() == AT_START && not_at_start) return; |
2183 on_success()->FillInBMInfo(offset, bm, not_at_start); | 2208 on_success()->FillInBMInfo(offset, bm, not_at_start); |
| 2209 SaveBMInfo(bm, not_at_start, offset); |
2184 } | 2210 } |
2185 | 2211 |
2186 | 2212 |
2187 int BackReferenceNode::EatsAtLeast(int still_to_find, | 2213 int BackReferenceNode::EatsAtLeast(int still_to_find, |
2188 int recursion_depth, | 2214 int recursion_depth, |
2189 bool not_at_start) { | 2215 bool not_at_start) { |
2190 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; | 2216 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
2191 return on_success()->EatsAtLeast(still_to_find, | 2217 return on_success()->EatsAtLeast(still_to_find, |
2192 recursion_depth + 1, | 2218 recursion_depth + 1, |
2193 not_at_start); | 2219 not_at_start); |
(...skipping 416 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2610 if (body_can_be_zero_length_ || info()->visited) return; | 2636 if (body_can_be_zero_length_ || info()->visited) return; |
2611 VisitMarker marker(info()); | 2637 VisitMarker marker(info()); |
2612 return ChoiceNode::GetQuickCheckDetails(details, | 2638 return ChoiceNode::GetQuickCheckDetails(details, |
2613 compiler, | 2639 compiler, |
2614 characters_filled_in, | 2640 characters_filled_in, |
2615 not_at_start); | 2641 not_at_start); |
2616 } | 2642 } |
2617 | 2643 |
2618 | 2644 |
2619 void LoopChoiceNode::FillInBMInfo( | 2645 void LoopChoiceNode::FillInBMInfo( |
2620 int offset, BoyerMooreLookahead* bm, bool nas) { | 2646 int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
2621 if (body_can_be_zero_length_) { | 2647 if (body_can_be_zero_length_) { |
2622 bm->SetRest(offset); | 2648 bm->SetRest(offset); |
| 2649 SaveBMInfo(bm, not_at_start, offset); |
2623 return; | 2650 return; |
2624 } | 2651 } |
2625 ChoiceNode::FillInBMInfo(offset, bm, nas); | 2652 ChoiceNode::FillInBMInfo(offset, bm, not_at_start); |
| 2653 SaveBMInfo(bm, not_at_start, offset); |
2626 } | 2654 } |
2627 | 2655 |
2628 | 2656 |
2629 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, | 2657 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, |
2630 RegExpCompiler* compiler, | 2658 RegExpCompiler* compiler, |
2631 int characters_filled_in, | 2659 int characters_filled_in, |
2632 bool not_at_start) { | 2660 bool not_at_start) { |
2633 not_at_start = (not_at_start || not_at_start_); | 2661 not_at_start = (not_at_start || not_at_start_); |
2634 int choice_count = alternatives_->length(); | 2662 int choice_count = alternatives_->length(); |
2635 ASSERT(choice_count > 0); | 2663 ASSERT(choice_count > 0); |
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2703 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); | 2731 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); |
2704 } | 2732 } |
2705 assembler->CheckCharacter('\n', &ok); | 2733 assembler->CheckCharacter('\n', &ok); |
2706 assembler->CheckNotCharacter('\r', new_trace.backtrack()); | 2734 assembler->CheckNotCharacter('\r', new_trace.backtrack()); |
2707 } | 2735 } |
2708 assembler->Bind(&ok); | 2736 assembler->Bind(&ok); |
2709 on_success->Emit(compiler, &new_trace); | 2737 on_success->Emit(compiler, &new_trace); |
2710 } | 2738 } |
2711 | 2739 |
2712 | 2740 |
2713 // Emit the code to handle \b and \B (word-boundary or non-word-boundary) | |
2714 // when we know whether the next character must be a word character or not. | |
2715 static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type, | |
2716 RegExpCompiler* compiler, | |
2717 RegExpNode* on_success, | |
2718 Trace* trace) { | |
2719 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | |
2720 Label done; | |
2721 | |
2722 Trace new_trace(*trace); | |
2723 | |
2724 bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER); | |
2725 Label* on_word = expect_word_character ? &done : new_trace.backtrack(); | |
2726 Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done; | |
2727 | |
2728 // Check whether previous character was a word character. | |
2729 switch (trace->at_start()) { | |
2730 case Trace::TRUE: | |
2731 if (expect_word_character) { | |
2732 assembler->GoTo(on_non_word); | |
2733 } | |
2734 break; | |
2735 case Trace::UNKNOWN: | |
2736 ASSERT_EQ(0, trace->cp_offset()); | |
2737 assembler->CheckAtStart(on_non_word); | |
2738 // Fall through. | |
2739 case Trace::FALSE: | |
2740 int prev_char_offset = trace->cp_offset() - 1; | |
2741 assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1); | |
2742 EmitWordCheck(assembler, on_word, on_non_word, expect_word_character); | |
2743 // We may or may not have loaded the previous character. | |
2744 new_trace.InvalidateCurrentCharacter(); | |
2745 } | |
2746 | |
2747 assembler->Bind(&done); | |
2748 | |
2749 on_success->Emit(compiler, &new_trace); | |
2750 } | |
2751 | |
2752 | |
2753 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). | 2741 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). |
2754 static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type, | 2742 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { |
2755 RegExpCompiler* compiler, | |
2756 RegExpNode* on_success, | |
2757 Trace* trace) { | |
2758 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | 2743 RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
2759 Label before_non_word; | 2744 Trace::TriBool next_is_word_character = Trace::UNKNOWN; |
2760 Label before_word; | 2745 bool not_at_start = (trace->at_start() == Trace::FALSE); |
2761 if (trace->characters_preloaded() != 1) { | 2746 BoyerMooreLookahead* lookahead = bm_info(not_at_start); |
2762 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); | 2747 if (lookahead == NULL) { |
| 2748 int eats_at_least = |
| 2749 Min(kMaxLookaheadForBoyerMoore, |
| 2750 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start)); |
| 2751 if (eats_at_least >= 1) { |
| 2752 BoyerMooreLookahead* bm = |
| 2753 new BoyerMooreLookahead(eats_at_least, compiler); |
| 2754 FillInBMInfo(0, bm, not_at_start); |
| 2755 if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE; |
| 2756 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE; |
| 2757 } |
| 2758 } else { |
| 2759 if (lookahead->at(0)->is_non_word()) next_is_word_character = Trace::FALSE; |
| 2760 if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE; |
2763 } | 2761 } |
2764 // Fall through on non-word. | 2762 bool at_boundary = (type_ == AssertionNode::AT_BOUNDARY); |
2765 EmitWordCheck(assembler, &before_word, &before_non_word, false); | 2763 if (next_is_word_character == Trace::UNKNOWN) { |
| 2764 Label before_non_word; |
| 2765 Label before_word; |
| 2766 if (trace->characters_preloaded() != 1) { |
| 2767 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); |
| 2768 } |
| 2769 // Fall through on non-word. |
| 2770 EmitWordCheck(assembler, &before_word, &before_non_word, false); |
| 2771 // Next character is not a word character. |
| 2772 assembler->Bind(&before_non_word); |
| 2773 Label ok; |
| 2774 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); |
| 2775 assembler->GoTo(&ok); |
2766 | 2776 |
2767 // We will be loading the previous character into the current character | 2777 assembler->Bind(&before_word); |
2768 // register. | 2778 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); |
| 2779 assembler->Bind(&ok); |
| 2780 } else if (next_is_word_character == Trace::TRUE) { |
| 2781 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); |
| 2782 } else { |
| 2783 ASSERT(next_is_word_character == Trace::FALSE); |
| 2784 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); |
| 2785 } |
| 2786 } |
| 2787 |
| 2788 |
| 2789 void AssertionNode::BacktrackIfPrevious( |
| 2790 RegExpCompiler* compiler, |
| 2791 Trace* trace, |
| 2792 AssertionNode::IfPrevious backtrack_if_previous) { |
| 2793 RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
2769 Trace new_trace(*trace); | 2794 Trace new_trace(*trace); |
2770 new_trace.InvalidateCurrentCharacter(); | 2795 new_trace.InvalidateCurrentCharacter(); |
2771 | 2796 |
2772 Label ok; | 2797 Label fall_through, dummy; |
2773 Label* boundary; | |
2774 Label* not_boundary; | |
2775 if (type == AssertionNode::AT_BOUNDARY) { | |
2776 boundary = &ok; | |
2777 not_boundary = new_trace.backtrack(); | |
2778 } else { | |
2779 not_boundary = &ok; | |
2780 boundary = new_trace.backtrack(); | |
2781 } | |
2782 | 2798 |
2783 // Next character is not a word character. | 2799 Label* non_word = backtrack_if_previous == kIsNonWord ? |
2784 assembler->Bind(&before_non_word); | 2800 new_trace.backtrack() : |
| 2801 &fall_through; |
| 2802 Label* word = backtrack_if_previous == kIsNonWord ? |
| 2803 &fall_through : |
| 2804 new_trace.backtrack(); |
| 2805 |
2785 if (new_trace.cp_offset() == 0) { | 2806 if (new_trace.cp_offset() == 0) { |
2786 // The start of input counts as a non-word character, so the question is | 2807 // The start of input counts as a non-word character, so the question is |
2787 // decided if we are at the start. | 2808 // decided if we are at the start. |
2788 assembler->CheckAtStart(not_boundary); | 2809 assembler->CheckAtStart(non_word); |
2789 } | 2810 } |
2790 // We already checked that we are not at the start of input so it must be | 2811 // We already checked that we are not at the start of input so it must be |
2791 // OK to load the previous character. | 2812 // OK to load the previous character. |
2792 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, | 2813 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false); |
2793 &ok, // Unused dummy label in this call. | 2814 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord); |
2794 false); | |
2795 // Fall through on non-word. | |
2796 EmitWordCheck(assembler, boundary, not_boundary, false); | |
2797 assembler->GoTo(not_boundary); | |
2798 | 2815 |
2799 // Next character is a word character. | 2816 assembler->Bind(&fall_through); |
2800 assembler->Bind(&before_word); | 2817 on_success()->Emit(compiler, &new_trace); |
2801 if (new_trace.cp_offset() == 0) { | |
2802 // The start of input counts as a non-word character, so the question is | |
2803 // decided if we are at the start. | |
2804 assembler->CheckAtStart(boundary); | |
2805 } | |
2806 // We already checked that we are not at the start of input so it must be | |
2807 // OK to load the previous character. | |
2808 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, | |
2809 &ok, // Unused dummy label in this call. | |
2810 false); | |
2811 bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY); | |
2812 EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word); | |
2813 | |
2814 assembler->Bind(&ok); | |
2815 | |
2816 on_success->Emit(compiler, &new_trace); | |
2817 } | 2818 } |
2818 | 2819 |
2819 | 2820 |
2820 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, | 2821 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, |
2821 RegExpCompiler* compiler, | 2822 RegExpCompiler* compiler, |
2822 int filled_in, | 2823 int filled_in, |
2823 bool not_at_start) { | 2824 bool not_at_start) { |
2824 if (type_ == AT_START && not_at_start) { | 2825 if (type_ == AT_START && not_at_start) { |
2825 details->set_cannot_match(); | 2826 details->set_cannot_match(); |
2826 return; | 2827 return; |
(...skipping 27 matching lines...) Expand all Loading... |
2854 on_success()->Emit(compiler, &at_start_trace); | 2855 on_success()->Emit(compiler, &at_start_trace); |
2855 return; | 2856 return; |
2856 } | 2857 } |
2857 } | 2858 } |
2858 break; | 2859 break; |
2859 case AFTER_NEWLINE: | 2860 case AFTER_NEWLINE: |
2860 EmitHat(compiler, on_success(), trace); | 2861 EmitHat(compiler, on_success(), trace); |
2861 return; | 2862 return; |
2862 case AT_BOUNDARY: | 2863 case AT_BOUNDARY: |
2863 case AT_NON_BOUNDARY: { | 2864 case AT_NON_BOUNDARY: { |
2864 EmitBoundaryCheck(type_, compiler, on_success(), trace); | 2865 EmitBoundaryCheck(compiler, trace); |
2865 return; | 2866 return; |
2866 } | 2867 } |
2867 case AFTER_WORD_CHARACTER: | |
2868 case AFTER_NONWORD_CHARACTER: { | |
2869 EmitHalfBoundaryCheck(type_, compiler, on_success(), trace); | |
2870 } | |
2871 } | 2868 } |
2872 on_success()->Emit(compiler, trace); | 2869 on_success()->Emit(compiler, trace); |
2873 } | 2870 } |
2874 | 2871 |
2875 | 2872 |
2876 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { | 2873 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { |
2877 if (quick_check == NULL) return false; | 2874 if (quick_check == NULL) return false; |
2878 if (offset >= quick_check->characters()) return false; | 2875 if (offset >= quick_check->characters()) return false; |
2879 return quick_check->positions(offset)->determines_perfectly; | 2876 return quick_check->positions(offset)->determines_perfectly; |
2880 } | 2877 } |
(...skipping 389 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3270 return alt_gens_[i]; | 3267 return alt_gens_[i]; |
3271 } | 3268 } |
3272 | 3269 |
3273 private: | 3270 private: |
3274 static const int kAFew = 10; | 3271 static const int kAFew = 10; |
3275 ZoneList<AlternativeGeneration*> alt_gens_; | 3272 ZoneList<AlternativeGeneration*> alt_gens_; |
3276 AlternativeGeneration a_few_alt_gens_[kAFew]; | 3273 AlternativeGeneration a_few_alt_gens_[kAFew]; |
3277 }; | 3274 }; |
3278 | 3275 |
3279 | 3276 |
| 3277 // The '2' variant is has inclusive from and exclusive to. |
| 3278 static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, |
| 3279 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, |
| 3280 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, 0x10000 }; |
| 3281 static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges); |
| 3282 |
| 3283 static const int kWordRanges[] = { |
| 3284 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 }; |
| 3285 static const int kWordRangeCount = ARRAY_SIZE(kWordRanges); |
| 3286 static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 }; |
| 3287 static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges); |
| 3288 static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 }; |
| 3289 static const int kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges); |
| 3290 static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E, |
| 3291 0x2028, 0x202A, 0x10000 }; |
| 3292 static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges); |
| 3293 |
| 3294 |
| 3295 void BoyerMoorePositionInfo::Set(int character) { |
| 3296 SetInterval(Interval(character, character)); |
| 3297 } |
| 3298 |
| 3299 |
| 3300 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) { |
| 3301 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval); |
| 3302 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval); |
| 3303 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval); |
| 3304 surrogate_ = |
| 3305 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval); |
| 3306 if (interval.to() - interval.from() >= kMapSize - 1) { |
| 3307 if (map_count_ != kMapSize) { |
| 3308 map_count_ = kMapSize; |
| 3309 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; |
| 3310 } |
| 3311 return; |
| 3312 } |
| 3313 for (int i = interval.from(); i <= interval.to(); i++) { |
| 3314 int mod_character = (i & kMask); |
| 3315 if (!map_->at(mod_character)) { |
| 3316 map_count_++; |
| 3317 map_->at(mod_character) = true; |
| 3318 } |
| 3319 if (map_count_ == kMapSize) return; |
| 3320 } |
| 3321 } |
| 3322 |
| 3323 |
| 3324 void BoyerMoorePositionInfo::SetAll() { |
| 3325 s_ = w_ = d_ = kLatticeUnknown; |
| 3326 if (map_count_ != kMapSize) { |
| 3327 map_count_ = kMapSize; |
| 3328 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; |
| 3329 } |
| 3330 } |
| 3331 |
| 3332 |
3280 BoyerMooreLookahead::BoyerMooreLookahead( | 3333 BoyerMooreLookahead::BoyerMooreLookahead( |
3281 int length, int map_length, RegExpCompiler* compiler) | 3334 int length, RegExpCompiler* compiler) |
3282 : length_(length), | 3335 : length_(length), |
3283 map_length_(map_length), | |
3284 compiler_(compiler) { | 3336 compiler_(compiler) { |
3285 ASSERT(IsPowerOf2(map_length)); | |
3286 if (compiler->ascii()) { | 3337 if (compiler->ascii()) { |
3287 max_char_ = String::kMaxAsciiCharCode; | 3338 max_char_ = String::kMaxAsciiCharCode; |
3288 } else { | 3339 } else { |
3289 max_char_ = String::kMaxUtf16CodeUnit; | 3340 max_char_ = String::kMaxUtf16CodeUnit; |
3290 } | 3341 } |
3291 bitmaps_ = new ZoneList<ZoneList<bool>*>(length); | 3342 bitmaps_ = new ZoneList<BoyerMoorePositionInfo*>(length); |
3292 for (int i = 0; i < length; i++) { | 3343 for (int i = 0; i < length; i++) { |
3293 bitmaps_->Add(new ZoneList<bool>(map_length)); | 3344 bitmaps_->Add(new BoyerMoorePositionInfo()); |
3294 ZoneList<bool>* map = bitmaps_->at(i); | |
3295 for (int i = 0; i < map_length; i++) { | |
3296 map->Add(false); | |
3297 } | |
3298 } | 3345 } |
3299 } | 3346 } |
3300 | 3347 |
3301 | 3348 |
3302 // Find the longest range of lookahead that has the fewest number of different | 3349 // Find the longest range of lookahead that has the fewest number of different |
3303 // characters that can occur at a given position. Since we are optimizing two | 3350 // characters that can occur at a given position. Since we are optimizing two |
3304 // different parameters at once this is a tradeoff. | 3351 // different parameters at once this is a tradeoff. |
3305 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { | 3352 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { |
3306 int biggest_points = 0; | 3353 int biggest_points = 0; |
| 3354 // If more than 32 characters out of 128 can occur it is unlikely that we can |
| 3355 // be lucky enough to step forwards much of the time. |
| 3356 const int kMaxMax = 32; |
3307 for (int max_number_of_chars = 4; | 3357 for (int max_number_of_chars = 4; |
3308 max_number_of_chars < kTooManyCharacters; | 3358 max_number_of_chars < kMaxMax; |
3309 max_number_of_chars *= 2) { | 3359 max_number_of_chars *= 2) { |
3310 biggest_points = | 3360 biggest_points = |
3311 FindBestInterval(max_number_of_chars, biggest_points, from, to); | 3361 FindBestInterval(max_number_of_chars, biggest_points, from, to); |
3312 } | 3362 } |
3313 if (biggest_points == 0) return false; | 3363 if (biggest_points == 0) return false; |
3314 return true; | 3364 return true; |
3315 } | 3365 } |
3316 | 3366 |
3317 | 3367 |
3318 // Find the highest-points range between 0 and length_ where the character | 3368 // Find the highest-points range between 0 and length_ where the character |
3319 // information is not too vague. 'Too vague' means that there are more than | 3369 // information is not too vague. 'Too vague' means that there are more than |
3320 // max_number_of_chars that can occur at this position. Calculates the number | 3370 // max_number_of_chars that can occur at this position. Calculates the number |
3321 // of points as the product of width-of-the-range and | 3371 // of points as the product of width-of-the-range and |
3322 // probability-of-finding-one-of-the-characters, where the probability is | 3372 // probability-of-finding-one-of-the-characters, where the probability is |
3323 // calculated using the frequency distribution of the sample subject string. | 3373 // calculated using the frequency distribution of the sample subject string. |
3324 int BoyerMooreLookahead::FindBestInterval( | 3374 int BoyerMooreLookahead::FindBestInterval( |
3325 int max_number_of_chars, int old_biggest_points, int* from, int* to) { | 3375 int max_number_of_chars, int old_biggest_points, int* from, int* to) { |
3326 int biggest_points = old_biggest_points; | 3376 int biggest_points = old_biggest_points; |
3327 static const int kSize = RegExpMacroAssembler::kTableSize; | 3377 static const int kSize = RegExpMacroAssembler::kTableSize; |
3328 for (int i = 0; i < length_; ) { | 3378 for (int i = 0; i < length_; ) { |
3329 while (i < length_ && Count(i) > max_number_of_chars) i++; | 3379 while (i < length_ && Count(i) > max_number_of_chars) i++; |
3330 if (i == length_) break; | 3380 if (i == length_) break; |
3331 int remembered_from = i; | 3381 int remembered_from = i; |
3332 bool union_map[kSize]; | 3382 bool union_map[kSize]; |
3333 for (int j = 0; j < kSize; j++) union_map[j] = false; | 3383 for (int j = 0; j < kSize; j++) union_map[j] = false; |
3334 while (i < length_ && Count(i) <= max_number_of_chars) { | 3384 while (i < length_ && Count(i) <= max_number_of_chars) { |
3335 ZoneList<bool>* map = bitmaps_->at(i); | 3385 BoyerMoorePositionInfo* map = bitmaps_->at(i); |
3336 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j); | 3386 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j); |
3337 i++; | 3387 i++; |
3338 } | 3388 } |
3339 int frequency = 0; | 3389 int frequency = 0; |
3340 for (int j = 0; j < kSize; j++) { | 3390 for (int j = 0; j < kSize; j++) { |
3341 if (union_map[j]) { | 3391 if (union_map[j]) { |
3342 // Add 1 to the frequency to give a small per-character boost for | 3392 // Add 1 to the frequency to give a small per-character boost for |
3343 // the cases where our sampling is not good enough and many | 3393 // the cases where our sampling is not good enough and many |
3344 // characters have a frequency of zero. This means the frequency | 3394 // characters have a frequency of zero. This means the frequency |
3345 // can theoretically be up to 2*kSize though we treat it mostly as | 3395 // can theoretically be up to 2*kSize though we treat it mostly as |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3380 | 3430 |
3381 const int kSkipArrayEntry = 0; | 3431 const int kSkipArrayEntry = 0; |
3382 const int kDontSkipArrayEntry = 1; | 3432 const int kDontSkipArrayEntry = 1; |
3383 | 3433 |
3384 for (int i = 0; i < kSize; i++) { | 3434 for (int i = 0; i < kSize; i++) { |
3385 boolean_skip_table->set(i, kSkipArrayEntry); | 3435 boolean_skip_table->set(i, kSkipArrayEntry); |
3386 } | 3436 } |
3387 int skip = max_lookahead + 1 - min_lookahead; | 3437 int skip = max_lookahead + 1 - min_lookahead; |
3388 | 3438 |
3389 for (int i = max_lookahead; i >= min_lookahead; i--) { | 3439 for (int i = max_lookahead; i >= min_lookahead; i--) { |
3390 ZoneList<bool>* map = bitmaps_->at(i); | 3440 BoyerMoorePositionInfo* map = bitmaps_->at(i); |
3391 for (int j = 0; j < map_length_; j++) { | 3441 for (int j = 0; j < kSize; j++) { |
3392 if (map->at(j)) { | 3442 if (map->at(j)) { |
3393 boolean_skip_table->set(j, kDontSkipArrayEntry); | 3443 boolean_skip_table->set(j, kDontSkipArrayEntry); |
3394 } | 3444 } |
3395 } | 3445 } |
3396 } | 3446 } |
3397 | 3447 |
3398 return skip; | 3448 return skip; |
3399 } | 3449 } |
3400 | 3450 |
3401 | 3451 |
3402 // See comment above on the implementation of GetSkipTable. | 3452 // See comment above on the implementation of GetSkipTable. |
3403 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { | 3453 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { |
| 3454 const int kSize = RegExpMacroAssembler::kTableSize; |
| 3455 |
3404 int min_lookahead = 0; | 3456 int min_lookahead = 0; |
3405 int max_lookahead = 0; | 3457 int max_lookahead = 0; |
3406 | 3458 |
3407 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false; | 3459 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false; |
3408 | 3460 |
3409 bool found_single_character = false; | 3461 bool found_single_character = false; |
3410 bool abandoned_search_for_single_character = false; | |
3411 int single_character = 0; | 3462 int single_character = 0; |
3412 for (int i = max_lookahead; i >= min_lookahead; i--) { | 3463 for (int i = max_lookahead; i >= min_lookahead; i--) { |
3413 ZoneList<bool>* map = bitmaps_->at(i); | 3464 BoyerMoorePositionInfo* map = bitmaps_->at(i); |
3414 for (int j = 0; j < map_length_; j++) { | 3465 if (map->map_count() > 1 || |
| 3466 (found_single_character && map->map_count() != 0)) { |
| 3467 found_single_character = false; |
| 3468 break; |
| 3469 } |
| 3470 for (int j = 0; j < kSize; j++) { |
3415 if (map->at(j)) { | 3471 if (map->at(j)) { |
3416 if (found_single_character) { | 3472 found_single_character = true; |
3417 found_single_character = false; // Found two. | 3473 single_character = j; |
3418 abandoned_search_for_single_character = true; | 3474 break; |
3419 break; | |
3420 } else { | |
3421 found_single_character = true; | |
3422 single_character = j; | |
3423 } | |
3424 } | 3475 } |
3425 } | 3476 } |
3426 if (abandoned_search_for_single_character) break; | |
3427 } | 3477 } |
3428 | 3478 |
3429 int lookahead_width = max_lookahead + 1 - min_lookahead; | 3479 int lookahead_width = max_lookahead + 1 - min_lookahead; |
3430 | 3480 |
3431 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { | 3481 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { |
3432 // The mask-compare can probably handle this better. | 3482 // The mask-compare can probably handle this better. |
3433 return false; | 3483 return false; |
3434 } | 3484 } |
3435 | 3485 |
3436 if (found_single_character) { | 3486 if (found_single_character) { |
3437 Label cont, again; | 3487 Label cont, again; |
3438 masm->Bind(&again); | 3488 masm->Bind(&again); |
3439 masm->LoadCurrentCharacter(max_lookahead, &cont, true); | 3489 masm->LoadCurrentCharacter(max_lookahead, &cont, true); |
3440 if (max_char_ > map_length_) { | 3490 if (max_char_ > kSize) { |
3441 ASSERT(map_length_ == RegExpMacroAssembler::kTableSize); | |
3442 masm->CheckCharacterAfterAnd(single_character, | 3491 masm->CheckCharacterAfterAnd(single_character, |
3443 RegExpMacroAssembler::kTableMask, | 3492 RegExpMacroAssembler::kTableMask, |
3444 &cont); | 3493 &cont); |
3445 } else { | 3494 } else { |
3446 masm->CheckCharacter(single_character, &cont); | 3495 masm->CheckCharacter(single_character, &cont); |
3447 } | 3496 } |
3448 masm->AdvanceCurrentPosition(lookahead_width); | 3497 masm->AdvanceCurrentPosition(lookahead_width); |
3449 masm->GoTo(&again); | 3498 masm->GoTo(&again); |
3450 masm->Bind(&cont); | 3499 masm->Bind(&cont); |
3451 return true; | 3500 return true; |
3452 } | 3501 } |
3453 | 3502 |
3454 Handle<ByteArray> boolean_skip_table = | 3503 Handle<ByteArray> boolean_skip_table = |
3455 FACTORY->NewByteArray(map_length_, TENURED); | 3504 FACTORY->NewByteArray(kSize, TENURED); |
3456 int skip_distance = GetSkipTable( | 3505 int skip_distance = GetSkipTable( |
3457 min_lookahead, max_lookahead, boolean_skip_table); | 3506 min_lookahead, max_lookahead, boolean_skip_table); |
3458 ASSERT(skip_distance != 0); | 3507 ASSERT(skip_distance != 0); |
3459 | 3508 |
3460 Label cont, again; | 3509 Label cont, again; |
3461 masm->Bind(&again); | 3510 masm->Bind(&again); |
3462 masm->LoadCurrentCharacter(max_lookahead, &cont, true); | 3511 masm->LoadCurrentCharacter(max_lookahead, &cont, true); |
3463 masm->CheckBitInTable(boolean_skip_table, &cont); | 3512 masm->CheckBitInTable(boolean_skip_table, &cont); |
3464 masm->AdvanceCurrentPosition(skip_distance); | 3513 masm->AdvanceCurrentPosition(skip_distance); |
3465 masm->GoTo(&again); | 3514 masm->GoTo(&again); |
(...skipping 158 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3624 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) == | 3673 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) == |
3625 this) { | 3674 this) { |
3626 // At this point we know that we are at a non-greedy loop that will eat | 3675 // At this point we know that we are at a non-greedy loop that will eat |
3627 // any character one at a time. Any non-anchored regexp has such a | 3676 // any character one at a time. Any non-anchored regexp has such a |
3628 // loop prepended to it in order to find where it starts. We look for | 3677 // loop prepended to it in order to find where it starts. We look for |
3629 // a pattern of the form ...abc... where we can look 6 characters ahead | 3678 // a pattern of the form ...abc... where we can look 6 characters ahead |
3630 // and step forwards 3 if the character is not one of abc. Abc need | 3679 // and step forwards 3 if the character is not one of abc. Abc need |
3631 // not be atoms, they can be any reasonably limited character class or | 3680 // not be atoms, they can be any reasonably limited character class or |
3632 // small alternation. | 3681 // small alternation. |
3633 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes. | 3682 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes. |
3634 eats_at_least = | 3683 BoyerMooreLookahead* lookahead = bm_info(not_at_start); |
3635 Min(kMaxLookaheadForBoyerMoore, | 3684 if (lookahead == NULL) { |
3636 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start)); | 3685 eats_at_least = |
3637 if (eats_at_least >= 1) { | 3686 Min(kMaxLookaheadForBoyerMoore, |
3638 BoyerMooreLookahead bm(eats_at_least, | 3687 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start)); |
3639 RegExpMacroAssembler::kTableSize, | 3688 if (eats_at_least >= 1) { |
3640 compiler); | 3689 BoyerMooreLookahead* bm = |
3641 GuardedAlternative alt0 = alternatives_->at(0); | 3690 new BoyerMooreLookahead(eats_at_least, compiler); |
3642 alt0.node()->FillInBMInfo(0, &bm, not_at_start); | 3691 GuardedAlternative alt0 = alternatives_->at(0); |
3643 skip_was_emitted = bm.EmitSkipInstructions(macro_assembler); | 3692 alt0.node()->FillInBMInfo(0, bm, not_at_start); |
| 3693 skip_was_emitted = bm->EmitSkipInstructions(macro_assembler); |
| 3694 } |
| 3695 } else { |
| 3696 skip_was_emitted = lookahead->EmitSkipInstructions(macro_assembler); |
3644 } | 3697 } |
3645 } | 3698 } |
3646 } | 3699 } |
3647 } | 3700 } |
3648 | 3701 |
3649 if (eats_at_least == kEatsAtLeastNotYetInitialized) { | 3702 if (eats_at_least == kEatsAtLeastNotYetInitialized) { |
3650 // Save some time by looking at most one machine word ahead. | 3703 // Save some time by looking at most one machine word ahead. |
3651 eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start); | 3704 eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start); |
3652 } | 3705 } |
3653 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least); | 3706 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least); |
(...skipping 542 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4196 break; | 4249 break; |
4197 case AssertionNode::AT_BOUNDARY: | 4250 case AssertionNode::AT_BOUNDARY: |
4198 stream()->Add("label=\"\\b\", shape=septagon"); | 4251 stream()->Add("label=\"\\b\", shape=septagon"); |
4199 break; | 4252 break; |
4200 case AssertionNode::AT_NON_BOUNDARY: | 4253 case AssertionNode::AT_NON_BOUNDARY: |
4201 stream()->Add("label=\"\\B\", shape=septagon"); | 4254 stream()->Add("label=\"\\B\", shape=septagon"); |
4202 break; | 4255 break; |
4203 case AssertionNode::AFTER_NEWLINE: | 4256 case AssertionNode::AFTER_NEWLINE: |
4204 stream()->Add("label=\"(?<=\\n)\", shape=septagon"); | 4257 stream()->Add("label=\"(?<=\\n)\", shape=septagon"); |
4205 break; | 4258 break; |
4206 case AssertionNode::AFTER_WORD_CHARACTER: | |
4207 stream()->Add("label=\"(?<=\\w)\", shape=septagon"); | |
4208 break; | |
4209 case AssertionNode::AFTER_NONWORD_CHARACTER: | |
4210 stream()->Add("label=\"(?<=\\W)\", shape=septagon"); | |
4211 break; | |
4212 } | 4259 } |
4213 stream()->Add("];\n"); | 4260 stream()->Add("];\n"); |
4214 PrintAttributes(that); | 4261 PrintAttributes(that); |
4215 RegExpNode* successor = that->on_success(); | 4262 RegExpNode* successor = that->on_success(); |
4216 stream()->Add(" n%p -> n%p;\n", that, successor); | 4263 stream()->Add(" n%p -> n%p;\n", that, successor); |
4217 Visit(successor); | 4264 Visit(successor); |
4218 } | 4265 } |
4219 | 4266 |
4220 | 4267 |
4221 void DotPrinter::VisitAction(ActionNode* that) { | 4268 void DotPrinter::VisitAction(ActionNode* that) { |
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4306 printer.PrintNode(label, node); | 4353 printer.PrintNode(label, node); |
4307 } | 4354 } |
4308 | 4355 |
4309 | 4356 |
4310 #endif // DEBUG | 4357 #endif // DEBUG |
4311 | 4358 |
4312 | 4359 |
4313 // ------------------------------------------------------------------- | 4360 // ------------------------------------------------------------------- |
4314 // Tree to graph conversion | 4361 // Tree to graph conversion |
4315 | 4362 |
4316 static const uc16 kSpaceRanges[] = { 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0, | |
4317 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029, | |
4318 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000, 0xFEFF, 0xFEFF }; | |
4319 static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges); | |
4320 | |
4321 static const uc16 kWordRanges[] = { '0', '9', 'A', 'Z', '_', '_', 'a', 'z' }; | |
4322 static const int kWordRangeCount = ARRAY_SIZE(kWordRanges); | |
4323 | |
4324 static const uc16 kDigitRanges[] = { '0', '9' }; | |
4325 static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges); | |
4326 | |
4327 static const uc16 kLineTerminatorRanges[] = { 0x000A, 0x000A, 0x000D, 0x000D, | |
4328 0x2028, 0x2029 }; | |
4329 static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges); | |
4330 | |
4331 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, | 4363 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, |
4332 RegExpNode* on_success) { | 4364 RegExpNode* on_success) { |
4333 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1); | 4365 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1); |
4334 elms->Add(TextElement::Atom(this)); | 4366 elms->Add(TextElement::Atom(this)); |
4335 return new TextNode(elms, on_success); | 4367 return new TextNode(elms, on_success); |
4336 } | 4368 } |
4337 | 4369 |
4338 | 4370 |
4339 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, | 4371 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, |
4340 RegExpNode* on_success) { | 4372 RegExpNode* on_success) { |
4341 return new TextNode(elements(), on_success); | 4373 return new TextNode(elements(), on_success); |
4342 } | 4374 } |
4343 | 4375 |
| 4376 |
4344 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, | 4377 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, |
4345 const uc16* special_class, | 4378 const int* special_class, |
4346 int length) { | 4379 int length) { |
| 4380 length--; // Remove final 0x10000. |
| 4381 ASSERT(special_class[length] == 0x10000); |
4347 ASSERT(ranges->length() != 0); | 4382 ASSERT(ranges->length() != 0); |
4348 ASSERT(length != 0); | 4383 ASSERT(length != 0); |
4349 ASSERT(special_class[0] != 0); | 4384 ASSERT(special_class[0] != 0); |
4350 if (ranges->length() != (length >> 1) + 1) { | 4385 if (ranges->length() != (length >> 1) + 1) { |
4351 return false; | 4386 return false; |
4352 } | 4387 } |
4353 CharacterRange range = ranges->at(0); | 4388 CharacterRange range = ranges->at(0); |
4354 if (range.from() != 0) { | 4389 if (range.from() != 0) { |
4355 return false; | 4390 return false; |
4356 } | 4391 } |
4357 for (int i = 0; i < length; i += 2) { | 4392 for (int i = 0; i < length; i += 2) { |
4358 if (special_class[i] != (range.to() + 1)) { | 4393 if (special_class[i] != (range.to() + 1)) { |
4359 return false; | 4394 return false; |
4360 } | 4395 } |
4361 range = ranges->at((i >> 1) + 1); | 4396 range = ranges->at((i >> 1) + 1); |
4362 if (special_class[i+1] != range.from() - 1) { | 4397 if (special_class[i+1] != range.from()) { |
4363 return false; | 4398 return false; |
4364 } | 4399 } |
4365 } | 4400 } |
4366 if (range.to() != 0xffff) { | 4401 if (range.to() != 0xffff) { |
4367 return false; | 4402 return false; |
4368 } | 4403 } |
4369 return true; | 4404 return true; |
4370 } | 4405 } |
4371 | 4406 |
4372 | 4407 |
4373 static bool CompareRanges(ZoneList<CharacterRange>* ranges, | 4408 static bool CompareRanges(ZoneList<CharacterRange>* ranges, |
4374 const uc16* special_class, | 4409 const int* special_class, |
4375 int length) { | 4410 int length) { |
| 4411 length--; // Remove final 0x10000. |
| 4412 ASSERT(special_class[length] == 0x10000); |
4376 if (ranges->length() * 2 != length) { | 4413 if (ranges->length() * 2 != length) { |
4377 return false; | 4414 return false; |
4378 } | 4415 } |
4379 for (int i = 0; i < length; i += 2) { | 4416 for (int i = 0; i < length; i += 2) { |
4380 CharacterRange range = ranges->at(i >> 1); | 4417 CharacterRange range = ranges->at(i >> 1); |
4381 if (range.from() != special_class[i] || range.to() != special_class[i+1]) { | 4418 if (range.from() != special_class[i] || |
| 4419 range.to() != special_class[i + 1] - 1) { |
4382 return false; | 4420 return false; |
4383 } | 4421 } |
4384 } | 4422 } |
4385 return true; | 4423 return true; |
4386 } | 4424 } |
4387 | 4425 |
4388 | 4426 |
4389 bool RegExpCharacterClass::is_standard() { | 4427 bool RegExpCharacterClass::is_standard() { |
4390 // TODO(lrn): Remove need for this function, by not throwing away information | 4428 // TODO(lrn): Remove need for this function, by not throwing away information |
4391 // along the way. | 4429 // along the way. |
(...skipping 380 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4772 RegExpNode* on_success) { | 4810 RegExpNode* on_success) { |
4773 ZoneList<RegExpTree*>* children = nodes(); | 4811 ZoneList<RegExpTree*>* children = nodes(); |
4774 RegExpNode* current = on_success; | 4812 RegExpNode* current = on_success; |
4775 for (int i = children->length() - 1; i >= 0; i--) { | 4813 for (int i = children->length() - 1; i >= 0; i--) { |
4776 current = children->at(i)->ToNode(compiler, current); | 4814 current = children->at(i)->ToNode(compiler, current); |
4777 } | 4815 } |
4778 return current; | 4816 return current; |
4779 } | 4817 } |
4780 | 4818 |
4781 | 4819 |
4782 static void AddClass(const uc16* elmv, | 4820 static void AddClass(const int* elmv, |
4783 int elmc, | 4821 int elmc, |
4784 ZoneList<CharacterRange>* ranges) { | 4822 ZoneList<CharacterRange>* ranges) { |
| 4823 elmc--; |
| 4824 ASSERT(elmv[elmc] == 0x10000); |
4785 for (int i = 0; i < elmc; i += 2) { | 4825 for (int i = 0; i < elmc; i += 2) { |
4786 ASSERT(elmv[i] <= elmv[i + 1]); | 4826 ASSERT(elmv[i] < elmv[i + 1]); |
4787 ranges->Add(CharacterRange(elmv[i], elmv[i + 1])); | 4827 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1)); |
4788 } | 4828 } |
4789 } | 4829 } |
4790 | 4830 |
4791 | 4831 |
4792 static void AddClassNegated(const uc16 *elmv, | 4832 static void AddClassNegated(const int *elmv, |
4793 int elmc, | 4833 int elmc, |
4794 ZoneList<CharacterRange>* ranges) { | 4834 ZoneList<CharacterRange>* ranges) { |
| 4835 elmc--; |
| 4836 ASSERT(elmv[elmc] == 0x10000); |
4795 ASSERT(elmv[0] != 0x0000); | 4837 ASSERT(elmv[0] != 0x0000); |
4796 ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit); | 4838 ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit); |
4797 uc16 last = 0x0000; | 4839 uc16 last = 0x0000; |
4798 for (int i = 0; i < elmc; i += 2) { | 4840 for (int i = 0; i < elmc; i += 2) { |
4799 ASSERT(last <= elmv[i] - 1); | 4841 ASSERT(last <= elmv[i] - 1); |
4800 ASSERT(elmv[i] <= elmv[i + 1]); | 4842 ASSERT(elmv[i] < elmv[i + 1]); |
4801 ranges->Add(CharacterRange(last, elmv[i] - 1)); | 4843 ranges->Add(CharacterRange(last, elmv[i] - 1)); |
4802 last = elmv[i + 1] + 1; | 4844 last = elmv[i + 1]; |
4803 } | 4845 } |
4804 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit)); | 4846 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit)); |
4805 } | 4847 } |
4806 | 4848 |
4807 | 4849 |
4808 void CharacterRange::AddClassEscape(uc16 type, | 4850 void CharacterRange::AddClassEscape(uc16 type, |
4809 ZoneList<CharacterRange>* ranges) { | 4851 ZoneList<CharacterRange>* ranges) { |
4810 switch (type) { | 4852 switch (type) { |
4811 case 's': | 4853 case 's': |
4812 AddClass(kSpaceRanges, kSpaceRangeCount, ranges); | 4854 AddClass(kSpaceRanges, kSpaceRangeCount, ranges); |
(...skipping 30 matching lines...) Expand all Loading... |
4843 AddClass(kLineTerminatorRanges, | 4885 AddClass(kLineTerminatorRanges, |
4844 kLineTerminatorRangeCount, | 4886 kLineTerminatorRangeCount, |
4845 ranges); | 4887 ranges); |
4846 break; | 4888 break; |
4847 default: | 4889 default: |
4848 UNREACHABLE(); | 4890 UNREACHABLE(); |
4849 } | 4891 } |
4850 } | 4892 } |
4851 | 4893 |
4852 | 4894 |
4853 Vector<const uc16> CharacterRange::GetWordBounds() { | 4895 Vector<const int> CharacterRange::GetWordBounds() { |
4854 return Vector<const uc16>(kWordRanges, kWordRangeCount); | 4896 return Vector<const int>(kWordRanges, kWordRangeCount - 1); |
4855 } | 4897 } |
4856 | 4898 |
4857 | 4899 |
4858 class CharacterRangeSplitter { | 4900 class CharacterRangeSplitter { |
4859 public: | 4901 public: |
4860 CharacterRangeSplitter(ZoneList<CharacterRange>** included, | 4902 CharacterRangeSplitter(ZoneList<CharacterRange>** included, |
4861 ZoneList<CharacterRange>** excluded) | 4903 ZoneList<CharacterRange>** excluded) |
4862 : included_(included), | 4904 : included_(included), |
4863 excluded_(excluded) { } | 4905 excluded_(excluded) { } |
4864 void Call(uc16 from, DispatchTable::Entry entry); | 4906 void Call(uc16 from, DispatchTable::Entry entry); |
(...skipping 11 matching lines...) Expand all Loading... |
4876 if (!entry.out_set()->Get(kInBase)) return; | 4918 if (!entry.out_set()->Get(kInBase)) return; |
4877 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay) | 4919 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay) |
4878 ? included_ | 4920 ? included_ |
4879 : excluded_; | 4921 : excluded_; |
4880 if (*target == NULL) *target = new ZoneList<CharacterRange>(2); | 4922 if (*target == NULL) *target = new ZoneList<CharacterRange>(2); |
4881 (*target)->Add(CharacterRange(entry.from(), entry.to())); | 4923 (*target)->Add(CharacterRange(entry.from(), entry.to())); |
4882 } | 4924 } |
4883 | 4925 |
4884 | 4926 |
4885 void CharacterRange::Split(ZoneList<CharacterRange>* base, | 4927 void CharacterRange::Split(ZoneList<CharacterRange>* base, |
4886 Vector<const uc16> overlay, | 4928 Vector<const int> overlay, |
4887 ZoneList<CharacterRange>** included, | 4929 ZoneList<CharacterRange>** included, |
4888 ZoneList<CharacterRange>** excluded) { | 4930 ZoneList<CharacterRange>** excluded) { |
4889 ASSERT_EQ(NULL, *included); | 4931 ASSERT_EQ(NULL, *included); |
4890 ASSERT_EQ(NULL, *excluded); | 4932 ASSERT_EQ(NULL, *excluded); |
4891 DispatchTable table; | 4933 DispatchTable table; |
4892 for (int i = 0; i < base->length(); i++) | 4934 for (int i = 0; i < base->length(); i++) |
4893 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase); | 4935 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase); |
4894 for (int i = 0; i < overlay.length(); i += 2) { | 4936 for (int i = 0; i < overlay.length(); i += 2) { |
4895 table.AddRange(CharacterRange(overlay[i], overlay[i+1]), | 4937 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1), |
4896 CharacterRangeSplitter::kInOverlay); | 4938 CharacterRangeSplitter::kInOverlay); |
4897 } | 4939 } |
4898 CharacterRangeSplitter callback(included, excluded); | 4940 CharacterRangeSplitter callback(included, excluded); |
4899 table.ForEach(&callback); | 4941 table.ForEach(&callback); |
4900 } | 4942 } |
4901 | 4943 |
4902 | 4944 |
4903 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, | 4945 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, |
4904 bool is_ascii) { | 4946 bool is_ascii) { |
4905 Isolate* isolate = Isolate::Current(); | 4947 Isolate* isolate = Isolate::Current(); |
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4971 if (n <= 1) return true; | 5013 if (n <= 1) return true; |
4972 int max = ranges->at(0).to(); | 5014 int max = ranges->at(0).to(); |
4973 for (int i = 1; i < n; i++) { | 5015 for (int i = 1; i < n; i++) { |
4974 CharacterRange next_range = ranges->at(i); | 5016 CharacterRange next_range = ranges->at(i); |
4975 if (next_range.from() <= max + 1) return false; | 5017 if (next_range.from() <= max + 1) return false; |
4976 max = next_range.to(); | 5018 max = next_range.to(); |
4977 } | 5019 } |
4978 return true; | 5020 return true; |
4979 } | 5021 } |
4980 | 5022 |
4981 SetRelation CharacterRange::WordCharacterRelation( | |
4982 ZoneList<CharacterRange>* range) { | |
4983 ASSERT(IsCanonical(range)); | |
4984 int i = 0; // Word character range index. | |
4985 int j = 0; // Argument range index. | |
4986 ASSERT_NE(0, kWordRangeCount); | |
4987 SetRelation result; | |
4988 if (range->length() == 0) { | |
4989 result.SetElementsInSecondSet(); | |
4990 return result; | |
4991 } | |
4992 CharacterRange argument_range = range->at(0); | |
4993 CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]); | |
4994 while (i < kWordRangeCount && j < range->length()) { | |
4995 // Check the two ranges for the five cases: | |
4996 // - no overlap. | |
4997 // - partial overlap (there are elements in both ranges that isn't | |
4998 // in the other, and there are also elements that are in both). | |
4999 // - argument range entirely inside word range. | |
5000 // - word range entirely inside argument range. | |
5001 // - ranges are completely equal. | |
5002 | |
5003 // First check for no overlap. The earlier range is not in the other set. | |
5004 if (argument_range.from() > word_range.to()) { | |
5005 // Ranges are disjoint. The earlier word range contains elements that | |
5006 // cannot be in the argument set. | |
5007 result.SetElementsInSecondSet(); | |
5008 } else if (word_range.from() > argument_range.to()) { | |
5009 // Ranges are disjoint. The earlier argument range contains elements that | |
5010 // cannot be in the word set. | |
5011 result.SetElementsInFirstSet(); | |
5012 } else if (word_range.from() <= argument_range.from() && | |
5013 word_range.to() >= argument_range.from()) { | |
5014 result.SetElementsInBothSets(); | |
5015 // argument range completely inside word range. | |
5016 if (word_range.from() < argument_range.from() || | |
5017 word_range.to() > argument_range.from()) { | |
5018 result.SetElementsInSecondSet(); | |
5019 } | |
5020 } else if (word_range.from() >= argument_range.from() && | |
5021 word_range.to() <= argument_range.from()) { | |
5022 result.SetElementsInBothSets(); | |
5023 result.SetElementsInFirstSet(); | |
5024 } else { | |
5025 // There is overlap, and neither is a subrange of the other | |
5026 result.SetElementsInFirstSet(); | |
5027 result.SetElementsInSecondSet(); | |
5028 result.SetElementsInBothSets(); | |
5029 } | |
5030 if (result.NonTrivialIntersection()) { | |
5031 // The result is as (im)precise as we can possibly make it. | |
5032 return result; | |
5033 } | |
5034 // Progress the range(s) with minimal to-character. | |
5035 uc16 word_to = word_range.to(); | |
5036 uc16 argument_to = argument_range.to(); | |
5037 if (argument_to <= word_to) { | |
5038 j++; | |
5039 if (j < range->length()) { | |
5040 argument_range = range->at(j); | |
5041 } | |
5042 } | |
5043 if (word_to <= argument_to) { | |
5044 i += 2; | |
5045 if (i < kWordRangeCount) { | |
5046 word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]); | |
5047 } | |
5048 } | |
5049 } | |
5050 // Check if anything wasn't compared in the loop. | |
5051 if (i < kWordRangeCount) { | |
5052 // word range contains something not in argument range. | |
5053 result.SetElementsInSecondSet(); | |
5054 } else if (j < range->length()) { | |
5055 // Argument range contains something not in word range. | |
5056 result.SetElementsInFirstSet(); | |
5057 } | |
5058 | |
5059 return result; | |
5060 } | |
5061 | |
5062 | 5023 |
5063 ZoneList<CharacterRange>* CharacterSet::ranges() { | 5024 ZoneList<CharacterRange>* CharacterSet::ranges() { |
5064 if (ranges_ == NULL) { | 5025 if (ranges_ == NULL) { |
5065 ranges_ = new ZoneList<CharacterRange>(2); | 5026 ranges_ = new ZoneList<CharacterRange>(2); |
5066 CharacterRange::AddClassEscape(standard_set_type_, ranges_); | 5027 CharacterRange::AddClassEscape(standard_set_type_, ranges_); |
5067 } | 5028 } |
5068 return ranges_; | 5029 return ranges_; |
5069 } | 5030 } |
5070 | 5031 |
5071 | 5032 |
(...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5184 num_canonical, | 5145 num_canonical, |
5185 character_ranges->at(read)); | 5146 character_ranges->at(read)); |
5186 read++; | 5147 read++; |
5187 } while (read < n); | 5148 } while (read < n); |
5188 character_ranges->Rewind(num_canonical); | 5149 character_ranges->Rewind(num_canonical); |
5189 | 5150 |
5190 ASSERT(CharacterRange::IsCanonical(character_ranges)); | 5151 ASSERT(CharacterRange::IsCanonical(character_ranges)); |
5191 } | 5152 } |
5192 | 5153 |
5193 | 5154 |
5194 // Utility function for CharacterRange::Merge. Adds a range at the end of | |
5195 // a canonicalized range list, if necessary merging the range with the last | |
5196 // range of the list. | |
5197 static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) { | |
5198 if (set == NULL) return; | |
5199 ASSERT(set->length() == 0 || set->at(set->length() - 1).to() < range.from()); | |
5200 int n = set->length(); | |
5201 if (n > 0) { | |
5202 CharacterRange lastRange = set->at(n - 1); | |
5203 if (lastRange.to() == range.from() - 1) { | |
5204 set->at(n - 1) = CharacterRange(lastRange.from(), range.to()); | |
5205 return; | |
5206 } | |
5207 } | |
5208 set->Add(range); | |
5209 } | |
5210 | |
5211 | |
5212 static void AddRangeToSelectedSet(int selector, | |
5213 ZoneList<CharacterRange>* first_set, | |
5214 ZoneList<CharacterRange>* second_set, | |
5215 ZoneList<CharacterRange>* intersection_set, | |
5216 CharacterRange range) { | |
5217 switch (selector) { | |
5218 case kInsideFirst: | |
5219 AddRangeToSet(first_set, range); | |
5220 break; | |
5221 case kInsideSecond: | |
5222 AddRangeToSet(second_set, range); | |
5223 break; | |
5224 case kInsideBoth: | |
5225 AddRangeToSet(intersection_set, range); | |
5226 break; | |
5227 } | |
5228 } | |
5229 | |
5230 | |
5231 | |
5232 void CharacterRange::Merge(ZoneList<CharacterRange>* first_set, | |
5233 ZoneList<CharacterRange>* second_set, | |
5234 ZoneList<CharacterRange>* first_set_only_out, | |
5235 ZoneList<CharacterRange>* second_set_only_out, | |
5236 ZoneList<CharacterRange>* both_sets_out) { | |
5237 // Inputs are canonicalized. | |
5238 ASSERT(CharacterRange::IsCanonical(first_set)); | |
5239 ASSERT(CharacterRange::IsCanonical(second_set)); | |
5240 // Outputs are empty, if applicable. | |
5241 ASSERT(first_set_only_out == NULL || first_set_only_out->length() == 0); | |
5242 ASSERT(second_set_only_out == NULL || second_set_only_out->length() == 0); | |
5243 ASSERT(both_sets_out == NULL || both_sets_out->length() == 0); | |
5244 | |
5245 // Merge sets by iterating through the lists in order of lowest "from" value, | |
5246 // and putting intervals into one of three sets. | |
5247 | |
5248 if (first_set->length() == 0) { | |
5249 second_set_only_out->AddAll(*second_set); | |
5250 return; | |
5251 } | |
5252 if (second_set->length() == 0) { | |
5253 first_set_only_out->AddAll(*first_set); | |
5254 return; | |
5255 } | |
5256 // Indices into input lists. | |
5257 int i1 = 0; | |
5258 int i2 = 0; | |
5259 // Cache length of input lists. | |
5260 int n1 = first_set->length(); | |
5261 int n2 = second_set->length(); | |
5262 // Current range. May be invalid if state is kInsideNone. | |
5263 int from = 0; | |
5264 int to = -1; | |
5265 // Where current range comes from. | |
5266 int state = kInsideNone; | |
5267 | |
5268 while (i1 < n1 || i2 < n2) { | |
5269 CharacterRange next_range; | |
5270 int range_source; | |
5271 if (i2 == n2 || | |
5272 (i1 < n1 && first_set->at(i1).from() < second_set->at(i2).from())) { | |
5273 // Next smallest element is in first set. | |
5274 next_range = first_set->at(i1++); | |
5275 range_source = kInsideFirst; | |
5276 } else { | |
5277 // Next smallest element is in second set. | |
5278 next_range = second_set->at(i2++); | |
5279 range_source = kInsideSecond; | |
5280 } | |
5281 if (to < next_range.from()) { | |
5282 // Ranges disjoint: |current| |next| | |
5283 AddRangeToSelectedSet(state, | |
5284 first_set_only_out, | |
5285 second_set_only_out, | |
5286 both_sets_out, | |
5287 CharacterRange(from, to)); | |
5288 from = next_range.from(); | |
5289 to = next_range.to(); | |
5290 state = range_source; | |
5291 } else { | |
5292 if (from < next_range.from()) { | |
5293 AddRangeToSelectedSet(state, | |
5294 first_set_only_out, | |
5295 second_set_only_out, | |
5296 both_sets_out, | |
5297 CharacterRange(from, next_range.from()-1)); | |
5298 } | |
5299 if (to < next_range.to()) { | |
5300 // Ranges overlap: |current| | |
5301 // |next| | |
5302 AddRangeToSelectedSet(state | range_source, | |
5303 first_set_only_out, | |
5304 second_set_only_out, | |
5305 both_sets_out, | |
5306 CharacterRange(next_range.from(), to)); | |
5307 from = to + 1; | |
5308 to = next_range.to(); | |
5309 state = range_source; | |
5310 } else { | |
5311 // Range included: |current| , possibly ending at same character. | |
5312 // |next| | |
5313 AddRangeToSelectedSet( | |
5314 state | range_source, | |
5315 first_set_only_out, | |
5316 second_set_only_out, | |
5317 both_sets_out, | |
5318 CharacterRange(next_range.from(), next_range.to())); | |
5319 from = next_range.to() + 1; | |
5320 // If ranges end at same character, both ranges are consumed completely. | |
5321 if (next_range.to() == to) state = kInsideNone; | |
5322 } | |
5323 } | |
5324 } | |
5325 AddRangeToSelectedSet(state, | |
5326 first_set_only_out, | |
5327 second_set_only_out, | |
5328 both_sets_out, | |
5329 CharacterRange(from, to)); | |
5330 } | |
5331 | |
5332 | |
5333 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, | 5155 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, |
5334 ZoneList<CharacterRange>* negated_ranges) { | 5156 ZoneList<CharacterRange>* negated_ranges) { |
5335 ASSERT(CharacterRange::IsCanonical(ranges)); | 5157 ASSERT(CharacterRange::IsCanonical(ranges)); |
5336 ASSERT_EQ(0, negated_ranges->length()); | 5158 ASSERT_EQ(0, negated_ranges->length()); |
5337 int range_count = ranges->length(); | 5159 int range_count = ranges->length(); |
5338 uc16 from = 0; | 5160 uc16 from = 0; |
5339 int i = 0; | 5161 int i = 0; |
5340 if (range_count > 0 && ranges->at(0).from() == 0) { | 5162 if (range_count > 0 && ranges->at(0).from() == 0) { |
5341 from = ranges->at(0).to(); | 5163 from = ranges->at(0).to(); |
5342 i = 1; | 5164 i = 1; |
(...skipping 292 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5635 } | 5457 } |
5636 | 5458 |
5637 | 5459 |
5638 void Analysis::VisitBackReference(BackReferenceNode* that) { | 5460 void Analysis::VisitBackReference(BackReferenceNode* that) { |
5639 EnsureAnalyzed(that->on_success()); | 5461 EnsureAnalyzed(that->on_success()); |
5640 } | 5462 } |
5641 | 5463 |
5642 | 5464 |
5643 void Analysis::VisitAssertion(AssertionNode* that) { | 5465 void Analysis::VisitAssertion(AssertionNode* that) { |
5644 EnsureAnalyzed(that->on_success()); | 5466 EnsureAnalyzed(that->on_success()); |
5645 AssertionNode::AssertionNodeType type = that->type(); | |
5646 if (type == AssertionNode::AT_BOUNDARY || | |
5647 type == AssertionNode::AT_NON_BOUNDARY) { | |
5648 // Check if the following character is known to be a word character | |
5649 // or known to not be a word character. | |
5650 ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet(); | |
5651 | |
5652 CharacterRange::Canonicalize(following_chars); | |
5653 | |
5654 SetRelation word_relation = | |
5655 CharacterRange::WordCharacterRelation(following_chars); | |
5656 if (word_relation.Disjoint()) { | |
5657 // Includes the case where following_chars is empty (e.g., end-of-input). | |
5658 // Following character is definitely *not* a word character. | |
5659 type = (type == AssertionNode::AT_BOUNDARY) ? | |
5660 AssertionNode::AFTER_WORD_CHARACTER : | |
5661 AssertionNode::AFTER_NONWORD_CHARACTER; | |
5662 that->set_type(type); | |
5663 } else if (word_relation.ContainedIn()) { | |
5664 // Following character is definitely a word character. | |
5665 type = (type == AssertionNode::AT_BOUNDARY) ? | |
5666 AssertionNode::AFTER_NONWORD_CHARACTER : | |
5667 AssertionNode::AFTER_WORD_CHARACTER; | |
5668 that->set_type(type); | |
5669 } | |
5670 } | |
5671 } | 5467 } |
5672 | 5468 |
5673 | 5469 |
5674 ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() { | 5470 void BackReferenceNode::FillInBMInfo( |
5675 if (first_character_set_ == NULL) { | 5471 int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
5676 if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) { | 5472 // Working out the set of characters that a backreference can match is too |
5677 // If we can't find an exact solution within the budget, we | 5473 // hard, so we just say that any character can match. |
5678 // set the value to the set of every character, i.e., all characters | 5474 bm->SetRest(offset); |
5679 // are possible. | 5475 SaveBMInfo(bm, not_at_start, offset); |
5680 ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1); | |
5681 all_set->Add(CharacterRange::Everything()); | |
5682 first_character_set_ = all_set; | |
5683 } | |
5684 } | |
5685 return first_character_set_; | |
5686 } | 5476 } |
5687 | 5477 |
5688 | 5478 |
5689 int RegExpNode::ComputeFirstCharacterSet(int budget) { | 5479 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == |
5690 // Default behavior is to not be able to determine the first character. | 5480 RegExpMacroAssembler::kTableSize); |
5691 return kComputeFirstCharacterSetFail; | |
5692 } | |
5693 | |
5694 | |
5695 int LoopChoiceNode::ComputeFirstCharacterSet(int budget) { | |
5696 budget--; | |
5697 if (budget >= 0) { | |
5698 // Find loop min-iteration. It's the value of the guarded choice node | |
5699 // with a GEQ guard, if any. | |
5700 int min_repetition = 0; | |
5701 | |
5702 for (int i = 0; i <= 1; i++) { | |
5703 GuardedAlternative alternative = alternatives()->at(i); | |
5704 ZoneList<Guard*>* guards = alternative.guards(); | |
5705 if (guards != NULL && guards->length() > 0) { | |
5706 Guard* guard = guards->at(0); | |
5707 if (guard->op() == Guard::GEQ) { | |
5708 min_repetition = guard->value(); | |
5709 break; | |
5710 } | |
5711 } | |
5712 } | |
5713 | |
5714 budget = loop_node()->ComputeFirstCharacterSet(budget); | |
5715 if (budget >= 0) { | |
5716 ZoneList<CharacterRange>* character_set = | |
5717 loop_node()->first_character_set(); | |
5718 if (body_can_be_zero_length() || min_repetition == 0) { | |
5719 budget = continue_node()->ComputeFirstCharacterSet(budget); | |
5720 if (budget < 0) return budget; | |
5721 ZoneList<CharacterRange>* body_set = | |
5722 continue_node()->first_character_set(); | |
5723 ZoneList<CharacterRange>* union_set = | |
5724 new ZoneList<CharacterRange>(Max(character_set->length(), | |
5725 body_set->length())); | |
5726 CharacterRange::Merge(character_set, | |
5727 body_set, | |
5728 union_set, | |
5729 union_set, | |
5730 union_set); | |
5731 character_set = union_set; | |
5732 } | |
5733 set_first_character_set(character_set); | |
5734 } | |
5735 } | |
5736 return budget; | |
5737 } | |
5738 | |
5739 | |
5740 int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) { | |
5741 budget--; | |
5742 if (budget >= 0) { | |
5743 GuardedAlternative successor = this->alternatives()->at(1); | |
5744 RegExpNode* successor_node = successor.node(); | |
5745 budget = successor_node->ComputeFirstCharacterSet(budget); | |
5746 if (budget >= 0) { | |
5747 set_first_character_set(successor_node->first_character_set()); | |
5748 } | |
5749 } | |
5750 return budget; | |
5751 } | |
5752 | |
5753 | |
5754 // The first character set of an EndNode is unknowable. Just use the | |
5755 // default implementation that fails and returns all characters as possible. | |
5756 | |
5757 | |
5758 int AssertionNode::ComputeFirstCharacterSet(int budget) { | |
5759 budget -= 1; | |
5760 if (budget >= 0) { | |
5761 switch (type_) { | |
5762 case AT_END: { | |
5763 set_first_character_set(new ZoneList<CharacterRange>(0)); | |
5764 break; | |
5765 } | |
5766 case AT_START: | |
5767 case AT_BOUNDARY: | |
5768 case AT_NON_BOUNDARY: | |
5769 case AFTER_NEWLINE: | |
5770 case AFTER_NONWORD_CHARACTER: | |
5771 case AFTER_WORD_CHARACTER: { | |
5772 ASSERT_NOT_NULL(on_success()); | |
5773 budget = on_success()->ComputeFirstCharacterSet(budget); | |
5774 if (budget >= 0) { | |
5775 set_first_character_set(on_success()->first_character_set()); | |
5776 } | |
5777 break; | |
5778 } | |
5779 } | |
5780 } | |
5781 return budget; | |
5782 } | |
5783 | |
5784 | |
5785 int ActionNode::ComputeFirstCharacterSet(int budget) { | |
5786 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail; | |
5787 budget--; | |
5788 if (budget >= 0) { | |
5789 ASSERT_NOT_NULL(on_success()); | |
5790 budget = on_success()->ComputeFirstCharacterSet(budget); | |
5791 if (budget >= 0) { | |
5792 set_first_character_set(on_success()->first_character_set()); | |
5793 } | |
5794 } | |
5795 return budget; | |
5796 } | |
5797 | |
5798 | |
5799 int BackReferenceNode::ComputeFirstCharacterSet(int budget) { | |
5800 // We don't know anything about the first character of a backreference | |
5801 // at this point. | |
5802 // The potential first characters are the first characters of the capture, | |
5803 // and the first characters of the on_success node, depending on whether the | |
5804 // capture can be empty and whether it is known to be participating or known | |
5805 // not to be. | |
5806 return kComputeFirstCharacterSetFail; | |
5807 } | |
5808 | 5481 |
5809 | 5482 |
5810 void ChoiceNode::FillInBMInfo( | 5483 void ChoiceNode::FillInBMInfo( |
5811 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | 5484 int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
5812 ZoneList<GuardedAlternative>* alts = alternatives(); | 5485 ZoneList<GuardedAlternative>* alts = alternatives(); |
5813 for (int i = 0; i < alts->length(); i++) { | 5486 for (int i = 0; i < alts->length(); i++) { |
5814 GuardedAlternative& alt = alts->at(i); | 5487 GuardedAlternative& alt = alts->at(i); |
5815 if (alt.guards() != NULL && alt.guards()->length() != 0) { | 5488 if (alt.guards() != NULL && alt.guards()->length() != 0) { |
5816 bm->SetRest(offset); // Give up trying to fill in info. | 5489 bm->SetRest(offset); // Give up trying to fill in info. |
| 5490 SaveBMInfo(bm, not_at_start, offset); |
5817 return; | 5491 return; |
5818 } | 5492 } |
5819 alt.node()->FillInBMInfo(offset, bm, not_at_start); | 5493 alt.node()->FillInBMInfo(offset, bm, not_at_start); |
5820 } | 5494 } |
| 5495 SaveBMInfo(bm, not_at_start, offset); |
5821 } | 5496 } |
5822 | 5497 |
5823 | 5498 |
5824 void TextNode::FillInBMInfo( | 5499 void TextNode::FillInBMInfo( |
5825 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | 5500 int initial_offset, BoyerMooreLookahead* bm, bool not_at_start) { |
5826 if (offset >= bm->length()) return; | 5501 if (initial_offset >= bm->length()) return; |
| 5502 int offset = initial_offset; |
5827 int max_char = bm->max_char(); | 5503 int max_char = bm->max_char(); |
5828 for (int i = 0; i < elements()->length(); i++) { | 5504 for (int i = 0; i < elements()->length(); i++) { |
5829 if (offset >= bm->length()) return; | 5505 if (offset >= bm->length()) { |
| 5506 if (initial_offset == 0) set_bm_info(not_at_start, bm); |
| 5507 return; |
| 5508 } |
5830 TextElement text = elements()->at(i); | 5509 TextElement text = elements()->at(i); |
5831 if (text.type == TextElement::ATOM) { | 5510 if (text.type == TextElement::ATOM) { |
5832 RegExpAtom* atom = text.data.u_atom; | 5511 RegExpAtom* atom = text.data.u_atom; |
5833 for (int j = 0; j < atom->length(); j++, offset++) { | 5512 for (int j = 0; j < atom->length(); j++, offset++) { |
5834 if (offset >= bm->length()) return; | 5513 if (offset >= bm->length()) { |
| 5514 if (initial_offset == 0) set_bm_info(not_at_start, bm); |
| 5515 return; |
| 5516 } |
5835 uc16 character = atom->data()[j]; | 5517 uc16 character = atom->data()[j]; |
5836 if (bm->compiler()->ignore_case()) { | 5518 if (bm->compiler()->ignore_case()) { |
5837 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 5519 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
5838 int length = GetCaseIndependentLetters( | 5520 int length = GetCaseIndependentLetters( |
5839 ISOLATE, | 5521 ISOLATE, |
5840 character, | 5522 character, |
5841 bm->max_char() == String::kMaxAsciiCharCode, | 5523 bm->max_char() == String::kMaxAsciiCharCode, |
5842 chars); | 5524 chars); |
5843 for (int j = 0; j < length; j++) { | 5525 for (int j = 0; j < length; j++) { |
5844 bm->Set(offset, chars[j]); | 5526 bm->Set(offset, chars[j]); |
5845 } | 5527 } |
5846 } else { | 5528 } else { |
5847 if (character <= max_char) bm->Set(offset, character); | 5529 if (character <= max_char) bm->Set(offset, character); |
5848 } | 5530 } |
5849 } | 5531 } |
5850 } else { | 5532 } else { |
5851 ASSERT(text.type == TextElement::CHAR_CLASS); | 5533 ASSERT(text.type == TextElement::CHAR_CLASS); |
5852 RegExpCharacterClass* char_class = text.data.u_char_class; | 5534 RegExpCharacterClass* char_class = text.data.u_char_class; |
5853 ZoneList<CharacterRange>* ranges = char_class->ranges(); | 5535 ZoneList<CharacterRange>* ranges = char_class->ranges(); |
5854 if (char_class->is_negated()) { | 5536 if (char_class->is_negated()) { |
5855 bm->SetAll(offset); | 5537 bm->SetAll(offset); |
5856 } else { | 5538 } else { |
5857 for (int k = 0; k < ranges->length(); k++) { | 5539 for (int k = 0; k < ranges->length(); k++) { |
5858 CharacterRange& range = ranges->at(k); | 5540 CharacterRange& range = ranges->at(k); |
5859 if (range.from() > max_char) continue; | 5541 if (range.from() > max_char) continue; |
5860 int to = Min(max_char, static_cast<int>(range.to())); | 5542 int to = Min(max_char, static_cast<int>(range.to())); |
5861 if (to - range.from() >= BoyerMooreLookahead::kTooManyCharacters) { | 5543 bm->SetInterval(offset, Interval(range.from(), to)); |
5862 bm->SetAll(offset); | |
5863 break; | |
5864 } | |
5865 for (int m = range.from(); m <= to; m++) { | |
5866 bm->Set(offset, m); | |
5867 } | |
5868 } | 5544 } |
5869 } | 5545 } |
5870 offset++; | 5546 offset++; |
5871 } | 5547 } |
5872 } | 5548 } |
5873 if (offset >= bm->length()) return; | 5549 if (offset >= bm->length()) { |
| 5550 if (initial_offset == 0) set_bm_info(not_at_start, bm); |
| 5551 return; |
| 5552 } |
5874 on_success()->FillInBMInfo(offset, | 5553 on_success()->FillInBMInfo(offset, |
5875 bm, | 5554 bm, |
5876 true); // Not at start after a text node. | 5555 true); // Not at start after a text node. |
| 5556 if (initial_offset == 0) set_bm_info(not_at_start, bm); |
5877 } | 5557 } |
5878 | 5558 |
5879 | 5559 |
5880 int TextNode::ComputeFirstCharacterSet(int budget) { | |
5881 budget--; | |
5882 if (budget >= 0) { | |
5883 ASSERT_NE(0, elements()->length()); | |
5884 TextElement text = elements()->at(0); | |
5885 if (text.type == TextElement::ATOM) { | |
5886 RegExpAtom* atom = text.data.u_atom; | |
5887 ASSERT_NE(0, atom->length()); | |
5888 uc16 first_char = atom->data()[0]; | |
5889 ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1); | |
5890 range->Add(CharacterRange(first_char, first_char)); | |
5891 set_first_character_set(range); | |
5892 } else { | |
5893 ASSERT(text.type == TextElement::CHAR_CLASS); | |
5894 RegExpCharacterClass* char_class = text.data.u_char_class; | |
5895 ZoneList<CharacterRange>* ranges = char_class->ranges(); | |
5896 // TODO(lrn): Canonicalize ranges when they are created | |
5897 // instead of waiting until now. | |
5898 CharacterRange::Canonicalize(ranges); | |
5899 if (char_class->is_negated()) { | |
5900 int length = ranges->length(); | |
5901 int new_length = length + 1; | |
5902 if (length > 0) { | |
5903 if (ranges->at(0).from() == 0) new_length--; | |
5904 if (ranges->at(length - 1).to() == String::kMaxUtf16CodeUnit) { | |
5905 new_length--; | |
5906 } | |
5907 } | |
5908 ZoneList<CharacterRange>* negated_ranges = | |
5909 new ZoneList<CharacterRange>(new_length); | |
5910 CharacterRange::Negate(ranges, negated_ranges); | |
5911 set_first_character_set(negated_ranges); | |
5912 } else { | |
5913 set_first_character_set(ranges); | |
5914 } | |
5915 } | |
5916 } | |
5917 return budget; | |
5918 } | |
5919 | |
5920 | |
5921 | |
5922 // ------------------------------------------------------------------- | 5560 // ------------------------------------------------------------------- |
5923 // Dispatch table construction | 5561 // Dispatch table construction |
5924 | 5562 |
5925 | 5563 |
5926 void DispatchTableConstructor::VisitEnd(EndNode* that) { | 5564 void DispatchTableConstructor::VisitEnd(EndNode* that) { |
5927 AddRange(CharacterRange::Everything()); | 5565 AddRange(CharacterRange::Everything()); |
5928 } | 5566 } |
5929 | 5567 |
5930 | 5568 |
5931 void DispatchTableConstructor::BuildTable(ChoiceNode* node) { | 5569 void DispatchTableConstructor::BuildTable(ChoiceNode* node) { |
(...skipping 201 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6133 } | 5771 } |
6134 | 5772 |
6135 return compiler.Assemble(¯o_assembler, | 5773 return compiler.Assemble(¯o_assembler, |
6136 node, | 5774 node, |
6137 data->capture_count, | 5775 data->capture_count, |
6138 pattern); | 5776 pattern); |
6139 } | 5777 } |
6140 | 5778 |
6141 | 5779 |
6142 }} // namespace v8::internal | 5780 }} // namespace v8::internal |
OLD | NEW |