| OLD | NEW |
| 1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
| 4 // met: | 4 // met: |
| 5 // | 5 // |
| 6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
| 7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
| 8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
| 9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
| 10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
| (...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 101 Factory* factory = isolate->factory(); | 101 Factory* factory = isolate->factory(); |
| 102 Handle<FixedArray> elements = factory->NewFixedArray(2); | 102 Handle<FixedArray> elements = factory->NewFixedArray(2); |
| 103 elements->set(0, *pattern); | 103 elements->set(0, *pattern); |
| 104 elements->set(1, *error_text); | 104 elements->set(1, *error_text); |
| 105 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); | 105 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); |
| 106 Handle<Object> regexp_err = factory->NewSyntaxError(message, array); | 106 Handle<Object> regexp_err = factory->NewSyntaxError(message, array); |
| 107 isolate->Throw(*regexp_err); | 107 isolate->Throw(*regexp_err); |
| 108 } | 108 } |
| 109 | 109 |
| 110 | 110 |
| 111 ContainedInLattice AddRange(ContainedInLattice containment, |
| 112 const int* ranges, |
| 113 int ranges_length, |
| 114 Interval new_range) { |
| 115 ASSERT((ranges_length & 1) == 1); |
| 116 ASSERT(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1); |
| 117 if (containment == kLatticeUnknown) return containment; |
| 118 bool inside = false; |
| 119 int last = 0; |
| 120 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) { |
| 121 // Consider the range from last to ranges[i]. |
| 122 // We haven't got to the new range yet. |
| 123 if (ranges[i] <= new_range.from()) continue; |
| 124 // New range is wholly inside last-ranges[i]. Note that new_range.to() is |
| 125 // inclusive, but the values in ranges are not. |
| 126 if (last <= new_range.from() && new_range.to() < ranges[i]) { |
| 127 return Combine(containment, inside ? kLatticeIn : kLatticeOut); |
| 128 } |
| 129 return kLatticeUnknown; |
| 130 } |
| 131 return containment; |
| 132 } |
| 133 |
| 134 |
| 111 // More makes code generation slower, less makes V8 benchmark score lower. | 135 // More makes code generation slower, less makes V8 benchmark score lower. |
| 112 const int kMaxLookaheadForBoyerMoore = 8; | 136 const int kMaxLookaheadForBoyerMoore = 8; |
| 113 // In a 3-character pattern you can maximally step forwards 3 characters | 137 // In a 3-character pattern you can maximally step forwards 3 characters |
| 114 // at a time, which is not always enough to pay for the extra logic. | 138 // at a time, which is not always enough to pay for the extra logic. |
| 115 const int kPatternTooShortForBoyerMoore = 2; | 139 const int kPatternTooShortForBoyerMoore = 2; |
| 116 | 140 |
| 117 | 141 |
| 118 // Identifies the sort of regexps where the regexp engine is faster | 142 // Identifies the sort of regexps where the regexp engine is faster |
| 119 // than the code used for atom matches. | 143 // than the code used for atom matches. |
| 120 static bool HasFewDifferentCharacters(Handle<String> pattern) { | 144 static bool HasFewDifferentCharacters(Handle<String> pattern) { |
| (...skipping 2029 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2150 | 2174 |
| 2151 | 2175 |
| 2152 void ActionNode::FillInBMInfo(int offset, | 2176 void ActionNode::FillInBMInfo(int offset, |
| 2153 BoyerMooreLookahead* bm, | 2177 BoyerMooreLookahead* bm, |
| 2154 bool not_at_start) { | 2178 bool not_at_start) { |
| 2155 if (type_ == BEGIN_SUBMATCH) { | 2179 if (type_ == BEGIN_SUBMATCH) { |
| 2156 bm->SetRest(offset); | 2180 bm->SetRest(offset); |
| 2157 } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) { | 2181 } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) { |
| 2158 on_success()->FillInBMInfo(offset, bm, not_at_start); | 2182 on_success()->FillInBMInfo(offset, bm, not_at_start); |
| 2159 } | 2183 } |
| 2184 SaveBMInfo(bm, not_at_start, offset); |
| 2160 } | 2185 } |
| 2161 | 2186 |
| 2162 | 2187 |
| 2163 int AssertionNode::EatsAtLeast(int still_to_find, | 2188 int AssertionNode::EatsAtLeast(int still_to_find, |
| 2164 int recursion_depth, | 2189 int recursion_depth, |
| 2165 bool not_at_start) { | 2190 bool not_at_start) { |
| 2166 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; | 2191 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
| 2167 // If we know we are not at the start and we are asked "how many characters | 2192 // If we know we are not at the start and we are asked "how many characters |
| 2168 // will you match if you succeed?" then we can answer anything since false | 2193 // will you match if you succeed?" then we can answer anything since false |
| 2169 // implies false. So lets just return the max answer (still_to_find) since | 2194 // implies false. So lets just return the max answer (still_to_find) since |
| 2170 // that won't prevent us from preloading a lot of characters for the other | 2195 // that won't prevent us from preloading a lot of characters for the other |
| 2171 // branches in the node graph. | 2196 // branches in the node graph. |
| 2172 if (type() == AT_START && not_at_start) return still_to_find; | 2197 if (type() == AT_START && not_at_start) return still_to_find; |
| 2173 return on_success()->EatsAtLeast(still_to_find, | 2198 return on_success()->EatsAtLeast(still_to_find, |
| 2174 recursion_depth + 1, | 2199 recursion_depth + 1, |
| 2175 not_at_start); | 2200 not_at_start); |
| 2176 } | 2201 } |
| 2177 | 2202 |
| 2178 | 2203 |
| 2179 void AssertionNode::FillInBMInfo( | 2204 void AssertionNode::FillInBMInfo( |
| 2180 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | 2205 int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
| 2181 // Match the behaviour of EatsAtLeast on this node. | 2206 // Match the behaviour of EatsAtLeast on this node. |
| 2182 if (type() == AT_START && not_at_start) return; | 2207 if (type() == AT_START && not_at_start) return; |
| 2183 on_success()->FillInBMInfo(offset, bm, not_at_start); | 2208 on_success()->FillInBMInfo(offset, bm, not_at_start); |
| 2209 SaveBMInfo(bm, not_at_start, offset); |
| 2184 } | 2210 } |
| 2185 | 2211 |
| 2186 | 2212 |
| 2187 int BackReferenceNode::EatsAtLeast(int still_to_find, | 2213 int BackReferenceNode::EatsAtLeast(int still_to_find, |
| 2188 int recursion_depth, | 2214 int recursion_depth, |
| 2189 bool not_at_start) { | 2215 bool not_at_start) { |
| 2190 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; | 2216 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
| 2191 return on_success()->EatsAtLeast(still_to_find, | 2217 return on_success()->EatsAtLeast(still_to_find, |
| 2192 recursion_depth + 1, | 2218 recursion_depth + 1, |
| 2193 not_at_start); | 2219 not_at_start); |
| (...skipping 416 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2610 if (body_can_be_zero_length_ || info()->visited) return; | 2636 if (body_can_be_zero_length_ || info()->visited) return; |
| 2611 VisitMarker marker(info()); | 2637 VisitMarker marker(info()); |
| 2612 return ChoiceNode::GetQuickCheckDetails(details, | 2638 return ChoiceNode::GetQuickCheckDetails(details, |
| 2613 compiler, | 2639 compiler, |
| 2614 characters_filled_in, | 2640 characters_filled_in, |
| 2615 not_at_start); | 2641 not_at_start); |
| 2616 } | 2642 } |
| 2617 | 2643 |
| 2618 | 2644 |
| 2619 void LoopChoiceNode::FillInBMInfo( | 2645 void LoopChoiceNode::FillInBMInfo( |
| 2620 int offset, BoyerMooreLookahead* bm, bool nas) { | 2646 int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
| 2621 if (body_can_be_zero_length_) { | 2647 if (body_can_be_zero_length_) { |
| 2622 bm->SetRest(offset); | 2648 bm->SetRest(offset); |
| 2649 SaveBMInfo(bm, not_at_start, offset); |
| 2623 return; | 2650 return; |
| 2624 } | 2651 } |
| 2625 ChoiceNode::FillInBMInfo(offset, bm, nas); | 2652 ChoiceNode::FillInBMInfo(offset, bm, not_at_start); |
| 2653 SaveBMInfo(bm, not_at_start, offset); |
| 2626 } | 2654 } |
| 2627 | 2655 |
| 2628 | 2656 |
| 2629 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, | 2657 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, |
| 2630 RegExpCompiler* compiler, | 2658 RegExpCompiler* compiler, |
| 2631 int characters_filled_in, | 2659 int characters_filled_in, |
| 2632 bool not_at_start) { | 2660 bool not_at_start) { |
| 2633 not_at_start = (not_at_start || not_at_start_); | 2661 not_at_start = (not_at_start || not_at_start_); |
| 2634 int choice_count = alternatives_->length(); | 2662 int choice_count = alternatives_->length(); |
| 2635 ASSERT(choice_count > 0); | 2663 ASSERT(choice_count > 0); |
| (...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2703 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); | 2731 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); |
| 2704 } | 2732 } |
| 2705 assembler->CheckCharacter('\n', &ok); | 2733 assembler->CheckCharacter('\n', &ok); |
| 2706 assembler->CheckNotCharacter('\r', new_trace.backtrack()); | 2734 assembler->CheckNotCharacter('\r', new_trace.backtrack()); |
| 2707 } | 2735 } |
| 2708 assembler->Bind(&ok); | 2736 assembler->Bind(&ok); |
| 2709 on_success->Emit(compiler, &new_trace); | 2737 on_success->Emit(compiler, &new_trace); |
| 2710 } | 2738 } |
| 2711 | 2739 |
| 2712 | 2740 |
| 2713 // Emit the code to handle \b and \B (word-boundary or non-word-boundary) | |
| 2714 // when we know whether the next character must be a word character or not. | |
| 2715 static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type, | |
| 2716 RegExpCompiler* compiler, | |
| 2717 RegExpNode* on_success, | |
| 2718 Trace* trace) { | |
| 2719 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | |
| 2720 Label done; | |
| 2721 | |
| 2722 Trace new_trace(*trace); | |
| 2723 | |
| 2724 bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER); | |
| 2725 Label* on_word = expect_word_character ? &done : new_trace.backtrack(); | |
| 2726 Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done; | |
| 2727 | |
| 2728 // Check whether previous character was a word character. | |
| 2729 switch (trace->at_start()) { | |
| 2730 case Trace::TRUE: | |
| 2731 if (expect_word_character) { | |
| 2732 assembler->GoTo(on_non_word); | |
| 2733 } | |
| 2734 break; | |
| 2735 case Trace::UNKNOWN: | |
| 2736 ASSERT_EQ(0, trace->cp_offset()); | |
| 2737 assembler->CheckAtStart(on_non_word); | |
| 2738 // Fall through. | |
| 2739 case Trace::FALSE: | |
| 2740 int prev_char_offset = trace->cp_offset() - 1; | |
| 2741 assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1); | |
| 2742 EmitWordCheck(assembler, on_word, on_non_word, expect_word_character); | |
| 2743 // We may or may not have loaded the previous character. | |
| 2744 new_trace.InvalidateCurrentCharacter(); | |
| 2745 } | |
| 2746 | |
| 2747 assembler->Bind(&done); | |
| 2748 | |
| 2749 on_success->Emit(compiler, &new_trace); | |
| 2750 } | |
| 2751 | |
| 2752 | |
| 2753 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). | 2741 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). |
| 2754 static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type, | 2742 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { |
| 2755 RegExpCompiler* compiler, | |
| 2756 RegExpNode* on_success, | |
| 2757 Trace* trace) { | |
| 2758 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | 2743 RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
| 2759 Label before_non_word; | 2744 Trace::TriBool next_is_word_character = Trace::UNKNOWN; |
| 2760 Label before_word; | 2745 bool not_at_start = (trace->at_start() == Trace::FALSE); |
| 2761 if (trace->characters_preloaded() != 1) { | 2746 BoyerMooreLookahead* lookahead = bm_info(not_at_start); |
| 2762 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); | 2747 if (lookahead == NULL) { |
| 2748 int eats_at_least = |
| 2749 Min(kMaxLookaheadForBoyerMoore, |
| 2750 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start)); |
| 2751 if (eats_at_least >= 1) { |
| 2752 BoyerMooreLookahead* bm = |
| 2753 new BoyerMooreLookahead(eats_at_least, compiler); |
| 2754 FillInBMInfo(0, bm, not_at_start); |
| 2755 if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE; |
| 2756 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE; |
| 2757 } |
| 2758 } else { |
| 2759 if (lookahead->at(0)->is_non_word()) next_is_word_character = Trace::FALSE; |
| 2760 if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE; |
| 2763 } | 2761 } |
| 2764 // Fall through on non-word. | 2762 bool at_boundary = (type_ == AssertionNode::AT_BOUNDARY); |
| 2765 EmitWordCheck(assembler, &before_word, &before_non_word, false); | 2763 if (next_is_word_character == Trace::UNKNOWN) { |
| 2764 Label before_non_word; |
| 2765 Label before_word; |
| 2766 if (trace->characters_preloaded() != 1) { |
| 2767 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); |
| 2768 } |
| 2769 // Fall through on non-word. |
| 2770 EmitWordCheck(assembler, &before_word, &before_non_word, false); |
| 2771 // Next character is not a word character. |
| 2772 assembler->Bind(&before_non_word); |
| 2773 Label ok; |
| 2774 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); |
| 2775 assembler->GoTo(&ok); |
| 2766 | 2776 |
| 2767 // We will be loading the previous character into the current character | 2777 assembler->Bind(&before_word); |
| 2768 // register. | 2778 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); |
| 2779 assembler->Bind(&ok); |
| 2780 } else if (next_is_word_character == Trace::TRUE) { |
| 2781 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); |
| 2782 } else { |
| 2783 ASSERT(next_is_word_character == Trace::FALSE); |
| 2784 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); |
| 2785 } |
| 2786 } |
| 2787 |
| 2788 |
| 2789 void AssertionNode::BacktrackIfPrevious( |
| 2790 RegExpCompiler* compiler, |
| 2791 Trace* trace, |
| 2792 AssertionNode::IfPrevious backtrack_if_previous) { |
| 2793 RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
| 2769 Trace new_trace(*trace); | 2794 Trace new_trace(*trace); |
| 2770 new_trace.InvalidateCurrentCharacter(); | 2795 new_trace.InvalidateCurrentCharacter(); |
| 2771 | 2796 |
| 2772 Label ok; | 2797 Label fall_through, dummy; |
| 2773 Label* boundary; | |
| 2774 Label* not_boundary; | |
| 2775 if (type == AssertionNode::AT_BOUNDARY) { | |
| 2776 boundary = &ok; | |
| 2777 not_boundary = new_trace.backtrack(); | |
| 2778 } else { | |
| 2779 not_boundary = &ok; | |
| 2780 boundary = new_trace.backtrack(); | |
| 2781 } | |
| 2782 | 2798 |
| 2783 // Next character is not a word character. | 2799 Label* non_word = backtrack_if_previous == kIsNonWord ? |
| 2784 assembler->Bind(&before_non_word); | 2800 new_trace.backtrack() : |
| 2801 &fall_through; |
| 2802 Label* word = backtrack_if_previous == kIsNonWord ? |
| 2803 &fall_through : |
| 2804 new_trace.backtrack(); |
| 2805 |
| 2785 if (new_trace.cp_offset() == 0) { | 2806 if (new_trace.cp_offset() == 0) { |
| 2786 // The start of input counts as a non-word character, so the question is | 2807 // The start of input counts as a non-word character, so the question is |
| 2787 // decided if we are at the start. | 2808 // decided if we are at the start. |
| 2788 assembler->CheckAtStart(not_boundary); | 2809 assembler->CheckAtStart(non_word); |
| 2789 } | 2810 } |
| 2790 // We already checked that we are not at the start of input so it must be | 2811 // We already checked that we are not at the start of input so it must be |
| 2791 // OK to load the previous character. | 2812 // OK to load the previous character. |
| 2792 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, | 2813 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false); |
| 2793 &ok, // Unused dummy label in this call. | 2814 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord); |
| 2794 false); | |
| 2795 // Fall through on non-word. | |
| 2796 EmitWordCheck(assembler, boundary, not_boundary, false); | |
| 2797 assembler->GoTo(not_boundary); | |
| 2798 | 2815 |
| 2799 // Next character is a word character. | 2816 assembler->Bind(&fall_through); |
| 2800 assembler->Bind(&before_word); | 2817 on_success()->Emit(compiler, &new_trace); |
| 2801 if (new_trace.cp_offset() == 0) { | |
| 2802 // The start of input counts as a non-word character, so the question is | |
| 2803 // decided if we are at the start. | |
| 2804 assembler->CheckAtStart(boundary); | |
| 2805 } | |
| 2806 // We already checked that we are not at the start of input so it must be | |
| 2807 // OK to load the previous character. | |
| 2808 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, | |
| 2809 &ok, // Unused dummy label in this call. | |
| 2810 false); | |
| 2811 bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY); | |
| 2812 EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word); | |
| 2813 | |
| 2814 assembler->Bind(&ok); | |
| 2815 | |
| 2816 on_success->Emit(compiler, &new_trace); | |
| 2817 } | 2818 } |
| 2818 | 2819 |
| 2819 | 2820 |
| 2820 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, | 2821 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, |
| 2821 RegExpCompiler* compiler, | 2822 RegExpCompiler* compiler, |
| 2822 int filled_in, | 2823 int filled_in, |
| 2823 bool not_at_start) { | 2824 bool not_at_start) { |
| 2824 if (type_ == AT_START && not_at_start) { | 2825 if (type_ == AT_START && not_at_start) { |
| 2825 details->set_cannot_match(); | 2826 details->set_cannot_match(); |
| 2826 return; | 2827 return; |
| (...skipping 27 matching lines...) Expand all Loading... |
| 2854 on_success()->Emit(compiler, &at_start_trace); | 2855 on_success()->Emit(compiler, &at_start_trace); |
| 2855 return; | 2856 return; |
| 2856 } | 2857 } |
| 2857 } | 2858 } |
| 2858 break; | 2859 break; |
| 2859 case AFTER_NEWLINE: | 2860 case AFTER_NEWLINE: |
| 2860 EmitHat(compiler, on_success(), trace); | 2861 EmitHat(compiler, on_success(), trace); |
| 2861 return; | 2862 return; |
| 2862 case AT_BOUNDARY: | 2863 case AT_BOUNDARY: |
| 2863 case AT_NON_BOUNDARY: { | 2864 case AT_NON_BOUNDARY: { |
| 2864 EmitBoundaryCheck(type_, compiler, on_success(), trace); | 2865 EmitBoundaryCheck(compiler, trace); |
| 2865 return; | 2866 return; |
| 2866 } | 2867 } |
| 2867 case AFTER_WORD_CHARACTER: | |
| 2868 case AFTER_NONWORD_CHARACTER: { | |
| 2869 EmitHalfBoundaryCheck(type_, compiler, on_success(), trace); | |
| 2870 } | |
| 2871 } | 2868 } |
| 2872 on_success()->Emit(compiler, trace); | 2869 on_success()->Emit(compiler, trace); |
| 2873 } | 2870 } |
| 2874 | 2871 |
| 2875 | 2872 |
| 2876 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { | 2873 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { |
| 2877 if (quick_check == NULL) return false; | 2874 if (quick_check == NULL) return false; |
| 2878 if (offset >= quick_check->characters()) return false; | 2875 if (offset >= quick_check->characters()) return false; |
| 2879 return quick_check->positions(offset)->determines_perfectly; | 2876 return quick_check->positions(offset)->determines_perfectly; |
| 2880 } | 2877 } |
| (...skipping 389 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3270 return alt_gens_[i]; | 3267 return alt_gens_[i]; |
| 3271 } | 3268 } |
| 3272 | 3269 |
| 3273 private: | 3270 private: |
| 3274 static const int kAFew = 10; | 3271 static const int kAFew = 10; |
| 3275 ZoneList<AlternativeGeneration*> alt_gens_; | 3272 ZoneList<AlternativeGeneration*> alt_gens_; |
| 3276 AlternativeGeneration a_few_alt_gens_[kAFew]; | 3273 AlternativeGeneration a_few_alt_gens_[kAFew]; |
| 3277 }; | 3274 }; |
| 3278 | 3275 |
| 3279 | 3276 |
| 3277 // The '2' variant is has inclusive from and exclusive to. |
| 3278 static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, |
| 3279 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, |
| 3280 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, 0x10000 }; |
| 3281 static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges); |
| 3282 |
| 3283 static const int kWordRanges[] = { |
| 3284 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 }; |
| 3285 static const int kWordRangeCount = ARRAY_SIZE(kWordRanges); |
| 3286 static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 }; |
| 3287 static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges); |
| 3288 static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 }; |
| 3289 static const int kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges); |
| 3290 static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E, |
| 3291 0x2028, 0x202A, 0x10000 }; |
| 3292 static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges); |
| 3293 |
| 3294 |
| 3295 void BoyerMoorePositionInfo::Set(int character) { |
| 3296 SetInterval(Interval(character, character)); |
| 3297 } |
| 3298 |
| 3299 |
| 3300 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) { |
| 3301 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval); |
| 3302 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval); |
| 3303 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval); |
| 3304 surrogate_ = |
| 3305 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval); |
| 3306 if (interval.to() - interval.from() >= kMapSize - 1) { |
| 3307 if (map_count_ != kMapSize) { |
| 3308 map_count_ = kMapSize; |
| 3309 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; |
| 3310 } |
| 3311 return; |
| 3312 } |
| 3313 for (int i = interval.from(); i <= interval.to(); i++) { |
| 3314 int mod_character = (i & kMask); |
| 3315 if (!map_->at(mod_character)) { |
| 3316 map_count_++; |
| 3317 map_->at(mod_character) = true; |
| 3318 } |
| 3319 if (map_count_ == kMapSize) return; |
| 3320 } |
| 3321 } |
| 3322 |
| 3323 |
| 3324 void BoyerMoorePositionInfo::SetAll() { |
| 3325 s_ = w_ = d_ = kLatticeUnknown; |
| 3326 if (map_count_ != kMapSize) { |
| 3327 map_count_ = kMapSize; |
| 3328 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; |
| 3329 } |
| 3330 } |
| 3331 |
| 3332 |
| 3280 BoyerMooreLookahead::BoyerMooreLookahead( | 3333 BoyerMooreLookahead::BoyerMooreLookahead( |
| 3281 int length, int map_length, RegExpCompiler* compiler) | 3334 int length, RegExpCompiler* compiler) |
| 3282 : length_(length), | 3335 : length_(length), |
| 3283 map_length_(map_length), | |
| 3284 compiler_(compiler) { | 3336 compiler_(compiler) { |
| 3285 ASSERT(IsPowerOf2(map_length)); | |
| 3286 if (compiler->ascii()) { | 3337 if (compiler->ascii()) { |
| 3287 max_char_ = String::kMaxAsciiCharCode; | 3338 max_char_ = String::kMaxAsciiCharCode; |
| 3288 } else { | 3339 } else { |
| 3289 max_char_ = String::kMaxUtf16CodeUnit; | 3340 max_char_ = String::kMaxUtf16CodeUnit; |
| 3290 } | 3341 } |
| 3291 bitmaps_ = new ZoneList<ZoneList<bool>*>(length); | 3342 bitmaps_ = new ZoneList<BoyerMoorePositionInfo*>(length); |
| 3292 for (int i = 0; i < length; i++) { | 3343 for (int i = 0; i < length; i++) { |
| 3293 bitmaps_->Add(new ZoneList<bool>(map_length)); | 3344 bitmaps_->Add(new BoyerMoorePositionInfo()); |
| 3294 ZoneList<bool>* map = bitmaps_->at(i); | |
| 3295 for (int i = 0; i < map_length; i++) { | |
| 3296 map->Add(false); | |
| 3297 } | |
| 3298 } | 3345 } |
| 3299 } | 3346 } |
| 3300 | 3347 |
| 3301 | 3348 |
| 3302 // Find the longest range of lookahead that has the fewest number of different | 3349 // Find the longest range of lookahead that has the fewest number of different |
| 3303 // characters that can occur at a given position. Since we are optimizing two | 3350 // characters that can occur at a given position. Since we are optimizing two |
| 3304 // different parameters at once this is a tradeoff. | 3351 // different parameters at once this is a tradeoff. |
| 3305 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { | 3352 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { |
| 3306 int biggest_points = 0; | 3353 int biggest_points = 0; |
| 3354 // If more than 32 characters out of 128 can occur it is unlikely that we can |
| 3355 // be lucky enough to step forwards much of the time. |
| 3356 const int kMaxMax = 32; |
| 3307 for (int max_number_of_chars = 4; | 3357 for (int max_number_of_chars = 4; |
| 3308 max_number_of_chars < kTooManyCharacters; | 3358 max_number_of_chars < kMaxMax; |
| 3309 max_number_of_chars *= 2) { | 3359 max_number_of_chars *= 2) { |
| 3310 biggest_points = | 3360 biggest_points = |
| 3311 FindBestInterval(max_number_of_chars, biggest_points, from, to); | 3361 FindBestInterval(max_number_of_chars, biggest_points, from, to); |
| 3312 } | 3362 } |
| 3313 if (biggest_points == 0) return false; | 3363 if (biggest_points == 0) return false; |
| 3314 return true; | 3364 return true; |
| 3315 } | 3365 } |
| 3316 | 3366 |
| 3317 | 3367 |
| 3318 // Find the highest-points range between 0 and length_ where the character | 3368 // Find the highest-points range between 0 and length_ where the character |
| 3319 // information is not too vague. 'Too vague' means that there are more than | 3369 // information is not too vague. 'Too vague' means that there are more than |
| 3320 // max_number_of_chars that can occur at this position. Calculates the number | 3370 // max_number_of_chars that can occur at this position. Calculates the number |
| 3321 // of points as the product of width-of-the-range and | 3371 // of points as the product of width-of-the-range and |
| 3322 // probability-of-finding-one-of-the-characters, where the probability is | 3372 // probability-of-finding-one-of-the-characters, where the probability is |
| 3323 // calculated using the frequency distribution of the sample subject string. | 3373 // calculated using the frequency distribution of the sample subject string. |
| 3324 int BoyerMooreLookahead::FindBestInterval( | 3374 int BoyerMooreLookahead::FindBestInterval( |
| 3325 int max_number_of_chars, int old_biggest_points, int* from, int* to) { | 3375 int max_number_of_chars, int old_biggest_points, int* from, int* to) { |
| 3326 int biggest_points = old_biggest_points; | 3376 int biggest_points = old_biggest_points; |
| 3327 static const int kSize = RegExpMacroAssembler::kTableSize; | 3377 static const int kSize = RegExpMacroAssembler::kTableSize; |
| 3328 for (int i = 0; i < length_; ) { | 3378 for (int i = 0; i < length_; ) { |
| 3329 while (i < length_ && Count(i) > max_number_of_chars) i++; | 3379 while (i < length_ && Count(i) > max_number_of_chars) i++; |
| 3330 if (i == length_) break; | 3380 if (i == length_) break; |
| 3331 int remembered_from = i; | 3381 int remembered_from = i; |
| 3332 bool union_map[kSize]; | 3382 bool union_map[kSize]; |
| 3333 for (int j = 0; j < kSize; j++) union_map[j] = false; | 3383 for (int j = 0; j < kSize; j++) union_map[j] = false; |
| 3334 while (i < length_ && Count(i) <= max_number_of_chars) { | 3384 while (i < length_ && Count(i) <= max_number_of_chars) { |
| 3335 ZoneList<bool>* map = bitmaps_->at(i); | 3385 BoyerMoorePositionInfo* map = bitmaps_->at(i); |
| 3336 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j); | 3386 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j); |
| 3337 i++; | 3387 i++; |
| 3338 } | 3388 } |
| 3339 int frequency = 0; | 3389 int frequency = 0; |
| 3340 for (int j = 0; j < kSize; j++) { | 3390 for (int j = 0; j < kSize; j++) { |
| 3341 if (union_map[j]) { | 3391 if (union_map[j]) { |
| 3342 // Add 1 to the frequency to give a small per-character boost for | 3392 // Add 1 to the frequency to give a small per-character boost for |
| 3343 // the cases where our sampling is not good enough and many | 3393 // the cases where our sampling is not good enough and many |
| 3344 // characters have a frequency of zero. This means the frequency | 3394 // characters have a frequency of zero. This means the frequency |
| 3345 // can theoretically be up to 2*kSize though we treat it mostly as | 3395 // can theoretically be up to 2*kSize though we treat it mostly as |
| (...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3380 | 3430 |
| 3381 const int kSkipArrayEntry = 0; | 3431 const int kSkipArrayEntry = 0; |
| 3382 const int kDontSkipArrayEntry = 1; | 3432 const int kDontSkipArrayEntry = 1; |
| 3383 | 3433 |
| 3384 for (int i = 0; i < kSize; i++) { | 3434 for (int i = 0; i < kSize; i++) { |
| 3385 boolean_skip_table->set(i, kSkipArrayEntry); | 3435 boolean_skip_table->set(i, kSkipArrayEntry); |
| 3386 } | 3436 } |
| 3387 int skip = max_lookahead + 1 - min_lookahead; | 3437 int skip = max_lookahead + 1 - min_lookahead; |
| 3388 | 3438 |
| 3389 for (int i = max_lookahead; i >= min_lookahead; i--) { | 3439 for (int i = max_lookahead; i >= min_lookahead; i--) { |
| 3390 ZoneList<bool>* map = bitmaps_->at(i); | 3440 BoyerMoorePositionInfo* map = bitmaps_->at(i); |
| 3391 for (int j = 0; j < map_length_; j++) { | 3441 for (int j = 0; j < kSize; j++) { |
| 3392 if (map->at(j)) { | 3442 if (map->at(j)) { |
| 3393 boolean_skip_table->set(j, kDontSkipArrayEntry); | 3443 boolean_skip_table->set(j, kDontSkipArrayEntry); |
| 3394 } | 3444 } |
| 3395 } | 3445 } |
| 3396 } | 3446 } |
| 3397 | 3447 |
| 3398 return skip; | 3448 return skip; |
| 3399 } | 3449 } |
| 3400 | 3450 |
| 3401 | 3451 |
| 3402 // See comment above on the implementation of GetSkipTable. | 3452 // See comment above on the implementation of GetSkipTable. |
| 3403 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { | 3453 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { |
| 3454 const int kSize = RegExpMacroAssembler::kTableSize; |
| 3455 |
| 3404 int min_lookahead = 0; | 3456 int min_lookahead = 0; |
| 3405 int max_lookahead = 0; | 3457 int max_lookahead = 0; |
| 3406 | 3458 |
| 3407 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false; | 3459 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false; |
| 3408 | 3460 |
| 3409 bool found_single_character = false; | 3461 bool found_single_character = false; |
| 3410 bool abandoned_search_for_single_character = false; | |
| 3411 int single_character = 0; | 3462 int single_character = 0; |
| 3412 for (int i = max_lookahead; i >= min_lookahead; i--) { | 3463 for (int i = max_lookahead; i >= min_lookahead; i--) { |
| 3413 ZoneList<bool>* map = bitmaps_->at(i); | 3464 BoyerMoorePositionInfo* map = bitmaps_->at(i); |
| 3414 for (int j = 0; j < map_length_; j++) { | 3465 if (map->map_count() > 1 || |
| 3466 (found_single_character && map->map_count() != 0)) { |
| 3467 found_single_character = false; |
| 3468 break; |
| 3469 } |
| 3470 for (int j = 0; j < kSize; j++) { |
| 3415 if (map->at(j)) { | 3471 if (map->at(j)) { |
| 3416 if (found_single_character) { | 3472 found_single_character = true; |
| 3417 found_single_character = false; // Found two. | 3473 single_character = j; |
| 3418 abandoned_search_for_single_character = true; | 3474 break; |
| 3419 break; | |
| 3420 } else { | |
| 3421 found_single_character = true; | |
| 3422 single_character = j; | |
| 3423 } | |
| 3424 } | 3475 } |
| 3425 } | 3476 } |
| 3426 if (abandoned_search_for_single_character) break; | |
| 3427 } | 3477 } |
| 3428 | 3478 |
| 3429 int lookahead_width = max_lookahead + 1 - min_lookahead; | 3479 int lookahead_width = max_lookahead + 1 - min_lookahead; |
| 3430 | 3480 |
| 3431 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { | 3481 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { |
| 3432 // The mask-compare can probably handle this better. | 3482 // The mask-compare can probably handle this better. |
| 3433 return false; | 3483 return false; |
| 3434 } | 3484 } |
| 3435 | 3485 |
| 3436 if (found_single_character) { | 3486 if (found_single_character) { |
| 3437 Label cont, again; | 3487 Label cont, again; |
| 3438 masm->Bind(&again); | 3488 masm->Bind(&again); |
| 3439 masm->LoadCurrentCharacter(max_lookahead, &cont, true); | 3489 masm->LoadCurrentCharacter(max_lookahead, &cont, true); |
| 3440 if (max_char_ > map_length_) { | 3490 if (max_char_ > kSize) { |
| 3441 ASSERT(map_length_ == RegExpMacroAssembler::kTableSize); | |
| 3442 masm->CheckCharacterAfterAnd(single_character, | 3491 masm->CheckCharacterAfterAnd(single_character, |
| 3443 RegExpMacroAssembler::kTableMask, | 3492 RegExpMacroAssembler::kTableMask, |
| 3444 &cont); | 3493 &cont); |
| 3445 } else { | 3494 } else { |
| 3446 masm->CheckCharacter(single_character, &cont); | 3495 masm->CheckCharacter(single_character, &cont); |
| 3447 } | 3496 } |
| 3448 masm->AdvanceCurrentPosition(lookahead_width); | 3497 masm->AdvanceCurrentPosition(lookahead_width); |
| 3449 masm->GoTo(&again); | 3498 masm->GoTo(&again); |
| 3450 masm->Bind(&cont); | 3499 masm->Bind(&cont); |
| 3451 return true; | 3500 return true; |
| 3452 } | 3501 } |
| 3453 | 3502 |
| 3454 Handle<ByteArray> boolean_skip_table = | 3503 Handle<ByteArray> boolean_skip_table = |
| 3455 FACTORY->NewByteArray(map_length_, TENURED); | 3504 FACTORY->NewByteArray(kSize, TENURED); |
| 3456 int skip_distance = GetSkipTable( | 3505 int skip_distance = GetSkipTable( |
| 3457 min_lookahead, max_lookahead, boolean_skip_table); | 3506 min_lookahead, max_lookahead, boolean_skip_table); |
| 3458 ASSERT(skip_distance != 0); | 3507 ASSERT(skip_distance != 0); |
| 3459 | 3508 |
| 3460 Label cont, again; | 3509 Label cont, again; |
| 3461 masm->Bind(&again); | 3510 masm->Bind(&again); |
| 3462 masm->LoadCurrentCharacter(max_lookahead, &cont, true); | 3511 masm->LoadCurrentCharacter(max_lookahead, &cont, true); |
| 3463 masm->CheckBitInTable(boolean_skip_table, &cont); | 3512 masm->CheckBitInTable(boolean_skip_table, &cont); |
| 3464 masm->AdvanceCurrentPosition(skip_distance); | 3513 masm->AdvanceCurrentPosition(skip_distance); |
| 3465 masm->GoTo(&again); | 3514 masm->GoTo(&again); |
| (...skipping 158 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3624 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) == | 3673 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) == |
| 3625 this) { | 3674 this) { |
| 3626 // At this point we know that we are at a non-greedy loop that will eat | 3675 // At this point we know that we are at a non-greedy loop that will eat |
| 3627 // any character one at a time. Any non-anchored regexp has such a | 3676 // any character one at a time. Any non-anchored regexp has such a |
| 3628 // loop prepended to it in order to find where it starts. We look for | 3677 // loop prepended to it in order to find where it starts. We look for |
| 3629 // a pattern of the form ...abc... where we can look 6 characters ahead | 3678 // a pattern of the form ...abc... where we can look 6 characters ahead |
| 3630 // and step forwards 3 if the character is not one of abc. Abc need | 3679 // and step forwards 3 if the character is not one of abc. Abc need |
| 3631 // not be atoms, they can be any reasonably limited character class or | 3680 // not be atoms, they can be any reasonably limited character class or |
| 3632 // small alternation. | 3681 // small alternation. |
| 3633 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes. | 3682 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes. |
| 3634 eats_at_least = | 3683 BoyerMooreLookahead* lookahead = bm_info(not_at_start); |
| 3635 Min(kMaxLookaheadForBoyerMoore, | 3684 if (lookahead == NULL) { |
| 3636 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start)); | 3685 eats_at_least = |
| 3637 if (eats_at_least >= 1) { | 3686 Min(kMaxLookaheadForBoyerMoore, |
| 3638 BoyerMooreLookahead bm(eats_at_least, | 3687 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start)); |
| 3639 RegExpMacroAssembler::kTableSize, | 3688 if (eats_at_least >= 1) { |
| 3640 compiler); | 3689 BoyerMooreLookahead* bm = |
| 3641 GuardedAlternative alt0 = alternatives_->at(0); | 3690 new BoyerMooreLookahead(eats_at_least, compiler); |
| 3642 alt0.node()->FillInBMInfo(0, &bm, not_at_start); | 3691 GuardedAlternative alt0 = alternatives_->at(0); |
| 3643 skip_was_emitted = bm.EmitSkipInstructions(macro_assembler); | 3692 alt0.node()->FillInBMInfo(0, bm, not_at_start); |
| 3693 skip_was_emitted = bm->EmitSkipInstructions(macro_assembler); |
| 3694 } |
| 3695 } else { |
| 3696 skip_was_emitted = lookahead->EmitSkipInstructions(macro_assembler); |
| 3644 } | 3697 } |
| 3645 } | 3698 } |
| 3646 } | 3699 } |
| 3647 } | 3700 } |
| 3648 | 3701 |
| 3649 if (eats_at_least == kEatsAtLeastNotYetInitialized) { | 3702 if (eats_at_least == kEatsAtLeastNotYetInitialized) { |
| 3650 // Save some time by looking at most one machine word ahead. | 3703 // Save some time by looking at most one machine word ahead. |
| 3651 eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start); | 3704 eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start); |
| 3652 } | 3705 } |
| 3653 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least); | 3706 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least); |
| (...skipping 542 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4196 break; | 4249 break; |
| 4197 case AssertionNode::AT_BOUNDARY: | 4250 case AssertionNode::AT_BOUNDARY: |
| 4198 stream()->Add("label=\"\\b\", shape=septagon"); | 4251 stream()->Add("label=\"\\b\", shape=septagon"); |
| 4199 break; | 4252 break; |
| 4200 case AssertionNode::AT_NON_BOUNDARY: | 4253 case AssertionNode::AT_NON_BOUNDARY: |
| 4201 stream()->Add("label=\"\\B\", shape=septagon"); | 4254 stream()->Add("label=\"\\B\", shape=septagon"); |
| 4202 break; | 4255 break; |
| 4203 case AssertionNode::AFTER_NEWLINE: | 4256 case AssertionNode::AFTER_NEWLINE: |
| 4204 stream()->Add("label=\"(?<=\\n)\", shape=septagon"); | 4257 stream()->Add("label=\"(?<=\\n)\", shape=septagon"); |
| 4205 break; | 4258 break; |
| 4206 case AssertionNode::AFTER_WORD_CHARACTER: | |
| 4207 stream()->Add("label=\"(?<=\\w)\", shape=septagon"); | |
| 4208 break; | |
| 4209 case AssertionNode::AFTER_NONWORD_CHARACTER: | |
| 4210 stream()->Add("label=\"(?<=\\W)\", shape=septagon"); | |
| 4211 break; | |
| 4212 } | 4259 } |
| 4213 stream()->Add("];\n"); | 4260 stream()->Add("];\n"); |
| 4214 PrintAttributes(that); | 4261 PrintAttributes(that); |
| 4215 RegExpNode* successor = that->on_success(); | 4262 RegExpNode* successor = that->on_success(); |
| 4216 stream()->Add(" n%p -> n%p;\n", that, successor); | 4263 stream()->Add(" n%p -> n%p;\n", that, successor); |
| 4217 Visit(successor); | 4264 Visit(successor); |
| 4218 } | 4265 } |
| 4219 | 4266 |
| 4220 | 4267 |
| 4221 void DotPrinter::VisitAction(ActionNode* that) { | 4268 void DotPrinter::VisitAction(ActionNode* that) { |
| (...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4306 printer.PrintNode(label, node); | 4353 printer.PrintNode(label, node); |
| 4307 } | 4354 } |
| 4308 | 4355 |
| 4309 | 4356 |
| 4310 #endif // DEBUG | 4357 #endif // DEBUG |
| 4311 | 4358 |
| 4312 | 4359 |
| 4313 // ------------------------------------------------------------------- | 4360 // ------------------------------------------------------------------- |
| 4314 // Tree to graph conversion | 4361 // Tree to graph conversion |
| 4315 | 4362 |
| 4316 static const uc16 kSpaceRanges[] = { 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0, | |
| 4317 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029, | |
| 4318 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000, 0xFEFF, 0xFEFF }; | |
| 4319 static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges); | |
| 4320 | |
| 4321 static const uc16 kWordRanges[] = { '0', '9', 'A', 'Z', '_', '_', 'a', 'z' }; | |
| 4322 static const int kWordRangeCount = ARRAY_SIZE(kWordRanges); | |
| 4323 | |
| 4324 static const uc16 kDigitRanges[] = { '0', '9' }; | |
| 4325 static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges); | |
| 4326 | |
| 4327 static const uc16 kLineTerminatorRanges[] = { 0x000A, 0x000A, 0x000D, 0x000D, | |
| 4328 0x2028, 0x2029 }; | |
| 4329 static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges); | |
| 4330 | |
| 4331 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, | 4363 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, |
| 4332 RegExpNode* on_success) { | 4364 RegExpNode* on_success) { |
| 4333 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1); | 4365 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1); |
| 4334 elms->Add(TextElement::Atom(this)); | 4366 elms->Add(TextElement::Atom(this)); |
| 4335 return new TextNode(elms, on_success); | 4367 return new TextNode(elms, on_success); |
| 4336 } | 4368 } |
| 4337 | 4369 |
| 4338 | 4370 |
| 4339 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, | 4371 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, |
| 4340 RegExpNode* on_success) { | 4372 RegExpNode* on_success) { |
| 4341 return new TextNode(elements(), on_success); | 4373 return new TextNode(elements(), on_success); |
| 4342 } | 4374 } |
| 4343 | 4375 |
| 4376 |
| 4344 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, | 4377 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, |
| 4345 const uc16* special_class, | 4378 const int* special_class, |
| 4346 int length) { | 4379 int length) { |
| 4380 length--; // Remove final 0x10000. |
| 4381 ASSERT(special_class[length] == 0x10000); |
| 4347 ASSERT(ranges->length() != 0); | 4382 ASSERT(ranges->length() != 0); |
| 4348 ASSERT(length != 0); | 4383 ASSERT(length != 0); |
| 4349 ASSERT(special_class[0] != 0); | 4384 ASSERT(special_class[0] != 0); |
| 4350 if (ranges->length() != (length >> 1) + 1) { | 4385 if (ranges->length() != (length >> 1) + 1) { |
| 4351 return false; | 4386 return false; |
| 4352 } | 4387 } |
| 4353 CharacterRange range = ranges->at(0); | 4388 CharacterRange range = ranges->at(0); |
| 4354 if (range.from() != 0) { | 4389 if (range.from() != 0) { |
| 4355 return false; | 4390 return false; |
| 4356 } | 4391 } |
| 4357 for (int i = 0; i < length; i += 2) { | 4392 for (int i = 0; i < length; i += 2) { |
| 4358 if (special_class[i] != (range.to() + 1)) { | 4393 if (special_class[i] != (range.to() + 1)) { |
| 4359 return false; | 4394 return false; |
| 4360 } | 4395 } |
| 4361 range = ranges->at((i >> 1) + 1); | 4396 range = ranges->at((i >> 1) + 1); |
| 4362 if (special_class[i+1] != range.from() - 1) { | 4397 if (special_class[i+1] != range.from()) { |
| 4363 return false; | 4398 return false; |
| 4364 } | 4399 } |
| 4365 } | 4400 } |
| 4366 if (range.to() != 0xffff) { | 4401 if (range.to() != 0xffff) { |
| 4367 return false; | 4402 return false; |
| 4368 } | 4403 } |
| 4369 return true; | 4404 return true; |
| 4370 } | 4405 } |
| 4371 | 4406 |
| 4372 | 4407 |
| 4373 static bool CompareRanges(ZoneList<CharacterRange>* ranges, | 4408 static bool CompareRanges(ZoneList<CharacterRange>* ranges, |
| 4374 const uc16* special_class, | 4409 const int* special_class, |
| 4375 int length) { | 4410 int length) { |
| 4411 length--; // Remove final 0x10000. |
| 4412 ASSERT(special_class[length] == 0x10000); |
| 4376 if (ranges->length() * 2 != length) { | 4413 if (ranges->length() * 2 != length) { |
| 4377 return false; | 4414 return false; |
| 4378 } | 4415 } |
| 4379 for (int i = 0; i < length; i += 2) { | 4416 for (int i = 0; i < length; i += 2) { |
| 4380 CharacterRange range = ranges->at(i >> 1); | 4417 CharacterRange range = ranges->at(i >> 1); |
| 4381 if (range.from() != special_class[i] || range.to() != special_class[i+1]) { | 4418 if (range.from() != special_class[i] || |
| 4419 range.to() != special_class[i + 1] - 1) { |
| 4382 return false; | 4420 return false; |
| 4383 } | 4421 } |
| 4384 } | 4422 } |
| 4385 return true; | 4423 return true; |
| 4386 } | 4424 } |
| 4387 | 4425 |
| 4388 | 4426 |
| 4389 bool RegExpCharacterClass::is_standard() { | 4427 bool RegExpCharacterClass::is_standard() { |
| 4390 // TODO(lrn): Remove need for this function, by not throwing away information | 4428 // TODO(lrn): Remove need for this function, by not throwing away information |
| 4391 // along the way. | 4429 // along the way. |
| (...skipping 380 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4772 RegExpNode* on_success) { | 4810 RegExpNode* on_success) { |
| 4773 ZoneList<RegExpTree*>* children = nodes(); | 4811 ZoneList<RegExpTree*>* children = nodes(); |
| 4774 RegExpNode* current = on_success; | 4812 RegExpNode* current = on_success; |
| 4775 for (int i = children->length() - 1; i >= 0; i--) { | 4813 for (int i = children->length() - 1; i >= 0; i--) { |
| 4776 current = children->at(i)->ToNode(compiler, current); | 4814 current = children->at(i)->ToNode(compiler, current); |
| 4777 } | 4815 } |
| 4778 return current; | 4816 return current; |
| 4779 } | 4817 } |
| 4780 | 4818 |
| 4781 | 4819 |
| 4782 static void AddClass(const uc16* elmv, | 4820 static void AddClass(const int* elmv, |
| 4783 int elmc, | 4821 int elmc, |
| 4784 ZoneList<CharacterRange>* ranges) { | 4822 ZoneList<CharacterRange>* ranges) { |
| 4823 elmc--; |
| 4824 ASSERT(elmv[elmc] == 0x10000); |
| 4785 for (int i = 0; i < elmc; i += 2) { | 4825 for (int i = 0; i < elmc; i += 2) { |
| 4786 ASSERT(elmv[i] <= elmv[i + 1]); | 4826 ASSERT(elmv[i] < elmv[i + 1]); |
| 4787 ranges->Add(CharacterRange(elmv[i], elmv[i + 1])); | 4827 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1)); |
| 4788 } | 4828 } |
| 4789 } | 4829 } |
| 4790 | 4830 |
| 4791 | 4831 |
| 4792 static void AddClassNegated(const uc16 *elmv, | 4832 static void AddClassNegated(const int *elmv, |
| 4793 int elmc, | 4833 int elmc, |
| 4794 ZoneList<CharacterRange>* ranges) { | 4834 ZoneList<CharacterRange>* ranges) { |
| 4835 elmc--; |
| 4836 ASSERT(elmv[elmc] == 0x10000); |
| 4795 ASSERT(elmv[0] != 0x0000); | 4837 ASSERT(elmv[0] != 0x0000); |
| 4796 ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit); | 4838 ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit); |
| 4797 uc16 last = 0x0000; | 4839 uc16 last = 0x0000; |
| 4798 for (int i = 0; i < elmc; i += 2) { | 4840 for (int i = 0; i < elmc; i += 2) { |
| 4799 ASSERT(last <= elmv[i] - 1); | 4841 ASSERT(last <= elmv[i] - 1); |
| 4800 ASSERT(elmv[i] <= elmv[i + 1]); | 4842 ASSERT(elmv[i] < elmv[i + 1]); |
| 4801 ranges->Add(CharacterRange(last, elmv[i] - 1)); | 4843 ranges->Add(CharacterRange(last, elmv[i] - 1)); |
| 4802 last = elmv[i + 1] + 1; | 4844 last = elmv[i + 1]; |
| 4803 } | 4845 } |
| 4804 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit)); | 4846 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit)); |
| 4805 } | 4847 } |
| 4806 | 4848 |
| 4807 | 4849 |
| 4808 void CharacterRange::AddClassEscape(uc16 type, | 4850 void CharacterRange::AddClassEscape(uc16 type, |
| 4809 ZoneList<CharacterRange>* ranges) { | 4851 ZoneList<CharacterRange>* ranges) { |
| 4810 switch (type) { | 4852 switch (type) { |
| 4811 case 's': | 4853 case 's': |
| 4812 AddClass(kSpaceRanges, kSpaceRangeCount, ranges); | 4854 AddClass(kSpaceRanges, kSpaceRangeCount, ranges); |
| (...skipping 30 matching lines...) Expand all Loading... |
| 4843 AddClass(kLineTerminatorRanges, | 4885 AddClass(kLineTerminatorRanges, |
| 4844 kLineTerminatorRangeCount, | 4886 kLineTerminatorRangeCount, |
| 4845 ranges); | 4887 ranges); |
| 4846 break; | 4888 break; |
| 4847 default: | 4889 default: |
| 4848 UNREACHABLE(); | 4890 UNREACHABLE(); |
| 4849 } | 4891 } |
| 4850 } | 4892 } |
| 4851 | 4893 |
| 4852 | 4894 |
| 4853 Vector<const uc16> CharacterRange::GetWordBounds() { | 4895 Vector<const int> CharacterRange::GetWordBounds() { |
| 4854 return Vector<const uc16>(kWordRanges, kWordRangeCount); | 4896 return Vector<const int>(kWordRanges, kWordRangeCount - 1); |
| 4855 } | 4897 } |
| 4856 | 4898 |
| 4857 | 4899 |
| 4858 class CharacterRangeSplitter { | 4900 class CharacterRangeSplitter { |
| 4859 public: | 4901 public: |
| 4860 CharacterRangeSplitter(ZoneList<CharacterRange>** included, | 4902 CharacterRangeSplitter(ZoneList<CharacterRange>** included, |
| 4861 ZoneList<CharacterRange>** excluded) | 4903 ZoneList<CharacterRange>** excluded) |
| 4862 : included_(included), | 4904 : included_(included), |
| 4863 excluded_(excluded) { } | 4905 excluded_(excluded) { } |
| 4864 void Call(uc16 from, DispatchTable::Entry entry); | 4906 void Call(uc16 from, DispatchTable::Entry entry); |
| (...skipping 11 matching lines...) Expand all Loading... |
| 4876 if (!entry.out_set()->Get(kInBase)) return; | 4918 if (!entry.out_set()->Get(kInBase)) return; |
| 4877 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay) | 4919 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay) |
| 4878 ? included_ | 4920 ? included_ |
| 4879 : excluded_; | 4921 : excluded_; |
| 4880 if (*target == NULL) *target = new ZoneList<CharacterRange>(2); | 4922 if (*target == NULL) *target = new ZoneList<CharacterRange>(2); |
| 4881 (*target)->Add(CharacterRange(entry.from(), entry.to())); | 4923 (*target)->Add(CharacterRange(entry.from(), entry.to())); |
| 4882 } | 4924 } |
| 4883 | 4925 |
| 4884 | 4926 |
| 4885 void CharacterRange::Split(ZoneList<CharacterRange>* base, | 4927 void CharacterRange::Split(ZoneList<CharacterRange>* base, |
| 4886 Vector<const uc16> overlay, | 4928 Vector<const int> overlay, |
| 4887 ZoneList<CharacterRange>** included, | 4929 ZoneList<CharacterRange>** included, |
| 4888 ZoneList<CharacterRange>** excluded) { | 4930 ZoneList<CharacterRange>** excluded) { |
| 4889 ASSERT_EQ(NULL, *included); | 4931 ASSERT_EQ(NULL, *included); |
| 4890 ASSERT_EQ(NULL, *excluded); | 4932 ASSERT_EQ(NULL, *excluded); |
| 4891 DispatchTable table; | 4933 DispatchTable table; |
| 4892 for (int i = 0; i < base->length(); i++) | 4934 for (int i = 0; i < base->length(); i++) |
| 4893 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase); | 4935 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase); |
| 4894 for (int i = 0; i < overlay.length(); i += 2) { | 4936 for (int i = 0; i < overlay.length(); i += 2) { |
| 4895 table.AddRange(CharacterRange(overlay[i], overlay[i+1]), | 4937 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1), |
| 4896 CharacterRangeSplitter::kInOverlay); | 4938 CharacterRangeSplitter::kInOverlay); |
| 4897 } | 4939 } |
| 4898 CharacterRangeSplitter callback(included, excluded); | 4940 CharacterRangeSplitter callback(included, excluded); |
| 4899 table.ForEach(&callback); | 4941 table.ForEach(&callback); |
| 4900 } | 4942 } |
| 4901 | 4943 |
| 4902 | 4944 |
| 4903 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, | 4945 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, |
| 4904 bool is_ascii) { | 4946 bool is_ascii) { |
| 4905 Isolate* isolate = Isolate::Current(); | 4947 Isolate* isolate = Isolate::Current(); |
| (...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4971 if (n <= 1) return true; | 5013 if (n <= 1) return true; |
| 4972 int max = ranges->at(0).to(); | 5014 int max = ranges->at(0).to(); |
| 4973 for (int i = 1; i < n; i++) { | 5015 for (int i = 1; i < n; i++) { |
| 4974 CharacterRange next_range = ranges->at(i); | 5016 CharacterRange next_range = ranges->at(i); |
| 4975 if (next_range.from() <= max + 1) return false; | 5017 if (next_range.from() <= max + 1) return false; |
| 4976 max = next_range.to(); | 5018 max = next_range.to(); |
| 4977 } | 5019 } |
| 4978 return true; | 5020 return true; |
| 4979 } | 5021 } |
| 4980 | 5022 |
| 4981 SetRelation CharacterRange::WordCharacterRelation( | |
| 4982 ZoneList<CharacterRange>* range) { | |
| 4983 ASSERT(IsCanonical(range)); | |
| 4984 int i = 0; // Word character range index. | |
| 4985 int j = 0; // Argument range index. | |
| 4986 ASSERT_NE(0, kWordRangeCount); | |
| 4987 SetRelation result; | |
| 4988 if (range->length() == 0) { | |
| 4989 result.SetElementsInSecondSet(); | |
| 4990 return result; | |
| 4991 } | |
| 4992 CharacterRange argument_range = range->at(0); | |
| 4993 CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]); | |
| 4994 while (i < kWordRangeCount && j < range->length()) { | |
| 4995 // Check the two ranges for the five cases: | |
| 4996 // - no overlap. | |
| 4997 // - partial overlap (there are elements in both ranges that isn't | |
| 4998 // in the other, and there are also elements that are in both). | |
| 4999 // - argument range entirely inside word range. | |
| 5000 // - word range entirely inside argument range. | |
| 5001 // - ranges are completely equal. | |
| 5002 | |
| 5003 // First check for no overlap. The earlier range is not in the other set. | |
| 5004 if (argument_range.from() > word_range.to()) { | |
| 5005 // Ranges are disjoint. The earlier word range contains elements that | |
| 5006 // cannot be in the argument set. | |
| 5007 result.SetElementsInSecondSet(); | |
| 5008 } else if (word_range.from() > argument_range.to()) { | |
| 5009 // Ranges are disjoint. The earlier argument range contains elements that | |
| 5010 // cannot be in the word set. | |
| 5011 result.SetElementsInFirstSet(); | |
| 5012 } else if (word_range.from() <= argument_range.from() && | |
| 5013 word_range.to() >= argument_range.from()) { | |
| 5014 result.SetElementsInBothSets(); | |
| 5015 // argument range completely inside word range. | |
| 5016 if (word_range.from() < argument_range.from() || | |
| 5017 word_range.to() > argument_range.from()) { | |
| 5018 result.SetElementsInSecondSet(); | |
| 5019 } | |
| 5020 } else if (word_range.from() >= argument_range.from() && | |
| 5021 word_range.to() <= argument_range.from()) { | |
| 5022 result.SetElementsInBothSets(); | |
| 5023 result.SetElementsInFirstSet(); | |
| 5024 } else { | |
| 5025 // There is overlap, and neither is a subrange of the other | |
| 5026 result.SetElementsInFirstSet(); | |
| 5027 result.SetElementsInSecondSet(); | |
| 5028 result.SetElementsInBothSets(); | |
| 5029 } | |
| 5030 if (result.NonTrivialIntersection()) { | |
| 5031 // The result is as (im)precise as we can possibly make it. | |
| 5032 return result; | |
| 5033 } | |
| 5034 // Progress the range(s) with minimal to-character. | |
| 5035 uc16 word_to = word_range.to(); | |
| 5036 uc16 argument_to = argument_range.to(); | |
| 5037 if (argument_to <= word_to) { | |
| 5038 j++; | |
| 5039 if (j < range->length()) { | |
| 5040 argument_range = range->at(j); | |
| 5041 } | |
| 5042 } | |
| 5043 if (word_to <= argument_to) { | |
| 5044 i += 2; | |
| 5045 if (i < kWordRangeCount) { | |
| 5046 word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]); | |
| 5047 } | |
| 5048 } | |
| 5049 } | |
| 5050 // Check if anything wasn't compared in the loop. | |
| 5051 if (i < kWordRangeCount) { | |
| 5052 // word range contains something not in argument range. | |
| 5053 result.SetElementsInSecondSet(); | |
| 5054 } else if (j < range->length()) { | |
| 5055 // Argument range contains something not in word range. | |
| 5056 result.SetElementsInFirstSet(); | |
| 5057 } | |
| 5058 | |
| 5059 return result; | |
| 5060 } | |
| 5061 | |
| 5062 | 5023 |
| 5063 ZoneList<CharacterRange>* CharacterSet::ranges() { | 5024 ZoneList<CharacterRange>* CharacterSet::ranges() { |
| 5064 if (ranges_ == NULL) { | 5025 if (ranges_ == NULL) { |
| 5065 ranges_ = new ZoneList<CharacterRange>(2); | 5026 ranges_ = new ZoneList<CharacterRange>(2); |
| 5066 CharacterRange::AddClassEscape(standard_set_type_, ranges_); | 5027 CharacterRange::AddClassEscape(standard_set_type_, ranges_); |
| 5067 } | 5028 } |
| 5068 return ranges_; | 5029 return ranges_; |
| 5069 } | 5030 } |
| 5070 | 5031 |
| 5071 | 5032 |
| (...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5184 num_canonical, | 5145 num_canonical, |
| 5185 character_ranges->at(read)); | 5146 character_ranges->at(read)); |
| 5186 read++; | 5147 read++; |
| 5187 } while (read < n); | 5148 } while (read < n); |
| 5188 character_ranges->Rewind(num_canonical); | 5149 character_ranges->Rewind(num_canonical); |
| 5189 | 5150 |
| 5190 ASSERT(CharacterRange::IsCanonical(character_ranges)); | 5151 ASSERT(CharacterRange::IsCanonical(character_ranges)); |
| 5191 } | 5152 } |
| 5192 | 5153 |
| 5193 | 5154 |
| 5194 // Utility function for CharacterRange::Merge. Adds a range at the end of | |
| 5195 // a canonicalized range list, if necessary merging the range with the last | |
| 5196 // range of the list. | |
| 5197 static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) { | |
| 5198 if (set == NULL) return; | |
| 5199 ASSERT(set->length() == 0 || set->at(set->length() - 1).to() < range.from()); | |
| 5200 int n = set->length(); | |
| 5201 if (n > 0) { | |
| 5202 CharacterRange lastRange = set->at(n - 1); | |
| 5203 if (lastRange.to() == range.from() - 1) { | |
| 5204 set->at(n - 1) = CharacterRange(lastRange.from(), range.to()); | |
| 5205 return; | |
| 5206 } | |
| 5207 } | |
| 5208 set->Add(range); | |
| 5209 } | |
| 5210 | |
| 5211 | |
| 5212 static void AddRangeToSelectedSet(int selector, | |
| 5213 ZoneList<CharacterRange>* first_set, | |
| 5214 ZoneList<CharacterRange>* second_set, | |
| 5215 ZoneList<CharacterRange>* intersection_set, | |
| 5216 CharacterRange range) { | |
| 5217 switch (selector) { | |
| 5218 case kInsideFirst: | |
| 5219 AddRangeToSet(first_set, range); | |
| 5220 break; | |
| 5221 case kInsideSecond: | |
| 5222 AddRangeToSet(second_set, range); | |
| 5223 break; | |
| 5224 case kInsideBoth: | |
| 5225 AddRangeToSet(intersection_set, range); | |
| 5226 break; | |
| 5227 } | |
| 5228 } | |
| 5229 | |
| 5230 | |
| 5231 | |
| 5232 void CharacterRange::Merge(ZoneList<CharacterRange>* first_set, | |
| 5233 ZoneList<CharacterRange>* second_set, | |
| 5234 ZoneList<CharacterRange>* first_set_only_out, | |
| 5235 ZoneList<CharacterRange>* second_set_only_out, | |
| 5236 ZoneList<CharacterRange>* both_sets_out) { | |
| 5237 // Inputs are canonicalized. | |
| 5238 ASSERT(CharacterRange::IsCanonical(first_set)); | |
| 5239 ASSERT(CharacterRange::IsCanonical(second_set)); | |
| 5240 // Outputs are empty, if applicable. | |
| 5241 ASSERT(first_set_only_out == NULL || first_set_only_out->length() == 0); | |
| 5242 ASSERT(second_set_only_out == NULL || second_set_only_out->length() == 0); | |
| 5243 ASSERT(both_sets_out == NULL || both_sets_out->length() == 0); | |
| 5244 | |
| 5245 // Merge sets by iterating through the lists in order of lowest "from" value, | |
| 5246 // and putting intervals into one of three sets. | |
| 5247 | |
| 5248 if (first_set->length() == 0) { | |
| 5249 second_set_only_out->AddAll(*second_set); | |
| 5250 return; | |
| 5251 } | |
| 5252 if (second_set->length() == 0) { | |
| 5253 first_set_only_out->AddAll(*first_set); | |
| 5254 return; | |
| 5255 } | |
| 5256 // Indices into input lists. | |
| 5257 int i1 = 0; | |
| 5258 int i2 = 0; | |
| 5259 // Cache length of input lists. | |
| 5260 int n1 = first_set->length(); | |
| 5261 int n2 = second_set->length(); | |
| 5262 // Current range. May be invalid if state is kInsideNone. | |
| 5263 int from = 0; | |
| 5264 int to = -1; | |
| 5265 // Where current range comes from. | |
| 5266 int state = kInsideNone; | |
| 5267 | |
| 5268 while (i1 < n1 || i2 < n2) { | |
| 5269 CharacterRange next_range; | |
| 5270 int range_source; | |
| 5271 if (i2 == n2 || | |
| 5272 (i1 < n1 && first_set->at(i1).from() < second_set->at(i2).from())) { | |
| 5273 // Next smallest element is in first set. | |
| 5274 next_range = first_set->at(i1++); | |
| 5275 range_source = kInsideFirst; | |
| 5276 } else { | |
| 5277 // Next smallest element is in second set. | |
| 5278 next_range = second_set->at(i2++); | |
| 5279 range_source = kInsideSecond; | |
| 5280 } | |
| 5281 if (to < next_range.from()) { | |
| 5282 // Ranges disjoint: |current| |next| | |
| 5283 AddRangeToSelectedSet(state, | |
| 5284 first_set_only_out, | |
| 5285 second_set_only_out, | |
| 5286 both_sets_out, | |
| 5287 CharacterRange(from, to)); | |
| 5288 from = next_range.from(); | |
| 5289 to = next_range.to(); | |
| 5290 state = range_source; | |
| 5291 } else { | |
| 5292 if (from < next_range.from()) { | |
| 5293 AddRangeToSelectedSet(state, | |
| 5294 first_set_only_out, | |
| 5295 second_set_only_out, | |
| 5296 both_sets_out, | |
| 5297 CharacterRange(from, next_range.from()-1)); | |
| 5298 } | |
| 5299 if (to < next_range.to()) { | |
| 5300 // Ranges overlap: |current| | |
| 5301 // |next| | |
| 5302 AddRangeToSelectedSet(state | range_source, | |
| 5303 first_set_only_out, | |
| 5304 second_set_only_out, | |
| 5305 both_sets_out, | |
| 5306 CharacterRange(next_range.from(), to)); | |
| 5307 from = to + 1; | |
| 5308 to = next_range.to(); | |
| 5309 state = range_source; | |
| 5310 } else { | |
| 5311 // Range included: |current| , possibly ending at same character. | |
| 5312 // |next| | |
| 5313 AddRangeToSelectedSet( | |
| 5314 state | range_source, | |
| 5315 first_set_only_out, | |
| 5316 second_set_only_out, | |
| 5317 both_sets_out, | |
| 5318 CharacterRange(next_range.from(), next_range.to())); | |
| 5319 from = next_range.to() + 1; | |
| 5320 // If ranges end at same character, both ranges are consumed completely. | |
| 5321 if (next_range.to() == to) state = kInsideNone; | |
| 5322 } | |
| 5323 } | |
| 5324 } | |
| 5325 AddRangeToSelectedSet(state, | |
| 5326 first_set_only_out, | |
| 5327 second_set_only_out, | |
| 5328 both_sets_out, | |
| 5329 CharacterRange(from, to)); | |
| 5330 } | |
| 5331 | |
| 5332 | |
| 5333 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, | 5155 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, |
| 5334 ZoneList<CharacterRange>* negated_ranges) { | 5156 ZoneList<CharacterRange>* negated_ranges) { |
| 5335 ASSERT(CharacterRange::IsCanonical(ranges)); | 5157 ASSERT(CharacterRange::IsCanonical(ranges)); |
| 5336 ASSERT_EQ(0, negated_ranges->length()); | 5158 ASSERT_EQ(0, negated_ranges->length()); |
| 5337 int range_count = ranges->length(); | 5159 int range_count = ranges->length(); |
| 5338 uc16 from = 0; | 5160 uc16 from = 0; |
| 5339 int i = 0; | 5161 int i = 0; |
| 5340 if (range_count > 0 && ranges->at(0).from() == 0) { | 5162 if (range_count > 0 && ranges->at(0).from() == 0) { |
| 5341 from = ranges->at(0).to(); | 5163 from = ranges->at(0).to(); |
| 5342 i = 1; | 5164 i = 1; |
| (...skipping 292 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5635 } | 5457 } |
| 5636 | 5458 |
| 5637 | 5459 |
| 5638 void Analysis::VisitBackReference(BackReferenceNode* that) { | 5460 void Analysis::VisitBackReference(BackReferenceNode* that) { |
| 5639 EnsureAnalyzed(that->on_success()); | 5461 EnsureAnalyzed(that->on_success()); |
| 5640 } | 5462 } |
| 5641 | 5463 |
| 5642 | 5464 |
| 5643 void Analysis::VisitAssertion(AssertionNode* that) { | 5465 void Analysis::VisitAssertion(AssertionNode* that) { |
| 5644 EnsureAnalyzed(that->on_success()); | 5466 EnsureAnalyzed(that->on_success()); |
| 5645 AssertionNode::AssertionNodeType type = that->type(); | |
| 5646 if (type == AssertionNode::AT_BOUNDARY || | |
| 5647 type == AssertionNode::AT_NON_BOUNDARY) { | |
| 5648 // Check if the following character is known to be a word character | |
| 5649 // or known to not be a word character. | |
| 5650 ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet(); | |
| 5651 | |
| 5652 CharacterRange::Canonicalize(following_chars); | |
| 5653 | |
| 5654 SetRelation word_relation = | |
| 5655 CharacterRange::WordCharacterRelation(following_chars); | |
| 5656 if (word_relation.Disjoint()) { | |
| 5657 // Includes the case where following_chars is empty (e.g., end-of-input). | |
| 5658 // Following character is definitely *not* a word character. | |
| 5659 type = (type == AssertionNode::AT_BOUNDARY) ? | |
| 5660 AssertionNode::AFTER_WORD_CHARACTER : | |
| 5661 AssertionNode::AFTER_NONWORD_CHARACTER; | |
| 5662 that->set_type(type); | |
| 5663 } else if (word_relation.ContainedIn()) { | |
| 5664 // Following character is definitely a word character. | |
| 5665 type = (type == AssertionNode::AT_BOUNDARY) ? | |
| 5666 AssertionNode::AFTER_NONWORD_CHARACTER : | |
| 5667 AssertionNode::AFTER_WORD_CHARACTER; | |
| 5668 that->set_type(type); | |
| 5669 } | |
| 5670 } | |
| 5671 } | 5467 } |
| 5672 | 5468 |
| 5673 | 5469 |
| 5674 ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() { | 5470 void BackReferenceNode::FillInBMInfo( |
| 5675 if (first_character_set_ == NULL) { | 5471 int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
| 5676 if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) { | 5472 // Working out the set of characters that a backreference can match is too |
| 5677 // If we can't find an exact solution within the budget, we | 5473 // hard, so we just say that any character can match. |
| 5678 // set the value to the set of every character, i.e., all characters | 5474 bm->SetRest(offset); |
| 5679 // are possible. | 5475 SaveBMInfo(bm, not_at_start, offset); |
| 5680 ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1); | |
| 5681 all_set->Add(CharacterRange::Everything()); | |
| 5682 first_character_set_ = all_set; | |
| 5683 } | |
| 5684 } | |
| 5685 return first_character_set_; | |
| 5686 } | 5476 } |
| 5687 | 5477 |
| 5688 | 5478 |
| 5689 int RegExpNode::ComputeFirstCharacterSet(int budget) { | 5479 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == |
| 5690 // Default behavior is to not be able to determine the first character. | 5480 RegExpMacroAssembler::kTableSize); |
| 5691 return kComputeFirstCharacterSetFail; | |
| 5692 } | |
| 5693 | |
| 5694 | |
| 5695 int LoopChoiceNode::ComputeFirstCharacterSet(int budget) { | |
| 5696 budget--; | |
| 5697 if (budget >= 0) { | |
| 5698 // Find loop min-iteration. It's the value of the guarded choice node | |
| 5699 // with a GEQ guard, if any. | |
| 5700 int min_repetition = 0; | |
| 5701 | |
| 5702 for (int i = 0; i <= 1; i++) { | |
| 5703 GuardedAlternative alternative = alternatives()->at(i); | |
| 5704 ZoneList<Guard*>* guards = alternative.guards(); | |
| 5705 if (guards != NULL && guards->length() > 0) { | |
| 5706 Guard* guard = guards->at(0); | |
| 5707 if (guard->op() == Guard::GEQ) { | |
| 5708 min_repetition = guard->value(); | |
| 5709 break; | |
| 5710 } | |
| 5711 } | |
| 5712 } | |
| 5713 | |
| 5714 budget = loop_node()->ComputeFirstCharacterSet(budget); | |
| 5715 if (budget >= 0) { | |
| 5716 ZoneList<CharacterRange>* character_set = | |
| 5717 loop_node()->first_character_set(); | |
| 5718 if (body_can_be_zero_length() || min_repetition == 0) { | |
| 5719 budget = continue_node()->ComputeFirstCharacterSet(budget); | |
| 5720 if (budget < 0) return budget; | |
| 5721 ZoneList<CharacterRange>* body_set = | |
| 5722 continue_node()->first_character_set(); | |
| 5723 ZoneList<CharacterRange>* union_set = | |
| 5724 new ZoneList<CharacterRange>(Max(character_set->length(), | |
| 5725 body_set->length())); | |
| 5726 CharacterRange::Merge(character_set, | |
| 5727 body_set, | |
| 5728 union_set, | |
| 5729 union_set, | |
| 5730 union_set); | |
| 5731 character_set = union_set; | |
| 5732 } | |
| 5733 set_first_character_set(character_set); | |
| 5734 } | |
| 5735 } | |
| 5736 return budget; | |
| 5737 } | |
| 5738 | |
| 5739 | |
| 5740 int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) { | |
| 5741 budget--; | |
| 5742 if (budget >= 0) { | |
| 5743 GuardedAlternative successor = this->alternatives()->at(1); | |
| 5744 RegExpNode* successor_node = successor.node(); | |
| 5745 budget = successor_node->ComputeFirstCharacterSet(budget); | |
| 5746 if (budget >= 0) { | |
| 5747 set_first_character_set(successor_node->first_character_set()); | |
| 5748 } | |
| 5749 } | |
| 5750 return budget; | |
| 5751 } | |
| 5752 | |
| 5753 | |
| 5754 // The first character set of an EndNode is unknowable. Just use the | |
| 5755 // default implementation that fails and returns all characters as possible. | |
| 5756 | |
| 5757 | |
| 5758 int AssertionNode::ComputeFirstCharacterSet(int budget) { | |
| 5759 budget -= 1; | |
| 5760 if (budget >= 0) { | |
| 5761 switch (type_) { | |
| 5762 case AT_END: { | |
| 5763 set_first_character_set(new ZoneList<CharacterRange>(0)); | |
| 5764 break; | |
| 5765 } | |
| 5766 case AT_START: | |
| 5767 case AT_BOUNDARY: | |
| 5768 case AT_NON_BOUNDARY: | |
| 5769 case AFTER_NEWLINE: | |
| 5770 case AFTER_NONWORD_CHARACTER: | |
| 5771 case AFTER_WORD_CHARACTER: { | |
| 5772 ASSERT_NOT_NULL(on_success()); | |
| 5773 budget = on_success()->ComputeFirstCharacterSet(budget); | |
| 5774 if (budget >= 0) { | |
| 5775 set_first_character_set(on_success()->first_character_set()); | |
| 5776 } | |
| 5777 break; | |
| 5778 } | |
| 5779 } | |
| 5780 } | |
| 5781 return budget; | |
| 5782 } | |
| 5783 | |
| 5784 | |
| 5785 int ActionNode::ComputeFirstCharacterSet(int budget) { | |
| 5786 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail; | |
| 5787 budget--; | |
| 5788 if (budget >= 0) { | |
| 5789 ASSERT_NOT_NULL(on_success()); | |
| 5790 budget = on_success()->ComputeFirstCharacterSet(budget); | |
| 5791 if (budget >= 0) { | |
| 5792 set_first_character_set(on_success()->first_character_set()); | |
| 5793 } | |
| 5794 } | |
| 5795 return budget; | |
| 5796 } | |
| 5797 | |
| 5798 | |
| 5799 int BackReferenceNode::ComputeFirstCharacterSet(int budget) { | |
| 5800 // We don't know anything about the first character of a backreference | |
| 5801 // at this point. | |
| 5802 // The potential first characters are the first characters of the capture, | |
| 5803 // and the first characters of the on_success node, depending on whether the | |
| 5804 // capture can be empty and whether it is known to be participating or known | |
| 5805 // not to be. | |
| 5806 return kComputeFirstCharacterSetFail; | |
| 5807 } | |
| 5808 | 5481 |
| 5809 | 5482 |
| 5810 void ChoiceNode::FillInBMInfo( | 5483 void ChoiceNode::FillInBMInfo( |
| 5811 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | 5484 int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
| 5812 ZoneList<GuardedAlternative>* alts = alternatives(); | 5485 ZoneList<GuardedAlternative>* alts = alternatives(); |
| 5813 for (int i = 0; i < alts->length(); i++) { | 5486 for (int i = 0; i < alts->length(); i++) { |
| 5814 GuardedAlternative& alt = alts->at(i); | 5487 GuardedAlternative& alt = alts->at(i); |
| 5815 if (alt.guards() != NULL && alt.guards()->length() != 0) { | 5488 if (alt.guards() != NULL && alt.guards()->length() != 0) { |
| 5816 bm->SetRest(offset); // Give up trying to fill in info. | 5489 bm->SetRest(offset); // Give up trying to fill in info. |
| 5490 SaveBMInfo(bm, not_at_start, offset); |
| 5817 return; | 5491 return; |
| 5818 } | 5492 } |
| 5819 alt.node()->FillInBMInfo(offset, bm, not_at_start); | 5493 alt.node()->FillInBMInfo(offset, bm, not_at_start); |
| 5820 } | 5494 } |
| 5495 SaveBMInfo(bm, not_at_start, offset); |
| 5821 } | 5496 } |
| 5822 | 5497 |
| 5823 | 5498 |
| 5824 void TextNode::FillInBMInfo( | 5499 void TextNode::FillInBMInfo( |
| 5825 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | 5500 int initial_offset, BoyerMooreLookahead* bm, bool not_at_start) { |
| 5826 if (offset >= bm->length()) return; | 5501 if (initial_offset >= bm->length()) return; |
| 5502 int offset = initial_offset; |
| 5827 int max_char = bm->max_char(); | 5503 int max_char = bm->max_char(); |
| 5828 for (int i = 0; i < elements()->length(); i++) { | 5504 for (int i = 0; i < elements()->length(); i++) { |
| 5829 if (offset >= bm->length()) return; | 5505 if (offset >= bm->length()) { |
| 5506 if (initial_offset == 0) set_bm_info(not_at_start, bm); |
| 5507 return; |
| 5508 } |
| 5830 TextElement text = elements()->at(i); | 5509 TextElement text = elements()->at(i); |
| 5831 if (text.type == TextElement::ATOM) { | 5510 if (text.type == TextElement::ATOM) { |
| 5832 RegExpAtom* atom = text.data.u_atom; | 5511 RegExpAtom* atom = text.data.u_atom; |
| 5833 for (int j = 0; j < atom->length(); j++, offset++) { | 5512 for (int j = 0; j < atom->length(); j++, offset++) { |
| 5834 if (offset >= bm->length()) return; | 5513 if (offset >= bm->length()) { |
| 5514 if (initial_offset == 0) set_bm_info(not_at_start, bm); |
| 5515 return; |
| 5516 } |
| 5835 uc16 character = atom->data()[j]; | 5517 uc16 character = atom->data()[j]; |
| 5836 if (bm->compiler()->ignore_case()) { | 5518 if (bm->compiler()->ignore_case()) { |
| 5837 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 5519 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 5838 int length = GetCaseIndependentLetters( | 5520 int length = GetCaseIndependentLetters( |
| 5839 ISOLATE, | 5521 ISOLATE, |
| 5840 character, | 5522 character, |
| 5841 bm->max_char() == String::kMaxAsciiCharCode, | 5523 bm->max_char() == String::kMaxAsciiCharCode, |
| 5842 chars); | 5524 chars); |
| 5843 for (int j = 0; j < length; j++) { | 5525 for (int j = 0; j < length; j++) { |
| 5844 bm->Set(offset, chars[j]); | 5526 bm->Set(offset, chars[j]); |
| 5845 } | 5527 } |
| 5846 } else { | 5528 } else { |
| 5847 if (character <= max_char) bm->Set(offset, character); | 5529 if (character <= max_char) bm->Set(offset, character); |
| 5848 } | 5530 } |
| 5849 } | 5531 } |
| 5850 } else { | 5532 } else { |
| 5851 ASSERT(text.type == TextElement::CHAR_CLASS); | 5533 ASSERT(text.type == TextElement::CHAR_CLASS); |
| 5852 RegExpCharacterClass* char_class = text.data.u_char_class; | 5534 RegExpCharacterClass* char_class = text.data.u_char_class; |
| 5853 ZoneList<CharacterRange>* ranges = char_class->ranges(); | 5535 ZoneList<CharacterRange>* ranges = char_class->ranges(); |
| 5854 if (char_class->is_negated()) { | 5536 if (char_class->is_negated()) { |
| 5855 bm->SetAll(offset); | 5537 bm->SetAll(offset); |
| 5856 } else { | 5538 } else { |
| 5857 for (int k = 0; k < ranges->length(); k++) { | 5539 for (int k = 0; k < ranges->length(); k++) { |
| 5858 CharacterRange& range = ranges->at(k); | 5540 CharacterRange& range = ranges->at(k); |
| 5859 if (range.from() > max_char) continue; | 5541 if (range.from() > max_char) continue; |
| 5860 int to = Min(max_char, static_cast<int>(range.to())); | 5542 int to = Min(max_char, static_cast<int>(range.to())); |
| 5861 if (to - range.from() >= BoyerMooreLookahead::kTooManyCharacters) { | 5543 bm->SetInterval(offset, Interval(range.from(), to)); |
| 5862 bm->SetAll(offset); | |
| 5863 break; | |
| 5864 } | |
| 5865 for (int m = range.from(); m <= to; m++) { | |
| 5866 bm->Set(offset, m); | |
| 5867 } | |
| 5868 } | 5544 } |
| 5869 } | 5545 } |
| 5870 offset++; | 5546 offset++; |
| 5871 } | 5547 } |
| 5872 } | 5548 } |
| 5873 if (offset >= bm->length()) return; | 5549 if (offset >= bm->length()) { |
| 5550 if (initial_offset == 0) set_bm_info(not_at_start, bm); |
| 5551 return; |
| 5552 } |
| 5874 on_success()->FillInBMInfo(offset, | 5553 on_success()->FillInBMInfo(offset, |
| 5875 bm, | 5554 bm, |
| 5876 true); // Not at start after a text node. | 5555 true); // Not at start after a text node. |
| 5556 if (initial_offset == 0) set_bm_info(not_at_start, bm); |
| 5877 } | 5557 } |
| 5878 | 5558 |
| 5879 | 5559 |
| 5880 int TextNode::ComputeFirstCharacterSet(int budget) { | |
| 5881 budget--; | |
| 5882 if (budget >= 0) { | |
| 5883 ASSERT_NE(0, elements()->length()); | |
| 5884 TextElement text = elements()->at(0); | |
| 5885 if (text.type == TextElement::ATOM) { | |
| 5886 RegExpAtom* atom = text.data.u_atom; | |
| 5887 ASSERT_NE(0, atom->length()); | |
| 5888 uc16 first_char = atom->data()[0]; | |
| 5889 ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1); | |
| 5890 range->Add(CharacterRange(first_char, first_char)); | |
| 5891 set_first_character_set(range); | |
| 5892 } else { | |
| 5893 ASSERT(text.type == TextElement::CHAR_CLASS); | |
| 5894 RegExpCharacterClass* char_class = text.data.u_char_class; | |
| 5895 ZoneList<CharacterRange>* ranges = char_class->ranges(); | |
| 5896 // TODO(lrn): Canonicalize ranges when they are created | |
| 5897 // instead of waiting until now. | |
| 5898 CharacterRange::Canonicalize(ranges); | |
| 5899 if (char_class->is_negated()) { | |
| 5900 int length = ranges->length(); | |
| 5901 int new_length = length + 1; | |
| 5902 if (length > 0) { | |
| 5903 if (ranges->at(0).from() == 0) new_length--; | |
| 5904 if (ranges->at(length - 1).to() == String::kMaxUtf16CodeUnit) { | |
| 5905 new_length--; | |
| 5906 } | |
| 5907 } | |
| 5908 ZoneList<CharacterRange>* negated_ranges = | |
| 5909 new ZoneList<CharacterRange>(new_length); | |
| 5910 CharacterRange::Negate(ranges, negated_ranges); | |
| 5911 set_first_character_set(negated_ranges); | |
| 5912 } else { | |
| 5913 set_first_character_set(ranges); | |
| 5914 } | |
| 5915 } | |
| 5916 } | |
| 5917 return budget; | |
| 5918 } | |
| 5919 | |
| 5920 | |
| 5921 | |
| 5922 // ------------------------------------------------------------------- | 5560 // ------------------------------------------------------------------- |
| 5923 // Dispatch table construction | 5561 // Dispatch table construction |
| 5924 | 5562 |
| 5925 | 5563 |
| 5926 void DispatchTableConstructor::VisitEnd(EndNode* that) { | 5564 void DispatchTableConstructor::VisitEnd(EndNode* that) { |
| 5927 AddRange(CharacterRange::Everything()); | 5565 AddRange(CharacterRange::Everything()); |
| 5928 } | 5566 } |
| 5929 | 5567 |
| 5930 | 5568 |
| 5931 void DispatchTableConstructor::BuildTable(ChoiceNode* node) { | 5569 void DispatchTableConstructor::BuildTable(ChoiceNode* node) { |
| (...skipping 201 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 6133 } | 5771 } |
| 6134 | 5772 |
| 6135 return compiler.Assemble(¯o_assembler, | 5773 return compiler.Assemble(¯o_assembler, |
| 6136 node, | 5774 node, |
| 6137 data->capture_count, | 5775 data->capture_count, |
| 6138 pattern); | 5776 pattern); |
| 6139 } | 5777 } |
| 6140 | 5778 |
| 6141 | 5779 |
| 6142 }} // namespace v8::internal | 5780 }} // namespace v8::internal |
| OLD | NEW |