Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(103)

Side by Side Diff: src/jsregexp.cc

Issue 9961009: Regexp: Unify the lookahead code for the BoyerMoore-like skipping (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 8 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2011 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after
101 Factory* factory = isolate->factory(); 101 Factory* factory = isolate->factory();
102 Handle<FixedArray> elements = factory->NewFixedArray(2); 102 Handle<FixedArray> elements = factory->NewFixedArray(2);
103 elements->set(0, *pattern); 103 elements->set(0, *pattern);
104 elements->set(1, *error_text); 104 elements->set(1, *error_text);
105 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); 105 Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
106 Handle<Object> regexp_err = factory->NewSyntaxError(message, array); 106 Handle<Object> regexp_err = factory->NewSyntaxError(message, array);
107 isolate->Throw(*regexp_err); 107 isolate->Throw(*regexp_err);
108 } 108 }
109 109
110 110
111 static ContainedInLattice Combine(ContainedInLattice a, bool inside) {
ulan 2012/04/16 13:52:22 Passing kLatticeIn/kLatticeOut instead of bool wou
Erik Corry 2012/04/17 07:51:45 Done.
112 if (a == kLatticeUnknown) return a;
113 if (a == kNotYet) return inside ? kLatticeIn : kLatticeOut;
114 if (a == kLatticeIn) return inside ? kLatticeIn : kLatticeUnknown;
115 if (a == kLatticeOut) return inside ? kLatticeUnknown : kLatticeOut;
116 UNREACHABLE();
117 return kLatticeUnknown;
118 }
119
120
121 ContainedInLattice AddRange(ContainedInLattice containment,
122 const int* ranges,
123 int ranges_length,
124 Interval new_range) {
125 if (containment == kLatticeUnknown) return containment;
126 bool inside = false;
127 int last = 0;
128 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
129 // Consider the range from last to ranges[i].
130 // We haven't got to the new range yet.
131 if (ranges[i] <= new_range.from()) continue;
132 // New range is wholly inside last-ranges[i].
133 if (last <= new_range.from() && ranges[i] > new_range.to()) {
ulan 2012/04/16 13:52:22 So ranges[i] is not inclusive, but new_range.to()
Erik Corry 2012/04/17 07:51:45 Done.
134 return Combine(containment, inside);
135 }
136 // Otherwise it
ulan 2012/04/16 13:52:22 Trailing space.
Erik Corry 2012/04/17 07:51:45 Done.
137 return kLatticeUnknown;
138 }
139 return containment;
140 }
141
142
111 // More makes code generation slower, less makes V8 benchmark score lower. 143 // More makes code generation slower, less makes V8 benchmark score lower.
112 const int kMaxLookaheadForBoyerMoore = 8; 144 const int kMaxLookaheadForBoyerMoore = 8;
113 // In a 3-character pattern you can maximally step forwards 3 characters 145 // In a 3-character pattern you can maximally step forwards 3 characters
114 // at a time, which is not always enough to pay for the extra logic. 146 // at a time, which is not always enough to pay for the extra logic.
115 const int kPatternTooShortForBoyerMoore = 2; 147 const int kPatternTooShortForBoyerMoore = 2;
116 148
117 149
118 // Identifies the sort of regexps where the regexp engine is faster 150 // Identifies the sort of regexps where the regexp engine is faster
119 // than the code used for atom matches. 151 // than the code used for atom matches.
120 static bool HasFewDifferentCharacters(Handle<String> pattern) { 152 static bool HasFewDifferentCharacters(Handle<String> pattern) {
(...skipping 2029 matching lines...) Expand 10 before | Expand all | Expand 10 after
2150 2182
2151 2183
2152 void ActionNode::FillInBMInfo(int offset, 2184 void ActionNode::FillInBMInfo(int offset,
2153 BoyerMooreLookahead* bm, 2185 BoyerMooreLookahead* bm,
2154 bool not_at_start) { 2186 bool not_at_start) {
2155 if (type_ == BEGIN_SUBMATCH) { 2187 if (type_ == BEGIN_SUBMATCH) {
2156 bm->SetRest(offset); 2188 bm->SetRest(offset);
2157 } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) { 2189 } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) {
2158 on_success()->FillInBMInfo(offset, bm, not_at_start); 2190 on_success()->FillInBMInfo(offset, bm, not_at_start);
2159 } 2191 }
2192 if (offset == 0) set_bm_info(not_at_start, bm);
ulan 2012/04/16 13:52:22 As discussed offline, it probably better to move t
Erik Corry 2012/04/17 07:51:45 Done.
2160 } 2193 }
2161 2194
2162 2195
2163 int AssertionNode::EatsAtLeast(int still_to_find, 2196 int AssertionNode::EatsAtLeast(int still_to_find,
2164 int recursion_depth, 2197 int recursion_depth,
2165 bool not_at_start) { 2198 bool not_at_start) {
2166 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 2199 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2167 // If we know we are not at the start and we are asked "how many characters 2200 // If we know we are not at the start and we are asked "how many characters
2168 // will you match if you succeed?" then we can answer anything since false 2201 // will you match if you succeed?" then we can answer anything since false
2169 // implies false. So lets just return the max answer (still_to_find) since 2202 // implies false. So lets just return the max answer (still_to_find) since
2170 // that won't prevent us from preloading a lot of characters for the other 2203 // that won't prevent us from preloading a lot of characters for the other
2171 // branches in the node graph. 2204 // branches in the node graph.
2172 if (type() == AT_START && not_at_start) return still_to_find; 2205 if (type() == AT_START && not_at_start) return still_to_find;
2173 return on_success()->EatsAtLeast(still_to_find, 2206 return on_success()->EatsAtLeast(still_to_find,
2174 recursion_depth + 1, 2207 recursion_depth + 1,
2175 not_at_start); 2208 not_at_start);
2176 } 2209 }
2177 2210
2178 2211
2179 void AssertionNode::FillInBMInfo( 2212 void AssertionNode::FillInBMInfo(
2180 int offset, BoyerMooreLookahead* bm, bool not_at_start) { 2213 int offset, BoyerMooreLookahead* bm, bool not_at_start) {
2181 // Match the behaviour of EatsAtLeast on this node. 2214 // Match the behaviour of EatsAtLeast on this node.
2182 if (type() == AT_START && not_at_start) return; 2215 if (type() == AT_START && not_at_start) return;
2183 on_success()->FillInBMInfo(offset, bm, not_at_start); 2216 on_success()->FillInBMInfo(offset, bm, not_at_start);
2217 if (offset == 0) set_bm_info(not_at_start, bm);
2184 } 2218 }
2185 2219
2186 2220
2187 int BackReferenceNode::EatsAtLeast(int still_to_find, 2221 int BackReferenceNode::EatsAtLeast(int still_to_find,
2188 int recursion_depth, 2222 int recursion_depth,
2189 bool not_at_start) { 2223 bool not_at_start) {
2190 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 2224 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2191 return on_success()->EatsAtLeast(still_to_find, 2225 return on_success()->EatsAtLeast(still_to_find,
2192 recursion_depth + 1, 2226 recursion_depth + 1,
2193 not_at_start); 2227 not_at_start);
(...skipping 416 matching lines...) Expand 10 before | Expand all | Expand 10 after
2610 if (body_can_be_zero_length_ || info()->visited) return; 2644 if (body_can_be_zero_length_ || info()->visited) return;
2611 VisitMarker marker(info()); 2645 VisitMarker marker(info());
2612 return ChoiceNode::GetQuickCheckDetails(details, 2646 return ChoiceNode::GetQuickCheckDetails(details,
2613 compiler, 2647 compiler,
2614 characters_filled_in, 2648 characters_filled_in,
2615 not_at_start); 2649 not_at_start);
2616 } 2650 }
2617 2651
2618 2652
2619 void LoopChoiceNode::FillInBMInfo( 2653 void LoopChoiceNode::FillInBMInfo(
2620 int offset, BoyerMooreLookahead* bm, bool nas) { 2654 int offset, BoyerMooreLookahead* bm, bool not_at_start) {
2621 if (body_can_be_zero_length_) { 2655 if (body_can_be_zero_length_) {
2622 bm->SetRest(offset); 2656 bm->SetRest(offset);
2657 if (offset == 0) set_bm_info(not_at_start, bm);
2623 return; 2658 return;
2624 } 2659 }
2625 ChoiceNode::FillInBMInfo(offset, bm, nas); 2660 ChoiceNode::FillInBMInfo(offset, bm, not_at_start);
2661 if (offset == 0) set_bm_info(not_at_start, bm);
2626 } 2662 }
2627 2663
2628 2664
2629 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2665 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2630 RegExpCompiler* compiler, 2666 RegExpCompiler* compiler,
2631 int characters_filled_in, 2667 int characters_filled_in,
2632 bool not_at_start) { 2668 bool not_at_start) {
2633 not_at_start = (not_at_start || not_at_start_); 2669 not_at_start = (not_at_start || not_at_start_);
2634 int choice_count = alternatives_->length(); 2670 int choice_count = alternatives_->length();
2635 ASSERT(choice_count > 0); 2671 ASSERT(choice_count > 0);
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
2703 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); 2739 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2704 } 2740 }
2705 assembler->CheckCharacter('\n', &ok); 2741 assembler->CheckCharacter('\n', &ok);
2706 assembler->CheckNotCharacter('\r', new_trace.backtrack()); 2742 assembler->CheckNotCharacter('\r', new_trace.backtrack());
2707 } 2743 }
2708 assembler->Bind(&ok); 2744 assembler->Bind(&ok);
2709 on_success->Emit(compiler, &new_trace); 2745 on_success->Emit(compiler, &new_trace);
2710 } 2746 }
2711 2747
2712 2748
2713 // Emit the code to handle \b and \B (word-boundary or non-word-boundary)
2714 // when we know whether the next character must be a word character or not.
2715 static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type,
2716 RegExpCompiler* compiler,
2717 RegExpNode* on_success,
2718 Trace* trace) {
2719 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2720 Label done;
2721
2722 Trace new_trace(*trace);
2723
2724 bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER);
2725 Label* on_word = expect_word_character ? &done : new_trace.backtrack();
2726 Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done;
2727
2728 // Check whether previous character was a word character.
2729 switch (trace->at_start()) {
2730 case Trace::TRUE:
2731 if (expect_word_character) {
2732 assembler->GoTo(on_non_word);
2733 }
2734 break;
2735 case Trace::UNKNOWN:
2736 ASSERT_EQ(0, trace->cp_offset());
2737 assembler->CheckAtStart(on_non_word);
2738 // Fall through.
2739 case Trace::FALSE:
2740 int prev_char_offset = trace->cp_offset() - 1;
2741 assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1);
2742 EmitWordCheck(assembler, on_word, on_non_word, expect_word_character);
2743 // We may or may not have loaded the previous character.
2744 new_trace.InvalidateCurrentCharacter();
2745 }
2746
2747 assembler->Bind(&done);
2748
2749 on_success->Emit(compiler, &new_trace);
2750 }
2751
2752
2753 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). 2749 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2754 static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type, 2750 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2755 RegExpCompiler* compiler,
2756 RegExpNode* on_success,
2757 Trace* trace) {
2758 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2751 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2759 Label before_non_word; 2752 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2760 Label before_word; 2753 bool not_at_start = (trace->at_start() == Trace::FALSE);
2761 if (trace->characters_preloaded() != 1) { 2754 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2762 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); 2755 if (lookahead == NULL) {
2756 int eats_at_least =
2757 Min(kMaxLookaheadForBoyerMoore,
2758 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start));
2759 if (eats_at_least >= 1) {
2760 BoyerMooreLookahead* bm =
2761 new BoyerMooreLookahead(eats_at_least, compiler);
2762 FillInBMInfo(0, bm, not_at_start);
2763 if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE;
2764 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE;
ulan 2012/04/16 13:52:22 Consider writing the two lines above as: next_is_w
Erik Corry 2012/04/17 07:51:45 Not the same, since it could be neither known to b
2765 }
2766 } else {
2767 if (lookahead->at(0)->is_non_word()) next_is_word_character = Trace::FALSE;
2768 if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE;
2763 } 2769 }
2764 // Fall through on non-word. 2770 bool at_boundary = (type_ == AssertionNode::AT_BOUNDARY);
2765 EmitWordCheck(assembler, &before_word, &before_non_word, false); 2771 if (next_is_word_character == Trace::UNKNOWN) {
2772 Label before_non_word;
2773 Label before_word;
2774 if (trace->characters_preloaded() != 1) {
2775 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2776 }
2777 // Fall through on non-word.
2778 EmitWordCheck(assembler, &before_word, &before_non_word, false);
2779 // Next character is not a word character.
2780 assembler->Bind(&before_non_word);
2781 Label ok;
2782 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2783 assembler->GoTo(&ok);
2766 2784
2767 // We will be loading the previous character into the current character 2785 assembler->Bind(&before_word);
2768 // register. 2786 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2787 assembler->Bind(&ok);
2788 } else if (next_is_word_character == Trace::TRUE) {
2789 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2790 } else {
2791 ASSERT(next_is_word_character == Trace::FALSE);
2792 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2793 }
2794 }
2795
2796
2797 void AssertionNode::BacktrackIfPrevious(
2798 RegExpCompiler* compiler,
2799 Trace* trace,
2800 AssertionNode::IfPrevious backtrack_if_previous) {
2801 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2769 Trace new_trace(*trace); 2802 Trace new_trace(*trace);
2770 new_trace.InvalidateCurrentCharacter(); 2803 new_trace.InvalidateCurrentCharacter();
2771 2804
2772 Label ok; 2805 Label fall_through, dummy;
2773 Label* boundary;
2774 Label* not_boundary;
2775 if (type == AssertionNode::AT_BOUNDARY) {
2776 boundary = &ok;
2777 not_boundary = new_trace.backtrack();
2778 } else {
2779 not_boundary = &ok;
2780 boundary = new_trace.backtrack();
2781 }
2782 2806
2783 // Next character is not a word character. 2807 Label* non_word = backtrack_if_previous == kIsNonWord ?
2784 assembler->Bind(&before_non_word); 2808 new_trace.backtrack() :
2809 &fall_through;
2810 Label* word = backtrack_if_previous == kIsNonWord ?
2811 &fall_through :
2812 new_trace.backtrack();
2813
2785 if (new_trace.cp_offset() == 0) { 2814 if (new_trace.cp_offset() == 0) {
2786 // The start of input counts as a non-word character, so the question is 2815 // The start of input counts as a non-word character, so the question is
2787 // decided if we are at the start. 2816 // decided if we are at the start.
2788 assembler->CheckAtStart(not_boundary); 2817 assembler->CheckAtStart(non_word);
2789 } 2818 }
2790 // We already checked that we are not at the start of input so it must be 2819 // We already checked that we are not at the start of input so it must be
2791 // OK to load the previous character. 2820 // OK to load the previous character.
2792 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, 2821 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
2793 &ok, // Unused dummy label in this call. 2822 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
2794 false);
2795 // Fall through on non-word.
2796 EmitWordCheck(assembler, boundary, not_boundary, false);
2797 assembler->GoTo(not_boundary);
2798 2823
2799 // Next character is a word character. 2824 assembler->Bind(&fall_through);
2800 assembler->Bind(&before_word); 2825 on_success()->Emit(compiler, &new_trace);
2801 if (new_trace.cp_offset() == 0) {
2802 // The start of input counts as a non-word character, so the question is
2803 // decided if we are at the start.
2804 assembler->CheckAtStart(boundary);
2805 }
2806 // We already checked that we are not at the start of input so it must be
2807 // OK to load the previous character.
2808 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2809 &ok, // Unused dummy label in this call.
2810 false);
2811 bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
2812 EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
2813
2814 assembler->Bind(&ok);
2815
2816 on_success->Emit(compiler, &new_trace);
2817 } 2826 }
2818 2827
2819 2828
2820 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, 2829 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2821 RegExpCompiler* compiler, 2830 RegExpCompiler* compiler,
2822 int filled_in, 2831 int filled_in,
2823 bool not_at_start) { 2832 bool not_at_start) {
2824 if (type_ == AT_START && not_at_start) { 2833 if (type_ == AT_START && not_at_start) {
2825 details->set_cannot_match(); 2834 details->set_cannot_match();
2826 return; 2835 return;
(...skipping 27 matching lines...) Expand all
2854 on_success()->Emit(compiler, &at_start_trace); 2863 on_success()->Emit(compiler, &at_start_trace);
2855 return; 2864 return;
2856 } 2865 }
2857 } 2866 }
2858 break; 2867 break;
2859 case AFTER_NEWLINE: 2868 case AFTER_NEWLINE:
2860 EmitHat(compiler, on_success(), trace); 2869 EmitHat(compiler, on_success(), trace);
2861 return; 2870 return;
2862 case AT_BOUNDARY: 2871 case AT_BOUNDARY:
2863 case AT_NON_BOUNDARY: { 2872 case AT_NON_BOUNDARY: {
2864 EmitBoundaryCheck(type_, compiler, on_success(), trace); 2873 EmitBoundaryCheck(compiler, trace);
2865 return; 2874 return;
2866 } 2875 }
2867 case AFTER_WORD_CHARACTER:
2868 case AFTER_NONWORD_CHARACTER: {
2869 EmitHalfBoundaryCheck(type_, compiler, on_success(), trace);
2870 }
2871 } 2876 }
2872 on_success()->Emit(compiler, trace); 2877 on_success()->Emit(compiler, trace);
2873 } 2878 }
2874 2879
2875 2880
2876 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { 2881 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
2877 if (quick_check == NULL) return false; 2882 if (quick_check == NULL) return false;
2878 if (offset >= quick_check->characters()) return false; 2883 if (offset >= quick_check->characters()) return false;
2879 return quick_check->positions(offset)->determines_perfectly; 2884 return quick_check->positions(offset)->determines_perfectly;
2880 } 2885 }
(...skipping 389 matching lines...) Expand 10 before | Expand all | Expand 10 after
3270 return alt_gens_[i]; 3275 return alt_gens_[i];
3271 } 3276 }
3272 3277
3273 private: 3278 private:
3274 static const int kAFew = 10; 3279 static const int kAFew = 10;
3275 ZoneList<AlternativeGeneration*> alt_gens_; 3280 ZoneList<AlternativeGeneration*> alt_gens_;
3276 AlternativeGeneration a_few_alt_gens_[kAFew]; 3281 AlternativeGeneration a_few_alt_gens_[kAFew];
3277 }; 3282 };
3278 3283
3279 3284
3285 // The '2' variant is has inclusive from and exclusive to.
3286 static const int kSpaceRanges2[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0,
ulan 2012/04/16 13:52:22 Consider adding a static assert that the range cou
Erik Corry 2012/04/17 07:51:45 AddRange jsregexp.cc:118 uses all the elements in
3287 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A,
3288 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, 0x10000 };
3289 static const int kSpaceRange2Count = ARRAY_SIZE(kSpaceRanges2);
3290
3291 static const int kWordRanges2[] = {
3292 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
3293 static const int kWordRange2Count = ARRAY_SIZE(kWordRanges2);
3294 static const int kDigitRanges2[] = { '0', '9' + 1, 0x10000 };
3295 static const int kDigitRange2Count = ARRAY_SIZE(kDigitRanges2);
3296 static const int kSurrogateRanges2[] = { 0xd800, 0xe000, 0x10000 };
3297 static const int kSurrogateRange2Count = ARRAY_SIZE(kSurrogateRanges2);
3298 static const int kLineTerminatorRanges2[] = { 0x000A, 0x000B, 0x000D, 0x000E,
3299 0x2028, 0x202A, 0x10000 };
3300 static const int kLineTerminatorRange2Count = ARRAY_SIZE(kLineTerminatorRanges2) ;
ulan 2012/04/16 13:52:22 Long line above and too many new lines below.
Erik Corry 2012/04/17 07:51:45 Done.
3301
3302
3303
3304 void BoyerMoorePositionInfo::Set(int character) {
3305 SetInterval(Interval(character, character));
3306 }
3307
3308
3309 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3310 s_ = AddRange(s_, kSpaceRanges2, kSpaceRange2Count, interval);
3311 w_ = AddRange(w_, kWordRanges2, kWordRange2Count, interval);
3312 d_ = AddRange(d_, kDigitRanges2, kDigitRange2Count, interval);
3313 surrogate_ =
3314 AddRange(surrogate_, kSurrogateRanges2, kSurrogateRange2Count, interval);
3315 if (interval.to() - interval.from() >= kMapSize - 1) {
3316 if (map_count_ != kMapSize) {
3317 map_count_ = kMapSize;
3318 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3319 }
3320 return;
3321 }
3322 for (int i = interval.from(); i <= interval.to(); i++) {
3323 int mod_character = (i & kMask);
3324 if (!map_->at(mod_character)) {
3325 map_count_++;
3326 map_->at(mod_character) = true;
3327 }
3328 if (map_count_ == kMapSize) return;
3329 }
3330 }
3331
3332
3333 void BoyerMoorePositionInfo::SetAll() {
3334 s_ = w_ = d_ = kLatticeUnknown;
3335 if (map_count_ != kMapSize) {
3336 map_count_ = kMapSize;
3337 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3338 }
3339 }
3340
3341
3280 BoyerMooreLookahead::BoyerMooreLookahead( 3342 BoyerMooreLookahead::BoyerMooreLookahead(
3281 int length, int map_length, RegExpCompiler* compiler) 3343 int length, RegExpCompiler* compiler)
3282 : length_(length), 3344 : length_(length),
3283 map_length_(map_length),
3284 compiler_(compiler) { 3345 compiler_(compiler) {
3285 ASSERT(IsPowerOf2(map_length));
3286 if (compiler->ascii()) { 3346 if (compiler->ascii()) {
3287 max_char_ = String::kMaxAsciiCharCode; 3347 max_char_ = String::kMaxAsciiCharCode;
3288 } else { 3348 } else {
3289 max_char_ = String::kMaxUtf16CodeUnit; 3349 max_char_ = String::kMaxUtf16CodeUnit;
3290 } 3350 }
3291 bitmaps_ = new ZoneList<ZoneList<bool>*>(length); 3351 bitmaps_ = new ZoneList<BoyerMoorePositionInfo*>(length);
3292 for (int i = 0; i < length; i++) { 3352 for (int i = 0; i < length; i++) {
3293 bitmaps_->Add(new ZoneList<bool>(map_length)); 3353 bitmaps_->Add(new BoyerMoorePositionInfo());
3294 ZoneList<bool>* map = bitmaps_->at(i);
3295 for (int i = 0; i < map_length; i++) {
3296 map->Add(false);
3297 }
3298 } 3354 }
3299 } 3355 }
3300 3356
3301 3357
3302 // Find the longest range of lookahead that has the fewest number of different 3358 // Find the longest range of lookahead that has the fewest number of different
3303 // characters that can occur at a given position. Since we are optimizing two 3359 // characters that can occur at a given position. Since we are optimizing two
3304 // different parameters at once this is a tradeoff. 3360 // different parameters at once this is a tradeoff.
3305 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { 3361 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3306 int biggest_points = 0; 3362 int biggest_points = 0;
3307 for (int max_number_of_chars = 4; 3363 for (int max_number_of_chars = 4;
3308 max_number_of_chars < kTooManyCharacters; 3364 max_number_of_chars < 32;
ulan 2012/04/16 13:52:22 Now 32 is more magical :)
Erik Corry 2012/04/17 07:51:45 Done.
3309 max_number_of_chars *= 2) { 3365 max_number_of_chars *= 2) {
3310 biggest_points = 3366 biggest_points =
3311 FindBestInterval(max_number_of_chars, biggest_points, from, to); 3367 FindBestInterval(max_number_of_chars, biggest_points, from, to);
3312 } 3368 }
3313 if (biggest_points == 0) return false; 3369 if (biggest_points == 0) return false;
3314 return true; 3370 return true;
3315 } 3371 }
3316 3372
3317 3373
3318 // Find the highest-points range between 0 and length_ where the character 3374 // Find the highest-points range between 0 and length_ where the character
3319 // information is not too vague. 'Too vague' means that there are more than 3375 // information is not too vague. 'Too vague' means that there are more than
3320 // max_number_of_chars that can occur at this position. Calculates the number 3376 // max_number_of_chars that can occur at this position. Calculates the number
3321 // of points as the product of width-of-the-range and 3377 // of points as the product of width-of-the-range and
3322 // probability-of-finding-one-of-the-characters, where the probability is 3378 // probability-of-finding-one-of-the-characters, where the probability is
3323 // calculated using the frequency distribution of the sample subject string. 3379 // calculated using the frequency distribution of the sample subject string.
3324 int BoyerMooreLookahead::FindBestInterval( 3380 int BoyerMooreLookahead::FindBestInterval(
3325 int max_number_of_chars, int old_biggest_points, int* from, int* to) { 3381 int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3326 int biggest_points = old_biggest_points; 3382 int biggest_points = old_biggest_points;
3327 static const int kSize = RegExpMacroAssembler::kTableSize; 3383 static const int kSize = RegExpMacroAssembler::kTableSize;
3328 for (int i = 0; i < length_; ) { 3384 for (int i = 0; i < length_; ) {
3329 while (i < length_ && Count(i) > max_number_of_chars) i++; 3385 while (i < length_ && Count(i) > max_number_of_chars) i++;
3330 if (i == length_) break; 3386 if (i == length_) break;
3331 int remembered_from = i; 3387 int remembered_from = i;
3332 bool union_map[kSize]; 3388 bool union_map[kSize];
3333 for (int j = 0; j < kSize; j++) union_map[j] = false; 3389 for (int j = 0; j < kSize; j++) union_map[j] = false;
3334 while (i < length_ && Count(i) <= max_number_of_chars) { 3390 while (i < length_ && Count(i) <= max_number_of_chars) {
3335 ZoneList<bool>* map = bitmaps_->at(i); 3391 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3336 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j); 3392 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3337 i++; 3393 i++;
3338 } 3394 }
3339 int frequency = 0; 3395 int frequency = 0;
3340 for (int j = 0; j < kSize; j++) { 3396 for (int j = 0; j < kSize; j++) {
3341 if (union_map[j]) { 3397 if (union_map[j]) {
3342 // Add 1 to the frequency to give a small per-character boost for 3398 // Add 1 to the frequency to give a small per-character boost for
3343 // the cases where our sampling is not good enough and many 3399 // the cases where our sampling is not good enough and many
3344 // characters have a frequency of zero. This means the frequency 3400 // characters have a frequency of zero. This means the frequency
3345 // can theoretically be up to 2*kSize though we treat it mostly as 3401 // can theoretically be up to 2*kSize though we treat it mostly as
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
3380 3436
3381 const int kSkipArrayEntry = 0; 3437 const int kSkipArrayEntry = 0;
3382 const int kDontSkipArrayEntry = 1; 3438 const int kDontSkipArrayEntry = 1;
3383 3439
3384 for (int i = 0; i < kSize; i++) { 3440 for (int i = 0; i < kSize; i++) {
3385 boolean_skip_table->set(i, kSkipArrayEntry); 3441 boolean_skip_table->set(i, kSkipArrayEntry);
3386 } 3442 }
3387 int skip = max_lookahead + 1 - min_lookahead; 3443 int skip = max_lookahead + 1 - min_lookahead;
3388 3444
3389 for (int i = max_lookahead; i >= min_lookahead; i--) { 3445 for (int i = max_lookahead; i >= min_lookahead; i--) {
3390 ZoneList<bool>* map = bitmaps_->at(i); 3446 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3391 for (int j = 0; j < map_length_; j++) { 3447 for (int j = 0; j < kSize; j++) {
3392 if (map->at(j)) { 3448 if (map->at(j)) {
3393 boolean_skip_table->set(j, kDontSkipArrayEntry); 3449 boolean_skip_table->set(j, kDontSkipArrayEntry);
3394 } 3450 }
3395 } 3451 }
3396 } 3452 }
3397 3453
3398 return skip; 3454 return skip;
3399 } 3455 }
3400 3456
3401 3457
3402 // See comment above on the implementation of GetSkipTable. 3458 // See comment above on the implementation of GetSkipTable.
3403 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { 3459 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3460 const int kSize = RegExpMacroAssembler::kTableSize;
3461
3404 int min_lookahead = 0; 3462 int min_lookahead = 0;
3405 int max_lookahead = 0; 3463 int max_lookahead = 0;
3406 3464
3407 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false; 3465 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false;
3408 3466
3409 bool found_single_character = false; 3467 bool found_single_character = false;
3410 bool abandoned_search_for_single_character = false;
3411 int single_character = 0; 3468 int single_character = 0;
3412 for (int i = max_lookahead; i >= min_lookahead; i--) { 3469 for (int i = max_lookahead; i >= min_lookahead; i--) {
3413 ZoneList<bool>* map = bitmaps_->at(i); 3470 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3414 for (int j = 0; j < map_length_; j++) { 3471 if (map->map_count() > 1 ||
3472 (found_single_character && map->map_count() != 0)) {
3473 found_single_character = false;
3474 break;
3475 }
3476 for (int j = 0; j < kSize; j++) {
3415 if (map->at(j)) { 3477 if (map->at(j)) {
3416 if (found_single_character) { 3478 found_single_character = true;
3417 found_single_character = false; // Found two. 3479 single_character = j;
3418 abandoned_search_for_single_character = true; 3480 break;
3419 break;
3420 } else {
3421 found_single_character = true;
3422 single_character = j;
3423 }
3424 } 3481 }
3425 } 3482 }
3426 if (abandoned_search_for_single_character) break;
3427 } 3483 }
3428 3484
3429 int lookahead_width = max_lookahead + 1 - min_lookahead; 3485 int lookahead_width = max_lookahead + 1 - min_lookahead;
3430 3486
3431 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { 3487 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3432 // The mask-compare can probably handle this better. 3488 // The mask-compare can probably handle this better.
3433 return false; 3489 return false;
3434 } 3490 }
3435 3491
3436 if (found_single_character) { 3492 if (found_single_character) {
3437 Label cont, again; 3493 Label cont, again;
3438 masm->Bind(&again); 3494 masm->Bind(&again);
3439 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3495 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3440 if (max_char_ > map_length_) { 3496 if (max_char_ > kSize) {
3441 ASSERT(map_length_ == RegExpMacroAssembler::kTableSize);
3442 masm->CheckCharacterAfterAnd(single_character, 3497 masm->CheckCharacterAfterAnd(single_character,
3443 RegExpMacroAssembler::kTableMask, 3498 RegExpMacroAssembler::kTableMask,
3444 &cont); 3499 &cont);
3445 } else { 3500 } else {
3446 masm->CheckCharacter(single_character, &cont); 3501 masm->CheckCharacter(single_character, &cont);
3447 } 3502 }
3448 masm->AdvanceCurrentPosition(lookahead_width); 3503 masm->AdvanceCurrentPosition(lookahead_width);
3449 masm->GoTo(&again); 3504 masm->GoTo(&again);
3450 masm->Bind(&cont); 3505 masm->Bind(&cont);
3451 return true; 3506 return true;
3452 } 3507 }
3453 3508
3454 Handle<ByteArray> boolean_skip_table = 3509 Handle<ByteArray> boolean_skip_table =
3455 FACTORY->NewByteArray(map_length_, TENURED); 3510 FACTORY->NewByteArray(kSize, TENURED);
3456 int skip_distance = GetSkipTable( 3511 int skip_distance = GetSkipTable(
3457 min_lookahead, max_lookahead, boolean_skip_table); 3512 min_lookahead, max_lookahead, boolean_skip_table);
3458 ASSERT(skip_distance != 0); 3513 ASSERT(skip_distance != 0);
3459 3514
3460 Label cont, again; 3515 Label cont, again;
3461 masm->Bind(&again); 3516 masm->Bind(&again);
3462 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3517 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3463 masm->CheckBitInTable(boolean_skip_table, &cont); 3518 masm->CheckBitInTable(boolean_skip_table, &cont);
3464 masm->AdvanceCurrentPosition(skip_distance); 3519 masm->AdvanceCurrentPosition(skip_distance);
3465 masm->GoTo(&again); 3520 masm->GoTo(&again);
(...skipping 158 matching lines...) Expand 10 before | Expand all | Expand 10 after
3624 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) == 3679 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) ==
3625 this) { 3680 this) {
3626 // At this point we know that we are at a non-greedy loop that will eat 3681 // At this point we know that we are at a non-greedy loop that will eat
3627 // any character one at a time. Any non-anchored regexp has such a 3682 // any character one at a time. Any non-anchored regexp has such a
3628 // loop prepended to it in order to find where it starts. We look for 3683 // loop prepended to it in order to find where it starts. We look for
3629 // a pattern of the form ...abc... where we can look 6 characters ahead 3684 // a pattern of the form ...abc... where we can look 6 characters ahead
3630 // and step forwards 3 if the character is not one of abc. Abc need 3685 // and step forwards 3 if the character is not one of abc. Abc need
3631 // not be atoms, they can be any reasonably limited character class or 3686 // not be atoms, they can be any reasonably limited character class or
3632 // small alternation. 3687 // small alternation.
3633 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes. 3688 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes.
3634 eats_at_least = 3689 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3635 Min(kMaxLookaheadForBoyerMoore, 3690 if (lookahead == NULL) {
3636 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start)); 3691 eats_at_least =
3637 if (eats_at_least >= 1) { 3692 Min(kMaxLookaheadForBoyerMoore,
3638 BoyerMooreLookahead bm(eats_at_least, 3693 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start));
3639 RegExpMacroAssembler::kTableSize, 3694 if (eats_at_least >= 1) {
3640 compiler); 3695 BoyerMooreLookahead* bm =
3641 GuardedAlternative alt0 = alternatives_->at(0); 3696 new BoyerMooreLookahead(eats_at_least, compiler);
3642 alt0.node()->FillInBMInfo(0, &bm, not_at_start); 3697 GuardedAlternative alt0 = alternatives_->at(0);
3643 skip_was_emitted = bm.EmitSkipInstructions(macro_assembler); 3698 alt0.node()->FillInBMInfo(0, bm, not_at_start);
3699 skip_was_emitted = bm->EmitSkipInstructions(macro_assembler);
3700 }
3701 } else {
3702 skip_was_emitted = lookahead->EmitSkipInstructions(macro_assembler);
3644 } 3703 }
3645 } 3704 }
3646 } 3705 }
3647 } 3706 }
3648 3707
3649 if (eats_at_least == kEatsAtLeastNotYetInitialized) { 3708 if (eats_at_least == kEatsAtLeastNotYetInitialized) {
3650 // Save some time by looking at most one machine word ahead. 3709 // Save some time by looking at most one machine word ahead.
3651 eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start); 3710 eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start);
3652 } 3711 }
3653 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least); 3712 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least);
(...skipping 542 matching lines...) Expand 10 before | Expand all | Expand 10 after
4196 break; 4255 break;
4197 case AssertionNode::AT_BOUNDARY: 4256 case AssertionNode::AT_BOUNDARY:
4198 stream()->Add("label=\"\\b\", shape=septagon"); 4257 stream()->Add("label=\"\\b\", shape=septagon");
4199 break; 4258 break;
4200 case AssertionNode::AT_NON_BOUNDARY: 4259 case AssertionNode::AT_NON_BOUNDARY:
4201 stream()->Add("label=\"\\B\", shape=septagon"); 4260 stream()->Add("label=\"\\B\", shape=septagon");
4202 break; 4261 break;
4203 case AssertionNode::AFTER_NEWLINE: 4262 case AssertionNode::AFTER_NEWLINE:
4204 stream()->Add("label=\"(?<=\\n)\", shape=septagon"); 4263 stream()->Add("label=\"(?<=\\n)\", shape=septagon");
4205 break; 4264 break;
4206 case AssertionNode::AFTER_WORD_CHARACTER:
4207 stream()->Add("label=\"(?<=\\w)\", shape=septagon");
4208 break;
4209 case AssertionNode::AFTER_NONWORD_CHARACTER:
4210 stream()->Add("label=\"(?<=\\W)\", shape=septagon");
4211 break;
4212 } 4265 }
4213 stream()->Add("];\n"); 4266 stream()->Add("];\n");
4214 PrintAttributes(that); 4267 PrintAttributes(that);
4215 RegExpNode* successor = that->on_success(); 4268 RegExpNode* successor = that->on_success();
4216 stream()->Add(" n%p -> n%p;\n", that, successor); 4269 stream()->Add(" n%p -> n%p;\n", that, successor);
4217 Visit(successor); 4270 Visit(successor);
4218 } 4271 }
4219 4272
4220 4273
4221 void DotPrinter::VisitAction(ActionNode* that) { 4274 void DotPrinter::VisitAction(ActionNode* that) {
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after
4306 printer.PrintNode(label, node); 4359 printer.PrintNode(label, node);
4307 } 4360 }
4308 4361
4309 4362
4310 #endif // DEBUG 4363 #endif // DEBUG
4311 4364
4312 4365
4313 // ------------------------------------------------------------------- 4366 // -------------------------------------------------------------------
4314 // Tree to graph conversion 4367 // Tree to graph conversion
4315 4368
4316 static const uc16 kSpaceRanges[] = { 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0,
4317 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029,
4318 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000, 0xFEFF, 0xFEFF };
4319 static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
4320
4321 static const uc16 kWordRanges[] = { '0', '9', 'A', 'Z', '_', '_', 'a', 'z' };
4322 static const int kWordRangeCount = ARRAY_SIZE(kWordRanges);
4323
4324 static const uc16 kDigitRanges[] = { '0', '9' };
4325 static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges);
4326
4327 static const uc16 kLineTerminatorRanges[] = { 0x000A, 0x000A, 0x000D, 0x000D,
4328 0x2028, 0x2029 };
4329 static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges);
4330
4331 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, 4369 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4332 RegExpNode* on_success) { 4370 RegExpNode* on_success) {
4333 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1); 4371 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
4334 elms->Add(TextElement::Atom(this)); 4372 elms->Add(TextElement::Atom(this));
4335 return new TextNode(elms, on_success); 4373 return new TextNode(elms, on_success);
4336 } 4374 }
4337 4375
4338 4376
4339 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, 4377 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4340 RegExpNode* on_success) { 4378 RegExpNode* on_success) {
4341 return new TextNode(elements(), on_success); 4379 return new TextNode(elements(), on_success);
4342 } 4380 }
4343 4381
4382
4344 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, 4383 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4345 const uc16* special_class, 4384 const int* special_class,
4346 int length) { 4385 int length) {
4386 length--; // Remove final 0x10000.
4387 ASSERT(special_class[length] == 0x10000);
4347 ASSERT(ranges->length() != 0); 4388 ASSERT(ranges->length() != 0);
4348 ASSERT(length != 0); 4389 ASSERT(length != 0);
4349 ASSERT(special_class[0] != 0); 4390 ASSERT(special_class[0] != 0);
4350 if (ranges->length() != (length >> 1) + 1) { 4391 if (ranges->length() != (length >> 1) + 1) {
4351 return false; 4392 return false;
4352 } 4393 }
4353 CharacterRange range = ranges->at(0); 4394 CharacterRange range = ranges->at(0);
4354 if (range.from() != 0) { 4395 if (range.from() != 0) {
4355 return false; 4396 return false;
4356 } 4397 }
4357 for (int i = 0; i < length; i += 2) { 4398 for (int i = 0; i < length; i += 2) {
4358 if (special_class[i] != (range.to() + 1)) { 4399 if (special_class[i] != (range.to() + 1)) {
4359 return false; 4400 return false;
4360 } 4401 }
4361 range = ranges->at((i >> 1) + 1); 4402 range = ranges->at((i >> 1) + 1);
4362 if (special_class[i+1] != range.from() - 1) { 4403 if (special_class[i+1] != range.from()) {
4363 return false; 4404 return false;
4364 } 4405 }
4365 } 4406 }
4366 if (range.to() != 0xffff) { 4407 if (range.to() != 0xffff) {
4367 return false; 4408 return false;
4368 } 4409 }
4369 return true; 4410 return true;
4370 } 4411 }
4371 4412
4372 4413
4373 static bool CompareRanges(ZoneList<CharacterRange>* ranges, 4414 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4374 const uc16* special_class, 4415 const int* special_class,
4375 int length) { 4416 int length) {
4417 length--; // Remove final 0x10000.
4418 ASSERT(special_class[length] == 0x10000);
4376 if (ranges->length() * 2 != length) { 4419 if (ranges->length() * 2 != length) {
4377 return false; 4420 return false;
4378 } 4421 }
4379 for (int i = 0; i < length; i += 2) { 4422 for (int i = 0; i < length; i += 2) {
4380 CharacterRange range = ranges->at(i >> 1); 4423 CharacterRange range = ranges->at(i >> 1);
4381 if (range.from() != special_class[i] || range.to() != special_class[i+1]) { 4424 if (range.from() != special_class[i] ||
4425 range.to() != special_class[i + 1] - 1) {
4382 return false; 4426 return false;
4383 } 4427 }
4384 } 4428 }
4385 return true; 4429 return true;
4386 } 4430 }
4387 4431
4388 4432
4389 bool RegExpCharacterClass::is_standard() { 4433 bool RegExpCharacterClass::is_standard() {
4390 // TODO(lrn): Remove need for this function, by not throwing away information 4434 // TODO(lrn): Remove need for this function, by not throwing away information
4391 // along the way. 4435 // along the way.
4392 if (is_negated_) { 4436 if (is_negated_) {
4393 return false; 4437 return false;
4394 } 4438 }
4395 if (set_.is_standard()) { 4439 if (set_.is_standard()) {
4396 return true; 4440 return true;
4397 } 4441 }
4398 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) { 4442 if (CompareRanges(set_.ranges(), kSpaceRanges2, kSpaceRange2Count)) {
4399 set_.set_standard_set_type('s'); 4443 set_.set_standard_set_type('s');
4400 return true; 4444 return true;
4401 } 4445 }
4402 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) { 4446 if (CompareInverseRanges(set_.ranges(), kSpaceRanges2, kSpaceRange2Count)) {
4403 set_.set_standard_set_type('S'); 4447 set_.set_standard_set_type('S');
4404 return true; 4448 return true;
4405 } 4449 }
4406 if (CompareInverseRanges(set_.ranges(), 4450 if (CompareInverseRanges(set_.ranges(),
4407 kLineTerminatorRanges, 4451 kLineTerminatorRanges2,
4408 kLineTerminatorRangeCount)) { 4452 kLineTerminatorRange2Count)) {
4409 set_.set_standard_set_type('.'); 4453 set_.set_standard_set_type('.');
4410 return true; 4454 return true;
4411 } 4455 }
4412 if (CompareRanges(set_.ranges(), 4456 if (CompareRanges(set_.ranges(),
4413 kLineTerminatorRanges, 4457 kLineTerminatorRanges2,
4414 kLineTerminatorRangeCount)) { 4458 kLineTerminatorRange2Count)) {
4415 set_.set_standard_set_type('n'); 4459 set_.set_standard_set_type('n');
4416 return true; 4460 return true;
4417 } 4461 }
4418 if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) { 4462 if (CompareRanges(set_.ranges(), kWordRanges2, kWordRange2Count)) {
4419 set_.set_standard_set_type('w'); 4463 set_.set_standard_set_type('w');
4420 return true; 4464 return true;
4421 } 4465 }
4422 if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) { 4466 if (CompareInverseRanges(set_.ranges(), kWordRanges2, kWordRange2Count)) {
4423 set_.set_standard_set_type('W'); 4467 set_.set_standard_set_type('W');
4424 return true; 4468 return true;
4425 } 4469 }
4426 return false; 4470 return false;
4427 } 4471 }
4428 4472
4429 4473
4430 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 4474 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
4431 RegExpNode* on_success) { 4475 RegExpNode* on_success) {
4432 return new TextNode(this, on_success); 4476 return new TextNode(this, on_success);
(...skipping 339 matching lines...) Expand 10 before | Expand all | Expand 10 after
4772 RegExpNode* on_success) { 4816 RegExpNode* on_success) {
4773 ZoneList<RegExpTree*>* children = nodes(); 4817 ZoneList<RegExpTree*>* children = nodes();
4774 RegExpNode* current = on_success; 4818 RegExpNode* current = on_success;
4775 for (int i = children->length() - 1; i >= 0; i--) { 4819 for (int i = children->length() - 1; i >= 0; i--) {
4776 current = children->at(i)->ToNode(compiler, current); 4820 current = children->at(i)->ToNode(compiler, current);
4777 } 4821 }
4778 return current; 4822 return current;
4779 } 4823 }
4780 4824
4781 4825
4782 static void AddClass(const uc16* elmv, 4826 static void AddClass(const int* elmv,
4783 int elmc, 4827 int elmc,
4784 ZoneList<CharacterRange>* ranges) { 4828 ZoneList<CharacterRange>* ranges) {
4829 elmc--;
4830 ASSERT(elmv[elmc] == 0x10000);
4785 for (int i = 0; i < elmc; i += 2) { 4831 for (int i = 0; i < elmc; i += 2) {
4786 ASSERT(elmv[i] <= elmv[i + 1]); 4832 ASSERT(elmv[i] < elmv[i + 1]);
4787 ranges->Add(CharacterRange(elmv[i], elmv[i + 1])); 4833 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1));
4788 } 4834 }
4789 } 4835 }
4790 4836
4791 4837
4792 static void AddClassNegated(const uc16 *elmv, 4838 static void AddClassNegated(const int *elmv,
4793 int elmc, 4839 int elmc,
4794 ZoneList<CharacterRange>* ranges) { 4840 ZoneList<CharacterRange>* ranges) {
4841 elmc--;
4842 ASSERT(elmv[elmc] == 0x10000);
4795 ASSERT(elmv[0] != 0x0000); 4843 ASSERT(elmv[0] != 0x0000);
4796 ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit); 4844 ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
4797 uc16 last = 0x0000; 4845 uc16 last = 0x0000;
4798 for (int i = 0; i < elmc; i += 2) { 4846 for (int i = 0; i < elmc; i += 2) {
4799 ASSERT(last <= elmv[i] - 1); 4847 ASSERT(last <= elmv[i] - 1);
4800 ASSERT(elmv[i] <= elmv[i + 1]); 4848 ASSERT(elmv[i] < elmv[i + 1]);
4801 ranges->Add(CharacterRange(last, elmv[i] - 1)); 4849 ranges->Add(CharacterRange(last, elmv[i] - 1));
4802 last = elmv[i + 1] + 1; 4850 last = elmv[i + 1];
4803 } 4851 }
4804 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit)); 4852 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit));
4805 } 4853 }
4806 4854
4807 4855
4808 void CharacterRange::AddClassEscape(uc16 type, 4856 void CharacterRange::AddClassEscape(uc16 type,
4809 ZoneList<CharacterRange>* ranges) { 4857 ZoneList<CharacterRange>* ranges) {
4810 switch (type) { 4858 switch (type) {
4811 case 's': 4859 case 's':
4812 AddClass(kSpaceRanges, kSpaceRangeCount, ranges); 4860 AddClass(kSpaceRanges2, kSpaceRange2Count, ranges);
4813 break; 4861 break;
4814 case 'S': 4862 case 'S':
4815 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges); 4863 AddClassNegated(kSpaceRanges2, kSpaceRange2Count, ranges);
4816 break; 4864 break;
4817 case 'w': 4865 case 'w':
4818 AddClass(kWordRanges, kWordRangeCount, ranges); 4866 AddClass(kWordRanges2, kWordRange2Count, ranges);
4819 break; 4867 break;
4820 case 'W': 4868 case 'W':
4821 AddClassNegated(kWordRanges, kWordRangeCount, ranges); 4869 AddClassNegated(kWordRanges2, kWordRange2Count, ranges);
4822 break; 4870 break;
4823 case 'd': 4871 case 'd':
4824 AddClass(kDigitRanges, kDigitRangeCount, ranges); 4872 AddClass(kDigitRanges2, kDigitRange2Count, ranges);
4825 break; 4873 break;
4826 case 'D': 4874 case 'D':
4827 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges); 4875 AddClassNegated(kDigitRanges2, kDigitRange2Count, ranges);
4828 break; 4876 break;
4829 case '.': 4877 case '.':
4830 AddClassNegated(kLineTerminatorRanges, 4878 AddClassNegated(kLineTerminatorRanges2,
4831 kLineTerminatorRangeCount, 4879 kLineTerminatorRange2Count,
4832 ranges); 4880 ranges);
4833 break; 4881 break;
4834 // This is not a character range as defined by the spec but a 4882 // This is not a character range as defined by the spec but a
4835 // convenient shorthand for a character class that matches any 4883 // convenient shorthand for a character class that matches any
4836 // character. 4884 // character.
4837 case '*': 4885 case '*':
4838 ranges->Add(CharacterRange::Everything()); 4886 ranges->Add(CharacterRange::Everything());
4839 break; 4887 break;
4840 // This is the set of characters matched by the $ and ^ symbols 4888 // This is the set of characters matched by the $ and ^ symbols
4841 // in multiline mode. 4889 // in multiline mode.
4842 case 'n': 4890 case 'n':
4843 AddClass(kLineTerminatorRanges, 4891 AddClass(kLineTerminatorRanges2,
4844 kLineTerminatorRangeCount, 4892 kLineTerminatorRange2Count,
4845 ranges); 4893 ranges);
4846 break; 4894 break;
4847 default: 4895 default:
4848 UNREACHABLE(); 4896 UNREACHABLE();
4849 } 4897 }
4850 } 4898 }
4851 4899
4852 4900
4853 Vector<const uc16> CharacterRange::GetWordBounds() { 4901 Vector<const int> CharacterRange::GetWordBounds() {
4854 return Vector<const uc16>(kWordRanges, kWordRangeCount); 4902 return Vector<const int>(kWordRanges2, kWordRange2Count - 1);
4855 } 4903 }
4856 4904
4857 4905
4858 class CharacterRangeSplitter { 4906 class CharacterRangeSplitter {
4859 public: 4907 public:
4860 CharacterRangeSplitter(ZoneList<CharacterRange>** included, 4908 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
4861 ZoneList<CharacterRange>** excluded) 4909 ZoneList<CharacterRange>** excluded)
4862 : included_(included), 4910 : included_(included),
4863 excluded_(excluded) { } 4911 excluded_(excluded) { }
4864 void Call(uc16 from, DispatchTable::Entry entry); 4912 void Call(uc16 from, DispatchTable::Entry entry);
(...skipping 11 matching lines...) Expand all
4876 if (!entry.out_set()->Get(kInBase)) return; 4924 if (!entry.out_set()->Get(kInBase)) return;
4877 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay) 4925 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
4878 ? included_ 4926 ? included_
4879 : excluded_; 4927 : excluded_;
4880 if (*target == NULL) *target = new ZoneList<CharacterRange>(2); 4928 if (*target == NULL) *target = new ZoneList<CharacterRange>(2);
4881 (*target)->Add(CharacterRange(entry.from(), entry.to())); 4929 (*target)->Add(CharacterRange(entry.from(), entry.to()));
4882 } 4930 }
4883 4931
4884 4932
4885 void CharacterRange::Split(ZoneList<CharacterRange>* base, 4933 void CharacterRange::Split(ZoneList<CharacterRange>* base,
4886 Vector<const uc16> overlay, 4934 Vector<const int> overlay,
4887 ZoneList<CharacterRange>** included, 4935 ZoneList<CharacterRange>** included,
4888 ZoneList<CharacterRange>** excluded) { 4936 ZoneList<CharacterRange>** excluded) {
4889 ASSERT_EQ(NULL, *included); 4937 ASSERT_EQ(NULL, *included);
4890 ASSERT_EQ(NULL, *excluded); 4938 ASSERT_EQ(NULL, *excluded);
4891 DispatchTable table; 4939 DispatchTable table;
4892 for (int i = 0; i < base->length(); i++) 4940 for (int i = 0; i < base->length(); i++)
4893 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase); 4941 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
4894 for (int i = 0; i < overlay.length(); i += 2) { 4942 for (int i = 0; i < overlay.length(); i += 2) {
4895 table.AddRange(CharacterRange(overlay[i], overlay[i+1]), 4943 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1),
4896 CharacterRangeSplitter::kInOverlay); 4944 CharacterRangeSplitter::kInOverlay);
4897 } 4945 }
4898 CharacterRangeSplitter callback(included, excluded); 4946 CharacterRangeSplitter callback(included, excluded);
4899 table.ForEach(&callback); 4947 table.ForEach(&callback);
4900 } 4948 }
4901 4949
4902 4950
4903 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, 4951 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
4904 bool is_ascii) { 4952 bool is_ascii) {
4905 Isolate* isolate = Isolate::Current(); 4953 Isolate* isolate = Isolate::Current();
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after
4971 if (n <= 1) return true; 5019 if (n <= 1) return true;
4972 int max = ranges->at(0).to(); 5020 int max = ranges->at(0).to();
4973 for (int i = 1; i < n; i++) { 5021 for (int i = 1; i < n; i++) {
4974 CharacterRange next_range = ranges->at(i); 5022 CharacterRange next_range = ranges->at(i);
4975 if (next_range.from() <= max + 1) return false; 5023 if (next_range.from() <= max + 1) return false;
4976 max = next_range.to(); 5024 max = next_range.to();
4977 } 5025 }
4978 return true; 5026 return true;
4979 } 5027 }
4980 5028
4981 SetRelation CharacterRange::WordCharacterRelation(
4982 ZoneList<CharacterRange>* range) {
4983 ASSERT(IsCanonical(range));
4984 int i = 0; // Word character range index.
4985 int j = 0; // Argument range index.
4986 ASSERT_NE(0, kWordRangeCount);
4987 SetRelation result;
4988 if (range->length() == 0) {
4989 result.SetElementsInSecondSet();
4990 return result;
4991 }
4992 CharacterRange argument_range = range->at(0);
4993 CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]);
4994 while (i < kWordRangeCount && j < range->length()) {
4995 // Check the two ranges for the five cases:
4996 // - no overlap.
4997 // - partial overlap (there are elements in both ranges that isn't
4998 // in the other, and there are also elements that are in both).
4999 // - argument range entirely inside word range.
5000 // - word range entirely inside argument range.
5001 // - ranges are completely equal.
5002
5003 // First check for no overlap. The earlier range is not in the other set.
5004 if (argument_range.from() > word_range.to()) {
5005 // Ranges are disjoint. The earlier word range contains elements that
5006 // cannot be in the argument set.
5007 result.SetElementsInSecondSet();
5008 } else if (word_range.from() > argument_range.to()) {
5009 // Ranges are disjoint. The earlier argument range contains elements that
5010 // cannot be in the word set.
5011 result.SetElementsInFirstSet();
5012 } else if (word_range.from() <= argument_range.from() &&
5013 word_range.to() >= argument_range.from()) {
5014 result.SetElementsInBothSets();
5015 // argument range completely inside word range.
5016 if (word_range.from() < argument_range.from() ||
5017 word_range.to() > argument_range.from()) {
5018 result.SetElementsInSecondSet();
5019 }
5020 } else if (word_range.from() >= argument_range.from() &&
5021 word_range.to() <= argument_range.from()) {
5022 result.SetElementsInBothSets();
5023 result.SetElementsInFirstSet();
5024 } else {
5025 // There is overlap, and neither is a subrange of the other
5026 result.SetElementsInFirstSet();
5027 result.SetElementsInSecondSet();
5028 result.SetElementsInBothSets();
5029 }
5030 if (result.NonTrivialIntersection()) {
5031 // The result is as (im)precise as we can possibly make it.
5032 return result;
5033 }
5034 // Progress the range(s) with minimal to-character.
5035 uc16 word_to = word_range.to();
5036 uc16 argument_to = argument_range.to();
5037 if (argument_to <= word_to) {
5038 j++;
5039 if (j < range->length()) {
5040 argument_range = range->at(j);
5041 }
5042 }
5043 if (word_to <= argument_to) {
5044 i += 2;
5045 if (i < kWordRangeCount) {
5046 word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]);
5047 }
5048 }
5049 }
5050 // Check if anything wasn't compared in the loop.
5051 if (i < kWordRangeCount) {
5052 // word range contains something not in argument range.
5053 result.SetElementsInSecondSet();
5054 } else if (j < range->length()) {
5055 // Argument range contains something not in word range.
5056 result.SetElementsInFirstSet();
5057 }
5058
5059 return result;
5060 }
5061
5062 5029
5063 ZoneList<CharacterRange>* CharacterSet::ranges() { 5030 ZoneList<CharacterRange>* CharacterSet::ranges() {
5064 if (ranges_ == NULL) { 5031 if (ranges_ == NULL) {
5065 ranges_ = new ZoneList<CharacterRange>(2); 5032 ranges_ = new ZoneList<CharacterRange>(2);
5066 CharacterRange::AddClassEscape(standard_set_type_, ranges_); 5033 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
5067 } 5034 }
5068 return ranges_; 5035 return ranges_;
5069 } 5036 }
5070 5037
5071 5038
(...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after
5184 num_canonical, 5151 num_canonical,
5185 character_ranges->at(read)); 5152 character_ranges->at(read));
5186 read++; 5153 read++;
5187 } while (read < n); 5154 } while (read < n);
5188 character_ranges->Rewind(num_canonical); 5155 character_ranges->Rewind(num_canonical);
5189 5156
5190 ASSERT(CharacterRange::IsCanonical(character_ranges)); 5157 ASSERT(CharacterRange::IsCanonical(character_ranges));
5191 } 5158 }
5192 5159
5193 5160
5194 // Utility function for CharacterRange::Merge. Adds a range at the end of
5195 // a canonicalized range list, if necessary merging the range with the last
5196 // range of the list.
5197 static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) {
5198 if (set == NULL) return;
5199 ASSERT(set->length() == 0 || set->at(set->length() - 1).to() < range.from());
5200 int n = set->length();
5201 if (n > 0) {
5202 CharacterRange lastRange = set->at(n - 1);
5203 if (lastRange.to() == range.from() - 1) {
5204 set->at(n - 1) = CharacterRange(lastRange.from(), range.to());
5205 return;
5206 }
5207 }
5208 set->Add(range);
5209 }
5210
5211
5212 static void AddRangeToSelectedSet(int selector,
5213 ZoneList<CharacterRange>* first_set,
5214 ZoneList<CharacterRange>* second_set,
5215 ZoneList<CharacterRange>* intersection_set,
5216 CharacterRange range) {
5217 switch (selector) {
5218 case kInsideFirst:
5219 AddRangeToSet(first_set, range);
5220 break;
5221 case kInsideSecond:
5222 AddRangeToSet(second_set, range);
5223 break;
5224 case kInsideBoth:
5225 AddRangeToSet(intersection_set, range);
5226 break;
5227 }
5228 }
5229
5230
5231
5232 void CharacterRange::Merge(ZoneList<CharacterRange>* first_set,
5233 ZoneList<CharacterRange>* second_set,
5234 ZoneList<CharacterRange>* first_set_only_out,
5235 ZoneList<CharacterRange>* second_set_only_out,
5236 ZoneList<CharacterRange>* both_sets_out) {
5237 // Inputs are canonicalized.
5238 ASSERT(CharacterRange::IsCanonical(first_set));
5239 ASSERT(CharacterRange::IsCanonical(second_set));
5240 // Outputs are empty, if applicable.
5241 ASSERT(first_set_only_out == NULL || first_set_only_out->length() == 0);
5242 ASSERT(second_set_only_out == NULL || second_set_only_out->length() == 0);
5243 ASSERT(both_sets_out == NULL || both_sets_out->length() == 0);
5244
5245 // Merge sets by iterating through the lists in order of lowest "from" value,
5246 // and putting intervals into one of three sets.
5247
5248 if (first_set->length() == 0) {
5249 second_set_only_out->AddAll(*second_set);
5250 return;
5251 }
5252 if (second_set->length() == 0) {
5253 first_set_only_out->AddAll(*first_set);
5254 return;
5255 }
5256 // Indices into input lists.
5257 int i1 = 0;
5258 int i2 = 0;
5259 // Cache length of input lists.
5260 int n1 = first_set->length();
5261 int n2 = second_set->length();
5262 // Current range. May be invalid if state is kInsideNone.
5263 int from = 0;
5264 int to = -1;
5265 // Where current range comes from.
5266 int state = kInsideNone;
5267
5268 while (i1 < n1 || i2 < n2) {
5269 CharacterRange next_range;
5270 int range_source;
5271 if (i2 == n2 ||
5272 (i1 < n1 && first_set->at(i1).from() < second_set->at(i2).from())) {
5273 // Next smallest element is in first set.
5274 next_range = first_set->at(i1++);
5275 range_source = kInsideFirst;
5276 } else {
5277 // Next smallest element is in second set.
5278 next_range = second_set->at(i2++);
5279 range_source = kInsideSecond;
5280 }
5281 if (to < next_range.from()) {
5282 // Ranges disjoint: |current| |next|
5283 AddRangeToSelectedSet(state,
5284 first_set_only_out,
5285 second_set_only_out,
5286 both_sets_out,
5287 CharacterRange(from, to));
5288 from = next_range.from();
5289 to = next_range.to();
5290 state = range_source;
5291 } else {
5292 if (from < next_range.from()) {
5293 AddRangeToSelectedSet(state,
5294 first_set_only_out,
5295 second_set_only_out,
5296 both_sets_out,
5297 CharacterRange(from, next_range.from()-1));
5298 }
5299 if (to < next_range.to()) {
5300 // Ranges overlap: |current|
5301 // |next|
5302 AddRangeToSelectedSet(state | range_source,
5303 first_set_only_out,
5304 second_set_only_out,
5305 both_sets_out,
5306 CharacterRange(next_range.from(), to));
5307 from = to + 1;
5308 to = next_range.to();
5309 state = range_source;
5310 } else {
5311 // Range included: |current| , possibly ending at same character.
5312 // |next|
5313 AddRangeToSelectedSet(
5314 state | range_source,
5315 first_set_only_out,
5316 second_set_only_out,
5317 both_sets_out,
5318 CharacterRange(next_range.from(), next_range.to()));
5319 from = next_range.to() + 1;
5320 // If ranges end at same character, both ranges are consumed completely.
5321 if (next_range.to() == to) state = kInsideNone;
5322 }
5323 }
5324 }
5325 AddRangeToSelectedSet(state,
5326 first_set_only_out,
5327 second_set_only_out,
5328 both_sets_out,
5329 CharacterRange(from, to));
5330 }
5331
5332
5333 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, 5161 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
5334 ZoneList<CharacterRange>* negated_ranges) { 5162 ZoneList<CharacterRange>* negated_ranges) {
5335 ASSERT(CharacterRange::IsCanonical(ranges)); 5163 ASSERT(CharacterRange::IsCanonical(ranges));
5336 ASSERT_EQ(0, negated_ranges->length()); 5164 ASSERT_EQ(0, negated_ranges->length());
5337 int range_count = ranges->length(); 5165 int range_count = ranges->length();
5338 uc16 from = 0; 5166 uc16 from = 0;
5339 int i = 0; 5167 int i = 0;
5340 if (range_count > 0 && ranges->at(0).from() == 0) { 5168 if (range_count > 0 && ranges->at(0).from() == 0) {
5341 from = ranges->at(0).to(); 5169 from = ranges->at(0).to();
5342 i = 1; 5170 i = 1;
(...skipping 292 matching lines...) Expand 10 before | Expand all | Expand 10 after
5635 } 5463 }
5636 5464
5637 5465
5638 void Analysis::VisitBackReference(BackReferenceNode* that) { 5466 void Analysis::VisitBackReference(BackReferenceNode* that) {
5639 EnsureAnalyzed(that->on_success()); 5467 EnsureAnalyzed(that->on_success());
5640 } 5468 }
5641 5469
5642 5470
5643 void Analysis::VisitAssertion(AssertionNode* that) { 5471 void Analysis::VisitAssertion(AssertionNode* that) {
5644 EnsureAnalyzed(that->on_success()); 5472 EnsureAnalyzed(that->on_success());
5645 AssertionNode::AssertionNodeType type = that->type();
5646 if (type == AssertionNode::AT_BOUNDARY ||
5647 type == AssertionNode::AT_NON_BOUNDARY) {
5648 // Check if the following character is known to be a word character
5649 // or known to not be a word character.
5650 ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet();
5651
5652 CharacterRange::Canonicalize(following_chars);
5653
5654 SetRelation word_relation =
5655 CharacterRange::WordCharacterRelation(following_chars);
5656 if (word_relation.Disjoint()) {
5657 // Includes the case where following_chars is empty (e.g., end-of-input).
5658 // Following character is definitely *not* a word character.
5659 type = (type == AssertionNode::AT_BOUNDARY) ?
5660 AssertionNode::AFTER_WORD_CHARACTER :
5661 AssertionNode::AFTER_NONWORD_CHARACTER;
5662 that->set_type(type);
5663 } else if (word_relation.ContainedIn()) {
5664 // Following character is definitely a word character.
5665 type = (type == AssertionNode::AT_BOUNDARY) ?
5666 AssertionNode::AFTER_NONWORD_CHARACTER :
5667 AssertionNode::AFTER_WORD_CHARACTER;
5668 that->set_type(type);
5669 }
5670 }
5671 } 5473 }
5672 5474
5673 5475
5674 ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() { 5476 void BackReferenceNode::FillInBMInfo(
5675 if (first_character_set_ == NULL) { 5477 int offset, BoyerMooreLookahead* bm, bool not_at_start) {
5676 if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) { 5478 // Working out the set of characters that a backreference can match is too
5677 // If we can't find an exact solution within the budget, we 5479 // hard, so we just say that any character can match.
5678 // set the value to the set of every character, i.e., all characters 5480 bm->SetRest(offset);
5679 // are possible. 5481 if (offset == 0) set_bm_info(not_at_start, bm);
5680 ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1);
5681 all_set->Add(CharacterRange::Everything());
5682 first_character_set_ = all_set;
5683 }
5684 }
5685 return first_character_set_;
5686 } 5482 }
5687 5483
5688 5484
5689 int RegExpNode::ComputeFirstCharacterSet(int budget) { 5485 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
5690 // Default behavior is to not be able to determine the first character. 5486 RegExpMacroAssembler::kTableSize);
5691 return kComputeFirstCharacterSetFail;
5692 }
5693
5694
5695 int LoopChoiceNode::ComputeFirstCharacterSet(int budget) {
5696 budget--;
5697 if (budget >= 0) {
5698 // Find loop min-iteration. It's the value of the guarded choice node
5699 // with a GEQ guard, if any.
5700 int min_repetition = 0;
5701
5702 for (int i = 0; i <= 1; i++) {
5703 GuardedAlternative alternative = alternatives()->at(i);
5704 ZoneList<Guard*>* guards = alternative.guards();
5705 if (guards != NULL && guards->length() > 0) {
5706 Guard* guard = guards->at(0);
5707 if (guard->op() == Guard::GEQ) {
5708 min_repetition = guard->value();
5709 break;
5710 }
5711 }
5712 }
5713
5714 budget = loop_node()->ComputeFirstCharacterSet(budget);
5715 if (budget >= 0) {
5716 ZoneList<CharacterRange>* character_set =
5717 loop_node()->first_character_set();
5718 if (body_can_be_zero_length() || min_repetition == 0) {
5719 budget = continue_node()->ComputeFirstCharacterSet(budget);
5720 if (budget < 0) return budget;
5721 ZoneList<CharacterRange>* body_set =
5722 continue_node()->first_character_set();
5723 ZoneList<CharacterRange>* union_set =
5724 new ZoneList<CharacterRange>(Max(character_set->length(),
5725 body_set->length()));
5726 CharacterRange::Merge(character_set,
5727 body_set,
5728 union_set,
5729 union_set,
5730 union_set);
5731 character_set = union_set;
5732 }
5733 set_first_character_set(character_set);
5734 }
5735 }
5736 return budget;
5737 }
5738
5739
5740 int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) {
5741 budget--;
5742 if (budget >= 0) {
5743 GuardedAlternative successor = this->alternatives()->at(1);
5744 RegExpNode* successor_node = successor.node();
5745 budget = successor_node->ComputeFirstCharacterSet(budget);
5746 if (budget >= 0) {
5747 set_first_character_set(successor_node->first_character_set());
5748 }
5749 }
5750 return budget;
5751 }
5752
5753
5754 // The first character set of an EndNode is unknowable. Just use the
5755 // default implementation that fails and returns all characters as possible.
5756
5757
5758 int AssertionNode::ComputeFirstCharacterSet(int budget) {
5759 budget -= 1;
5760 if (budget >= 0) {
5761 switch (type_) {
5762 case AT_END: {
5763 set_first_character_set(new ZoneList<CharacterRange>(0));
5764 break;
5765 }
5766 case AT_START:
5767 case AT_BOUNDARY:
5768 case AT_NON_BOUNDARY:
5769 case AFTER_NEWLINE:
5770 case AFTER_NONWORD_CHARACTER:
5771 case AFTER_WORD_CHARACTER: {
5772 ASSERT_NOT_NULL(on_success());
5773 budget = on_success()->ComputeFirstCharacterSet(budget);
5774 if (budget >= 0) {
5775 set_first_character_set(on_success()->first_character_set());
5776 }
5777 break;
5778 }
5779 }
5780 }
5781 return budget;
5782 }
5783
5784
5785 int ActionNode::ComputeFirstCharacterSet(int budget) {
5786 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail;
5787 budget--;
5788 if (budget >= 0) {
5789 ASSERT_NOT_NULL(on_success());
5790 budget = on_success()->ComputeFirstCharacterSet(budget);
5791 if (budget >= 0) {
5792 set_first_character_set(on_success()->first_character_set());
5793 }
5794 }
5795 return budget;
5796 }
5797
5798
5799 int BackReferenceNode::ComputeFirstCharacterSet(int budget) {
5800 // We don't know anything about the first character of a backreference
5801 // at this point.
5802 // The potential first characters are the first characters of the capture,
5803 // and the first characters of the on_success node, depending on whether the
5804 // capture can be empty and whether it is known to be participating or known
5805 // not to be.
5806 return kComputeFirstCharacterSetFail;
5807 }
5808 5487
5809 5488
5810 void ChoiceNode::FillInBMInfo( 5489 void ChoiceNode::FillInBMInfo(
5811 int offset, BoyerMooreLookahead* bm, bool not_at_start) { 5490 int offset, BoyerMooreLookahead* bm, bool not_at_start) {
5812 ZoneList<GuardedAlternative>* alts = alternatives(); 5491 ZoneList<GuardedAlternative>* alts = alternatives();
5813 for (int i = 0; i < alts->length(); i++) { 5492 for (int i = 0; i < alts->length(); i++) {
5814 GuardedAlternative& alt = alts->at(i); 5493 GuardedAlternative& alt = alts->at(i);
5815 if (alt.guards() != NULL && alt.guards()->length() != 0) { 5494 if (alt.guards() != NULL && alt.guards()->length() != 0) {
5816 bm->SetRest(offset); // Give up trying to fill in info. 5495 bm->SetRest(offset); // Give up trying to fill in info.
5496 if (offset == 0) set_bm_info(not_at_start, bm);
5817 return; 5497 return;
5818 } 5498 }
5819 alt.node()->FillInBMInfo(offset, bm, not_at_start); 5499 alt.node()->FillInBMInfo(offset, bm, not_at_start);
5820 } 5500 }
5501 if (offset == 0) set_bm_info(not_at_start, bm);
5821 } 5502 }
5822 5503
5823 5504
5824 void TextNode::FillInBMInfo( 5505 void TextNode::FillInBMInfo(
5825 int offset, BoyerMooreLookahead* bm, bool not_at_start) { 5506 int initial_offset, BoyerMooreLookahead* bm, bool not_at_start) {
5826 if (offset >= bm->length()) return; 5507 if (initial_offset >= bm->length()) return;
5508 int offset = initial_offset;
5827 int max_char = bm->max_char(); 5509 int max_char = bm->max_char();
5828 for (int i = 0; i < elements()->length(); i++) { 5510 for (int i = 0; i < elements()->length(); i++) {
5829 if (offset >= bm->length()) return; 5511 if (offset >= bm->length()) {
5512 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5513 return;
5514 }
5830 TextElement text = elements()->at(i); 5515 TextElement text = elements()->at(i);
5831 if (text.type == TextElement::ATOM) { 5516 if (text.type == TextElement::ATOM) {
5832 RegExpAtom* atom = text.data.u_atom; 5517 RegExpAtom* atom = text.data.u_atom;
5833 for (int j = 0; j < atom->length(); j++, offset++) { 5518 for (int j = 0; j < atom->length(); j++, offset++) {
5834 if (offset >= bm->length()) return; 5519 if (offset >= bm->length()) {
5520 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5521 return;
5522 }
5835 uc16 character = atom->data()[j]; 5523 uc16 character = atom->data()[j];
5836 if (bm->compiler()->ignore_case()) { 5524 if (bm->compiler()->ignore_case()) {
5837 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5525 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5838 int length = GetCaseIndependentLetters( 5526 int length = GetCaseIndependentLetters(
5839 ISOLATE, 5527 ISOLATE,
5840 character, 5528 character,
5841 bm->max_char() == String::kMaxAsciiCharCode, 5529 bm->max_char() == String::kMaxAsciiCharCode,
5842 chars); 5530 chars);
5843 for (int j = 0; j < length; j++) { 5531 for (int j = 0; j < length; j++) {
5844 bm->Set(offset, chars[j]); 5532 bm->Set(offset, chars[j]);
5845 } 5533 }
5846 } else { 5534 } else {
5847 if (character <= max_char) bm->Set(offset, character); 5535 if (character <= max_char) bm->Set(offset, character);
5848 } 5536 }
5849 } 5537 }
5850 } else { 5538 } else {
5851 ASSERT(text.type == TextElement::CHAR_CLASS); 5539 ASSERT(text.type == TextElement::CHAR_CLASS);
5852 RegExpCharacterClass* char_class = text.data.u_char_class; 5540 RegExpCharacterClass* char_class = text.data.u_char_class;
5853 ZoneList<CharacterRange>* ranges = char_class->ranges(); 5541 ZoneList<CharacterRange>* ranges = char_class->ranges();
5854 if (char_class->is_negated()) { 5542 if (char_class->is_negated()) {
5855 bm->SetAll(offset); 5543 bm->SetAll(offset);
5856 } else { 5544 } else {
5857 for (int k = 0; k < ranges->length(); k++) { 5545 for (int k = 0; k < ranges->length(); k++) {
5858 CharacterRange& range = ranges->at(k); 5546 CharacterRange& range = ranges->at(k);
5859 if (range.from() > max_char) continue; 5547 if (range.from() > max_char) continue;
5860 int to = Min(max_char, static_cast<int>(range.to())); 5548 int to = Min(max_char, static_cast<int>(range.to()));
5861 if (to - range.from() >= BoyerMooreLookahead::kTooManyCharacters) { 5549 bm->SetInterval(offset, Interval(range.from(), to));
5862 bm->SetAll(offset);
5863 break;
5864 }
5865 for (int m = range.from(); m <= to; m++) {
5866 bm->Set(offset, m);
5867 }
5868 } 5550 }
5869 } 5551 }
5870 offset++; 5552 offset++;
5871 } 5553 }
5872 } 5554 }
5873 if (offset >= bm->length()) return; 5555 if (offset >= bm->length()) {
5556 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5557 return;
5558 }
5874 on_success()->FillInBMInfo(offset, 5559 on_success()->FillInBMInfo(offset,
5875 bm, 5560 bm,
5876 true); // Not at start after a text node. 5561 true); // Not at start after a text node.
5562 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5877 } 5563 }
5878 5564
5879 5565
5880 int TextNode::ComputeFirstCharacterSet(int budget) {
5881 budget--;
5882 if (budget >= 0) {
5883 ASSERT_NE(0, elements()->length());
5884 TextElement text = elements()->at(0);
5885 if (text.type == TextElement::ATOM) {
5886 RegExpAtom* atom = text.data.u_atom;
5887 ASSERT_NE(0, atom->length());
5888 uc16 first_char = atom->data()[0];
5889 ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1);
5890 range->Add(CharacterRange(first_char, first_char));
5891 set_first_character_set(range);
5892 } else {
5893 ASSERT(text.type == TextElement::CHAR_CLASS);
5894 RegExpCharacterClass* char_class = text.data.u_char_class;
5895 ZoneList<CharacterRange>* ranges = char_class->ranges();
5896 // TODO(lrn): Canonicalize ranges when they are created
5897 // instead of waiting until now.
5898 CharacterRange::Canonicalize(ranges);
5899 if (char_class->is_negated()) {
5900 int length = ranges->length();
5901 int new_length = length + 1;
5902 if (length > 0) {
5903 if (ranges->at(0).from() == 0) new_length--;
5904 if (ranges->at(length - 1).to() == String::kMaxUtf16CodeUnit) {
5905 new_length--;
5906 }
5907 }
5908 ZoneList<CharacterRange>* negated_ranges =
5909 new ZoneList<CharacterRange>(new_length);
5910 CharacterRange::Negate(ranges, negated_ranges);
5911 set_first_character_set(negated_ranges);
5912 } else {
5913 set_first_character_set(ranges);
5914 }
5915 }
5916 }
5917 return budget;
5918 }
5919
5920
5921
5922 // ------------------------------------------------------------------- 5566 // -------------------------------------------------------------------
5923 // Dispatch table construction 5567 // Dispatch table construction
5924 5568
5925 5569
5926 void DispatchTableConstructor::VisitEnd(EndNode* that) { 5570 void DispatchTableConstructor::VisitEnd(EndNode* that) {
5927 AddRange(CharacterRange::Everything()); 5571 AddRange(CharacterRange::Everything());
5928 } 5572 }
5929 5573
5930 5574
5931 void DispatchTableConstructor::BuildTable(ChoiceNode* node) { 5575 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
(...skipping 201 matching lines...) Expand 10 before | Expand all | Expand 10 after
6133 } 5777 }
6134 5778
6135 return compiler.Assemble(&macro_assembler, 5779 return compiler.Assemble(&macro_assembler,
6136 node, 5780 node,
6137 data->capture_count, 5781 data->capture_count,
6138 pattern); 5782 pattern);
6139 } 5783 }
6140 5784
6141 5785
6142 }} // namespace v8::internal 5786 }} // namespace v8::internal
OLDNEW
« src/jsregexp.h ('K') | « src/jsregexp.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698