Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(206)

Side by Side Diff: src/jsregexp.cc

Issue 10105026: Version 3.10.3 (Closed) Base URL: http://v8.googlecode.com/svn/trunk/
Patch Set: Created 8 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « src/jsregexp.h ('k') | src/macros.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2011 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after
101 Factory* factory = isolate->factory(); 101 Factory* factory = isolate->factory();
102 Handle<FixedArray> elements = factory->NewFixedArray(2); 102 Handle<FixedArray> elements = factory->NewFixedArray(2);
103 elements->set(0, *pattern); 103 elements->set(0, *pattern);
104 elements->set(1, *error_text); 104 elements->set(1, *error_text);
105 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); 105 Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
106 Handle<Object> regexp_err = factory->NewSyntaxError(message, array); 106 Handle<Object> regexp_err = factory->NewSyntaxError(message, array);
107 isolate->Throw(*regexp_err); 107 isolate->Throw(*regexp_err);
108 } 108 }
109 109
110 110
111 ContainedInLattice AddRange(ContainedInLattice containment,
112 const int* ranges,
113 int ranges_length,
114 Interval new_range) {
115 ASSERT((ranges_length & 1) == 1);
116 ASSERT(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1);
117 if (containment == kLatticeUnknown) return containment;
118 bool inside = false;
119 int last = 0;
120 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
121 // Consider the range from last to ranges[i].
122 // We haven't got to the new range yet.
123 if (ranges[i] <= new_range.from()) continue;
124 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
125 // inclusive, but the values in ranges are not.
126 if (last <= new_range.from() && new_range.to() < ranges[i]) {
127 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
128 }
129 return kLatticeUnknown;
130 }
131 return containment;
132 }
133
134
111 // More makes code generation slower, less makes V8 benchmark score lower. 135 // More makes code generation slower, less makes V8 benchmark score lower.
112 const int kMaxLookaheadForBoyerMoore = 8; 136 const int kMaxLookaheadForBoyerMoore = 8;
113 // In a 3-character pattern you can maximally step forwards 3 characters 137 // In a 3-character pattern you can maximally step forwards 3 characters
114 // at a time, which is not always enough to pay for the extra logic. 138 // at a time, which is not always enough to pay for the extra logic.
115 const int kPatternTooShortForBoyerMoore = 2; 139 const int kPatternTooShortForBoyerMoore = 2;
116 140
117 141
118 // Identifies the sort of regexps where the regexp engine is faster 142 // Identifies the sort of regexps where the regexp engine is faster
119 // than the code used for atom matches. 143 // than the code used for atom matches.
120 static bool HasFewDifferentCharacters(Handle<String> pattern) { 144 static bool HasFewDifferentCharacters(Handle<String> pattern) {
(...skipping 2029 matching lines...) Expand 10 before | Expand all | Expand 10 after
2150 2174
2151 2175
2152 void ActionNode::FillInBMInfo(int offset, 2176 void ActionNode::FillInBMInfo(int offset,
2153 BoyerMooreLookahead* bm, 2177 BoyerMooreLookahead* bm,
2154 bool not_at_start) { 2178 bool not_at_start) {
2155 if (type_ == BEGIN_SUBMATCH) { 2179 if (type_ == BEGIN_SUBMATCH) {
2156 bm->SetRest(offset); 2180 bm->SetRest(offset);
2157 } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) { 2181 } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) {
2158 on_success()->FillInBMInfo(offset, bm, not_at_start); 2182 on_success()->FillInBMInfo(offset, bm, not_at_start);
2159 } 2183 }
2184 SaveBMInfo(bm, not_at_start, offset);
2160 } 2185 }
2161 2186
2162 2187
2163 int AssertionNode::EatsAtLeast(int still_to_find, 2188 int AssertionNode::EatsAtLeast(int still_to_find,
2164 int recursion_depth, 2189 int recursion_depth,
2165 bool not_at_start) { 2190 bool not_at_start) {
2166 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 2191 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2167 // If we know we are not at the start and we are asked "how many characters 2192 // If we know we are not at the start and we are asked "how many characters
2168 // will you match if you succeed?" then we can answer anything since false 2193 // will you match if you succeed?" then we can answer anything since false
2169 // implies false. So lets just return the max answer (still_to_find) since 2194 // implies false. So lets just return the max answer (still_to_find) since
2170 // that won't prevent us from preloading a lot of characters for the other 2195 // that won't prevent us from preloading a lot of characters for the other
2171 // branches in the node graph. 2196 // branches in the node graph.
2172 if (type() == AT_START && not_at_start) return still_to_find; 2197 if (type() == AT_START && not_at_start) return still_to_find;
2173 return on_success()->EatsAtLeast(still_to_find, 2198 return on_success()->EatsAtLeast(still_to_find,
2174 recursion_depth + 1, 2199 recursion_depth + 1,
2175 not_at_start); 2200 not_at_start);
2176 } 2201 }
2177 2202
2178 2203
2179 void AssertionNode::FillInBMInfo( 2204 void AssertionNode::FillInBMInfo(
2180 int offset, BoyerMooreLookahead* bm, bool not_at_start) { 2205 int offset, BoyerMooreLookahead* bm, bool not_at_start) {
2181 // Match the behaviour of EatsAtLeast on this node. 2206 // Match the behaviour of EatsAtLeast on this node.
2182 if (type() == AT_START && not_at_start) return; 2207 if (type() == AT_START && not_at_start) return;
2183 on_success()->FillInBMInfo(offset, bm, not_at_start); 2208 on_success()->FillInBMInfo(offset, bm, not_at_start);
2209 SaveBMInfo(bm, not_at_start, offset);
2184 } 2210 }
2185 2211
2186 2212
2187 int BackReferenceNode::EatsAtLeast(int still_to_find, 2213 int BackReferenceNode::EatsAtLeast(int still_to_find,
2188 int recursion_depth, 2214 int recursion_depth,
2189 bool not_at_start) { 2215 bool not_at_start) {
2190 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 2216 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2191 return on_success()->EatsAtLeast(still_to_find, 2217 return on_success()->EatsAtLeast(still_to_find,
2192 recursion_depth + 1, 2218 recursion_depth + 1,
2193 not_at_start); 2219 not_at_start);
(...skipping 416 matching lines...) Expand 10 before | Expand all | Expand 10 after
2610 if (body_can_be_zero_length_ || info()->visited) return; 2636 if (body_can_be_zero_length_ || info()->visited) return;
2611 VisitMarker marker(info()); 2637 VisitMarker marker(info());
2612 return ChoiceNode::GetQuickCheckDetails(details, 2638 return ChoiceNode::GetQuickCheckDetails(details,
2613 compiler, 2639 compiler,
2614 characters_filled_in, 2640 characters_filled_in,
2615 not_at_start); 2641 not_at_start);
2616 } 2642 }
2617 2643
2618 2644
2619 void LoopChoiceNode::FillInBMInfo( 2645 void LoopChoiceNode::FillInBMInfo(
2620 int offset, BoyerMooreLookahead* bm, bool nas) { 2646 int offset, BoyerMooreLookahead* bm, bool not_at_start) {
2621 if (body_can_be_zero_length_) { 2647 if (body_can_be_zero_length_) {
2622 bm->SetRest(offset); 2648 bm->SetRest(offset);
2649 SaveBMInfo(bm, not_at_start, offset);
2623 return; 2650 return;
2624 } 2651 }
2625 ChoiceNode::FillInBMInfo(offset, bm, nas); 2652 ChoiceNode::FillInBMInfo(offset, bm, not_at_start);
2653 SaveBMInfo(bm, not_at_start, offset);
2626 } 2654 }
2627 2655
2628 2656
2629 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2657 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2630 RegExpCompiler* compiler, 2658 RegExpCompiler* compiler,
2631 int characters_filled_in, 2659 int characters_filled_in,
2632 bool not_at_start) { 2660 bool not_at_start) {
2633 not_at_start = (not_at_start || not_at_start_); 2661 not_at_start = (not_at_start || not_at_start_);
2634 int choice_count = alternatives_->length(); 2662 int choice_count = alternatives_->length();
2635 ASSERT(choice_count > 0); 2663 ASSERT(choice_count > 0);
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
2703 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); 2731 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2704 } 2732 }
2705 assembler->CheckCharacter('\n', &ok); 2733 assembler->CheckCharacter('\n', &ok);
2706 assembler->CheckNotCharacter('\r', new_trace.backtrack()); 2734 assembler->CheckNotCharacter('\r', new_trace.backtrack());
2707 } 2735 }
2708 assembler->Bind(&ok); 2736 assembler->Bind(&ok);
2709 on_success->Emit(compiler, &new_trace); 2737 on_success->Emit(compiler, &new_trace);
2710 } 2738 }
2711 2739
2712 2740
2713 // Emit the code to handle \b and \B (word-boundary or non-word-boundary)
2714 // when we know whether the next character must be a word character or not.
2715 static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type,
2716 RegExpCompiler* compiler,
2717 RegExpNode* on_success,
2718 Trace* trace) {
2719 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2720 Label done;
2721
2722 Trace new_trace(*trace);
2723
2724 bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER);
2725 Label* on_word = expect_word_character ? &done : new_trace.backtrack();
2726 Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done;
2727
2728 // Check whether previous character was a word character.
2729 switch (trace->at_start()) {
2730 case Trace::TRUE:
2731 if (expect_word_character) {
2732 assembler->GoTo(on_non_word);
2733 }
2734 break;
2735 case Trace::UNKNOWN:
2736 ASSERT_EQ(0, trace->cp_offset());
2737 assembler->CheckAtStart(on_non_word);
2738 // Fall through.
2739 case Trace::FALSE:
2740 int prev_char_offset = trace->cp_offset() - 1;
2741 assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1);
2742 EmitWordCheck(assembler, on_word, on_non_word, expect_word_character);
2743 // We may or may not have loaded the previous character.
2744 new_trace.InvalidateCurrentCharacter();
2745 }
2746
2747 assembler->Bind(&done);
2748
2749 on_success->Emit(compiler, &new_trace);
2750 }
2751
2752
2753 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). 2741 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2754 static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type, 2742 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2755 RegExpCompiler* compiler,
2756 RegExpNode* on_success,
2757 Trace* trace) {
2758 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2743 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2759 Label before_non_word; 2744 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2760 Label before_word; 2745 bool not_at_start = (trace->at_start() == Trace::FALSE);
2761 if (trace->characters_preloaded() != 1) { 2746 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2762 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); 2747 if (lookahead == NULL) {
2748 int eats_at_least =
2749 Min(kMaxLookaheadForBoyerMoore,
2750 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start));
2751 if (eats_at_least >= 1) {
2752 BoyerMooreLookahead* bm =
2753 new BoyerMooreLookahead(eats_at_least, compiler);
2754 FillInBMInfo(0, bm, not_at_start);
2755 if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE;
2756 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE;
2757 }
2758 } else {
2759 if (lookahead->at(0)->is_non_word()) next_is_word_character = Trace::FALSE;
2760 if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE;
2763 } 2761 }
2764 // Fall through on non-word. 2762 bool at_boundary = (type_ == AssertionNode::AT_BOUNDARY);
2765 EmitWordCheck(assembler, &before_word, &before_non_word, false); 2763 if (next_is_word_character == Trace::UNKNOWN) {
2764 Label before_non_word;
2765 Label before_word;
2766 if (trace->characters_preloaded() != 1) {
2767 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2768 }
2769 // Fall through on non-word.
2770 EmitWordCheck(assembler, &before_word, &before_non_word, false);
2771 // Next character is not a word character.
2772 assembler->Bind(&before_non_word);
2773 Label ok;
2774 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2775 assembler->GoTo(&ok);
2766 2776
2767 // We will be loading the previous character into the current character 2777 assembler->Bind(&before_word);
2768 // register. 2778 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2779 assembler->Bind(&ok);
2780 } else if (next_is_word_character == Trace::TRUE) {
2781 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2782 } else {
2783 ASSERT(next_is_word_character == Trace::FALSE);
2784 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2785 }
2786 }
2787
2788
2789 void AssertionNode::BacktrackIfPrevious(
2790 RegExpCompiler* compiler,
2791 Trace* trace,
2792 AssertionNode::IfPrevious backtrack_if_previous) {
2793 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2769 Trace new_trace(*trace); 2794 Trace new_trace(*trace);
2770 new_trace.InvalidateCurrentCharacter(); 2795 new_trace.InvalidateCurrentCharacter();
2771 2796
2772 Label ok; 2797 Label fall_through, dummy;
2773 Label* boundary;
2774 Label* not_boundary;
2775 if (type == AssertionNode::AT_BOUNDARY) {
2776 boundary = &ok;
2777 not_boundary = new_trace.backtrack();
2778 } else {
2779 not_boundary = &ok;
2780 boundary = new_trace.backtrack();
2781 }
2782 2798
2783 // Next character is not a word character. 2799 Label* non_word = backtrack_if_previous == kIsNonWord ?
2784 assembler->Bind(&before_non_word); 2800 new_trace.backtrack() :
2801 &fall_through;
2802 Label* word = backtrack_if_previous == kIsNonWord ?
2803 &fall_through :
2804 new_trace.backtrack();
2805
2785 if (new_trace.cp_offset() == 0) { 2806 if (new_trace.cp_offset() == 0) {
2786 // The start of input counts as a non-word character, so the question is 2807 // The start of input counts as a non-word character, so the question is
2787 // decided if we are at the start. 2808 // decided if we are at the start.
2788 assembler->CheckAtStart(not_boundary); 2809 assembler->CheckAtStart(non_word);
2789 } 2810 }
2790 // We already checked that we are not at the start of input so it must be 2811 // We already checked that we are not at the start of input so it must be
2791 // OK to load the previous character. 2812 // OK to load the previous character.
2792 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, 2813 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
2793 &ok, // Unused dummy label in this call. 2814 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
2794 false);
2795 // Fall through on non-word.
2796 EmitWordCheck(assembler, boundary, not_boundary, false);
2797 assembler->GoTo(not_boundary);
2798 2815
2799 // Next character is a word character. 2816 assembler->Bind(&fall_through);
2800 assembler->Bind(&before_word); 2817 on_success()->Emit(compiler, &new_trace);
2801 if (new_trace.cp_offset() == 0) {
2802 // The start of input counts as a non-word character, so the question is
2803 // decided if we are at the start.
2804 assembler->CheckAtStart(boundary);
2805 }
2806 // We already checked that we are not at the start of input so it must be
2807 // OK to load the previous character.
2808 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2809 &ok, // Unused dummy label in this call.
2810 false);
2811 bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
2812 EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
2813
2814 assembler->Bind(&ok);
2815
2816 on_success->Emit(compiler, &new_trace);
2817 } 2818 }
2818 2819
2819 2820
2820 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, 2821 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2821 RegExpCompiler* compiler, 2822 RegExpCompiler* compiler,
2822 int filled_in, 2823 int filled_in,
2823 bool not_at_start) { 2824 bool not_at_start) {
2824 if (type_ == AT_START && not_at_start) { 2825 if (type_ == AT_START && not_at_start) {
2825 details->set_cannot_match(); 2826 details->set_cannot_match();
2826 return; 2827 return;
(...skipping 27 matching lines...) Expand all
2854 on_success()->Emit(compiler, &at_start_trace); 2855 on_success()->Emit(compiler, &at_start_trace);
2855 return; 2856 return;
2856 } 2857 }
2857 } 2858 }
2858 break; 2859 break;
2859 case AFTER_NEWLINE: 2860 case AFTER_NEWLINE:
2860 EmitHat(compiler, on_success(), trace); 2861 EmitHat(compiler, on_success(), trace);
2861 return; 2862 return;
2862 case AT_BOUNDARY: 2863 case AT_BOUNDARY:
2863 case AT_NON_BOUNDARY: { 2864 case AT_NON_BOUNDARY: {
2864 EmitBoundaryCheck(type_, compiler, on_success(), trace); 2865 EmitBoundaryCheck(compiler, trace);
2865 return; 2866 return;
2866 } 2867 }
2867 case AFTER_WORD_CHARACTER:
2868 case AFTER_NONWORD_CHARACTER: {
2869 EmitHalfBoundaryCheck(type_, compiler, on_success(), trace);
2870 }
2871 } 2868 }
2872 on_success()->Emit(compiler, trace); 2869 on_success()->Emit(compiler, trace);
2873 } 2870 }
2874 2871
2875 2872
2876 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { 2873 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
2877 if (quick_check == NULL) return false; 2874 if (quick_check == NULL) return false;
2878 if (offset >= quick_check->characters()) return false; 2875 if (offset >= quick_check->characters()) return false;
2879 return quick_check->positions(offset)->determines_perfectly; 2876 return quick_check->positions(offset)->determines_perfectly;
2880 } 2877 }
(...skipping 389 matching lines...) Expand 10 before | Expand all | Expand 10 after
3270 return alt_gens_[i]; 3267 return alt_gens_[i];
3271 } 3268 }
3272 3269
3273 private: 3270 private:
3274 static const int kAFew = 10; 3271 static const int kAFew = 10;
3275 ZoneList<AlternativeGeneration*> alt_gens_; 3272 ZoneList<AlternativeGeneration*> alt_gens_;
3276 AlternativeGeneration a_few_alt_gens_[kAFew]; 3273 AlternativeGeneration a_few_alt_gens_[kAFew];
3277 }; 3274 };
3278 3275
3279 3276
3277 // The '2' variant is has inclusive from and exclusive to.
3278 static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0,
3279 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A,
3280 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, 0x10000 };
3281 static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
3282
3283 static const int kWordRanges[] = {
3284 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
3285 static const int kWordRangeCount = ARRAY_SIZE(kWordRanges);
3286 static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };
3287 static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges);
3288 static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };
3289 static const int kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges);
3290 static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E,
3291 0x2028, 0x202A, 0x10000 };
3292 static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges);
3293
3294
3295 void BoyerMoorePositionInfo::Set(int character) {
3296 SetInterval(Interval(character, character));
3297 }
3298
3299
3300 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3301 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3302 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3303 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3304 surrogate_ =
3305 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3306 if (interval.to() - interval.from() >= kMapSize - 1) {
3307 if (map_count_ != kMapSize) {
3308 map_count_ = kMapSize;
3309 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3310 }
3311 return;
3312 }
3313 for (int i = interval.from(); i <= interval.to(); i++) {
3314 int mod_character = (i & kMask);
3315 if (!map_->at(mod_character)) {
3316 map_count_++;
3317 map_->at(mod_character) = true;
3318 }
3319 if (map_count_ == kMapSize) return;
3320 }
3321 }
3322
3323
3324 void BoyerMoorePositionInfo::SetAll() {
3325 s_ = w_ = d_ = kLatticeUnknown;
3326 if (map_count_ != kMapSize) {
3327 map_count_ = kMapSize;
3328 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3329 }
3330 }
3331
3332
3280 BoyerMooreLookahead::BoyerMooreLookahead( 3333 BoyerMooreLookahead::BoyerMooreLookahead(
3281 int length, int map_length, RegExpCompiler* compiler) 3334 int length, RegExpCompiler* compiler)
3282 : length_(length), 3335 : length_(length),
3283 map_length_(map_length),
3284 compiler_(compiler) { 3336 compiler_(compiler) {
3285 ASSERT(IsPowerOf2(map_length));
3286 if (compiler->ascii()) { 3337 if (compiler->ascii()) {
3287 max_char_ = String::kMaxAsciiCharCode; 3338 max_char_ = String::kMaxAsciiCharCode;
3288 } else { 3339 } else {
3289 max_char_ = String::kMaxUtf16CodeUnit; 3340 max_char_ = String::kMaxUtf16CodeUnit;
3290 } 3341 }
3291 bitmaps_ = new ZoneList<ZoneList<bool>*>(length); 3342 bitmaps_ = new ZoneList<BoyerMoorePositionInfo*>(length);
3292 for (int i = 0; i < length; i++) { 3343 for (int i = 0; i < length; i++) {
3293 bitmaps_->Add(new ZoneList<bool>(map_length)); 3344 bitmaps_->Add(new BoyerMoorePositionInfo());
3294 ZoneList<bool>* map = bitmaps_->at(i);
3295 for (int i = 0; i < map_length; i++) {
3296 map->Add(false);
3297 }
3298 } 3345 }
3299 } 3346 }
3300 3347
3301 3348
3302 // Find the longest range of lookahead that has the fewest number of different 3349 // Find the longest range of lookahead that has the fewest number of different
3303 // characters that can occur at a given position. Since we are optimizing two 3350 // characters that can occur at a given position. Since we are optimizing two
3304 // different parameters at once this is a tradeoff. 3351 // different parameters at once this is a tradeoff.
3305 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { 3352 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3306 int biggest_points = 0; 3353 int biggest_points = 0;
3354 // If more than 32 characters out of 128 can occur it is unlikely that we can
3355 // be lucky enough to step forwards much of the time.
3356 const int kMaxMax = 32;
3307 for (int max_number_of_chars = 4; 3357 for (int max_number_of_chars = 4;
3308 max_number_of_chars < kTooManyCharacters; 3358 max_number_of_chars < kMaxMax;
3309 max_number_of_chars *= 2) { 3359 max_number_of_chars *= 2) {
3310 biggest_points = 3360 biggest_points =
3311 FindBestInterval(max_number_of_chars, biggest_points, from, to); 3361 FindBestInterval(max_number_of_chars, biggest_points, from, to);
3312 } 3362 }
3313 if (biggest_points == 0) return false; 3363 if (biggest_points == 0) return false;
3314 return true; 3364 return true;
3315 } 3365 }
3316 3366
3317 3367
3318 // Find the highest-points range between 0 and length_ where the character 3368 // Find the highest-points range between 0 and length_ where the character
3319 // information is not too vague. 'Too vague' means that there are more than 3369 // information is not too vague. 'Too vague' means that there are more than
3320 // max_number_of_chars that can occur at this position. Calculates the number 3370 // max_number_of_chars that can occur at this position. Calculates the number
3321 // of points as the product of width-of-the-range and 3371 // of points as the product of width-of-the-range and
3322 // probability-of-finding-one-of-the-characters, where the probability is 3372 // probability-of-finding-one-of-the-characters, where the probability is
3323 // calculated using the frequency distribution of the sample subject string. 3373 // calculated using the frequency distribution of the sample subject string.
3324 int BoyerMooreLookahead::FindBestInterval( 3374 int BoyerMooreLookahead::FindBestInterval(
3325 int max_number_of_chars, int old_biggest_points, int* from, int* to) { 3375 int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3326 int biggest_points = old_biggest_points; 3376 int biggest_points = old_biggest_points;
3327 static const int kSize = RegExpMacroAssembler::kTableSize; 3377 static const int kSize = RegExpMacroAssembler::kTableSize;
3328 for (int i = 0; i < length_; ) { 3378 for (int i = 0; i < length_; ) {
3329 while (i < length_ && Count(i) > max_number_of_chars) i++; 3379 while (i < length_ && Count(i) > max_number_of_chars) i++;
3330 if (i == length_) break; 3380 if (i == length_) break;
3331 int remembered_from = i; 3381 int remembered_from = i;
3332 bool union_map[kSize]; 3382 bool union_map[kSize];
3333 for (int j = 0; j < kSize; j++) union_map[j] = false; 3383 for (int j = 0; j < kSize; j++) union_map[j] = false;
3334 while (i < length_ && Count(i) <= max_number_of_chars) { 3384 while (i < length_ && Count(i) <= max_number_of_chars) {
3335 ZoneList<bool>* map = bitmaps_->at(i); 3385 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3336 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j); 3386 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3337 i++; 3387 i++;
3338 } 3388 }
3339 int frequency = 0; 3389 int frequency = 0;
3340 for (int j = 0; j < kSize; j++) { 3390 for (int j = 0; j < kSize; j++) {
3341 if (union_map[j]) { 3391 if (union_map[j]) {
3342 // Add 1 to the frequency to give a small per-character boost for 3392 // Add 1 to the frequency to give a small per-character boost for
3343 // the cases where our sampling is not good enough and many 3393 // the cases where our sampling is not good enough and many
3344 // characters have a frequency of zero. This means the frequency 3394 // characters have a frequency of zero. This means the frequency
3345 // can theoretically be up to 2*kSize though we treat it mostly as 3395 // can theoretically be up to 2*kSize though we treat it mostly as
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
3380 3430
3381 const int kSkipArrayEntry = 0; 3431 const int kSkipArrayEntry = 0;
3382 const int kDontSkipArrayEntry = 1; 3432 const int kDontSkipArrayEntry = 1;
3383 3433
3384 for (int i = 0; i < kSize; i++) { 3434 for (int i = 0; i < kSize; i++) {
3385 boolean_skip_table->set(i, kSkipArrayEntry); 3435 boolean_skip_table->set(i, kSkipArrayEntry);
3386 } 3436 }
3387 int skip = max_lookahead + 1 - min_lookahead; 3437 int skip = max_lookahead + 1 - min_lookahead;
3388 3438
3389 for (int i = max_lookahead; i >= min_lookahead; i--) { 3439 for (int i = max_lookahead; i >= min_lookahead; i--) {
3390 ZoneList<bool>* map = bitmaps_->at(i); 3440 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3391 for (int j = 0; j < map_length_; j++) { 3441 for (int j = 0; j < kSize; j++) {
3392 if (map->at(j)) { 3442 if (map->at(j)) {
3393 boolean_skip_table->set(j, kDontSkipArrayEntry); 3443 boolean_skip_table->set(j, kDontSkipArrayEntry);
3394 } 3444 }
3395 } 3445 }
3396 } 3446 }
3397 3447
3398 return skip; 3448 return skip;
3399 } 3449 }
3400 3450
3401 3451
3402 // See comment above on the implementation of GetSkipTable. 3452 // See comment above on the implementation of GetSkipTable.
3403 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { 3453 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3454 const int kSize = RegExpMacroAssembler::kTableSize;
3455
3404 int min_lookahead = 0; 3456 int min_lookahead = 0;
3405 int max_lookahead = 0; 3457 int max_lookahead = 0;
3406 3458
3407 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false; 3459 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false;
3408 3460
3409 bool found_single_character = false; 3461 bool found_single_character = false;
3410 bool abandoned_search_for_single_character = false;
3411 int single_character = 0; 3462 int single_character = 0;
3412 for (int i = max_lookahead; i >= min_lookahead; i--) { 3463 for (int i = max_lookahead; i >= min_lookahead; i--) {
3413 ZoneList<bool>* map = bitmaps_->at(i); 3464 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3414 for (int j = 0; j < map_length_; j++) { 3465 if (map->map_count() > 1 ||
3466 (found_single_character && map->map_count() != 0)) {
3467 found_single_character = false;
3468 break;
3469 }
3470 for (int j = 0; j < kSize; j++) {
3415 if (map->at(j)) { 3471 if (map->at(j)) {
3416 if (found_single_character) { 3472 found_single_character = true;
3417 found_single_character = false; // Found two. 3473 single_character = j;
3418 abandoned_search_for_single_character = true; 3474 break;
3419 break;
3420 } else {
3421 found_single_character = true;
3422 single_character = j;
3423 }
3424 } 3475 }
3425 } 3476 }
3426 if (abandoned_search_for_single_character) break;
3427 } 3477 }
3428 3478
3429 int lookahead_width = max_lookahead + 1 - min_lookahead; 3479 int lookahead_width = max_lookahead + 1 - min_lookahead;
3430 3480
3431 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { 3481 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3432 // The mask-compare can probably handle this better. 3482 // The mask-compare can probably handle this better.
3433 return false; 3483 return false;
3434 } 3484 }
3435 3485
3436 if (found_single_character) { 3486 if (found_single_character) {
3437 Label cont, again; 3487 Label cont, again;
3438 masm->Bind(&again); 3488 masm->Bind(&again);
3439 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3489 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3440 if (max_char_ > map_length_) { 3490 if (max_char_ > kSize) {
3441 ASSERT(map_length_ == RegExpMacroAssembler::kTableSize);
3442 masm->CheckCharacterAfterAnd(single_character, 3491 masm->CheckCharacterAfterAnd(single_character,
3443 RegExpMacroAssembler::kTableMask, 3492 RegExpMacroAssembler::kTableMask,
3444 &cont); 3493 &cont);
3445 } else { 3494 } else {
3446 masm->CheckCharacter(single_character, &cont); 3495 masm->CheckCharacter(single_character, &cont);
3447 } 3496 }
3448 masm->AdvanceCurrentPosition(lookahead_width); 3497 masm->AdvanceCurrentPosition(lookahead_width);
3449 masm->GoTo(&again); 3498 masm->GoTo(&again);
3450 masm->Bind(&cont); 3499 masm->Bind(&cont);
3451 return true; 3500 return true;
3452 } 3501 }
3453 3502
3454 Handle<ByteArray> boolean_skip_table = 3503 Handle<ByteArray> boolean_skip_table =
3455 FACTORY->NewByteArray(map_length_, TENURED); 3504 FACTORY->NewByteArray(kSize, TENURED);
3456 int skip_distance = GetSkipTable( 3505 int skip_distance = GetSkipTable(
3457 min_lookahead, max_lookahead, boolean_skip_table); 3506 min_lookahead, max_lookahead, boolean_skip_table);
3458 ASSERT(skip_distance != 0); 3507 ASSERT(skip_distance != 0);
3459 3508
3460 Label cont, again; 3509 Label cont, again;
3461 masm->Bind(&again); 3510 masm->Bind(&again);
3462 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3511 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3463 masm->CheckBitInTable(boolean_skip_table, &cont); 3512 masm->CheckBitInTable(boolean_skip_table, &cont);
3464 masm->AdvanceCurrentPosition(skip_distance); 3513 masm->AdvanceCurrentPosition(skip_distance);
3465 masm->GoTo(&again); 3514 masm->GoTo(&again);
(...skipping 158 matching lines...) Expand 10 before | Expand all | Expand 10 after
3624 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) == 3673 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) ==
3625 this) { 3674 this) {
3626 // At this point we know that we are at a non-greedy loop that will eat 3675 // At this point we know that we are at a non-greedy loop that will eat
3627 // any character one at a time. Any non-anchored regexp has such a 3676 // any character one at a time. Any non-anchored regexp has such a
3628 // loop prepended to it in order to find where it starts. We look for 3677 // loop prepended to it in order to find where it starts. We look for
3629 // a pattern of the form ...abc... where we can look 6 characters ahead 3678 // a pattern of the form ...abc... where we can look 6 characters ahead
3630 // and step forwards 3 if the character is not one of abc. Abc need 3679 // and step forwards 3 if the character is not one of abc. Abc need
3631 // not be atoms, they can be any reasonably limited character class or 3680 // not be atoms, they can be any reasonably limited character class or
3632 // small alternation. 3681 // small alternation.
3633 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes. 3682 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes.
3634 eats_at_least = 3683 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3635 Min(kMaxLookaheadForBoyerMoore, 3684 if (lookahead == NULL) {
3636 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start)); 3685 eats_at_least =
3637 if (eats_at_least >= 1) { 3686 Min(kMaxLookaheadForBoyerMoore,
3638 BoyerMooreLookahead bm(eats_at_least, 3687 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start));
3639 RegExpMacroAssembler::kTableSize, 3688 if (eats_at_least >= 1) {
3640 compiler); 3689 BoyerMooreLookahead* bm =
3641 GuardedAlternative alt0 = alternatives_->at(0); 3690 new BoyerMooreLookahead(eats_at_least, compiler);
3642 alt0.node()->FillInBMInfo(0, &bm, not_at_start); 3691 GuardedAlternative alt0 = alternatives_->at(0);
3643 skip_was_emitted = bm.EmitSkipInstructions(macro_assembler); 3692 alt0.node()->FillInBMInfo(0, bm, not_at_start);
3693 skip_was_emitted = bm->EmitSkipInstructions(macro_assembler);
3694 }
3695 } else {
3696 skip_was_emitted = lookahead->EmitSkipInstructions(macro_assembler);
3644 } 3697 }
3645 } 3698 }
3646 } 3699 }
3647 } 3700 }
3648 3701
3649 if (eats_at_least == kEatsAtLeastNotYetInitialized) { 3702 if (eats_at_least == kEatsAtLeastNotYetInitialized) {
3650 // Save some time by looking at most one machine word ahead. 3703 // Save some time by looking at most one machine word ahead.
3651 eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start); 3704 eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start);
3652 } 3705 }
3653 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least); 3706 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least);
(...skipping 542 matching lines...) Expand 10 before | Expand all | Expand 10 after
4196 break; 4249 break;
4197 case AssertionNode::AT_BOUNDARY: 4250 case AssertionNode::AT_BOUNDARY:
4198 stream()->Add("label=\"\\b\", shape=septagon"); 4251 stream()->Add("label=\"\\b\", shape=septagon");
4199 break; 4252 break;
4200 case AssertionNode::AT_NON_BOUNDARY: 4253 case AssertionNode::AT_NON_BOUNDARY:
4201 stream()->Add("label=\"\\B\", shape=septagon"); 4254 stream()->Add("label=\"\\B\", shape=septagon");
4202 break; 4255 break;
4203 case AssertionNode::AFTER_NEWLINE: 4256 case AssertionNode::AFTER_NEWLINE:
4204 stream()->Add("label=\"(?<=\\n)\", shape=septagon"); 4257 stream()->Add("label=\"(?<=\\n)\", shape=septagon");
4205 break; 4258 break;
4206 case AssertionNode::AFTER_WORD_CHARACTER:
4207 stream()->Add("label=\"(?<=\\w)\", shape=septagon");
4208 break;
4209 case AssertionNode::AFTER_NONWORD_CHARACTER:
4210 stream()->Add("label=\"(?<=\\W)\", shape=septagon");
4211 break;
4212 } 4259 }
4213 stream()->Add("];\n"); 4260 stream()->Add("];\n");
4214 PrintAttributes(that); 4261 PrintAttributes(that);
4215 RegExpNode* successor = that->on_success(); 4262 RegExpNode* successor = that->on_success();
4216 stream()->Add(" n%p -> n%p;\n", that, successor); 4263 stream()->Add(" n%p -> n%p;\n", that, successor);
4217 Visit(successor); 4264 Visit(successor);
4218 } 4265 }
4219 4266
4220 4267
4221 void DotPrinter::VisitAction(ActionNode* that) { 4268 void DotPrinter::VisitAction(ActionNode* that) {
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after
4306 printer.PrintNode(label, node); 4353 printer.PrintNode(label, node);
4307 } 4354 }
4308 4355
4309 4356
4310 #endif // DEBUG 4357 #endif // DEBUG
4311 4358
4312 4359
4313 // ------------------------------------------------------------------- 4360 // -------------------------------------------------------------------
4314 // Tree to graph conversion 4361 // Tree to graph conversion
4315 4362
4316 static const uc16 kSpaceRanges[] = { 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0,
4317 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029,
4318 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000, 0xFEFF, 0xFEFF };
4319 static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
4320
4321 static const uc16 kWordRanges[] = { '0', '9', 'A', 'Z', '_', '_', 'a', 'z' };
4322 static const int kWordRangeCount = ARRAY_SIZE(kWordRanges);
4323
4324 static const uc16 kDigitRanges[] = { '0', '9' };
4325 static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges);
4326
4327 static const uc16 kLineTerminatorRanges[] = { 0x000A, 0x000A, 0x000D, 0x000D,
4328 0x2028, 0x2029 };
4329 static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges);
4330
4331 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, 4363 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4332 RegExpNode* on_success) { 4364 RegExpNode* on_success) {
4333 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1); 4365 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
4334 elms->Add(TextElement::Atom(this)); 4366 elms->Add(TextElement::Atom(this));
4335 return new TextNode(elms, on_success); 4367 return new TextNode(elms, on_success);
4336 } 4368 }
4337 4369
4338 4370
4339 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, 4371 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4340 RegExpNode* on_success) { 4372 RegExpNode* on_success) {
4341 return new TextNode(elements(), on_success); 4373 return new TextNode(elements(), on_success);
4342 } 4374 }
4343 4375
4376
4344 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, 4377 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4345 const uc16* special_class, 4378 const int* special_class,
4346 int length) { 4379 int length) {
4380 length--; // Remove final 0x10000.
4381 ASSERT(special_class[length] == 0x10000);
4347 ASSERT(ranges->length() != 0); 4382 ASSERT(ranges->length() != 0);
4348 ASSERT(length != 0); 4383 ASSERT(length != 0);
4349 ASSERT(special_class[0] != 0); 4384 ASSERT(special_class[0] != 0);
4350 if (ranges->length() != (length >> 1) + 1) { 4385 if (ranges->length() != (length >> 1) + 1) {
4351 return false; 4386 return false;
4352 } 4387 }
4353 CharacterRange range = ranges->at(0); 4388 CharacterRange range = ranges->at(0);
4354 if (range.from() != 0) { 4389 if (range.from() != 0) {
4355 return false; 4390 return false;
4356 } 4391 }
4357 for (int i = 0; i < length; i += 2) { 4392 for (int i = 0; i < length; i += 2) {
4358 if (special_class[i] != (range.to() + 1)) { 4393 if (special_class[i] != (range.to() + 1)) {
4359 return false; 4394 return false;
4360 } 4395 }
4361 range = ranges->at((i >> 1) + 1); 4396 range = ranges->at((i >> 1) + 1);
4362 if (special_class[i+1] != range.from() - 1) { 4397 if (special_class[i+1] != range.from()) {
4363 return false; 4398 return false;
4364 } 4399 }
4365 } 4400 }
4366 if (range.to() != 0xffff) { 4401 if (range.to() != 0xffff) {
4367 return false; 4402 return false;
4368 } 4403 }
4369 return true; 4404 return true;
4370 } 4405 }
4371 4406
4372 4407
4373 static bool CompareRanges(ZoneList<CharacterRange>* ranges, 4408 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4374 const uc16* special_class, 4409 const int* special_class,
4375 int length) { 4410 int length) {
4411 length--; // Remove final 0x10000.
4412 ASSERT(special_class[length] == 0x10000);
4376 if (ranges->length() * 2 != length) { 4413 if (ranges->length() * 2 != length) {
4377 return false; 4414 return false;
4378 } 4415 }
4379 for (int i = 0; i < length; i += 2) { 4416 for (int i = 0; i < length; i += 2) {
4380 CharacterRange range = ranges->at(i >> 1); 4417 CharacterRange range = ranges->at(i >> 1);
4381 if (range.from() != special_class[i] || range.to() != special_class[i+1]) { 4418 if (range.from() != special_class[i] ||
4419 range.to() != special_class[i + 1] - 1) {
4382 return false; 4420 return false;
4383 } 4421 }
4384 } 4422 }
4385 return true; 4423 return true;
4386 } 4424 }
4387 4425
4388 4426
4389 bool RegExpCharacterClass::is_standard() { 4427 bool RegExpCharacterClass::is_standard() {
4390 // TODO(lrn): Remove need for this function, by not throwing away information 4428 // TODO(lrn): Remove need for this function, by not throwing away information
4391 // along the way. 4429 // along the way.
(...skipping 380 matching lines...) Expand 10 before | Expand all | Expand 10 after
4772 RegExpNode* on_success) { 4810 RegExpNode* on_success) {
4773 ZoneList<RegExpTree*>* children = nodes(); 4811 ZoneList<RegExpTree*>* children = nodes();
4774 RegExpNode* current = on_success; 4812 RegExpNode* current = on_success;
4775 for (int i = children->length() - 1; i >= 0; i--) { 4813 for (int i = children->length() - 1; i >= 0; i--) {
4776 current = children->at(i)->ToNode(compiler, current); 4814 current = children->at(i)->ToNode(compiler, current);
4777 } 4815 }
4778 return current; 4816 return current;
4779 } 4817 }
4780 4818
4781 4819
4782 static void AddClass(const uc16* elmv, 4820 static void AddClass(const int* elmv,
4783 int elmc, 4821 int elmc,
4784 ZoneList<CharacterRange>* ranges) { 4822 ZoneList<CharacterRange>* ranges) {
4823 elmc--;
4824 ASSERT(elmv[elmc] == 0x10000);
4785 for (int i = 0; i < elmc; i += 2) { 4825 for (int i = 0; i < elmc; i += 2) {
4786 ASSERT(elmv[i] <= elmv[i + 1]); 4826 ASSERT(elmv[i] < elmv[i + 1]);
4787 ranges->Add(CharacterRange(elmv[i], elmv[i + 1])); 4827 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1));
4788 } 4828 }
4789 } 4829 }
4790 4830
4791 4831
4792 static void AddClassNegated(const uc16 *elmv, 4832 static void AddClassNegated(const int *elmv,
4793 int elmc, 4833 int elmc,
4794 ZoneList<CharacterRange>* ranges) { 4834 ZoneList<CharacterRange>* ranges) {
4835 elmc--;
4836 ASSERT(elmv[elmc] == 0x10000);
4795 ASSERT(elmv[0] != 0x0000); 4837 ASSERT(elmv[0] != 0x0000);
4796 ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit); 4838 ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
4797 uc16 last = 0x0000; 4839 uc16 last = 0x0000;
4798 for (int i = 0; i < elmc; i += 2) { 4840 for (int i = 0; i < elmc; i += 2) {
4799 ASSERT(last <= elmv[i] - 1); 4841 ASSERT(last <= elmv[i] - 1);
4800 ASSERT(elmv[i] <= elmv[i + 1]); 4842 ASSERT(elmv[i] < elmv[i + 1]);
4801 ranges->Add(CharacterRange(last, elmv[i] - 1)); 4843 ranges->Add(CharacterRange(last, elmv[i] - 1));
4802 last = elmv[i + 1] + 1; 4844 last = elmv[i + 1];
4803 } 4845 }
4804 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit)); 4846 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit));
4805 } 4847 }
4806 4848
4807 4849
4808 void CharacterRange::AddClassEscape(uc16 type, 4850 void CharacterRange::AddClassEscape(uc16 type,
4809 ZoneList<CharacterRange>* ranges) { 4851 ZoneList<CharacterRange>* ranges) {
4810 switch (type) { 4852 switch (type) {
4811 case 's': 4853 case 's':
4812 AddClass(kSpaceRanges, kSpaceRangeCount, ranges); 4854 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
(...skipping 30 matching lines...) Expand all
4843 AddClass(kLineTerminatorRanges, 4885 AddClass(kLineTerminatorRanges,
4844 kLineTerminatorRangeCount, 4886 kLineTerminatorRangeCount,
4845 ranges); 4887 ranges);
4846 break; 4888 break;
4847 default: 4889 default:
4848 UNREACHABLE(); 4890 UNREACHABLE();
4849 } 4891 }
4850 } 4892 }
4851 4893
4852 4894
4853 Vector<const uc16> CharacterRange::GetWordBounds() { 4895 Vector<const int> CharacterRange::GetWordBounds() {
4854 return Vector<const uc16>(kWordRanges, kWordRangeCount); 4896 return Vector<const int>(kWordRanges, kWordRangeCount - 1);
4855 } 4897 }
4856 4898
4857 4899
4858 class CharacterRangeSplitter { 4900 class CharacterRangeSplitter {
4859 public: 4901 public:
4860 CharacterRangeSplitter(ZoneList<CharacterRange>** included, 4902 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
4861 ZoneList<CharacterRange>** excluded) 4903 ZoneList<CharacterRange>** excluded)
4862 : included_(included), 4904 : included_(included),
4863 excluded_(excluded) { } 4905 excluded_(excluded) { }
4864 void Call(uc16 from, DispatchTable::Entry entry); 4906 void Call(uc16 from, DispatchTable::Entry entry);
(...skipping 11 matching lines...) Expand all
4876 if (!entry.out_set()->Get(kInBase)) return; 4918 if (!entry.out_set()->Get(kInBase)) return;
4877 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay) 4919 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
4878 ? included_ 4920 ? included_
4879 : excluded_; 4921 : excluded_;
4880 if (*target == NULL) *target = new ZoneList<CharacterRange>(2); 4922 if (*target == NULL) *target = new ZoneList<CharacterRange>(2);
4881 (*target)->Add(CharacterRange(entry.from(), entry.to())); 4923 (*target)->Add(CharacterRange(entry.from(), entry.to()));
4882 } 4924 }
4883 4925
4884 4926
4885 void CharacterRange::Split(ZoneList<CharacterRange>* base, 4927 void CharacterRange::Split(ZoneList<CharacterRange>* base,
4886 Vector<const uc16> overlay, 4928 Vector<const int> overlay,
4887 ZoneList<CharacterRange>** included, 4929 ZoneList<CharacterRange>** included,
4888 ZoneList<CharacterRange>** excluded) { 4930 ZoneList<CharacterRange>** excluded) {
4889 ASSERT_EQ(NULL, *included); 4931 ASSERT_EQ(NULL, *included);
4890 ASSERT_EQ(NULL, *excluded); 4932 ASSERT_EQ(NULL, *excluded);
4891 DispatchTable table; 4933 DispatchTable table;
4892 for (int i = 0; i < base->length(); i++) 4934 for (int i = 0; i < base->length(); i++)
4893 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase); 4935 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
4894 for (int i = 0; i < overlay.length(); i += 2) { 4936 for (int i = 0; i < overlay.length(); i += 2) {
4895 table.AddRange(CharacterRange(overlay[i], overlay[i+1]), 4937 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1),
4896 CharacterRangeSplitter::kInOverlay); 4938 CharacterRangeSplitter::kInOverlay);
4897 } 4939 }
4898 CharacterRangeSplitter callback(included, excluded); 4940 CharacterRangeSplitter callback(included, excluded);
4899 table.ForEach(&callback); 4941 table.ForEach(&callback);
4900 } 4942 }
4901 4943
4902 4944
4903 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, 4945 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
4904 bool is_ascii) { 4946 bool is_ascii) {
4905 Isolate* isolate = Isolate::Current(); 4947 Isolate* isolate = Isolate::Current();
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after
4971 if (n <= 1) return true; 5013 if (n <= 1) return true;
4972 int max = ranges->at(0).to(); 5014 int max = ranges->at(0).to();
4973 for (int i = 1; i < n; i++) { 5015 for (int i = 1; i < n; i++) {
4974 CharacterRange next_range = ranges->at(i); 5016 CharacterRange next_range = ranges->at(i);
4975 if (next_range.from() <= max + 1) return false; 5017 if (next_range.from() <= max + 1) return false;
4976 max = next_range.to(); 5018 max = next_range.to();
4977 } 5019 }
4978 return true; 5020 return true;
4979 } 5021 }
4980 5022
4981 SetRelation CharacterRange::WordCharacterRelation(
4982 ZoneList<CharacterRange>* range) {
4983 ASSERT(IsCanonical(range));
4984 int i = 0; // Word character range index.
4985 int j = 0; // Argument range index.
4986 ASSERT_NE(0, kWordRangeCount);
4987 SetRelation result;
4988 if (range->length() == 0) {
4989 result.SetElementsInSecondSet();
4990 return result;
4991 }
4992 CharacterRange argument_range = range->at(0);
4993 CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]);
4994 while (i < kWordRangeCount && j < range->length()) {
4995 // Check the two ranges for the five cases:
4996 // - no overlap.
4997 // - partial overlap (there are elements in both ranges that isn't
4998 // in the other, and there are also elements that are in both).
4999 // - argument range entirely inside word range.
5000 // - word range entirely inside argument range.
5001 // - ranges are completely equal.
5002
5003 // First check for no overlap. The earlier range is not in the other set.
5004 if (argument_range.from() > word_range.to()) {
5005 // Ranges are disjoint. The earlier word range contains elements that
5006 // cannot be in the argument set.
5007 result.SetElementsInSecondSet();
5008 } else if (word_range.from() > argument_range.to()) {
5009 // Ranges are disjoint. The earlier argument range contains elements that
5010 // cannot be in the word set.
5011 result.SetElementsInFirstSet();
5012 } else if (word_range.from() <= argument_range.from() &&
5013 word_range.to() >= argument_range.from()) {
5014 result.SetElementsInBothSets();
5015 // argument range completely inside word range.
5016 if (word_range.from() < argument_range.from() ||
5017 word_range.to() > argument_range.from()) {
5018 result.SetElementsInSecondSet();
5019 }
5020 } else if (word_range.from() >= argument_range.from() &&
5021 word_range.to() <= argument_range.from()) {
5022 result.SetElementsInBothSets();
5023 result.SetElementsInFirstSet();
5024 } else {
5025 // There is overlap, and neither is a subrange of the other
5026 result.SetElementsInFirstSet();
5027 result.SetElementsInSecondSet();
5028 result.SetElementsInBothSets();
5029 }
5030 if (result.NonTrivialIntersection()) {
5031 // The result is as (im)precise as we can possibly make it.
5032 return result;
5033 }
5034 // Progress the range(s) with minimal to-character.
5035 uc16 word_to = word_range.to();
5036 uc16 argument_to = argument_range.to();
5037 if (argument_to <= word_to) {
5038 j++;
5039 if (j < range->length()) {
5040 argument_range = range->at(j);
5041 }
5042 }
5043 if (word_to <= argument_to) {
5044 i += 2;
5045 if (i < kWordRangeCount) {
5046 word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]);
5047 }
5048 }
5049 }
5050 // Check if anything wasn't compared in the loop.
5051 if (i < kWordRangeCount) {
5052 // word range contains something not in argument range.
5053 result.SetElementsInSecondSet();
5054 } else if (j < range->length()) {
5055 // Argument range contains something not in word range.
5056 result.SetElementsInFirstSet();
5057 }
5058
5059 return result;
5060 }
5061
5062 5023
5063 ZoneList<CharacterRange>* CharacterSet::ranges() { 5024 ZoneList<CharacterRange>* CharacterSet::ranges() {
5064 if (ranges_ == NULL) { 5025 if (ranges_ == NULL) {
5065 ranges_ = new ZoneList<CharacterRange>(2); 5026 ranges_ = new ZoneList<CharacterRange>(2);
5066 CharacterRange::AddClassEscape(standard_set_type_, ranges_); 5027 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
5067 } 5028 }
5068 return ranges_; 5029 return ranges_;
5069 } 5030 }
5070 5031
5071 5032
(...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after
5184 num_canonical, 5145 num_canonical,
5185 character_ranges->at(read)); 5146 character_ranges->at(read));
5186 read++; 5147 read++;
5187 } while (read < n); 5148 } while (read < n);
5188 character_ranges->Rewind(num_canonical); 5149 character_ranges->Rewind(num_canonical);
5189 5150
5190 ASSERT(CharacterRange::IsCanonical(character_ranges)); 5151 ASSERT(CharacterRange::IsCanonical(character_ranges));
5191 } 5152 }
5192 5153
5193 5154
5194 // Utility function for CharacterRange::Merge. Adds a range at the end of
5195 // a canonicalized range list, if necessary merging the range with the last
5196 // range of the list.
5197 static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) {
5198 if (set == NULL) return;
5199 ASSERT(set->length() == 0 || set->at(set->length() - 1).to() < range.from());
5200 int n = set->length();
5201 if (n > 0) {
5202 CharacterRange lastRange = set->at(n - 1);
5203 if (lastRange.to() == range.from() - 1) {
5204 set->at(n - 1) = CharacterRange(lastRange.from(), range.to());
5205 return;
5206 }
5207 }
5208 set->Add(range);
5209 }
5210
5211
5212 static void AddRangeToSelectedSet(int selector,
5213 ZoneList<CharacterRange>* first_set,
5214 ZoneList<CharacterRange>* second_set,
5215 ZoneList<CharacterRange>* intersection_set,
5216 CharacterRange range) {
5217 switch (selector) {
5218 case kInsideFirst:
5219 AddRangeToSet(first_set, range);
5220 break;
5221 case kInsideSecond:
5222 AddRangeToSet(second_set, range);
5223 break;
5224 case kInsideBoth:
5225 AddRangeToSet(intersection_set, range);
5226 break;
5227 }
5228 }
5229
5230
5231
5232 void CharacterRange::Merge(ZoneList<CharacterRange>* first_set,
5233 ZoneList<CharacterRange>* second_set,
5234 ZoneList<CharacterRange>* first_set_only_out,
5235 ZoneList<CharacterRange>* second_set_only_out,
5236 ZoneList<CharacterRange>* both_sets_out) {
5237 // Inputs are canonicalized.
5238 ASSERT(CharacterRange::IsCanonical(first_set));
5239 ASSERT(CharacterRange::IsCanonical(second_set));
5240 // Outputs are empty, if applicable.
5241 ASSERT(first_set_only_out == NULL || first_set_only_out->length() == 0);
5242 ASSERT(second_set_only_out == NULL || second_set_only_out->length() == 0);
5243 ASSERT(both_sets_out == NULL || both_sets_out->length() == 0);
5244
5245 // Merge sets by iterating through the lists in order of lowest "from" value,
5246 // and putting intervals into one of three sets.
5247
5248 if (first_set->length() == 0) {
5249 second_set_only_out->AddAll(*second_set);
5250 return;
5251 }
5252 if (second_set->length() == 0) {
5253 first_set_only_out->AddAll(*first_set);
5254 return;
5255 }
5256 // Indices into input lists.
5257 int i1 = 0;
5258 int i2 = 0;
5259 // Cache length of input lists.
5260 int n1 = first_set->length();
5261 int n2 = second_set->length();
5262 // Current range. May be invalid if state is kInsideNone.
5263 int from = 0;
5264 int to = -1;
5265 // Where current range comes from.
5266 int state = kInsideNone;
5267
5268 while (i1 < n1 || i2 < n2) {
5269 CharacterRange next_range;
5270 int range_source;
5271 if (i2 == n2 ||
5272 (i1 < n1 && first_set->at(i1).from() < second_set->at(i2).from())) {
5273 // Next smallest element is in first set.
5274 next_range = first_set->at(i1++);
5275 range_source = kInsideFirst;
5276 } else {
5277 // Next smallest element is in second set.
5278 next_range = second_set->at(i2++);
5279 range_source = kInsideSecond;
5280 }
5281 if (to < next_range.from()) {
5282 // Ranges disjoint: |current| |next|
5283 AddRangeToSelectedSet(state,
5284 first_set_only_out,
5285 second_set_only_out,
5286 both_sets_out,
5287 CharacterRange(from, to));
5288 from = next_range.from();
5289 to = next_range.to();
5290 state = range_source;
5291 } else {
5292 if (from < next_range.from()) {
5293 AddRangeToSelectedSet(state,
5294 first_set_only_out,
5295 second_set_only_out,
5296 both_sets_out,
5297 CharacterRange(from, next_range.from()-1));
5298 }
5299 if (to < next_range.to()) {
5300 // Ranges overlap: |current|
5301 // |next|
5302 AddRangeToSelectedSet(state | range_source,
5303 first_set_only_out,
5304 second_set_only_out,
5305 both_sets_out,
5306 CharacterRange(next_range.from(), to));
5307 from = to + 1;
5308 to = next_range.to();
5309 state = range_source;
5310 } else {
5311 // Range included: |current| , possibly ending at same character.
5312 // |next|
5313 AddRangeToSelectedSet(
5314 state | range_source,
5315 first_set_only_out,
5316 second_set_only_out,
5317 both_sets_out,
5318 CharacterRange(next_range.from(), next_range.to()));
5319 from = next_range.to() + 1;
5320 // If ranges end at same character, both ranges are consumed completely.
5321 if (next_range.to() == to) state = kInsideNone;
5322 }
5323 }
5324 }
5325 AddRangeToSelectedSet(state,
5326 first_set_only_out,
5327 second_set_only_out,
5328 both_sets_out,
5329 CharacterRange(from, to));
5330 }
5331
5332
5333 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, 5155 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
5334 ZoneList<CharacterRange>* negated_ranges) { 5156 ZoneList<CharacterRange>* negated_ranges) {
5335 ASSERT(CharacterRange::IsCanonical(ranges)); 5157 ASSERT(CharacterRange::IsCanonical(ranges));
5336 ASSERT_EQ(0, negated_ranges->length()); 5158 ASSERT_EQ(0, negated_ranges->length());
5337 int range_count = ranges->length(); 5159 int range_count = ranges->length();
5338 uc16 from = 0; 5160 uc16 from = 0;
5339 int i = 0; 5161 int i = 0;
5340 if (range_count > 0 && ranges->at(0).from() == 0) { 5162 if (range_count > 0 && ranges->at(0).from() == 0) {
5341 from = ranges->at(0).to(); 5163 from = ranges->at(0).to();
5342 i = 1; 5164 i = 1;
(...skipping 292 matching lines...) Expand 10 before | Expand all | Expand 10 after
5635 } 5457 }
5636 5458
5637 5459
5638 void Analysis::VisitBackReference(BackReferenceNode* that) { 5460 void Analysis::VisitBackReference(BackReferenceNode* that) {
5639 EnsureAnalyzed(that->on_success()); 5461 EnsureAnalyzed(that->on_success());
5640 } 5462 }
5641 5463
5642 5464
5643 void Analysis::VisitAssertion(AssertionNode* that) { 5465 void Analysis::VisitAssertion(AssertionNode* that) {
5644 EnsureAnalyzed(that->on_success()); 5466 EnsureAnalyzed(that->on_success());
5645 AssertionNode::AssertionNodeType type = that->type();
5646 if (type == AssertionNode::AT_BOUNDARY ||
5647 type == AssertionNode::AT_NON_BOUNDARY) {
5648 // Check if the following character is known to be a word character
5649 // or known to not be a word character.
5650 ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet();
5651
5652 CharacterRange::Canonicalize(following_chars);
5653
5654 SetRelation word_relation =
5655 CharacterRange::WordCharacterRelation(following_chars);
5656 if (word_relation.Disjoint()) {
5657 // Includes the case where following_chars is empty (e.g., end-of-input).
5658 // Following character is definitely *not* a word character.
5659 type = (type == AssertionNode::AT_BOUNDARY) ?
5660 AssertionNode::AFTER_WORD_CHARACTER :
5661 AssertionNode::AFTER_NONWORD_CHARACTER;
5662 that->set_type(type);
5663 } else if (word_relation.ContainedIn()) {
5664 // Following character is definitely a word character.
5665 type = (type == AssertionNode::AT_BOUNDARY) ?
5666 AssertionNode::AFTER_NONWORD_CHARACTER :
5667 AssertionNode::AFTER_WORD_CHARACTER;
5668 that->set_type(type);
5669 }
5670 }
5671 } 5467 }
5672 5468
5673 5469
5674 ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() { 5470 void BackReferenceNode::FillInBMInfo(
5675 if (first_character_set_ == NULL) { 5471 int offset, BoyerMooreLookahead* bm, bool not_at_start) {
5676 if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) { 5472 // Working out the set of characters that a backreference can match is too
5677 // If we can't find an exact solution within the budget, we 5473 // hard, so we just say that any character can match.
5678 // set the value to the set of every character, i.e., all characters 5474 bm->SetRest(offset);
5679 // are possible. 5475 SaveBMInfo(bm, not_at_start, offset);
5680 ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1);
5681 all_set->Add(CharacterRange::Everything());
5682 first_character_set_ = all_set;
5683 }
5684 }
5685 return first_character_set_;
5686 } 5476 }
5687 5477
5688 5478
5689 int RegExpNode::ComputeFirstCharacterSet(int budget) { 5479 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
5690 // Default behavior is to not be able to determine the first character. 5480 RegExpMacroAssembler::kTableSize);
5691 return kComputeFirstCharacterSetFail;
5692 }
5693
5694
5695 int LoopChoiceNode::ComputeFirstCharacterSet(int budget) {
5696 budget--;
5697 if (budget >= 0) {
5698 // Find loop min-iteration. It's the value of the guarded choice node
5699 // with a GEQ guard, if any.
5700 int min_repetition = 0;
5701
5702 for (int i = 0; i <= 1; i++) {
5703 GuardedAlternative alternative = alternatives()->at(i);
5704 ZoneList<Guard*>* guards = alternative.guards();
5705 if (guards != NULL && guards->length() > 0) {
5706 Guard* guard = guards->at(0);
5707 if (guard->op() == Guard::GEQ) {
5708 min_repetition = guard->value();
5709 break;
5710 }
5711 }
5712 }
5713
5714 budget = loop_node()->ComputeFirstCharacterSet(budget);
5715 if (budget >= 0) {
5716 ZoneList<CharacterRange>* character_set =
5717 loop_node()->first_character_set();
5718 if (body_can_be_zero_length() || min_repetition == 0) {
5719 budget = continue_node()->ComputeFirstCharacterSet(budget);
5720 if (budget < 0) return budget;
5721 ZoneList<CharacterRange>* body_set =
5722 continue_node()->first_character_set();
5723 ZoneList<CharacterRange>* union_set =
5724 new ZoneList<CharacterRange>(Max(character_set->length(),
5725 body_set->length()));
5726 CharacterRange::Merge(character_set,
5727 body_set,
5728 union_set,
5729 union_set,
5730 union_set);
5731 character_set = union_set;
5732 }
5733 set_first_character_set(character_set);
5734 }
5735 }
5736 return budget;
5737 }
5738
5739
5740 int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) {
5741 budget--;
5742 if (budget >= 0) {
5743 GuardedAlternative successor = this->alternatives()->at(1);
5744 RegExpNode* successor_node = successor.node();
5745 budget = successor_node->ComputeFirstCharacterSet(budget);
5746 if (budget >= 0) {
5747 set_first_character_set(successor_node->first_character_set());
5748 }
5749 }
5750 return budget;
5751 }
5752
5753
5754 // The first character set of an EndNode is unknowable. Just use the
5755 // default implementation that fails and returns all characters as possible.
5756
5757
5758 int AssertionNode::ComputeFirstCharacterSet(int budget) {
5759 budget -= 1;
5760 if (budget >= 0) {
5761 switch (type_) {
5762 case AT_END: {
5763 set_first_character_set(new ZoneList<CharacterRange>(0));
5764 break;
5765 }
5766 case AT_START:
5767 case AT_BOUNDARY:
5768 case AT_NON_BOUNDARY:
5769 case AFTER_NEWLINE:
5770 case AFTER_NONWORD_CHARACTER:
5771 case AFTER_WORD_CHARACTER: {
5772 ASSERT_NOT_NULL(on_success());
5773 budget = on_success()->ComputeFirstCharacterSet(budget);
5774 if (budget >= 0) {
5775 set_first_character_set(on_success()->first_character_set());
5776 }
5777 break;
5778 }
5779 }
5780 }
5781 return budget;
5782 }
5783
5784
5785 int ActionNode::ComputeFirstCharacterSet(int budget) {
5786 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail;
5787 budget--;
5788 if (budget >= 0) {
5789 ASSERT_NOT_NULL(on_success());
5790 budget = on_success()->ComputeFirstCharacterSet(budget);
5791 if (budget >= 0) {
5792 set_first_character_set(on_success()->first_character_set());
5793 }
5794 }
5795 return budget;
5796 }
5797
5798
5799 int BackReferenceNode::ComputeFirstCharacterSet(int budget) {
5800 // We don't know anything about the first character of a backreference
5801 // at this point.
5802 // The potential first characters are the first characters of the capture,
5803 // and the first characters of the on_success node, depending on whether the
5804 // capture can be empty and whether it is known to be participating or known
5805 // not to be.
5806 return kComputeFirstCharacterSetFail;
5807 }
5808 5481
5809 5482
5810 void ChoiceNode::FillInBMInfo( 5483 void ChoiceNode::FillInBMInfo(
5811 int offset, BoyerMooreLookahead* bm, bool not_at_start) { 5484 int offset, BoyerMooreLookahead* bm, bool not_at_start) {
5812 ZoneList<GuardedAlternative>* alts = alternatives(); 5485 ZoneList<GuardedAlternative>* alts = alternatives();
5813 for (int i = 0; i < alts->length(); i++) { 5486 for (int i = 0; i < alts->length(); i++) {
5814 GuardedAlternative& alt = alts->at(i); 5487 GuardedAlternative& alt = alts->at(i);
5815 if (alt.guards() != NULL && alt.guards()->length() != 0) { 5488 if (alt.guards() != NULL && alt.guards()->length() != 0) {
5816 bm->SetRest(offset); // Give up trying to fill in info. 5489 bm->SetRest(offset); // Give up trying to fill in info.
5490 SaveBMInfo(bm, not_at_start, offset);
5817 return; 5491 return;
5818 } 5492 }
5819 alt.node()->FillInBMInfo(offset, bm, not_at_start); 5493 alt.node()->FillInBMInfo(offset, bm, not_at_start);
5820 } 5494 }
5495 SaveBMInfo(bm, not_at_start, offset);
5821 } 5496 }
5822 5497
5823 5498
5824 void TextNode::FillInBMInfo( 5499 void TextNode::FillInBMInfo(
5825 int offset, BoyerMooreLookahead* bm, bool not_at_start) { 5500 int initial_offset, BoyerMooreLookahead* bm, bool not_at_start) {
5826 if (offset >= bm->length()) return; 5501 if (initial_offset >= bm->length()) return;
5502 int offset = initial_offset;
5827 int max_char = bm->max_char(); 5503 int max_char = bm->max_char();
5828 for (int i = 0; i < elements()->length(); i++) { 5504 for (int i = 0; i < elements()->length(); i++) {
5829 if (offset >= bm->length()) return; 5505 if (offset >= bm->length()) {
5506 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5507 return;
5508 }
5830 TextElement text = elements()->at(i); 5509 TextElement text = elements()->at(i);
5831 if (text.type == TextElement::ATOM) { 5510 if (text.type == TextElement::ATOM) {
5832 RegExpAtom* atom = text.data.u_atom; 5511 RegExpAtom* atom = text.data.u_atom;
5833 for (int j = 0; j < atom->length(); j++, offset++) { 5512 for (int j = 0; j < atom->length(); j++, offset++) {
5834 if (offset >= bm->length()) return; 5513 if (offset >= bm->length()) {
5514 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5515 return;
5516 }
5835 uc16 character = atom->data()[j]; 5517 uc16 character = atom->data()[j];
5836 if (bm->compiler()->ignore_case()) { 5518 if (bm->compiler()->ignore_case()) {
5837 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5519 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5838 int length = GetCaseIndependentLetters( 5520 int length = GetCaseIndependentLetters(
5839 ISOLATE, 5521 ISOLATE,
5840 character, 5522 character,
5841 bm->max_char() == String::kMaxAsciiCharCode, 5523 bm->max_char() == String::kMaxAsciiCharCode,
5842 chars); 5524 chars);
5843 for (int j = 0; j < length; j++) { 5525 for (int j = 0; j < length; j++) {
5844 bm->Set(offset, chars[j]); 5526 bm->Set(offset, chars[j]);
5845 } 5527 }
5846 } else { 5528 } else {
5847 if (character <= max_char) bm->Set(offset, character); 5529 if (character <= max_char) bm->Set(offset, character);
5848 } 5530 }
5849 } 5531 }
5850 } else { 5532 } else {
5851 ASSERT(text.type == TextElement::CHAR_CLASS); 5533 ASSERT(text.type == TextElement::CHAR_CLASS);
5852 RegExpCharacterClass* char_class = text.data.u_char_class; 5534 RegExpCharacterClass* char_class = text.data.u_char_class;
5853 ZoneList<CharacterRange>* ranges = char_class->ranges(); 5535 ZoneList<CharacterRange>* ranges = char_class->ranges();
5854 if (char_class->is_negated()) { 5536 if (char_class->is_negated()) {
5855 bm->SetAll(offset); 5537 bm->SetAll(offset);
5856 } else { 5538 } else {
5857 for (int k = 0; k < ranges->length(); k++) { 5539 for (int k = 0; k < ranges->length(); k++) {
5858 CharacterRange& range = ranges->at(k); 5540 CharacterRange& range = ranges->at(k);
5859 if (range.from() > max_char) continue; 5541 if (range.from() > max_char) continue;
5860 int to = Min(max_char, static_cast<int>(range.to())); 5542 int to = Min(max_char, static_cast<int>(range.to()));
5861 if (to - range.from() >= BoyerMooreLookahead::kTooManyCharacters) { 5543 bm->SetInterval(offset, Interval(range.from(), to));
5862 bm->SetAll(offset);
5863 break;
5864 }
5865 for (int m = range.from(); m <= to; m++) {
5866 bm->Set(offset, m);
5867 }
5868 } 5544 }
5869 } 5545 }
5870 offset++; 5546 offset++;
5871 } 5547 }
5872 } 5548 }
5873 if (offset >= bm->length()) return; 5549 if (offset >= bm->length()) {
5550 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5551 return;
5552 }
5874 on_success()->FillInBMInfo(offset, 5553 on_success()->FillInBMInfo(offset,
5875 bm, 5554 bm,
5876 true); // Not at start after a text node. 5555 true); // Not at start after a text node.
5556 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5877 } 5557 }
5878 5558
5879 5559
5880 int TextNode::ComputeFirstCharacterSet(int budget) {
5881 budget--;
5882 if (budget >= 0) {
5883 ASSERT_NE(0, elements()->length());
5884 TextElement text = elements()->at(0);
5885 if (text.type == TextElement::ATOM) {
5886 RegExpAtom* atom = text.data.u_atom;
5887 ASSERT_NE(0, atom->length());
5888 uc16 first_char = atom->data()[0];
5889 ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1);
5890 range->Add(CharacterRange(first_char, first_char));
5891 set_first_character_set(range);
5892 } else {
5893 ASSERT(text.type == TextElement::CHAR_CLASS);
5894 RegExpCharacterClass* char_class = text.data.u_char_class;
5895 ZoneList<CharacterRange>* ranges = char_class->ranges();
5896 // TODO(lrn): Canonicalize ranges when they are created
5897 // instead of waiting until now.
5898 CharacterRange::Canonicalize(ranges);
5899 if (char_class->is_negated()) {
5900 int length = ranges->length();
5901 int new_length = length + 1;
5902 if (length > 0) {
5903 if (ranges->at(0).from() == 0) new_length--;
5904 if (ranges->at(length - 1).to() == String::kMaxUtf16CodeUnit) {
5905 new_length--;
5906 }
5907 }
5908 ZoneList<CharacterRange>* negated_ranges =
5909 new ZoneList<CharacterRange>(new_length);
5910 CharacterRange::Negate(ranges, negated_ranges);
5911 set_first_character_set(negated_ranges);
5912 } else {
5913 set_first_character_set(ranges);
5914 }
5915 }
5916 }
5917 return budget;
5918 }
5919
5920
5921
5922 // ------------------------------------------------------------------- 5560 // -------------------------------------------------------------------
5923 // Dispatch table construction 5561 // Dispatch table construction
5924 5562
5925 5563
5926 void DispatchTableConstructor::VisitEnd(EndNode* that) { 5564 void DispatchTableConstructor::VisitEnd(EndNode* that) {
5927 AddRange(CharacterRange::Everything()); 5565 AddRange(CharacterRange::Everything());
5928 } 5566 }
5929 5567
5930 5568
5931 void DispatchTableConstructor::BuildTable(ChoiceNode* node) { 5569 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
(...skipping 201 matching lines...) Expand 10 before | Expand all | Expand 10 after
6133 } 5771 }
6134 5772
6135 return compiler.Assemble(&macro_assembler, 5773 return compiler.Assemble(&macro_assembler,
6136 node, 5774 node,
6137 data->capture_count, 5775 data->capture_count,
6138 pattern); 5776 pattern);
6139 } 5777 }
6140 5778
6141 5779
6142 }} // namespace v8::internal 5780 }} // namespace v8::internal
OLDNEW
« no previous file with comments | « src/jsregexp.h ('k') | src/macros.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698