src/jsregexp.cc - Issue 9961009: Regexp: Unify the lookahead code for the BoyerMoore-like skipping

Side by Side Diff: src/jsregexp.cc

Issue 9961009: Regexp: Unify the lookahead code for the BoyerMoore-like skipping (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 8 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2011 the V8 project authors. All rights reserved.	1 // Copyright 2011 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 90 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
101 Factory* factory = isolate->factory();	101 Factory* factory = isolate->factory();

102 Handle<FixedArray> elements = factory->NewFixedArray(2);	102 Handle<FixedArray> elements = factory->NewFixedArray(2);

103 elements->set(0, *pattern);	103 elements->set(0, *pattern);

104 elements->set(1, *error_text);	104 elements->set(1, *error_text);

105 Handle<JSArray> array = factory->NewJSArrayWithElements(elements);	105 Handle<JSArray> array = factory->NewJSArrayWithElements(elements);

106 Handle<Object> regexp_err = factory->NewSyntaxError(message, array);	106 Handle<Object> regexp_err = factory->NewSyntaxError(message, array);

107 isolate->Throw(*regexp_err);	107 isolate->Throw(*regexp_err);

108 }	108 }

109	109

110	110

	111 static ContainedInLattice Combine(ContainedInLattice a, bool inside) {
	ulan 2012/04/16 13:52:22 Passing kLatticeIn/kLatticeOut instead of bool wou Passing kLatticeIn/kLatticeOut instead of bool would make it possible to write ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) { return static_cast<ContainedInLattice>(a \| b); } Erik Corry 2012/04/17 07:51:45 Done. Show quoted text On 2012/04/16 13:52:22, ulan wrote: > Passing kLatticeIn/kLatticeOut instead of bool would make it possible to write > ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) { > return static_cast<ContainedInLattice>(a \| b); > } Done.
	112 if (a == kLatticeUnknown) return a;

	113 if (a == kNotYet) return inside ? kLatticeIn : kLatticeOut;

	114 if (a == kLatticeIn) return inside ? kLatticeIn : kLatticeUnknown;

	115 if (a == kLatticeOut) return inside ? kLatticeUnknown : kLatticeOut;

	116 UNREACHABLE();

	117 return kLatticeUnknown;

	118 }

	119

	120

	121 ContainedInLattice AddRange(ContainedInLattice containment,

	122 const int* ranges,

	123 int ranges_length,

	124 Interval new_range) {

	125 if (containment == kLatticeUnknown) return containment;

	126 bool inside = false;

	127 int last = 0;

	128 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {

	129 // Consider the range from last to ranges[i].

	130 // We haven't got to the new range yet.

	131 if (ranges[i] <= new_range.from()) continue;

	132 // New range is wholly inside last-ranges[i].

	133 if (last <= new_range.from() && ranges[i] > new_range.to()) {
	ulan 2012/04/16 13:52:22 So ranges[i] is not inclusive, but new_range.to() So ranges[i] is not inclusive, but new_range.to() is inclusive. It's a bit confusing, maybe worthwhile adding a comment. Nit, I would reverse comparison: last <= new_range.from() && new_range.to() < ranges[i] Erik Corry 2012/04/17 07:51:45 Done. Show quoted text On 2012/04/16 13:52:22, ulan wrote: > So ranges[i] is not inclusive, but new_range.to() is inclusive. It's a bit > confusing, maybe worthwhile adding a comment. > > Nit, I would reverse comparison: > last <= new_range.from() && new_range.to() < ranges[i] Done.
	134 return Combine(containment, inside);

	135 }

	136 // Otherwise it
	ulan 2012/04/16 13:52:22 Trailing space. Trailing space. Erik Corry 2012/04/17 07:51:45 Done. Show quoted text On 2012/04/16 13:52:22, ulan wrote: > Trailing space. Done.
	137 return kLatticeUnknown;

	138 }

	139 return containment;

	140 }

	141

	142

111 // More makes code generation slower, less makes V8 benchmark score lower.	143 // More makes code generation slower, less makes V8 benchmark score lower.

112 const int kMaxLookaheadForBoyerMoore = 8;	144 const int kMaxLookaheadForBoyerMoore = 8;

113 // In a 3-character pattern you can maximally step forwards 3 characters	145 // In a 3-character pattern you can maximally step forwards 3 characters

114 // at a time, which is not always enough to pay for the extra logic.	146 // at a time, which is not always enough to pay for the extra logic.

115 const int kPatternTooShortForBoyerMoore = 2;	147 const int kPatternTooShortForBoyerMoore = 2;

116	148

117	149

118 // Identifies the sort of regexps where the regexp engine is faster	150 // Identifies the sort of regexps where the regexp engine is faster

119 // than the code used for atom matches.	151 // than the code used for atom matches.

120 static bool HasFewDifferentCharacters(Handle<String> pattern) {	152 static bool HasFewDifferentCharacters(Handle<String> pattern) {

(...skipping 2029 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2150	2182

2151	2183

2152 void ActionNode::FillInBMInfo(int offset,	2184 void ActionNode::FillInBMInfo(int offset,

2153 BoyerMooreLookahead* bm,	2185 BoyerMooreLookahead* bm,

2154 bool not_at_start) {	2186 bool not_at_start) {

2155 if (type_ == BEGIN_SUBMATCH) {	2187 if (type_ == BEGIN_SUBMATCH) {

2156 bm->SetRest(offset);	2188 bm->SetRest(offset);

2157 } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) {	2189 } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) {

2158 on_success()->FillInBMInfo(offset, bm, not_at_start);	2190 on_success()->FillInBMInfo(offset, bm, not_at_start);

2159 }	2191 }

	2192 if (offset == 0) set_bm_info(not_at_start, bm);
	ulan 2012/04/16 13:52:22 As discussed offline, it probably better to move t As discussed offline, it probably better to move this into a separate function with comments. Erik Corry 2012/04/17 07:51:45 Done. Show quoted text On 2012/04/16 13:52:22, ulan wrote: > As discussed offline, it probably better to move this into a separate function > with comments. Done.
2160 }	2193 }

2161	2194

2162	2195

2163 int AssertionNode::EatsAtLeast(int still_to_find,	2196 int AssertionNode::EatsAtLeast(int still_to_find,

2164 int recursion_depth,	2197 int recursion_depth,

2165 bool not_at_start) {	2198 bool not_at_start) {

2166 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;	2199 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;

2167 // If we know we are not at the start and we are asked "how many characters	2200 // If we know we are not at the start and we are asked "how many characters

2168 // will you match if you succeed?" then we can answer anything since false	2201 // will you match if you succeed?" then we can answer anything since false

2169 // implies false. So lets just return the max answer (still_to_find) since	2202 // implies false. So lets just return the max answer (still_to_find) since

2170 // that won't prevent us from preloading a lot of characters for the other	2203 // that won't prevent us from preloading a lot of characters for the other

2171 // branches in the node graph.	2204 // branches in the node graph.

2172 if (type() == AT_START && not_at_start) return still_to_find;	2205 if (type() == AT_START && not_at_start) return still_to_find;

2173 return on_success()->EatsAtLeast(still_to_find,	2206 return on_success()->EatsAtLeast(still_to_find,

2174 recursion_depth + 1,	2207 recursion_depth + 1,

2175 not_at_start);	2208 not_at_start);

2176 }	2209 }

2177	2210

2178	2211

2179 void AssertionNode::FillInBMInfo(	2212 void AssertionNode::FillInBMInfo(

2180 int offset, BoyerMooreLookahead* bm, bool not_at_start) {	2213 int offset, BoyerMooreLookahead* bm, bool not_at_start) {

2181 // Match the behaviour of EatsAtLeast on this node.	2214 // Match the behaviour of EatsAtLeast on this node.

2182 if (type() == AT_START && not_at_start) return;	2215 if (type() == AT_START && not_at_start) return;

2183 on_success()->FillInBMInfo(offset, bm, not_at_start);	2216 on_success()->FillInBMInfo(offset, bm, not_at_start);

	2217 if (offset == 0) set_bm_info(not_at_start, bm);

2184 }	2218 }

2185	2219

2186	2220

2187 int BackReferenceNode::EatsAtLeast(int still_to_find,	2221 int BackReferenceNode::EatsAtLeast(int still_to_find,

2188 int recursion_depth,	2222 int recursion_depth,

2189 bool not_at_start) {	2223 bool not_at_start) {

2190 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;	2224 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;

2191 return on_success()->EatsAtLeast(still_to_find,	2225 return on_success()->EatsAtLeast(still_to_find,

2192 recursion_depth + 1,	2226 recursion_depth + 1,

2193 not_at_start);	2227 not_at_start);

(...skipping 416 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2610 if (body_can_be_zero_length_ \|\| info()->visited) return;	2644 if (body_can_be_zero_length_ \|\| info()->visited) return;

2611 VisitMarker marker(info());	2645 VisitMarker marker(info());

2612 return ChoiceNode::GetQuickCheckDetails(details,	2646 return ChoiceNode::GetQuickCheckDetails(details,

2613 compiler,	2647 compiler,

2614 characters_filled_in,	2648 characters_filled_in,

2615 not_at_start);	2649 not_at_start);

2616 }	2650 }

2617	2651

2618	2652

2619 void LoopChoiceNode::FillInBMInfo(	2653 void LoopChoiceNode::FillInBMInfo(

2620 int offset, BoyerMooreLookahead* bm, bool nas) {	2654 int offset, BoyerMooreLookahead* bm, bool not_at_start) {

2621 if (body_can_be_zero_length_) {	2655 if (body_can_be_zero_length_) {

2622 bm->SetRest(offset);	2656 bm->SetRest(offset);

	2657 if (offset == 0) set_bm_info(not_at_start, bm);

2623 return;	2658 return;

2624 }	2659 }

2625 ChoiceNode::FillInBMInfo(offset, bm, nas);	2660 ChoiceNode::FillInBMInfo(offset, bm, not_at_start);

	2661 if (offset == 0) set_bm_info(not_at_start, bm);

2626 }	2662 }

2627	2663

2628	2664

2629 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,	2665 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,

2630 RegExpCompiler* compiler,	2666 RegExpCompiler* compiler,

2631 int characters_filled_in,	2667 int characters_filled_in,

2632 bool not_at_start) {	2668 bool not_at_start) {

2633 not_at_start = (not_at_start \|\| not_at_start_);	2669 not_at_start = (not_at_start \|\| not_at_start_);

2634 int choice_count = alternatives_->length();	2670 int choice_count = alternatives_->length();

2635 ASSERT(choice_count > 0);	2671 ASSERT(choice_count > 0);

(...skipping 67 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2703 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);	2739 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);

2704 }	2740 }

2705 assembler->CheckCharacter('\n', &ok);	2741 assembler->CheckCharacter('\n', &ok);

2706 assembler->CheckNotCharacter('\r', new_trace.backtrack());	2742 assembler->CheckNotCharacter('\r', new_trace.backtrack());

2707 }	2743 }

2708 assembler->Bind(&ok);	2744 assembler->Bind(&ok);

2709 on_success->Emit(compiler, &new_trace);	2745 on_success->Emit(compiler, &new_trace);

2710 }	2746 }

2711	2747

2712	2748

2713 // Emit the code to handle \b and \B (word-boundary or non-word-boundary)

2714 // when we know whether the next character must be a word character or not.

2715 static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type,

2716 RegExpCompiler* compiler,

2717 RegExpNode* on_success,

2718 Trace* trace) {

2719 RegExpMacroAssembler* assembler = compiler->macro_assembler();

2720 Label done;

2721

2722 Trace new_trace(*trace);

2723

2724 bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER);

2725 Label* on_word = expect_word_character ? &done : new_trace.backtrack();

2726 Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done;

2727

2728 // Check whether previous character was a word character.

2729 switch (trace->at_start()) {

2730 case Trace::TRUE:

2731 if (expect_word_character) {

2732 assembler->GoTo(on_non_word);

2733 }

2734 break;

2735 case Trace::UNKNOWN:

2736 ASSERT_EQ(0, trace->cp_offset());

2737 assembler->CheckAtStart(on_non_word);

2738 // Fall through.

2739 case Trace::FALSE:

2740 int prev_char_offset = trace->cp_offset() - 1;

2741 assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1);

2742 EmitWordCheck(assembler, on_word, on_non_word, expect_word_character);

2743 // We may or may not have loaded the previous character.

2744 new_trace.InvalidateCurrentCharacter();

2745 }

2746

2747 assembler->Bind(&done);

2748

2749 on_success->Emit(compiler, &new_trace);

2750 }

2751

2752

2753 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).	2749 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).

2754 static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,	2750 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {

2755 RegExpCompiler* compiler,

2756 RegExpNode* on_success,

2757 Trace* trace) {

2758 RegExpMacroAssembler* assembler = compiler->macro_assembler();	2751 RegExpMacroAssembler* assembler = compiler->macro_assembler();

2759 Label before_non_word;	2752 Trace::TriBool next_is_word_character = Trace::UNKNOWN;

2760 Label before_word;	2753 bool not_at_start = (trace->at_start() == Trace::FALSE);

2761 if (trace->characters_preloaded() != 1) {	2754 BoyerMooreLookahead* lookahead = bm_info(not_at_start);

2762 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);	2755 if (lookahead == NULL) {

	2756 int eats_at_least =

	2757 Min(kMaxLookaheadForBoyerMoore,

	2758 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start));

	2759 if (eats_at_least >= 1) {

	2760 BoyerMooreLookahead* bm =

	2761 new BoyerMooreLookahead(eats_at_least, compiler);

	2762 FillInBMInfo(0, bm, not_at_start);

	2763 if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE;

	2764 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE;
	ulan 2012/04/16 13:52:22 Consider writing the two lines above as: next_is_w Consider writing the two lines above as: next_is_word_character = bm->at(0)->is_word() ? Trace::TRUE : Trace::FALSE; Erik Corry 2012/04/17 07:51:45 Not the same, since it could be neither known to b Show quoted text On 2012/04/16 13:52:22, ulan wrote: > Consider writing the two lines above as: > next_is_word_character = bm->at(0)->is_word() ? Trace::TRUE : Trace::FALSE; Not the same, since it could be neither known to be a word, nor known to be a non-word.
	2765 }

	2766 } else {

	2767 if (lookahead->at(0)->is_non_word()) next_is_word_character = Trace::FALSE;

	2768 if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE;

2763 }	2769 }

2764 // Fall through on non-word.	2770 bool at_boundary = (type_ == AssertionNode::AT_BOUNDARY);

2765 EmitWordCheck(assembler, &before_word, &before_non_word, false);	2771 if (next_is_word_character == Trace::UNKNOWN) {

	2772 Label before_non_word;

	2773 Label before_word;

	2774 if (trace->characters_preloaded() != 1) {

	2775 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);

	2776 }

	2777 // Fall through on non-word.

	2778 EmitWordCheck(assembler, &before_word, &before_non_word, false);

	2779 // Next character is not a word character.

	2780 assembler->Bind(&before_non_word);

	2781 Label ok;

	2782 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);

	2783 assembler->GoTo(&ok);

2766	2784

2767 // We will be loading the previous character into the current character	2785 assembler->Bind(&before_word);

2768 // register.	2786 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);

	2787 assembler->Bind(&ok);

	2788 } else if (next_is_word_character == Trace::TRUE) {

	2789 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);

	2790 } else {

	2791 ASSERT(next_is_word_character == Trace::FALSE);

	2792 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);

	2793 }

	2794 }

	2795

	2796

	2797 void AssertionNode::BacktrackIfPrevious(

	2798 RegExpCompiler* compiler,

	2799 Trace* trace,

	2800 AssertionNode::IfPrevious backtrack_if_previous) {

	2801 RegExpMacroAssembler* assembler = compiler->macro_assembler();

2769 Trace new_trace(*trace);	2802 Trace new_trace(*trace);

2770 new_trace.InvalidateCurrentCharacter();	2803 new_trace.InvalidateCurrentCharacter();

2771	2804

2772 Label ok;	2805 Label fall_through, dummy;

2773 Label* boundary;

2774 Label* not_boundary;

2775 if (type == AssertionNode::AT_BOUNDARY) {

2776 boundary = &ok;

2777 not_boundary = new_trace.backtrack();

2778 } else {

2779 not_boundary = &ok;

2780 boundary = new_trace.backtrack();

2781 }

2782	2806

2783 // Next character is not a word character.	2807 Label* non_word = backtrack_if_previous == kIsNonWord ?

2784 assembler->Bind(&before_non_word);	2808 new_trace.backtrack() :

	2809 &fall_through;

	2810 Label* word = backtrack_if_previous == kIsNonWord ?

	2811 &fall_through :

	2812 new_trace.backtrack();

	2813

2785 if (new_trace.cp_offset() == 0) {	2814 if (new_trace.cp_offset() == 0) {

2786 // The start of input counts as a non-word character, so the question is	2815 // The start of input counts as a non-word character, so the question is

2787 // decided if we are at the start.	2816 // decided if we are at the start.

2788 assembler->CheckAtStart(not_boundary);	2817 assembler->CheckAtStart(non_word);

2789 }	2818 }

2790 // We already checked that we are not at the start of input so it must be	2819 // We already checked that we are not at the start of input so it must be

2791 // OK to load the previous character.	2820 // OK to load the previous character.

2792 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,	2821 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);

2793 &ok, // Unused dummy label in this call.	2822 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);

2794 false);

2795 // Fall through on non-word.

2796 EmitWordCheck(assembler, boundary, not_boundary, false);

2797 assembler->GoTo(not_boundary);

2798	2823

2799 // Next character is a word character.	2824 assembler->Bind(&fall_through);

2800 assembler->Bind(&before_word);	2825 on_success()->Emit(compiler, &new_trace);

2801 if (new_trace.cp_offset() == 0) {

2802 // The start of input counts as a non-word character, so the question is

2803 // decided if we are at the start.

2804 assembler->CheckAtStart(boundary);

2805 }

2806 // We already checked that we are not at the start of input so it must be

2807 // OK to load the previous character.

2808 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,

2809 &ok, // Unused dummy label in this call.

2810 false);

2811 bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);

2812 EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);

2813

2814 assembler->Bind(&ok);

2815

2816 on_success->Emit(compiler, &new_trace);

2817 }	2826 }

2818	2827

2819	2828

2820 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,	2829 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,

2821 RegExpCompiler* compiler,	2830 RegExpCompiler* compiler,

2822 int filled_in,	2831 int filled_in,

2823 bool not_at_start) {	2832 bool not_at_start) {

2824 if (type_ == AT_START && not_at_start) {	2833 if (type_ == AT_START && not_at_start) {

2825 details->set_cannot_match();	2834 details->set_cannot_match();

2826 return;	2835 return;

(...skipping 27 matching lines...) Expand all Loading...
2854 on_success()->Emit(compiler, &at_start_trace);	2863 on_success()->Emit(compiler, &at_start_trace);

2855 return;	2864 return;

2856 }	2865 }

2857 }	2866 }

2858 break;	2867 break;

2859 case AFTER_NEWLINE:	2868 case AFTER_NEWLINE:

2860 EmitHat(compiler, on_success(), trace);	2869 EmitHat(compiler, on_success(), trace);

2861 return;	2870 return;

2862 case AT_BOUNDARY:	2871 case AT_BOUNDARY:

2863 case AT_NON_BOUNDARY: {	2872 case AT_NON_BOUNDARY: {

2864 EmitBoundaryCheck(type_, compiler, on_success(), trace);	2873 EmitBoundaryCheck(compiler, trace);

2865 return;	2874 return;

2866 }	2875 }

2867 case AFTER_WORD_CHARACTER:

2868 case AFTER_NONWORD_CHARACTER: {

2869 EmitHalfBoundaryCheck(type_, compiler, on_success(), trace);

2870 }

2871 }	2876 }

2872 on_success()->Emit(compiler, trace);	2877 on_success()->Emit(compiler, trace);

2873 }	2878 }

2874	2879

2875	2880

2876 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {	2881 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {

2877 if (quick_check == NULL) return false;	2882 if (quick_check == NULL) return false;

2878 if (offset >= quick_check->characters()) return false;	2883 if (offset >= quick_check->characters()) return false;

2879 return quick_check->positions(offset)->determines_perfectly;	2884 return quick_check->positions(offset)->determines_perfectly;

2880 }	2885 }

(...skipping 389 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3270 return alt_gens_[i];	3275 return alt_gens_[i];

3271 }	3276 }

3272	3277

3273 private:	3278 private:

3274 static const int kAFew = 10;	3279 static const int kAFew = 10;

3275 ZoneList<AlternativeGeneration*> alt_gens_;	3280 ZoneList<AlternativeGeneration*> alt_gens_;

3276 AlternativeGeneration a_few_alt_gens_[kAFew];	3281 AlternativeGeneration a_few_alt_gens_[kAFew];

3277 };	3282 };

3278	3283

3279	3284

	3285 // The '2' variant is has inclusive from and exclusive to.

	3286 static const int kSpaceRanges2[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0,
	ulan 2012/04/16 13:52:22 Consider adding a static assert that the range cou Consider adding a static assert that the range counts are odd or a comment that 0x10000 is dummy. Maybe I missed something, but it looks like 0x10000 is never used in the code below, because all loops do (i+=2) and go while i < count-1. I didn't verify that the ranges have correct values. Erik Corry 2012/04/17 07:51:45 AddRange jsregexp.cc:118 uses all the elements in Show quoted text On 2012/04/16 13:52:22, ulan wrote: > Consider adding a static assert that the range counts are odd or a comment that > 0x10000 is dummy. Maybe I missed something, but it looks like 0x10000 is never > used in the code below, because all loops do (i+=2) and go while i < count-1. > > I didn't verify that the ranges have correct values. AddRange jsregexp.cc:118 uses all the elements in the table. ASSERTS added
	3287 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A,

	3288 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, 0x10000 };

	3289 static const int kSpaceRange2Count = ARRAY_SIZE(kSpaceRanges2);

	3290

	3291 static const int kWordRanges2[] = {

	3292 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };

	3293 static const int kWordRange2Count = ARRAY_SIZE(kWordRanges2);

	3294 static const int kDigitRanges2[] = { '0', '9' + 1, 0x10000 };

	3295 static const int kDigitRange2Count = ARRAY_SIZE(kDigitRanges2);

	3296 static const int kSurrogateRanges2[] = { 0xd800, 0xe000, 0x10000 };

	3297 static const int kSurrogateRange2Count = ARRAY_SIZE(kSurrogateRanges2);

	3298 static const int kLineTerminatorRanges2[] = { 0x000A, 0x000B, 0x000D, 0x000E,

	3299 0x2028, 0x202A, 0x10000 };

	3300 static const int kLineTerminatorRange2Count = ARRAY_SIZE(kLineTerminatorRanges2) ;
	ulan 2012/04/16 13:52:22 Long line above and too many new lines below. Long line above and too many new lines below. Erik Corry 2012/04/17 07:51:45 Done. Show quoted text On 2012/04/16 13:52:22, ulan wrote: > Long line above and too many new lines below. Done.
	3301

	3302

	3303

	3304 void BoyerMoorePositionInfo::Set(int character) {

	3305 SetInterval(Interval(character, character));

	3306 }

	3307

	3308

	3309 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {

	3310 s_ = AddRange(s_, kSpaceRanges2, kSpaceRange2Count, interval);

	3311 w_ = AddRange(w_, kWordRanges2, kWordRange2Count, interval);

	3312 d_ = AddRange(d_, kDigitRanges2, kDigitRange2Count, interval);

	3313 surrogate_ =

	3314 AddRange(surrogate_, kSurrogateRanges2, kSurrogateRange2Count, interval);

	3315 if (interval.to() - interval.from() >= kMapSize - 1) {

	3316 if (map_count_ != kMapSize) {

	3317 map_count_ = kMapSize;

	3318 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;

	3319 }

	3320 return;

	3321 }

	3322 for (int i = interval.from(); i <= interval.to(); i++) {

	3323 int mod_character = (i & kMask);

	3324 if (!map_->at(mod_character)) {

	3325 map_count_++;

	3326 map_->at(mod_character) = true;

	3327 }

	3328 if (map_count_ == kMapSize) return;

	3329 }

	3330 }

	3331

	3332

	3333 void BoyerMoorePositionInfo::SetAll() {

	3334 s_ = w_ = d_ = kLatticeUnknown;

	3335 if (map_count_ != kMapSize) {

	3336 map_count_ = kMapSize;

	3337 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;

	3338 }

	3339 }

	3340

	3341

3280 BoyerMooreLookahead::BoyerMooreLookahead(	3342 BoyerMooreLookahead::BoyerMooreLookahead(

3281 int length, int map_length, RegExpCompiler* compiler)	3343 int length, RegExpCompiler* compiler)

3282 : length_(length),	3344 : length_(length),

3283 map_length_(map_length),

3284 compiler_(compiler) {	3345 compiler_(compiler) {

3285 ASSERT(IsPowerOf2(map_length));

3286 if (compiler->ascii()) {	3346 if (compiler->ascii()) {

3287 max_char_ = String::kMaxAsciiCharCode;	3347 max_char_ = String::kMaxAsciiCharCode;

3288 } else {	3348 } else {

3289 max_char_ = String::kMaxUtf16CodeUnit;	3349 max_char_ = String::kMaxUtf16CodeUnit;

3290 }	3350 }

3291 bitmaps_ = new ZoneList<ZoneList<bool>*>(length);	3351 bitmaps_ = new ZoneList<BoyerMoorePositionInfo*>(length);

3292 for (int i = 0; i < length; i++) {	3352 for (int i = 0; i < length; i++) {

3293 bitmaps_->Add(new ZoneList<bool>(map_length));	3353 bitmaps_->Add(new BoyerMoorePositionInfo());

3294 ZoneList<bool>* map = bitmaps_->at(i);

3295 for (int i = 0; i < map_length; i++) {

3296 map->Add(false);

3297 }

3298 }	3354 }

3299 }	3355 }

3300	3356

3301	3357

3302 // Find the longest range of lookahead that has the fewest number of different	3358 // Find the longest range of lookahead that has the fewest number of different

3303 // characters that can occur at a given position. Since we are optimizing two	3359 // characters that can occur at a given position. Since we are optimizing two

3304 // different parameters at once this is a tradeoff.	3360 // different parameters at once this is a tradeoff.

3305 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {	3361 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {

3306 int biggest_points = 0;	3362 int biggest_points = 0;

3307 for (int max_number_of_chars = 4;	3363 for (int max_number_of_chars = 4;

3308 max_number_of_chars < kTooManyCharacters;	3364 max_number_of_chars < 32;
	ulan 2012/04/16 13:52:22 Now 32 is more magical :) Now 32 is more magical :) Erik Corry 2012/04/17 07:51:45 Done. Show quoted text On 2012/04/16 13:52:22, ulan wrote: > Now 32 is more magical :) Done.
3309 max_number_of_chars *= 2) {	3365 max_number_of_chars *= 2) {

3310 biggest_points =	3366 biggest_points =

3311 FindBestInterval(max_number_of_chars, biggest_points, from, to);	3367 FindBestInterval(max_number_of_chars, biggest_points, from, to);

3312 }	3368 }

3313 if (biggest_points == 0) return false;	3369 if (biggest_points == 0) return false;

3314 return true;	3370 return true;

3315 }	3371 }

3316	3372

3317	3373

3318 // Find the highest-points range between 0 and length_ where the character	3374 // Find the highest-points range between 0 and length_ where the character

3319 // information is not too vague. 'Too vague' means that there are more than	3375 // information is not too vague. 'Too vague' means that there are more than

3320 // max_number_of_chars that can occur at this position. Calculates the number	3376 // max_number_of_chars that can occur at this position. Calculates the number

3321 // of points as the product of width-of-the-range and	3377 // of points as the product of width-of-the-range and

3322 // probability-of-finding-one-of-the-characters, where the probability is	3378 // probability-of-finding-one-of-the-characters, where the probability is

3323 // calculated using the frequency distribution of the sample subject string.	3379 // calculated using the frequency distribution of the sample subject string.

3324 int BoyerMooreLookahead::FindBestInterval(	3380 int BoyerMooreLookahead::FindBestInterval(

3325 int max_number_of_chars, int old_biggest_points, int* from, int* to) {	3381 int max_number_of_chars, int old_biggest_points, int* from, int* to) {

3326 int biggest_points = old_biggest_points;	3382 int biggest_points = old_biggest_points;

3327 static const int kSize = RegExpMacroAssembler::kTableSize;	3383 static const int kSize = RegExpMacroAssembler::kTableSize;

3328 for (int i = 0; i < length_; ) {	3384 for (int i = 0; i < length_; ) {

3329 while (i < length_ && Count(i) > max_number_of_chars) i++;	3385 while (i < length_ && Count(i) > max_number_of_chars) i++;

3330 if (i == length_) break;	3386 if (i == length_) break;

3331 int remembered_from = i;	3387 int remembered_from = i;

3332 bool union_map[kSize];	3388 bool union_map[kSize];

3333 for (int j = 0; j < kSize; j++) union_map[j] = false;	3389 for (int j = 0; j < kSize; j++) union_map[j] = false;

3334 while (i < length_ && Count(i) <= max_number_of_chars) {	3390 while (i < length_ && Count(i) <= max_number_of_chars) {

3335 ZoneList<bool>* map = bitmaps_->at(i);	3391 BoyerMoorePositionInfo* map = bitmaps_->at(i);

3336 for (int j = 0; j < kSize; j++) union_map[j] \|= map->at(j);	3392 for (int j = 0; j < kSize; j++) union_map[j] \|= map->at(j);

3337 i++;	3393 i++;

3338 }	3394 }

3339 int frequency = 0;	3395 int frequency = 0;

3340 for (int j = 0; j < kSize; j++) {	3396 for (int j = 0; j < kSize; j++) {

3341 if (union_map[j]) {	3397 if (union_map[j]) {

3342 // Add 1 to the frequency to give a small per-character boost for	3398 // Add 1 to the frequency to give a small per-character boost for

3343 // the cases where our sampling is not good enough and many	3399 // the cases where our sampling is not good enough and many

3344 // characters have a frequency of zero. This means the frequency	3400 // characters have a frequency of zero. This means the frequency

3345 // can theoretically be up to 2*kSize though we treat it mostly as	3401 // can theoretically be up to 2*kSize though we treat it mostly as

(...skipping 34 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3380	3436

3381 const int kSkipArrayEntry = 0;	3437 const int kSkipArrayEntry = 0;

3382 const int kDontSkipArrayEntry = 1;	3438 const int kDontSkipArrayEntry = 1;

3383	3439

3384 for (int i = 0; i < kSize; i++) {	3440 for (int i = 0; i < kSize; i++) {

3385 boolean_skip_table->set(i, kSkipArrayEntry);	3441 boolean_skip_table->set(i, kSkipArrayEntry);

3386 }	3442 }

3387 int skip = max_lookahead + 1 - min_lookahead;	3443 int skip = max_lookahead + 1 - min_lookahead;

3388	3444

3389 for (int i = max_lookahead; i >= min_lookahead; i--) {	3445 for (int i = max_lookahead; i >= min_lookahead; i--) {

3390 ZoneList<bool>* map = bitmaps_->at(i);	3446 BoyerMoorePositionInfo* map = bitmaps_->at(i);

3391 for (int j = 0; j < map_length_; j++) {	3447 for (int j = 0; j < kSize; j++) {

3392 if (map->at(j)) {	3448 if (map->at(j)) {

3393 boolean_skip_table->set(j, kDontSkipArrayEntry);	3449 boolean_skip_table->set(j, kDontSkipArrayEntry);

3394 }	3450 }

3395 }	3451 }

3396 }	3452 }

3397	3453

3398 return skip;	3454 return skip;

3399 }	3455 }

3400	3456

3401	3457

3402 // See comment above on the implementation of GetSkipTable.	3458 // See comment above on the implementation of GetSkipTable.

3403 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {	3459 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {

	3460 const int kSize = RegExpMacroAssembler::kTableSize;

	3461

3404 int min_lookahead = 0;	3462 int min_lookahead = 0;

3405 int max_lookahead = 0;	3463 int max_lookahead = 0;

3406	3464

3407 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false;	3465 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false;

3408	3466

3409 bool found_single_character = false;	3467 bool found_single_character = false;

3410 bool abandoned_search_for_single_character = false;

3411 int single_character = 0;	3468 int single_character = 0;

3412 for (int i = max_lookahead; i >= min_lookahead; i--) {	3469 for (int i = max_lookahead; i >= min_lookahead; i--) {

3413 ZoneList<bool>* map = bitmaps_->at(i);	3470 BoyerMoorePositionInfo* map = bitmaps_->at(i);

3414 for (int j = 0; j < map_length_; j++) {	3471 if (map->map_count() > 1 \|\|

	3472 (found_single_character && map->map_count() != 0)) {

	3473 found_single_character = false;

	3474 break;

	3475 }

	3476 for (int j = 0; j < kSize; j++) {

3415 if (map->at(j)) {	3477 if (map->at(j)) {

3416 if (found_single_character) {	3478 found_single_character = true;

3417 found_single_character = false; // Found two.	3479 single_character = j;

3418 abandoned_search_for_single_character = true;	3480 break;

3419 break;

3420 } else {

3421 found_single_character = true;

3422 single_character = j;

3423 }

3424 }	3481 }

3425 }	3482 }

3426 if (abandoned_search_for_single_character) break;

3427 }	3483 }

3428	3484

3429 int lookahead_width = max_lookahead + 1 - min_lookahead;	3485 int lookahead_width = max_lookahead + 1 - min_lookahead;

3430	3486

3431 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {	3487 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {

3432 // The mask-compare can probably handle this better.	3488 // The mask-compare can probably handle this better.

3433 return false;	3489 return false;

3434 }	3490 }

3435	3491

3436 if (found_single_character) {	3492 if (found_single_character) {

3437 Label cont, again;	3493 Label cont, again;

3438 masm->Bind(&again);	3494 masm->Bind(&again);

3439 masm->LoadCurrentCharacter(max_lookahead, &cont, true);	3495 masm->LoadCurrentCharacter(max_lookahead, &cont, true);

3440 if (max_char_ > map_length_) {	3496 if (max_char_ > kSize) {

3441 ASSERT(map_length_ == RegExpMacroAssembler::kTableSize);

3442 masm->CheckCharacterAfterAnd(single_character,	3497 masm->CheckCharacterAfterAnd(single_character,

3443 RegExpMacroAssembler::kTableMask,	3498 RegExpMacroAssembler::kTableMask,

3444 &cont);	3499 &cont);

3445 } else {	3500 } else {

3446 masm->CheckCharacter(single_character, &cont);	3501 masm->CheckCharacter(single_character, &cont);

3447 }	3502 }

3448 masm->AdvanceCurrentPosition(lookahead_width);	3503 masm->AdvanceCurrentPosition(lookahead_width);

3449 masm->GoTo(&again);	3504 masm->GoTo(&again);

3450 masm->Bind(&cont);	3505 masm->Bind(&cont);

3451 return true;	3506 return true;

3452 }	3507 }

3453	3508

3454 Handle<ByteArray> boolean_skip_table =	3509 Handle<ByteArray> boolean_skip_table =

3455 FACTORY->NewByteArray(map_length_, TENURED);	3510 FACTORY->NewByteArray(kSize, TENURED);

3456 int skip_distance = GetSkipTable(	3511 int skip_distance = GetSkipTable(

3457 min_lookahead, max_lookahead, boolean_skip_table);	3512 min_lookahead, max_lookahead, boolean_skip_table);

3458 ASSERT(skip_distance != 0);	3513 ASSERT(skip_distance != 0);

3459	3514

3460 Label cont, again;	3515 Label cont, again;

3461 masm->Bind(&again);	3516 masm->Bind(&again);

3462 masm->LoadCurrentCharacter(max_lookahead, &cont, true);	3517 masm->LoadCurrentCharacter(max_lookahead, &cont, true);

3463 masm->CheckBitInTable(boolean_skip_table, &cont);	3518 masm->CheckBitInTable(boolean_skip_table, &cont);

3464 masm->AdvanceCurrentPosition(skip_distance);	3519 masm->AdvanceCurrentPosition(skip_distance);

3465 masm->GoTo(&again);	3520 masm->GoTo(&again);

(...skipping 158 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3624 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) ==	3679 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) ==

3625 this) {	3680 this) {

3626 // At this point we know that we are at a non-greedy loop that will eat	3681 // At this point we know that we are at a non-greedy loop that will eat

3627 // any character one at a time. Any non-anchored regexp has such a	3682 // any character one at a time. Any non-anchored regexp has such a

3628 // loop prepended to it in order to find where it starts. We look for	3683 // loop prepended to it in order to find where it starts. We look for

3629 // a pattern of the form ...abc... where we can look 6 characters ahead	3684 // a pattern of the form ...abc... where we can look 6 characters ahead

3630 // and step forwards 3 if the character is not one of abc. Abc need	3685 // and step forwards 3 if the character is not one of abc. Abc need

3631 // not be atoms, they can be any reasonably limited character class or	3686 // not be atoms, they can be any reasonably limited character class or

3632 // small alternation.	3687 // small alternation.

3633 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes.	3688 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes.

3634 eats_at_least =	3689 BoyerMooreLookahead* lookahead = bm_info(not_at_start);

3635 Min(kMaxLookaheadForBoyerMoore,	3690 if (lookahead == NULL) {

3636 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start));	3691 eats_at_least =

3637 if (eats_at_least >= 1) {	3692 Min(kMaxLookaheadForBoyerMoore,

3638 BoyerMooreLookahead bm(eats_at_least,	3693 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start));

3639 RegExpMacroAssembler::kTableSize,	3694 if (eats_at_least >= 1) {

3640 compiler);	3695 BoyerMooreLookahead* bm =

3641 GuardedAlternative alt0 = alternatives_->at(0);	3696 new BoyerMooreLookahead(eats_at_least, compiler);

3642 alt0.node()->FillInBMInfo(0, &bm, not_at_start);	3697 GuardedAlternative alt0 = alternatives_->at(0);

3643 skip_was_emitted = bm.EmitSkipInstructions(macro_assembler);	3698 alt0.node()->FillInBMInfo(0, bm, not_at_start);

	3699 skip_was_emitted = bm->EmitSkipInstructions(macro_assembler);

	3700 }

	3701 } else {

	3702 skip_was_emitted = lookahead->EmitSkipInstructions(macro_assembler);

3644 }	3703 }

3645 }	3704 }

3646 }	3705 }

3647 }	3706 }

3648	3707

3649 if (eats_at_least == kEatsAtLeastNotYetInitialized) {	3708 if (eats_at_least == kEatsAtLeastNotYetInitialized) {

3650 // Save some time by looking at most one machine word ahead.	3709 // Save some time by looking at most one machine word ahead.

3651 eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start);	3710 eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start);

3652 }	3711 }

3653 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least);	3712 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least);

(...skipping 542 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4196 break;	4255 break;

4197 case AssertionNode::AT_BOUNDARY:	4256 case AssertionNode::AT_BOUNDARY:

4198 stream()->Add("label=\"\\b\", shape=septagon");	4257 stream()->Add("label=\"\\b\", shape=septagon");

4199 break;	4258 break;

4200 case AssertionNode::AT_NON_BOUNDARY:	4259 case AssertionNode::AT_NON_BOUNDARY:

4201 stream()->Add("label=\"\\B\", shape=septagon");	4260 stream()->Add("label=\"\\B\", shape=septagon");

4202 break;	4261 break;

4203 case AssertionNode::AFTER_NEWLINE:	4262 case AssertionNode::AFTER_NEWLINE:

4204 stream()->Add("label=\"(?<=\\n)\", shape=septagon");	4263 stream()->Add("label=\"(?<=\\n)\", shape=septagon");

4205 break;	4264 break;

4206 case AssertionNode::AFTER_WORD_CHARACTER:

4207 stream()->Add("label=\"(?<=\\w)\", shape=septagon");

4208 break;

4209 case AssertionNode::AFTER_NONWORD_CHARACTER:

4210 stream()->Add("label=\"(?<=\\W)\", shape=septagon");

4211 break;

4212 }	4265 }

4213 stream()->Add("];\n");	4266 stream()->Add("];\n");

4214 PrintAttributes(that);	4267 PrintAttributes(that);

4215 RegExpNode* successor = that->on_success();	4268 RegExpNode* successor = that->on_success();

4216 stream()->Add(" n%p -> n%p;\n", that, successor);	4269 stream()->Add(" n%p -> n%p;\n", that, successor);

4217 Visit(successor);	4270 Visit(successor);

4218 }	4271 }

4219	4272

4220	4273

4221 void DotPrinter::VisitAction(ActionNode* that) {	4274 void DotPrinter::VisitAction(ActionNode* that) {

(...skipping 84 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4306 printer.PrintNode(label, node);	4359 printer.PrintNode(label, node);

4307 }	4360 }

4308	4361

4309	4362

4310 #endif // DEBUG	4363 #endif // DEBUG

4311	4364

4312	4365

4313 // -------------------------------------------------------------------	4366 // -------------------------------------------------------------------

4314 // Tree to graph conversion	4367 // Tree to graph conversion

4315	4368

4316 static const uc16 kSpaceRanges[] = { 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0,

4317 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029,

4318 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000, 0xFEFF, 0xFEFF };

4319 static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);

4320

4321 static const uc16 kWordRanges[] = { '0', '9', 'A', 'Z', '_', '_', 'a', 'z' };

4322 static const int kWordRangeCount = ARRAY_SIZE(kWordRanges);

4323

4324 static const uc16 kDigitRanges[] = { '0', '9' };

4325 static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges);

4326

4327 static const uc16 kLineTerminatorRanges[] = { 0x000A, 0x000A, 0x000D, 0x000D,

4328 0x2028, 0x2029 };

4329 static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges);

4330

4331 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,	4369 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,

4332 RegExpNode* on_success) {	4370 RegExpNode* on_success) {

4333 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);	4371 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);

4334 elms->Add(TextElement::Atom(this));	4372 elms->Add(TextElement::Atom(this));

4335 return new TextNode(elms, on_success);	4373 return new TextNode(elms, on_success);

4336 }	4374 }

4337	4375

4338	4376

4339 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,	4377 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,

4340 RegExpNode* on_success) {	4378 RegExpNode* on_success) {

4341 return new TextNode(elements(), on_success);	4379 return new TextNode(elements(), on_success);

4342 }	4380 }

4343	4381

	4382

4344 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,	4383 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,

4345 const uc16* special_class,	4384 const int* special_class,

4346 int length) {	4385 int length) {

	4386 length--; // Remove final 0x10000.

	4387 ASSERT(special_class[length] == 0x10000);

4347 ASSERT(ranges->length() != 0);	4388 ASSERT(ranges->length() != 0);

4348 ASSERT(length != 0);	4389 ASSERT(length != 0);

4349 ASSERT(special_class[0] != 0);	4390 ASSERT(special_class[0] != 0);

4350 if (ranges->length() != (length >> 1) + 1) {	4391 if (ranges->length() != (length >> 1) + 1) {

4351 return false;	4392 return false;

4352 }	4393 }

4353 CharacterRange range = ranges->at(0);	4394 CharacterRange range = ranges->at(0);

4354 if (range.from() != 0) {	4395 if (range.from() != 0) {

4355 return false;	4396 return false;

4356 }	4397 }

4357 for (int i = 0; i < length; i += 2) {	4398 for (int i = 0; i < length; i += 2) {

4358 if (special_class[i] != (range.to() + 1)) {	4399 if (special_class[i] != (range.to() + 1)) {

4359 return false;	4400 return false;

4360 }	4401 }

4361 range = ranges->at((i >> 1) + 1);	4402 range = ranges->at((i >> 1) + 1);

4362 if (special_class[i+1] != range.from() - 1) {	4403 if (special_class[i+1] != range.from()) {

4363 return false;	4404 return false;

4364 }	4405 }

4365 }	4406 }

4366 if (range.to() != 0xffff) {	4407 if (range.to() != 0xffff) {

4367 return false;	4408 return false;

4368 }	4409 }

4369 return true;	4410 return true;

4370 }	4411 }

4371	4412

4372	4413

4373 static bool CompareRanges(ZoneList<CharacterRange>* ranges,	4414 static bool CompareRanges(ZoneList<CharacterRange>* ranges,

4374 const uc16* special_class,	4415 const int* special_class,

4375 int length) {	4416 int length) {

	4417 length--; // Remove final 0x10000.

	4418 ASSERT(special_class[length] == 0x10000);

4376 if (ranges->length() * 2 != length) {	4419 if (ranges->length() * 2 != length) {

4377 return false;	4420 return false;

4378 }	4421 }

4379 for (int i = 0; i < length; i += 2) {	4422 for (int i = 0; i < length; i += 2) {

4380 CharacterRange range = ranges->at(i >> 1);	4423 CharacterRange range = ranges->at(i >> 1);

4381 if (range.from() != special_class[i] \|\| range.to() != special_class[i+1]) {	4424 if (range.from() != special_class[i] \|\|

	4425 range.to() != special_class[i + 1] - 1) {

4382 return false;	4426 return false;

4383 }	4427 }

4384 }	4428 }

4385 return true;	4429 return true;

4386 }	4430 }

4387	4431

4388	4432

4389 bool RegExpCharacterClass::is_standard() {	4433 bool RegExpCharacterClass::is_standard() {

4390 // TODO(lrn): Remove need for this function, by not throwing away information	4434 // TODO(lrn): Remove need for this function, by not throwing away information

4391 // along the way.	4435 // along the way.

4392 if (is_negated_) {	4436 if (is_negated_) {

4393 return false;	4437 return false;

4394 }	4438 }

4395 if (set_.is_standard()) {	4439 if (set_.is_standard()) {

4396 return true;	4440 return true;

4397 }	4441 }

4398 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {	4442 if (CompareRanges(set_.ranges(), kSpaceRanges2, kSpaceRange2Count)) {

4399 set_.set_standard_set_type('s');	4443 set_.set_standard_set_type('s');

4400 return true;	4444 return true;

4401 }	4445 }

4402 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {	4446 if (CompareInverseRanges(set_.ranges(), kSpaceRanges2, kSpaceRange2Count)) {

4403 set_.set_standard_set_type('S');	4447 set_.set_standard_set_type('S');

4404 return true;	4448 return true;

4405 }	4449 }

4406 if (CompareInverseRanges(set_.ranges(),	4450 if (CompareInverseRanges(set_.ranges(),

4407 kLineTerminatorRanges,	4451 kLineTerminatorRanges2,

4408 kLineTerminatorRangeCount)) {	4452 kLineTerminatorRange2Count)) {

4409 set_.set_standard_set_type('.');	4453 set_.set_standard_set_type('.');

4410 return true;	4454 return true;

4411 }	4455 }

4412 if (CompareRanges(set_.ranges(),	4456 if (CompareRanges(set_.ranges(),

4413 kLineTerminatorRanges,	4457 kLineTerminatorRanges2,

4414 kLineTerminatorRangeCount)) {	4458 kLineTerminatorRange2Count)) {

4415 set_.set_standard_set_type('n');	4459 set_.set_standard_set_type('n');

4416 return true;	4460 return true;

4417 }	4461 }

4418 if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {	4462 if (CompareRanges(set_.ranges(), kWordRanges2, kWordRange2Count)) {

4419 set_.set_standard_set_type('w');	4463 set_.set_standard_set_type('w');

4420 return true;	4464 return true;

4421 }	4465 }

4422 if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {	4466 if (CompareInverseRanges(set_.ranges(), kWordRanges2, kWordRange2Count)) {

4423 set_.set_standard_set_type('W');	4467 set_.set_standard_set_type('W');

4424 return true;	4468 return true;

4425 }	4469 }

4426 return false;	4470 return false;

4427 }	4471 }

4428	4472

4429	4473

4430 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,	4474 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,

4431 RegExpNode* on_success) {	4475 RegExpNode* on_success) {

4432 return new TextNode(this, on_success);	4476 return new TextNode(this, on_success);

(...skipping 339 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4772 RegExpNode* on_success) {	4816 RegExpNode* on_success) {

4773 ZoneList<RegExpTree> children = nodes();	4817 ZoneList<RegExpTree> children = nodes();

4774 RegExpNode* current = on_success;	4818 RegExpNode* current = on_success;

4775 for (int i = children->length() - 1; i >= 0; i--) {	4819 for (int i = children->length() - 1; i >= 0; i--) {

4776 current = children->at(i)->ToNode(compiler, current);	4820 current = children->at(i)->ToNode(compiler, current);

4777 }	4821 }

4778 return current;	4822 return current;

4779 }	4823 }

4780	4824

4781	4825

4782 static void AddClass(const uc16* elmv,	4826 static void AddClass(const int* elmv,

4783 int elmc,	4827 int elmc,

4784 ZoneList<CharacterRange>* ranges) {	4828 ZoneList<CharacterRange>* ranges) {

	4829 elmc--;

	4830 ASSERT(elmv[elmc] == 0x10000);

4785 for (int i = 0; i < elmc; i += 2) {	4831 for (int i = 0; i < elmc; i += 2) {

4786 ASSERT(elmv[i] <= elmv[i + 1]);	4832 ASSERT(elmv[i] < elmv[i + 1]);

4787 ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));	4833 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1));

4788 }	4834 }

4789 }	4835 }

4790	4836

4791	4837

4792 static void AddClassNegated(const uc16 *elmv,	4838 static void AddClassNegated(const int *elmv,

4793 int elmc,	4839 int elmc,

4794 ZoneList<CharacterRange>* ranges) {	4840 ZoneList<CharacterRange>* ranges) {

	4841 elmc--;

	4842 ASSERT(elmv[elmc] == 0x10000);

4795 ASSERT(elmv[0] != 0x0000);	4843 ASSERT(elmv[0] != 0x0000);

4796 ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit);	4844 ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit);

4797 uc16 last = 0x0000;	4845 uc16 last = 0x0000;

4798 for (int i = 0; i < elmc; i += 2) {	4846 for (int i = 0; i < elmc; i += 2) {

4799 ASSERT(last <= elmv[i] - 1);	4847 ASSERT(last <= elmv[i] - 1);

4800 ASSERT(elmv[i] <= elmv[i + 1]);	4848 ASSERT(elmv[i] < elmv[i + 1]);

4801 ranges->Add(CharacterRange(last, elmv[i] - 1));	4849 ranges->Add(CharacterRange(last, elmv[i] - 1));

4802 last = elmv[i + 1] + 1;	4850 last = elmv[i + 1];

4803 }	4851 }

4804 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit));	4852 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit));

4805 }	4853 }

4806	4854

4807	4855

4808 void CharacterRange::AddClassEscape(uc16 type,	4856 void CharacterRange::AddClassEscape(uc16 type,

4809 ZoneList<CharacterRange>* ranges) {	4857 ZoneList<CharacterRange>* ranges) {

4810 switch (type) {	4858 switch (type) {

4811 case 's':	4859 case 's':

4812 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);	4860 AddClass(kSpaceRanges2, kSpaceRange2Count, ranges);

4813 break;	4861 break;

4814 case 'S':	4862 case 'S':

4815 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);	4863 AddClassNegated(kSpaceRanges2, kSpaceRange2Count, ranges);

4816 break;	4864 break;

4817 case 'w':	4865 case 'w':

4818 AddClass(kWordRanges, kWordRangeCount, ranges);	4866 AddClass(kWordRanges2, kWordRange2Count, ranges);

4819 break;	4867 break;

4820 case 'W':	4868 case 'W':

4821 AddClassNegated(kWordRanges, kWordRangeCount, ranges);	4869 AddClassNegated(kWordRanges2, kWordRange2Count, ranges);

4822 break;	4870 break;

4823 case 'd':	4871 case 'd':

4824 AddClass(kDigitRanges, kDigitRangeCount, ranges);	4872 AddClass(kDigitRanges2, kDigitRange2Count, ranges);

4825 break;	4873 break;

4826 case 'D':	4874 case 'D':

4827 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);	4875 AddClassNegated(kDigitRanges2, kDigitRange2Count, ranges);

4828 break;	4876 break;

4829 case '.':	4877 case '.':

4830 AddClassNegated(kLineTerminatorRanges,	4878 AddClassNegated(kLineTerminatorRanges2,

4831 kLineTerminatorRangeCount,	4879 kLineTerminatorRange2Count,

4832 ranges);	4880 ranges);

4833 break;	4881 break;

4834 // This is not a character range as defined by the spec but a	4882 // This is not a character range as defined by the spec but a

4835 // convenient shorthand for a character class that matches any	4883 // convenient shorthand for a character class that matches any

4836 // character.	4884 // character.

4837 case '*':	4885 case '*':

4838 ranges->Add(CharacterRange::Everything());	4886 ranges->Add(CharacterRange::Everything());

4839 break;	4887 break;

4840 // This is the set of characters matched by the $ and ^ symbols	4888 // This is the set of characters matched by the $ and ^ symbols

4841 // in multiline mode.	4889 // in multiline mode.

4842 case 'n':	4890 case 'n':

4843 AddClass(kLineTerminatorRanges,	4891 AddClass(kLineTerminatorRanges2,

4844 kLineTerminatorRangeCount,	4892 kLineTerminatorRange2Count,

4845 ranges);	4893 ranges);

4846 break;	4894 break;

4847 default:	4895 default:

4848 UNREACHABLE();	4896 UNREACHABLE();

4849 }	4897 }

4850 }	4898 }

4851	4899

4852	4900

4853 Vector<const uc16> CharacterRange::GetWordBounds() {	4901 Vector<const int> CharacterRange::GetWordBounds() {

4854 return Vector<const uc16>(kWordRanges, kWordRangeCount);	4902 return Vector<const int>(kWordRanges2, kWordRange2Count - 1);

4855 }	4903 }

4856	4904

4857	4905

4858 class CharacterRangeSplitter {	4906 class CharacterRangeSplitter {

4859 public:	4907 public:

4860 CharacterRangeSplitter(ZoneList<CharacterRange>** included,	4908 CharacterRangeSplitter(ZoneList<CharacterRange>** included,

4861 ZoneList<CharacterRange>** excluded)	4909 ZoneList<CharacterRange>** excluded)

4862 : included_(included),	4910 : included_(included),

4863 excluded_(excluded) { }	4911 excluded_(excluded) { }

4864 void Call(uc16 from, DispatchTable::Entry entry);	4912 void Call(uc16 from, DispatchTable::Entry entry);

(...skipping 11 matching lines...) Expand all Loading...
4876 if (!entry.out_set()->Get(kInBase)) return;	4924 if (!entry.out_set()->Get(kInBase)) return;

4877 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)	4925 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)

4878 ? included_	4926 ? included_

4879 : excluded_;	4927 : excluded_;

4880 if (target == NULL) target = new ZoneList<CharacterRange>(2);	4928 if (target == NULL) target = new ZoneList<CharacterRange>(2);

4881 (*target)->Add(CharacterRange(entry.from(), entry.to()));	4929 (*target)->Add(CharacterRange(entry.from(), entry.to()));

4882 }	4930 }

4883	4931

4884	4932

4885 void CharacterRange::Split(ZoneList<CharacterRange>* base,	4933 void CharacterRange::Split(ZoneList<CharacterRange>* base,

4886 Vector<const uc16> overlay,	4934 Vector<const int> overlay,

4887 ZoneList<CharacterRange>** included,	4935 ZoneList<CharacterRange>** included,

4888 ZoneList<CharacterRange>** excluded) {	4936 ZoneList<CharacterRange>** excluded) {

4889 ASSERT_EQ(NULL, *included);	4937 ASSERT_EQ(NULL, *included);

4890 ASSERT_EQ(NULL, *excluded);	4938 ASSERT_EQ(NULL, *excluded);

4891 DispatchTable table;	4939 DispatchTable table;

4892 for (int i = 0; i < base->length(); i++)	4940 for (int i = 0; i < base->length(); i++)

4893 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);	4941 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);

4894 for (int i = 0; i < overlay.length(); i += 2) {	4942 for (int i = 0; i < overlay.length(); i += 2) {

4895 table.AddRange(CharacterRange(overlay[i], overlay[i+1]),	4943 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1),

4896 CharacterRangeSplitter::kInOverlay);	4944 CharacterRangeSplitter::kInOverlay);

4897 }	4945 }

4898 CharacterRangeSplitter callback(included, excluded);	4946 CharacterRangeSplitter callback(included, excluded);

4899 table.ForEach(&callback);	4947 table.ForEach(&callback);

4900 }	4948 }

4901	4949

4902	4950

4903 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,	4951 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,

4904 bool is_ascii) {	4952 bool is_ascii) {

4905 Isolate* isolate = Isolate::Current();	4953 Isolate* isolate = Isolate::Current();

(...skipping 65 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4971 if (n <= 1) return true;	5019 if (n <= 1) return true;

4972 int max = ranges->at(0).to();	5020 int max = ranges->at(0).to();

4973 for (int i = 1; i < n; i++) {	5021 for (int i = 1; i < n; i++) {

4974 CharacterRange next_range = ranges->at(i);	5022 CharacterRange next_range = ranges->at(i);

4975 if (next_range.from() <= max + 1) return false;	5023 if (next_range.from() <= max + 1) return false;

4976 max = next_range.to();	5024 max = next_range.to();

4977 }	5025 }

4978 return true;	5026 return true;

4979 }	5027 }

4980	5028

4981 SetRelation CharacterRange::WordCharacterRelation(

4982 ZoneList<CharacterRange>* range) {

4983 ASSERT(IsCanonical(range));

4984 int i = 0; // Word character range index.

4985 int j = 0; // Argument range index.

4986 ASSERT_NE(0, kWordRangeCount);

4987 SetRelation result;

4988 if (range->length() == 0) {

4989 result.SetElementsInSecondSet();

4990 return result;

4991 }

4992 CharacterRange argument_range = range->at(0);

4993 CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]);

4994 while (i < kWordRangeCount && j < range->length()) {

4995 // Check the two ranges for the five cases:

4996 // - no overlap.

4997 // - partial overlap (there are elements in both ranges that isn't

4998 // in the other, and there are also elements that are in both).

4999 // - argument range entirely inside word range.

5000 // - word range entirely inside argument range.

5001 // - ranges are completely equal.

5002

5003 // First check for no overlap. The earlier range is not in the other set.

5004 if (argument_range.from() > word_range.to()) {

5005 // Ranges are disjoint. The earlier word range contains elements that

5006 // cannot be in the argument set.

5007 result.SetElementsInSecondSet();

5008 } else if (word_range.from() > argument_range.to()) {

5009 // Ranges are disjoint. The earlier argument range contains elements that

5010 // cannot be in the word set.

5011 result.SetElementsInFirstSet();

5012 } else if (word_range.from() <= argument_range.from() &&

5013 word_range.to() >= argument_range.from()) {

5014 result.SetElementsInBothSets();

5015 // argument range completely inside word range.

5016 if (word_range.from() < argument_range.from() \|\|

5017 word_range.to() > argument_range.from()) {

5018 result.SetElementsInSecondSet();

5019 }

5020 } else if (word_range.from() >= argument_range.from() &&

5021 word_range.to() <= argument_range.from()) {

5022 result.SetElementsInBothSets();

5023 result.SetElementsInFirstSet();

5024 } else {

5025 // There is overlap, and neither is a subrange of the other

5026 result.SetElementsInFirstSet();

5027 result.SetElementsInSecondSet();

5028 result.SetElementsInBothSets();

5029 }

5030 if (result.NonTrivialIntersection()) {

5031 // The result is as (im)precise as we can possibly make it.

5032 return result;

5033 }

5034 // Progress the range(s) with minimal to-character.

5035 uc16 word_to = word_range.to();

5036 uc16 argument_to = argument_range.to();

5037 if (argument_to <= word_to) {

5038 j++;

5039 if (j < range->length()) {

5040 argument_range = range->at(j);

5041 }

5042 }

5043 if (word_to <= argument_to) {

5044 i += 2;

5045 if (i < kWordRangeCount) {

5046 word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]);

5047 }

5048 }

5049 }

5050 // Check if anything wasn't compared in the loop.

5051 if (i < kWordRangeCount) {

5052 // word range contains something not in argument range.

5053 result.SetElementsInSecondSet();

5054 } else if (j < range->length()) {

5055 // Argument range contains something not in word range.

5056 result.SetElementsInFirstSet();

5057 }

5058

5059 return result;

5060 }

5061

5062	5029

5063 ZoneList<CharacterRange>* CharacterSet::ranges() {	5030 ZoneList<CharacterRange>* CharacterSet::ranges() {

5064 if (ranges_ == NULL) {	5031 if (ranges_ == NULL) {

5065 ranges_ = new ZoneList<CharacterRange>(2);	5032 ranges_ = new ZoneList<CharacterRange>(2);

5066 CharacterRange::AddClassEscape(standard_set_type_, ranges_);	5033 CharacterRange::AddClassEscape(standard_set_type_, ranges_);

5067 }	5034 }

5068 return ranges_;	5035 return ranges_;

5069 }	5036 }

5070	5037

5071	5038

(...skipping 112 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5184 num_canonical,	5151 num_canonical,

5185 character_ranges->at(read));	5152 character_ranges->at(read));

5186 read++;	5153 read++;

5187 } while (read < n);	5154 } while (read < n);

5188 character_ranges->Rewind(num_canonical);	5155 character_ranges->Rewind(num_canonical);

5189	5156

5190 ASSERT(CharacterRange::IsCanonical(character_ranges));	5157 ASSERT(CharacterRange::IsCanonical(character_ranges));

5191 }	5158 }

5192	5159

5193	5160

5194 // Utility function for CharacterRange::Merge. Adds a range at the end of

5195 // a canonicalized range list, if necessary merging the range with the last

5196 // range of the list.

5197 static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) {

5198 if (set == NULL) return;

5199 ASSERT(set->length() == 0 \|\| set->at(set->length() - 1).to() < range.from());

5200 int n = set->length();

5201 if (n > 0) {

5202 CharacterRange lastRange = set->at(n - 1);

5203 if (lastRange.to() == range.from() - 1) {

5204 set->at(n - 1) = CharacterRange(lastRange.from(), range.to());

5205 return;

5206 }

5207 }

5208 set->Add(range);

5209 }

5210

5211

5212 static void AddRangeToSelectedSet(int selector,

5213 ZoneList<CharacterRange>* first_set,

5214 ZoneList<CharacterRange>* second_set,

5215 ZoneList<CharacterRange>* intersection_set,

5216 CharacterRange range) {

5217 switch (selector) {

5218 case kInsideFirst:

5219 AddRangeToSet(first_set, range);

5220 break;

5221 case kInsideSecond:

5222 AddRangeToSet(second_set, range);

5223 break;

5224 case kInsideBoth:

5225 AddRangeToSet(intersection_set, range);

5226 break;

5227 }

5228 }

5229

5230

5231

5232 void CharacterRange::Merge(ZoneList<CharacterRange>* first_set,

5233 ZoneList<CharacterRange>* second_set,

5234 ZoneList<CharacterRange>* first_set_only_out,

5235 ZoneList<CharacterRange>* second_set_only_out,

5236 ZoneList<CharacterRange>* both_sets_out) {

5237 // Inputs are canonicalized.

5238 ASSERT(CharacterRange::IsCanonical(first_set));

5239 ASSERT(CharacterRange::IsCanonical(second_set));

5240 // Outputs are empty, if applicable.

5241 ASSERT(first_set_only_out == NULL \|\| first_set_only_out->length() == 0);

5242 ASSERT(second_set_only_out == NULL \|\| second_set_only_out->length() == 0);

5243 ASSERT(both_sets_out == NULL \|\| both_sets_out->length() == 0);

5244

5245 // Merge sets by iterating through the lists in order of lowest "from" value,

5246 // and putting intervals into one of three sets.

5247

5248 if (first_set->length() == 0) {

5249 second_set_only_out->AddAll(*second_set);

5250 return;

5251 }

5252 if (second_set->length() == 0) {

5253 first_set_only_out->AddAll(*first_set);

5254 return;

5255 }

5256 // Indices into input lists.

5257 int i1 = 0;

5258 int i2 = 0;

5259 // Cache length of input lists.

5260 int n1 = first_set->length();

5261 int n2 = second_set->length();

5262 // Current range. May be invalid if state is kInsideNone.

5263 int from = 0;

5264 int to = -1;

5265 // Where current range comes from.

5266 int state = kInsideNone;

5267

5268 while (i1 < n1 \|\| i2 < n2) {

5269 CharacterRange next_range;

5270 int range_source;

5271 if (i2 == n2 \|\|

5272 (i1 < n1 && first_set->at(i1).from() < second_set->at(i2).from())) {

5273 // Next smallest element is in first set.

5274 next_range = first_set->at(i1++);

5275 range_source = kInsideFirst;

5276 } else {

5277 // Next smallest element is in second set.

5278 next_range = second_set->at(i2++);

5279 range_source = kInsideSecond;

5280 }

5281 if (to < next_range.from()) {

5282 // Ranges disjoint: \|current\| \|next\|

5283 AddRangeToSelectedSet(state,

5284 first_set_only_out,

5285 second_set_only_out,

5286 both_sets_out,

5287 CharacterRange(from, to));

5288 from = next_range.from();

5289 to = next_range.to();

5290 state = range_source;

5291 } else {

5292 if (from < next_range.from()) {

5293 AddRangeToSelectedSet(state,

5294 first_set_only_out,

5295 second_set_only_out,

5296 both_sets_out,

5297 CharacterRange(from, next_range.from()-1));

5298 }

5299 if (to < next_range.to()) {

5300 // Ranges overlap: \|current\|

5301 // \|next\|

5302 AddRangeToSelectedSet(state \| range_source,

5303 first_set_only_out,

5304 second_set_only_out,

5305 both_sets_out,

5306 CharacterRange(next_range.from(), to));

5307 from = to + 1;

5308 to = next_range.to();

5309 state = range_source;

5310 } else {

5311 // Range included: \|current\| , possibly ending at same character.

5312 // \|next\|

5313 AddRangeToSelectedSet(

5314 state \| range_source,

5315 first_set_only_out,

5316 second_set_only_out,

5317 both_sets_out,

5318 CharacterRange(next_range.from(), next_range.to()));

5319 from = next_range.to() + 1;

5320 // If ranges end at same character, both ranges are consumed completely.

5321 if (next_range.to() == to) state = kInsideNone;

5322 }

5323 }

5324 }

5325 AddRangeToSelectedSet(state,

5326 first_set_only_out,

5327 second_set_only_out,

5328 both_sets_out,

5329 CharacterRange(from, to));

5330 }

5331

5332

5333 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,	5161 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,

5334 ZoneList<CharacterRange>* negated_ranges) {	5162 ZoneList<CharacterRange>* negated_ranges) {

5335 ASSERT(CharacterRange::IsCanonical(ranges));	5163 ASSERT(CharacterRange::IsCanonical(ranges));

5336 ASSERT_EQ(0, negated_ranges->length());	5164 ASSERT_EQ(0, negated_ranges->length());

5337 int range_count = ranges->length();	5165 int range_count = ranges->length();

5338 uc16 from = 0;	5166 uc16 from = 0;

5339 int i = 0;	5167 int i = 0;

5340 if (range_count > 0 && ranges->at(0).from() == 0) {	5168 if (range_count > 0 && ranges->at(0).from() == 0) {

5341 from = ranges->at(0).to();	5169 from = ranges->at(0).to();

5342 i = 1;	5170 i = 1;

(...skipping 292 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5635 }	5463 }

5636	5464

5637	5465

5638 void Analysis::VisitBackReference(BackReferenceNode* that) {	5466 void Analysis::VisitBackReference(BackReferenceNode* that) {

5639 EnsureAnalyzed(that->on_success());	5467 EnsureAnalyzed(that->on_success());

5640 }	5468 }

5641	5469

5642	5470

5643 void Analysis::VisitAssertion(AssertionNode* that) {	5471 void Analysis::VisitAssertion(AssertionNode* that) {

5644 EnsureAnalyzed(that->on_success());	5472 EnsureAnalyzed(that->on_success());

5645 AssertionNode::AssertionNodeType type = that->type();

5646 if (type == AssertionNode::AT_BOUNDARY \|\|

5647 type == AssertionNode::AT_NON_BOUNDARY) {

5648 // Check if the following character is known to be a word character

5649 // or known to not be a word character.

5650 ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet();

5651

5652 CharacterRange::Canonicalize(following_chars);

5653

5654 SetRelation word_relation =

5655 CharacterRange::WordCharacterRelation(following_chars);

5656 if (word_relation.Disjoint()) {

5657 // Includes the case where following_chars is empty (e.g., end-of-input).

5658 // Following character is definitely not a word character.

5659 type = (type == AssertionNode::AT_BOUNDARY) ?

5660 AssertionNode::AFTER_WORD_CHARACTER :

5661 AssertionNode::AFTER_NONWORD_CHARACTER;

5662 that->set_type(type);

5663 } else if (word_relation.ContainedIn()) {

5664 // Following character is definitely a word character.

5665 type = (type == AssertionNode::AT_BOUNDARY) ?

5666 AssertionNode::AFTER_NONWORD_CHARACTER :

5667 AssertionNode::AFTER_WORD_CHARACTER;

5668 that->set_type(type);

5669 }

5670 }

5671 }	5473 }

5672	5474

5673	5475

5674 ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() {	5476 void BackReferenceNode::FillInBMInfo(

5675 if (first_character_set_ == NULL) {	5477 int offset, BoyerMooreLookahead* bm, bool not_at_start) {

5676 if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) {	5478 // Working out the set of characters that a backreference can match is too

5677 // If we can't find an exact solution within the budget, we	5479 // hard, so we just say that any character can match.

5678 // set the value to the set of every character, i.e., all characters	5480 bm->SetRest(offset);

5679 // are possible.	5481 if (offset == 0) set_bm_info(not_at_start, bm);

5680 ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1);

5681 all_set->Add(CharacterRange::Everything());

5682 first_character_set_ = all_set;

5683 }

5684 }

5685 return first_character_set_;

5686 }	5482 }

5687	5483

5688	5484

5689 int RegExpNode::ComputeFirstCharacterSet(int budget) {	5485 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==

5690 // Default behavior is to not be able to determine the first character.	5486 RegExpMacroAssembler::kTableSize);

5691 return kComputeFirstCharacterSetFail;

5692 }

5693

5694

5695 int LoopChoiceNode::ComputeFirstCharacterSet(int budget) {

5696 budget--;

5697 if (budget >= 0) {

5698 // Find loop min-iteration. It's the value of the guarded choice node

5699 // with a GEQ guard, if any.

5700 int min_repetition = 0;

5701

5702 for (int i = 0; i <= 1; i++) {

5703 GuardedAlternative alternative = alternatives()->at(i);

5704 ZoneList<Guard> guards = alternative.guards();

5705 if (guards != NULL && guards->length() > 0) {

5706 Guard* guard = guards->at(0);

5707 if (guard->op() == Guard::GEQ) {

5708 min_repetition = guard->value();

5709 break;

5710 }

5711 }

5712 }

5713

5714 budget = loop_node()->ComputeFirstCharacterSet(budget);

5715 if (budget >= 0) {

5716 ZoneList<CharacterRange>* character_set =

5717 loop_node()->first_character_set();

5718 if (body_can_be_zero_length() \|\| min_repetition == 0) {

5719 budget = continue_node()->ComputeFirstCharacterSet(budget);

5720 if (budget < 0) return budget;

5721 ZoneList<CharacterRange>* body_set =

5722 continue_node()->first_character_set();

5723 ZoneList<CharacterRange>* union_set =

5724 new ZoneList<CharacterRange>(Max(character_set->length(),

5725 body_set->length()));

5726 CharacterRange::Merge(character_set,

5727 body_set,

5728 union_set,

5729 union_set,

5730 union_set);

5731 character_set = union_set;

5732 }

5733 set_first_character_set(character_set);

5734 }

5735 }

5736 return budget;

5737 }

5738

5739

5740 int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) {

5741 budget--;

5742 if (budget >= 0) {

5743 GuardedAlternative successor = this->alternatives()->at(1);

5744 RegExpNode* successor_node = successor.node();

5745 budget = successor_node->ComputeFirstCharacterSet(budget);

5746 if (budget >= 0) {

5747 set_first_character_set(successor_node->first_character_set());

5748 }

5749 }

5750 return budget;

5751 }

5752

5753

5754 // The first character set of an EndNode is unknowable. Just use the

5755 // default implementation that fails and returns all characters as possible.

5756

5757

5758 int AssertionNode::ComputeFirstCharacterSet(int budget) {

5759 budget -= 1;

5760 if (budget >= 0) {

5761 switch (type_) {

5762 case AT_END: {

5763 set_first_character_set(new ZoneList<CharacterRange>(0));

5764 break;

5765 }

5766 case AT_START:

5767 case AT_BOUNDARY:

5768 case AT_NON_BOUNDARY:

5769 case AFTER_NEWLINE:

5770 case AFTER_NONWORD_CHARACTER:

5771 case AFTER_WORD_CHARACTER: {

5772 ASSERT_NOT_NULL(on_success());

5773 budget = on_success()->ComputeFirstCharacterSet(budget);

5774 if (budget >= 0) {

5775 set_first_character_set(on_success()->first_character_set());

5776 }

5777 break;

5778 }

5779 }

5780 }

5781 return budget;

5782 }

5783

5784

5785 int ActionNode::ComputeFirstCharacterSet(int budget) {

5786 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail;

5787 budget--;

5788 if (budget >= 0) {

5789 ASSERT_NOT_NULL(on_success());

5790 budget = on_success()->ComputeFirstCharacterSet(budget);

5791 if (budget >= 0) {

5792 set_first_character_set(on_success()->first_character_set());

5793 }

5794 }

5795 return budget;

5796 }

5797

5798

5799 int BackReferenceNode::ComputeFirstCharacterSet(int budget) {

5800 // We don't know anything about the first character of a backreference

5801 // at this point.

5802 // The potential first characters are the first characters of the capture,

5803 // and the first characters of the on_success node, depending on whether the

5804 // capture can be empty and whether it is known to be participating or known

5805 // not to be.

5806 return kComputeFirstCharacterSetFail;

5807 }

5808	5487

5809	5488

5810 void ChoiceNode::FillInBMInfo(	5489 void ChoiceNode::FillInBMInfo(

5811 int offset, BoyerMooreLookahead* bm, bool not_at_start) {	5490 int offset, BoyerMooreLookahead* bm, bool not_at_start) {

5812 ZoneList<GuardedAlternative>* alts = alternatives();	5491 ZoneList<GuardedAlternative>* alts = alternatives();

5813 for (int i = 0; i < alts->length(); i++) {	5492 for (int i = 0; i < alts->length(); i++) {

5814 GuardedAlternative& alt = alts->at(i);	5493 GuardedAlternative& alt = alts->at(i);

5815 if (alt.guards() != NULL && alt.guards()->length() != 0) {	5494 if (alt.guards() != NULL && alt.guards()->length() != 0) {

5816 bm->SetRest(offset); // Give up trying to fill in info.	5495 bm->SetRest(offset); // Give up trying to fill in info.

	5496 if (offset == 0) set_bm_info(not_at_start, bm);

5817 return;	5497 return;

5818 }	5498 }

5819 alt.node()->FillInBMInfo(offset, bm, not_at_start);	5499 alt.node()->FillInBMInfo(offset, bm, not_at_start);

5820 }	5500 }

	5501 if (offset == 0) set_bm_info(not_at_start, bm);

5821 }	5502 }

5822	5503

5823	5504

5824 void TextNode::FillInBMInfo(	5505 void TextNode::FillInBMInfo(

5825 int offset, BoyerMooreLookahead* bm, bool not_at_start) {	5506 int initial_offset, BoyerMooreLookahead* bm, bool not_at_start) {

5826 if (offset >= bm->length()) return;	5507 if (initial_offset >= bm->length()) return;

	5508 int offset = initial_offset;

5827 int max_char = bm->max_char();	5509 int max_char = bm->max_char();

5828 for (int i = 0; i < elements()->length(); i++) {	5510 for (int i = 0; i < elements()->length(); i++) {

5829 if (offset >= bm->length()) return;	5511 if (offset >= bm->length()) {

	5512 if (initial_offset == 0) set_bm_info(not_at_start, bm);

	5513 return;

	5514 }

5830 TextElement text = elements()->at(i);	5515 TextElement text = elements()->at(i);

5831 if (text.type == TextElement::ATOM) {	5516 if (text.type == TextElement::ATOM) {

5832 RegExpAtom* atom = text.data.u_atom;	5517 RegExpAtom* atom = text.data.u_atom;

5833 for (int j = 0; j < atom->length(); j++, offset++) {	5518 for (int j = 0; j < atom->length(); j++, offset++) {

5834 if (offset >= bm->length()) return;	5519 if (offset >= bm->length()) {

	5520 if (initial_offset == 0) set_bm_info(not_at_start, bm);

	5521 return;

	5522 }

5835 uc16 character = atom->data()[j];	5523 uc16 character = atom->data()[j];

5836 if (bm->compiler()->ignore_case()) {	5524 if (bm->compiler()->ignore_case()) {

5837 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	5525 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

5838 int length = GetCaseIndependentLetters(	5526 int length = GetCaseIndependentLetters(

5839 ISOLATE,	5527 ISOLATE,

5840 character,	5528 character,

5841 bm->max_char() == String::kMaxAsciiCharCode,	5529 bm->max_char() == String::kMaxAsciiCharCode,

5842 chars);	5530 chars);

5843 for (int j = 0; j < length; j++) {	5531 for (int j = 0; j < length; j++) {

5844 bm->Set(offset, chars[j]);	5532 bm->Set(offset, chars[j]);

5845 }	5533 }

5846 } else {	5534 } else {

5847 if (character <= max_char) bm->Set(offset, character);	5535 if (character <= max_char) bm->Set(offset, character);

5848 }	5536 }

5849 }	5537 }

5850 } else {	5538 } else {

5851 ASSERT(text.type == TextElement::CHAR_CLASS);	5539 ASSERT(text.type == TextElement::CHAR_CLASS);

5852 RegExpCharacterClass* char_class = text.data.u_char_class;	5540 RegExpCharacterClass* char_class = text.data.u_char_class;

5853 ZoneList<CharacterRange>* ranges = char_class->ranges();	5541 ZoneList<CharacterRange>* ranges = char_class->ranges();

5854 if (char_class->is_negated()) {	5542 if (char_class->is_negated()) {

5855 bm->SetAll(offset);	5543 bm->SetAll(offset);

5856 } else {	5544 } else {

5857 for (int k = 0; k < ranges->length(); k++) {	5545 for (int k = 0; k < ranges->length(); k++) {

5858 CharacterRange& range = ranges->at(k);	5546 CharacterRange& range = ranges->at(k);

5859 if (range.from() > max_char) continue;	5547 if (range.from() > max_char) continue;

5860 int to = Min(max_char, static_cast<int>(range.to()));	5548 int to = Min(max_char, static_cast<int>(range.to()));

5861 if (to - range.from() >= BoyerMooreLookahead::kTooManyCharacters) {	5549 bm->SetInterval(offset, Interval(range.from(), to));

5862 bm->SetAll(offset);

5863 break;

5864 }

5865 for (int m = range.from(); m <= to; m++) {

5866 bm->Set(offset, m);

5867 }

5868 }	5550 }

5869 }	5551 }

5870 offset++;	5552 offset++;

5871 }	5553 }

5872 }	5554 }

5873 if (offset >= bm->length()) return;	5555 if (offset >= bm->length()) {

	5556 if (initial_offset == 0) set_bm_info(not_at_start, bm);

	5557 return;

	5558 }

5874 on_success()->FillInBMInfo(offset,	5559 on_success()->FillInBMInfo(offset,

5875 bm,	5560 bm,

5876 true); // Not at start after a text node.	5561 true); // Not at start after a text node.

	5562 if (initial_offset == 0) set_bm_info(not_at_start, bm);

5877 }	5563 }

5878	5564

5879	5565

5880 int TextNode::ComputeFirstCharacterSet(int budget) {

5881 budget--;

5882 if (budget >= 0) {

5883 ASSERT_NE(0, elements()->length());

5884 TextElement text = elements()->at(0);

5885 if (text.type == TextElement::ATOM) {

5886 RegExpAtom* atom = text.data.u_atom;

5887 ASSERT_NE(0, atom->length());

5888 uc16 first_char = atom->data()[0];

5889 ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1);

5890 range->Add(CharacterRange(first_char, first_char));

5891 set_first_character_set(range);

5892 } else {

5893 ASSERT(text.type == TextElement::CHAR_CLASS);

5894 RegExpCharacterClass* char_class = text.data.u_char_class;

5895 ZoneList<CharacterRange>* ranges = char_class->ranges();

5896 // TODO(lrn): Canonicalize ranges when they are created

5897 // instead of waiting until now.

5898 CharacterRange::Canonicalize(ranges);

5899 if (char_class->is_negated()) {

5900 int length = ranges->length();

5901 int new_length = length + 1;

5902 if (length > 0) {

5903 if (ranges->at(0).from() == 0) new_length--;

5904 if (ranges->at(length - 1).to() == String::kMaxUtf16CodeUnit) {

5905 new_length--;

5906 }

5907 }

5908 ZoneList<CharacterRange>* negated_ranges =

5909 new ZoneList<CharacterRange>(new_length);

5910 CharacterRange::Negate(ranges, negated_ranges);

5911 set_first_character_set(negated_ranges);

5912 } else {

5913 set_first_character_set(ranges);

5914 }

5915 }

5916 }

5917 return budget;

5918 }

5919

5920

5921

5922 // -------------------------------------------------------------------	5566 // -------------------------------------------------------------------

5923 // Dispatch table construction	5567 // Dispatch table construction

5924	5568

5925	5569

5926 void DispatchTableConstructor::VisitEnd(EndNode* that) {	5570 void DispatchTableConstructor::VisitEnd(EndNode* that) {

5927 AddRange(CharacterRange::Everything());	5571 AddRange(CharacterRange::Everything());

5928 }	5572 }

5929	5573

5930	5574

5931 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {	5575 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {

(...skipping 201 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6133 }	5777 }

6134	5778

6135 return compiler.Assemble(&macro_assembler,	5779 return compiler.Assemble(&macro_assembler,

6136 node,	5780 node,

6137 data->capture_count,	5781 data->capture_count,

6138 pattern);	5782 pattern);

6139 }	5783 }

6140	5784

6141	5785

6142 }} // namespace v8::internal	5786 }} // namespace v8::internal

OLD	NEW

« src/jsregexp.h ('K') | « src/jsregexp.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »