src/jsregexp.cc - Issue 10174017: Regexp: Remove nodes from the regexp that cannot match because

Side by Side Diff: src/jsregexp.cc

Issue 10174017: Regexp: Remove nodes from the regexp that cannot match because (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 8 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2011 the V8 project authors. All rights reserved.	1 // Copyright 2011 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 2408 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2419 char_mask = String::kMaxUtf16CodeUnit;	2419 char_mask = String::kMaxUtf16CodeUnit;

2420 }	2420 }

2421 for (int k = 0; k < elms_->length(); k++) {	2421 for (int k = 0; k < elms_->length(); k++) {

2422 TextElement elm = elms_->at(k);	2422 TextElement elm = elms_->at(k);

2423 if (elm.type == TextElement::ATOM) {	2423 if (elm.type == TextElement::ATOM) {

2424 Vector<const uc16> quarks = elm.data.u_atom->data();	2424 Vector<const uc16> quarks = elm.data.u_atom->data();

2425 for (int i = 0; i < characters && i < quarks.length(); i++) {	2425 for (int i = 0; i < characters && i < quarks.length(); i++) {

2426 QuickCheckDetails::Position* pos =	2426 QuickCheckDetails::Position* pos =

2427 details->positions(characters_filled_in);	2427 details->positions(characters_filled_in);

2428 uc16 c = quarks[i];	2428 uc16 c = quarks[i];

2429 if (c > char_mask) {	2429 // We should already have filtered out nodes that have non-ASCII

2430 // If we expect a non-ASCII character from an ASCII string,	2430 // characters if we are matching against an ASCII string.

2431 // there is no way we can match. Not even case independent	2431 ASSERT(c <= char_mask);

2432 // matching can turn an ASCII character into non-ASCII or

2433 // vice versa.

2434 details->set_cannot_match();

2435 pos->determines_perfectly = false;

2436 return;

2437 }

2438 if (compiler->ignore_case()) {	2432 if (compiler->ignore_case()) {

2439 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	2433 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

2440 int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(),	2434 int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(),

2441 chars);	2435 chars);

2442 ASSERT(length != 0); // Can only happen if c > char_mask (see above).	2436 ASSERT(length != 0); // Can only happen if c > char_mask (see above).

2443 if (length == 1) {	2437 if (length == 1) {

2444 // This letter has no case equivalents, so it's nice and simple	2438 // This letter has no case equivalents, so it's nice and simple

2445 // and the mask-compare will determine definitely whether we have	2439 // and the mask-compare will determine definitely whether we have

2446 // a match at this character position.	2440 // a match at this character position.

2447 pos->mask = char_mask;	2441 pos->mask = char_mask;

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2489 // A quick check uses multi-character mask and compare. There is no	2483 // A quick check uses multi-character mask and compare. There is no

2490 // useful way to incorporate a negative char class into this scheme	2484 // useful way to incorporate a negative char class into this scheme

2491 // so we just conservatively create a mask and value that will always	2485 // so we just conservatively create a mask and value that will always

2492 // succeed.	2486 // succeed.

2493 pos->mask = 0;	2487 pos->mask = 0;

2494 pos->value = 0;	2488 pos->value = 0;

2495 } else {	2489 } else {

2496 int first_range = 0;	2490 int first_range = 0;

2497 while (ranges->at(first_range).from() > char_mask) {	2491 while (ranges->at(first_range).from() > char_mask) {

2498 first_range++;	2492 first_range++;

2499 if (first_range == ranges->length()) {	2493 // We should already have filtered out nodes that cannot match

2500 details->set_cannot_match();	2494 // so the first range should be a valid range.

2501 pos->determines_perfectly = false;	2495 ASSERT(first_range != ranges->length());

2502 return;

2503 }

2504 }	2496 }

2505 CharacterRange range = ranges->at(first_range);	2497 CharacterRange range = ranges->at(first_range);

2506 uc16 from = range.from();	2498 uc16 from = range.from();

2507 uc16 to = range.to();	2499 uc16 to = range.to();

2508 if (to > char_mask) {	2500 if (to > char_mask) {

2509 to = char_mask;	2501 to = char_mask;

2510 }	2502 }

2511 uint32_t differing_bits = (from ^ to);	2503 uint32_t differing_bits = (from ^ to);

2512 // A mask and compare is only perfect if the differing bits form a	2504 // A mask and compare is only perfect if the differing bits form a

2513 // number like 00011111 with one single block of trailing 1s.	2505 // number like 00011111 with one single block of trailing 1s.

(...skipping 108 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2622 info->visited = true;	2614 info->visited = true;

2623 }	2615 }

2624 ~VisitMarker() {	2616 ~VisitMarker() {

2625 info_->visited = false;	2617 info_->visited = false;

2626 }	2618 }

2627 private:	2619 private:

2628 NodeInfo* info_;	2620 NodeInfo* info_;

2629 };	2621 };

2630	2622

2631	2623

	2624 RegExpNode* SeqRegExpNode::FilterASCII(int depth) {

	2625 if (info()->replacement_calculated) return replacement();

	2626 if (depth < 0) return this;

	2627 ASSERT(!info()->visited);

	2628 VisitMarker marker(info());

	2629 return FilterSuccessor(depth - 1);

	2630 }

	2631

	2632

	2633 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {

	2634 RegExpNode* next = on_success_->FilterASCII(depth - 1);

	2635 if (next == NULL) return set_replacement(NULL);

	2636 on_success_ = next;

	2637 return set_replacement(this);

	2638 }

	2639

	2640

	2641 RegExpNode* TextNode::FilterASCII(int depth) {

	2642 if (info()->replacement_calculated) return replacement();

	2643 if (depth < 0) return this;

	2644 ASSERT(!info()->visited);

	2645 VisitMarker marker(info());

	2646 int element_count = elms_->length();

	2647 for (int i = 0; i < element_count; i++) {

	2648 TextElement elm = elms_->at(i);

	2649 if (elm.type == TextElement::ATOM) {

	2650 Vector<const uc16> quarks = elm.data.u_atom->data();

	2651 for (int j = 0; j < quarks.length(); j++) {

	2652 // We don't need special handling for case independence

	2653 // because of the rule that case independence cannot make

	2654 // a non-ASCII character match an ASCII character.

	2655 if (quarks[j] > String::kMaxAsciiCharCode) {

	2656 return set_replacement(NULL);

	2657 }

	2658 }

	2659 } else {

	2660 ASSERT(elm.type == TextElement::CHAR_CLASS);

	2661 RegExpCharacterClass* cc = elm.data.u_char_class;

	2662 ZoneList<CharacterRange>* ranges = cc->ranges();

	2663 if (!CharacterRange::IsCanonical(ranges)) {

	2664 CharacterRange::Canonicalize(ranges);

	2665 }

	2666 // Now they are in order so we only need to look at the first.

	2667 int range_count = ranges->length();

	2668 if (cc->is_negated()) {

	2669 if (range_count != 0 &&

	2670 ranges->at(0).from() == 0 &&

	2671 ranges->at(0).to() >= String::kMaxAsciiCharCode) {

	2672 return set_replacement(NULL);

	2673 }

	2674 } else {

	2675 if (range_count == 0 \|\|

	2676 ranges->at(0).from() > String::kMaxAsciiCharCode) {

	2677 return set_replacement(NULL);

	2678 }

	2679 }

	2680 }

	2681 }

	2682 return FilterSuccessor(depth - 1);

	2683 }

	2684

	2685

	2686 RegExpNode* LoopChoiceNode::FilterASCII(int depth) {

	2687 if (info()->replacement_calculated) return replacement();

	2688 if (depth < 0) return this;

	2689 if (info()->visited) return this;

	2690 VisitMarker marker(info());

	2691

	2692 RegExpNode* continue_replacement = continue_node_->FilterASCII(depth - 1);

	2693 // If we can't continue after the loop then there is no sense in doing the

	2694 // loop.

	2695 if (continue_replacement == NULL) return set_replacement(NULL);

	2696

	2697 return ChoiceNode::FilterASCII(depth - 1);

	2698 }

	2699

	2700

	2701 RegExpNode* ChoiceNode::FilterASCII(int depth) {

	2702 if (info()->replacement_calculated) return replacement();

	2703 if (depth < 0) return this;

	2704 if (info()->visited) return this;

	2705 VisitMarker marker(info());

	2706 int choice_count = alternatives_->length();

	2707 int surviving = 0;

	2708 RegExpNode* survivor = NULL;

	2709 for (int i = 0; i < choice_count; i++) {

	2710 GuardedAlternative alternative = alternatives_->at(i);

	2711 RegExpNode* replacement = alternative.node()->FilterASCII(depth - 1);

	2712 ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.

	2713 alternatives_->at(i).set_node(replacement);

	2714 if (replacement != NULL) {

	2715 surviving++;

	2716 survivor = replacement;

	2717 }

	2718 }

	2719 if (surviving < 2) return set_replacement(survivor);

	2720

	2721 set_replacement(this);

	2722 if (surviving == choice_count) {

	2723 return this;

	2724 }

	2725 // Only some of the nodes survived the filtering. We need to rebuild the

	2726 // alternatives list.

	2727 ZoneList<GuardedAlternative>* new_alternatives =

	2728 new ZoneList<GuardedAlternative>(surviving);

	2729 for (int i = 0; i < choice_count; i++) {

	2730 GuardedAlternative alternative = alternatives_->at(i);

	2731 if (alternative.node() != NULL) {

	2732 new_alternatives->Add(alternative);

	2733 }

	2734 }

	2735 alternatives_ = new_alternatives;

	2736 return this;

	2737 }

	2738

	2739

	2740 RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth) {

	2741 if (info()->replacement_calculated) return replacement();

	2742 if (depth < 0) return this;

	2743 if (info()->visited) return this;

	2744 VisitMarker marker(info());

	2745 // Alternative 0 is the negative lookahead, alternative 1 is what comes

	2746 // afterwards.

	2747 RegExpNode* node = alternatives_->at(1).node();

	2748 RegExpNode* replacement = node->FilterASCII(depth - 1);

	2749 if (replacement == NULL) return set_replacement(NULL);

	2750 alternatives_->at(1).set_node(replacement);

	2751

	2752 RegExpNode* neg_node = alternatives_->at(0).node();

	2753 RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1);

	2754 // If the negative lookahead is always going to fail then

	2755 // we don't need to check it.

	2756 if (neg_replacement == NULL) return set_replacement(replacement);

	2757 alternatives_->at(0).set_node(neg_replacement);

	2758 return set_replacement(this);

	2759 }

	2760

	2761

2632 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,	2762 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,

2633 RegExpCompiler* compiler,	2763 RegExpCompiler* compiler,

2634 int characters_filled_in,	2764 int characters_filled_in,

2635 bool not_at_start) {	2765 bool not_at_start) {

2636 if (body_can_be_zero_length_ \|\| info()->visited) return;	2766 if (body_can_be_zero_length_ \|\| info()->visited) return;

2637 VisitMarker marker(info());	2767 VisitMarker marker(info());

2638 return ChoiceNode::GetQuickCheckDetails(details,	2768 return ChoiceNode::GetQuickCheckDetails(details,

2639 compiler,	2769 compiler,

2640 characters_filled_in,	2770 characters_filled_in,

2641 not_at_start);	2771 not_at_start);

(...skipping 3041 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5683 // at the start of input.	5813 // at the start of input.

5684 ChoiceNode* first_step_node = new ChoiceNode(2);	5814 ChoiceNode* first_step_node = new ChoiceNode(2);

5685 first_step_node->AddAlternative(GuardedAlternative(captured_body));	5815 first_step_node->AddAlternative(GuardedAlternative(captured_body));

5686 first_step_node->AddAlternative(GuardedAlternative(	5816 first_step_node->AddAlternative(GuardedAlternative(

5687 new TextNode(new RegExpCharacterClass('*'), loop_node)));	5817 new TextNode(new RegExpCharacterClass('*'), loop_node)));

5688 node = first_step_node;	5818 node = first_step_node;

5689 } else {	5819 } else {

5690 node = loop_node;	5820 node = loop_node;

5691 }	5821 }

5692 }	5822 }

	5823 if (is_ascii) node = node->FilterASCII(RegExpCompiler::kMaxRecursion);

	5824

	5825 if (node == NULL) node = new EndNode(EndNode::BACKTRACK);

5693 data->node = node;	5826 data->node = node;

5694 Analysis analysis(ignore_case, is_ascii);	5827 Analysis analysis(ignore_case, is_ascii);

5695 analysis.EnsureAnalyzed(node);	5828 analysis.EnsureAnalyzed(node);

5696 if (analysis.has_failed()) {	5829 if (analysis.has_failed()) {

5697 const char* error_message = analysis.error_message();	5830 const char* error_message = analysis.error_message();

5698 return CompilationResult(error_message);	5831 return CompilationResult(error_message);

5699 }	5832 }

5700	5833

5701 // Create the correct assembler for the architecture.	5834 // Create the correct assembler for the architecture.

5702 #ifndef V8_INTERPRETED_REGEXP	5835 #ifndef V8_INTERPRETED_REGEXP

(...skipping 29 matching lines...) Expand all Loading...
5732 }	5865 }

5733	5866

5734 return compiler.Assemble(&macro_assembler,	5867 return compiler.Assemble(&macro_assembler,

5735 node,	5868 node,

5736 data->capture_count,	5869 data->capture_count,

5737 pattern);	5870 pattern);

5738 }	5871 }

5739	5872

5740	5873

5741 }} // namespace v8::internal	5874 }} // namespace v8::internal

OLD	NEW

« no previous file with comments | « src/jsregexp.h ('k') | test/mjsunit/regexp-capture-3.js » ('j') | no next file with comments »