Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(63)

Side by Side Diff: src/jsregexp.cc

Issue 10174017: Regexp: Remove nodes from the regexp that cannot match because (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 8 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « src/jsregexp.h ('k') | test/mjsunit/regexp-capture-3.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2011 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 2408 matching lines...) Expand 10 before | Expand all | Expand 10 after
2419 char_mask = String::kMaxUtf16CodeUnit; 2419 char_mask = String::kMaxUtf16CodeUnit;
2420 } 2420 }
2421 for (int k = 0; k < elms_->length(); k++) { 2421 for (int k = 0; k < elms_->length(); k++) {
2422 TextElement elm = elms_->at(k); 2422 TextElement elm = elms_->at(k);
2423 if (elm.type == TextElement::ATOM) { 2423 if (elm.type == TextElement::ATOM) {
2424 Vector<const uc16> quarks = elm.data.u_atom->data(); 2424 Vector<const uc16> quarks = elm.data.u_atom->data();
2425 for (int i = 0; i < characters && i < quarks.length(); i++) { 2425 for (int i = 0; i < characters && i < quarks.length(); i++) {
2426 QuickCheckDetails::Position* pos = 2426 QuickCheckDetails::Position* pos =
2427 details->positions(characters_filled_in); 2427 details->positions(characters_filled_in);
2428 uc16 c = quarks[i]; 2428 uc16 c = quarks[i];
2429 if (c > char_mask) { 2429 // We should already have filtered out nodes that have non-ASCII
2430 // If we expect a non-ASCII character from an ASCII string, 2430 // characters if we are matching against an ASCII string.
2431 // there is no way we can match. Not even case independent 2431 ASSERT(c <= char_mask);
2432 // matching can turn an ASCII character into non-ASCII or
2433 // vice versa.
2434 details->set_cannot_match();
2435 pos->determines_perfectly = false;
2436 return;
2437 }
2438 if (compiler->ignore_case()) { 2432 if (compiler->ignore_case()) {
2439 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 2433 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2440 int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(), 2434 int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(),
2441 chars); 2435 chars);
2442 ASSERT(length != 0); // Can only happen if c > char_mask (see above). 2436 ASSERT(length != 0); // Can only happen if c > char_mask (see above).
2443 if (length == 1) { 2437 if (length == 1) {
2444 // This letter has no case equivalents, so it's nice and simple 2438 // This letter has no case equivalents, so it's nice and simple
2445 // and the mask-compare will determine definitely whether we have 2439 // and the mask-compare will determine definitely whether we have
2446 // a match at this character position. 2440 // a match at this character position.
2447 pos->mask = char_mask; 2441 pos->mask = char_mask;
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
2489 // A quick check uses multi-character mask and compare. There is no 2483 // A quick check uses multi-character mask and compare. There is no
2490 // useful way to incorporate a negative char class into this scheme 2484 // useful way to incorporate a negative char class into this scheme
2491 // so we just conservatively create a mask and value that will always 2485 // so we just conservatively create a mask and value that will always
2492 // succeed. 2486 // succeed.
2493 pos->mask = 0; 2487 pos->mask = 0;
2494 pos->value = 0; 2488 pos->value = 0;
2495 } else { 2489 } else {
2496 int first_range = 0; 2490 int first_range = 0;
2497 while (ranges->at(first_range).from() > char_mask) { 2491 while (ranges->at(first_range).from() > char_mask) {
2498 first_range++; 2492 first_range++;
2499 if (first_range == ranges->length()) { 2493 // We should already have filtered out nodes that cannot match
2500 details->set_cannot_match(); 2494 // so the first range should be a valid range.
2501 pos->determines_perfectly = false; 2495 ASSERT(first_range != ranges->length());
2502 return;
2503 }
2504 } 2496 }
2505 CharacterRange range = ranges->at(first_range); 2497 CharacterRange range = ranges->at(first_range);
2506 uc16 from = range.from(); 2498 uc16 from = range.from();
2507 uc16 to = range.to(); 2499 uc16 to = range.to();
2508 if (to > char_mask) { 2500 if (to > char_mask) {
2509 to = char_mask; 2501 to = char_mask;
2510 } 2502 }
2511 uint32_t differing_bits = (from ^ to); 2503 uint32_t differing_bits = (from ^ to);
2512 // A mask and compare is only perfect if the differing bits form a 2504 // A mask and compare is only perfect if the differing bits form a
2513 // number like 00011111 with one single block of trailing 1s. 2505 // number like 00011111 with one single block of trailing 1s.
(...skipping 108 matching lines...) Expand 10 before | Expand all | Expand 10 after
2622 info->visited = true; 2614 info->visited = true;
2623 } 2615 }
2624 ~VisitMarker() { 2616 ~VisitMarker() {
2625 info_->visited = false; 2617 info_->visited = false;
2626 } 2618 }
2627 private: 2619 private:
2628 NodeInfo* info_; 2620 NodeInfo* info_;
2629 }; 2621 };
2630 2622
2631 2623
2624 RegExpNode* SeqRegExpNode::FilterASCII(int depth) {
2625 if (info()->replacement_calculated) return replacement();
2626 if (depth < 0) return this;
2627 ASSERT(!info()->visited);
2628 VisitMarker marker(info());
2629 return FilterSuccessor(depth - 1);
2630 }
2631
2632
2633 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
2634 RegExpNode* next = on_success_->FilterASCII(depth - 1);
2635 if (next == NULL) return set_replacement(NULL);
2636 on_success_ = next;
2637 return set_replacement(this);
2638 }
2639
2640
2641 RegExpNode* TextNode::FilterASCII(int depth) {
2642 if (info()->replacement_calculated) return replacement();
2643 if (depth < 0) return this;
2644 ASSERT(!info()->visited);
2645 VisitMarker marker(info());
2646 int element_count = elms_->length();
2647 for (int i = 0; i < element_count; i++) {
2648 TextElement elm = elms_->at(i);
2649 if (elm.type == TextElement::ATOM) {
2650 Vector<const uc16> quarks = elm.data.u_atom->data();
2651 for (int j = 0; j < quarks.length(); j++) {
2652 // We don't need special handling for case independence
2653 // because of the rule that case independence cannot make
2654 // a non-ASCII character match an ASCII character.
2655 if (quarks[j] > String::kMaxAsciiCharCode) {
2656 return set_replacement(NULL);
2657 }
2658 }
2659 } else {
2660 ASSERT(elm.type == TextElement::CHAR_CLASS);
2661 RegExpCharacterClass* cc = elm.data.u_char_class;
2662 ZoneList<CharacterRange>* ranges = cc->ranges();
2663 if (!CharacterRange::IsCanonical(ranges)) {
2664 CharacterRange::Canonicalize(ranges);
2665 }
2666 // Now they are in order so we only need to look at the first.
2667 int range_count = ranges->length();
2668 if (cc->is_negated()) {
2669 if (range_count != 0 &&
2670 ranges->at(0).from() == 0 &&
2671 ranges->at(0).to() >= String::kMaxAsciiCharCode) {
2672 return set_replacement(NULL);
2673 }
2674 } else {
2675 if (range_count == 0 ||
2676 ranges->at(0).from() > String::kMaxAsciiCharCode) {
2677 return set_replacement(NULL);
2678 }
2679 }
2680 }
2681 }
2682 return FilterSuccessor(depth - 1);
2683 }
2684
2685
2686 RegExpNode* LoopChoiceNode::FilterASCII(int depth) {
2687 if (info()->replacement_calculated) return replacement();
2688 if (depth < 0) return this;
2689 if (info()->visited) return this;
2690 VisitMarker marker(info());
2691
2692 RegExpNode* continue_replacement = continue_node_->FilterASCII(depth - 1);
2693 // If we can't continue after the loop then there is no sense in doing the
2694 // loop.
2695 if (continue_replacement == NULL) return set_replacement(NULL);
2696
2697 return ChoiceNode::FilterASCII(depth - 1);
2698 }
2699
2700
2701 RegExpNode* ChoiceNode::FilterASCII(int depth) {
2702 if (info()->replacement_calculated) return replacement();
2703 if (depth < 0) return this;
2704 if (info()->visited) return this;
2705 VisitMarker marker(info());
2706 int choice_count = alternatives_->length();
2707 int surviving = 0;
2708 RegExpNode* survivor = NULL;
2709 for (int i = 0; i < choice_count; i++) {
2710 GuardedAlternative alternative = alternatives_->at(i);
2711 RegExpNode* replacement = alternative.node()->FilterASCII(depth - 1);
2712 ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
2713 alternatives_->at(i).set_node(replacement);
2714 if (replacement != NULL) {
2715 surviving++;
2716 survivor = replacement;
2717 }
2718 }
2719 if (surviving < 2) return set_replacement(survivor);
2720
2721 set_replacement(this);
2722 if (surviving == choice_count) {
2723 return this;
2724 }
2725 // Only some of the nodes survived the filtering. We need to rebuild the
2726 // alternatives list.
2727 ZoneList<GuardedAlternative>* new_alternatives =
2728 new ZoneList<GuardedAlternative>(surviving);
2729 for (int i = 0; i < choice_count; i++) {
2730 GuardedAlternative alternative = alternatives_->at(i);
2731 if (alternative.node() != NULL) {
2732 new_alternatives->Add(alternative);
2733 }
2734 }
2735 alternatives_ = new_alternatives;
2736 return this;
2737 }
2738
2739
2740 RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth) {
2741 if (info()->replacement_calculated) return replacement();
2742 if (depth < 0) return this;
2743 if (info()->visited) return this;
2744 VisitMarker marker(info());
2745 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2746 // afterwards.
2747 RegExpNode* node = alternatives_->at(1).node();
2748 RegExpNode* replacement = node->FilterASCII(depth - 1);
2749 if (replacement == NULL) return set_replacement(NULL);
2750 alternatives_->at(1).set_node(replacement);
2751
2752 RegExpNode* neg_node = alternatives_->at(0).node();
2753 RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1);
2754 // If the negative lookahead is always going to fail then
2755 // we don't need to check it.
2756 if (neg_replacement == NULL) return set_replacement(replacement);
2757 alternatives_->at(0).set_node(neg_replacement);
2758 return set_replacement(this);
2759 }
2760
2761
2632 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2762 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2633 RegExpCompiler* compiler, 2763 RegExpCompiler* compiler,
2634 int characters_filled_in, 2764 int characters_filled_in,
2635 bool not_at_start) { 2765 bool not_at_start) {
2636 if (body_can_be_zero_length_ || info()->visited) return; 2766 if (body_can_be_zero_length_ || info()->visited) return;
2637 VisitMarker marker(info()); 2767 VisitMarker marker(info());
2638 return ChoiceNode::GetQuickCheckDetails(details, 2768 return ChoiceNode::GetQuickCheckDetails(details,
2639 compiler, 2769 compiler,
2640 characters_filled_in, 2770 characters_filled_in,
2641 not_at_start); 2771 not_at_start);
(...skipping 3041 matching lines...) Expand 10 before | Expand all | Expand 10 after
5683 // at the start of input. 5813 // at the start of input.
5684 ChoiceNode* first_step_node = new ChoiceNode(2); 5814 ChoiceNode* first_step_node = new ChoiceNode(2);
5685 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 5815 first_step_node->AddAlternative(GuardedAlternative(captured_body));
5686 first_step_node->AddAlternative(GuardedAlternative( 5816 first_step_node->AddAlternative(GuardedAlternative(
5687 new TextNode(new RegExpCharacterClass('*'), loop_node))); 5817 new TextNode(new RegExpCharacterClass('*'), loop_node)));
5688 node = first_step_node; 5818 node = first_step_node;
5689 } else { 5819 } else {
5690 node = loop_node; 5820 node = loop_node;
5691 } 5821 }
5692 } 5822 }
5823 if (is_ascii) node = node->FilterASCII(RegExpCompiler::kMaxRecursion);
5824
5825 if (node == NULL) node = new EndNode(EndNode::BACKTRACK);
5693 data->node = node; 5826 data->node = node;
5694 Analysis analysis(ignore_case, is_ascii); 5827 Analysis analysis(ignore_case, is_ascii);
5695 analysis.EnsureAnalyzed(node); 5828 analysis.EnsureAnalyzed(node);
5696 if (analysis.has_failed()) { 5829 if (analysis.has_failed()) {
5697 const char* error_message = analysis.error_message(); 5830 const char* error_message = analysis.error_message();
5698 return CompilationResult(error_message); 5831 return CompilationResult(error_message);
5699 } 5832 }
5700 5833
5701 // Create the correct assembler for the architecture. 5834 // Create the correct assembler for the architecture.
5702 #ifndef V8_INTERPRETED_REGEXP 5835 #ifndef V8_INTERPRETED_REGEXP
(...skipping 29 matching lines...) Expand all
5732 } 5865 }
5733 5866
5734 return compiler.Assemble(&macro_assembler, 5867 return compiler.Assemble(&macro_assembler,
5735 node, 5868 node,
5736 data->capture_count, 5869 data->capture_count,
5737 pattern); 5870 pattern);
5738 } 5871 }
5739 5872
5740 5873
5741 }} // namespace v8::internal 5874 }} // namespace v8::internal
OLDNEW
« no previous file with comments | « src/jsregexp.h ('k') | test/mjsunit/regexp-capture-3.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698