Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(628)

Side by Side Diff: src/jsregexp.cc

Issue 10174017: Regexp: Remove nodes from the regexp that cannot match because (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 8 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2011 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 362 matching lines...) Expand 10 before | Expand all | Expand 10 after
373 elements->set(0, re->Pattern()); 373 elements->set(0, re->Pattern());
374 elements->set(1, *error_message); 374 elements->set(1, *error_message);
375 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); 375 Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
376 Handle<Object> regexp_err = 376 Handle<Object> regexp_err =
377 factory->NewSyntaxError("malformed_regexp", array); 377 factory->NewSyntaxError("malformed_regexp", array);
378 isolate->Throw(*regexp_err); 378 isolate->Throw(*regexp_err);
379 return false; 379 return false;
380 } 380 }
381 381
382 382
383 int kUninitializedRegExpNodePlaceHolder;
384
385
383 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, 386 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
384 Handle<String> sample_subject, 387 Handle<String> sample_subject,
385 bool is_ascii) { 388 bool is_ascii) {
386 // Compile the RegExp. 389 // Compile the RegExp.
387 Isolate* isolate = re->GetIsolate(); 390 Isolate* isolate = re->GetIsolate();
388 ZoneScope zone_scope(isolate, DELETE_ON_EXIT); 391 ZoneScope zone_scope(isolate, DELETE_ON_EXIT);
389 PostponeInterruptsScope postpone(isolate); 392 PostponeInterruptsScope postpone(isolate);
390 // If we had a compilation error the last time this is saved at the 393 // If we had a compilation error the last time this is saved at the
391 // saved code index. 394 // saved code index.
392 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii)); 395 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
(...skipping 2026 matching lines...) Expand 10 before | Expand all | Expand 10 after
2419 char_mask = String::kMaxUtf16CodeUnit; 2422 char_mask = String::kMaxUtf16CodeUnit;
2420 } 2423 }
2421 for (int k = 0; k < elms_->length(); k++) { 2424 for (int k = 0; k < elms_->length(); k++) {
2422 TextElement elm = elms_->at(k); 2425 TextElement elm = elms_->at(k);
2423 if (elm.type == TextElement::ATOM) { 2426 if (elm.type == TextElement::ATOM) {
2424 Vector<const uc16> quarks = elm.data.u_atom->data(); 2427 Vector<const uc16> quarks = elm.data.u_atom->data();
2425 for (int i = 0; i < characters && i < quarks.length(); i++) { 2428 for (int i = 0; i < characters && i < quarks.length(); i++) {
2426 QuickCheckDetails::Position* pos = 2429 QuickCheckDetails::Position* pos =
2427 details->positions(characters_filled_in); 2430 details->positions(characters_filled_in);
2428 uc16 c = quarks[i]; 2431 uc16 c = quarks[i];
2429 if (c > char_mask) { 2432 // We should already have filtered out nodes that have non-ASCII
2430 // If we expect a non-ASCII character from an ASCII string, 2433 // characters if we are matching against an ASCII string.
2431 // there is no way we can match. Not even case independent 2434 ASSERT(c <= char_mask);
2432 // matching can turn an ASCII character into non-ASCII or
2433 // vice versa.
2434 details->set_cannot_match();
2435 pos->determines_perfectly = false;
2436 return;
2437 }
2438 if (compiler->ignore_case()) { 2435 if (compiler->ignore_case()) {
2439 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 2436 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2440 int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(), 2437 int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(),
2441 chars); 2438 chars);
2442 ASSERT(length != 0); // Can only happen if c > char_mask (see above). 2439 ASSERT(length != 0); // Can only happen if c > char_mask (see above).
2443 if (length == 1) { 2440 if (length == 1) {
2444 // This letter has no case equivalents, so it's nice and simple 2441 // This letter has no case equivalents, so it's nice and simple
2445 // and the mask-compare will determine definitely whether we have 2442 // and the mask-compare will determine definitely whether we have
2446 // a match at this character position. 2443 // a match at this character position.
2447 pos->mask = char_mask; 2444 pos->mask = char_mask;
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
2489 // A quick check uses multi-character mask and compare. There is no 2486 // A quick check uses multi-character mask and compare. There is no
2490 // useful way to incorporate a negative char class into this scheme 2487 // useful way to incorporate a negative char class into this scheme
2491 // so we just conservatively create a mask and value that will always 2488 // so we just conservatively create a mask and value that will always
2492 // succeed. 2489 // succeed.
2493 pos->mask = 0; 2490 pos->mask = 0;
2494 pos->value = 0; 2491 pos->value = 0;
2495 } else { 2492 } else {
2496 int first_range = 0; 2493 int first_range = 0;
2497 while (ranges->at(first_range).from() > char_mask) { 2494 while (ranges->at(first_range).from() > char_mask) {
2498 first_range++; 2495 first_range++;
2499 if (first_range == ranges->length()) { 2496 // We should already have filtered out nodes that cannot match
2500 details->set_cannot_match(); 2497 // so the first range should be a valid range.
2501 pos->determines_perfectly = false; 2498 ASSERT(first_range != ranges->length());
2502 return;
2503 }
2504 } 2499 }
2505 CharacterRange range = ranges->at(first_range); 2500 CharacterRange range = ranges->at(first_range);
2506 uc16 from = range.from(); 2501 uc16 from = range.from();
2507 uc16 to = range.to(); 2502 uc16 to = range.to();
2508 if (to > char_mask) { 2503 if (to > char_mask) {
2509 to = char_mask; 2504 to = char_mask;
2510 } 2505 }
2511 uint32_t differing_bits = (from ^ to); 2506 uint32_t differing_bits = (from ^ to);
2512 // A mask and compare is only perfect if the differing bits form a 2507 // A mask and compare is only perfect if the differing bits form a
2513 // number like 00011111 with one single block of trailing 1s. 2508 // number like 00011111 with one single block of trailing 1s.
(...skipping 108 matching lines...) Expand 10 before | Expand all | Expand 10 after
2622 info->visited = true; 2617 info->visited = true;
2623 } 2618 }
2624 ~VisitMarker() { 2619 ~VisitMarker() {
2625 info_->visited = false; 2620 info_->visited = false;
2626 } 2621 }
2627 private: 2622 private:
2628 NodeInfo* info_; 2623 NodeInfo* info_;
2629 }; 2624 };
2630 2625
2631 2626
2627 RegExpNode* SeqRegExpNode::FilterASCII(int depth) {
2628 if (replacement_ != Uninitialized()) return replacement_;
2629 if (depth < 0) return this;
2630 ASSERT(!info()->visited);
2631 VisitMarker marker(info());
2632 return FilterSuccessor(depth - 1);
2633 }
2634
2635
2636 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
2637 RegExpNode* next = on_success_->FilterASCII(depth - 1);
2638 if (next == NULL) {
2639 replacement_ = NULL;
2640 return NULL;
2641 }
2642 on_success_ = next;
2643 replacement_ = this;
2644 return this;
2645 }
2646
2647
2648 RegExpNode* TextNode::FilterASCII(int depth) {
2649 if (replacement_ != Uninitialized()) return replacement_;
2650 if (depth < 0) return this;
2651 ASSERT(!info()->visited);
2652 VisitMarker marker(info());
2653 int element_count = elms_->length();
2654 for (int i = 0; i < element_count; i++) {
2655 TextElement elm = elms_->at(i);
2656 if (elm.type == TextElement::ATOM) {
2657 Vector<const uc16> quarks = elm.data.u_atom->data();
2658 for (int j = 0; j < quarks.length(); j++) {
2659 // We don't need special handling for case independence
2660 // because of the rule that case independence cannot make
2661 // a non-ASCII character match an ASCII character.
2662 if (quarks[j] > String::kMaxAsciiCharCode) {
2663 replacement_ = NULL;
2664 return NULL;
2665 }
2666 }
2667 } else {
2668 ASSERT(elm.type == TextElement::CHAR_CLASS);
2669 RegExpCharacterClass* cc = elm.data.u_char_class;
2670 ZoneList<CharacterRange>* ranges = cc->ranges();
2671 if (!CharacterRange::IsCanonical(ranges)) {
2672 CharacterRange::Canonicalize(ranges);
2673 }
2674 // Now they are in order so we only need to look at the first.
2675 int range_count = ranges->length();
2676 if (cc->is_negated()) {
2677 if (range_count != 0 &&
2678 ranges->at(0).from() == 0 &&
2679 ranges->at(0).to() >= String::kMaxAsciiCharCode) {
2680 replacement_ = NULL;
2681 return NULL;
2682 }
2683 } else {
2684 if (range_count == 0 ||
2685 ranges->at(0).from() > String::kMaxAsciiCharCode) {
2686 replacement_ = NULL;
2687 return NULL;
2688 }
2689 }
2690 }
2691 }
2692 return FilterSuccessor(depth - 1);
2693 }
2694
2695
2696 RegExpNode* LoopChoiceNode::FilterASCII(int depth) {
2697 if (replacement_ != Uninitialized()) return replacement_;
2698 if (depth < 0) return this;
2699 if (info()->visited) return this;
2700 VisitMarker marker(info());
2701
2702 RegExpNode* continue_replacement = continue_node_->FilterASCII(depth - 1);
2703 if (continue_replacement == NULL) {
2704 // If we can't continue after the loop then there is no sense in doing the
2705 // loop.
2706 replacement_ = NULL;
2707 return NULL;
2708 }
2709
2710 return ChoiceNode::FilterASCII(depth - 1);
2711 }
2712
2713
2714 RegExpNode* ChoiceNode::FilterASCII(int depth) {
2715 if (replacement_ != Uninitialized()) return replacement_;
2716 if (depth < 0) return this;
2717 if (info()->visited) return this;
2718 VisitMarker marker(info());
2719 int choice_count = alternatives_->length();
2720 int surviving = 0;
2721 RegExpNode* survivor = NULL;
2722 for (int i = 0; i < choice_count; i++) {
ulan 2012/04/25 12:55:28 Instead of duplicating the same loop twice, can we
Erik Corry 2012/04/26 08:23:11 I can't see a way to do in-place filtering, since
2723 GuardedAlternative alternative = alternatives_->at(i);
2724 ZoneList<Guard*>* guards = alternative.guards();
2725 int guard_count = (guards == NULL) ? 0 : guards->length();
2726 if (guard_count == 0) {
2727 RegExpNode* replacement = alternative.node()->FilterASCII(depth - 1);
2728 if (replacement != NULL && replacement != this) {
2729 surviving++;
2730 alternatives_->at(i).set_node(replacement);
2731 survivor = replacement;
2732 }
2733 } else {
2734 surviving++;
2735 }
2736 }
2737 if (surviving == 0) {
2738 replacement_ = NULL;
2739 return NULL;
2740 }
2741 if (surviving == 1 && survivor != NULL) {
2742 replacement_ = survivor;
2743 return survivor;
2744 }
2745 replacement_ = this;
2746 if (surviving == choice_count) {
2747 return this;
2748 }
2749 // Only some of the nodes survived the filtering. We need to rebuild the
2750 // alternatives list.
2751 ZoneList<GuardedAlternative>* new_alternatives =
2752 new ZoneList<GuardedAlternative>(surviving);
2753 for (int i = 0; i < choice_count; i++) {
2754 GuardedAlternative alternative = alternatives_->at(i);
2755 ZoneList<Guard*>* guards = alternative.guards();
2756 int guard_count = (guards == NULL) ? 0 : guards->length();
2757 if (guard_count == 0) {
2758 RegExpNode* replacement = alternative.node()->FilterASCII(depth - 1);
2759 if (replacement != NULL && replacement != this) {
2760 GuardedAlternative new_alternative(replacement);
2761 new_alternatives->Add(new_alternative);
2762 }
2763 } else {
2764 new_alternatives->Add(alternative);
2765 }
2766 }
2767 alternatives_ = new_alternatives;
2768 return this;
2769 }
2770
2771
2772 RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth) {
2773 if (replacement_ != Uninitialized()) return replacement_;
2774 if (depth < 0) return this;
2775 if (info()->visited) return this;
2776 VisitMarker marker(info());
2777 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2778 // afterwards.
2779 RegExpNode* node = alternatives_->at(1).node();
2780 RegExpNode* replacement = node->FilterASCII(depth - 1);
2781 if (node == NULL) {
ulan 2012/04/25 12:55:28 Did you mean replacement == NULL?
Erik Corry 2012/04/26 08:23:11 Oops, yes.
2782 replacement_ = NULL;
2783 return NULL;
2784 }
2785 alternatives_->at(1).set_node(replacement);
2786
2787 RegExpNode* neg_node = alternatives_->at(0).node();
2788 RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1);
2789 // If the negative lookahead is always going to fail then
2790 // we don't need to check it.
2791 if (neg_replacement == NULL) {
2792 replacement_ = replacement;
2793 return replacement;
2794 }
2795 alternatives_->at(0).set_node(neg_replacement);
2796 replacement_ = this;
2797 return this;
2798 }
2799
2800
2632 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2801 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2633 RegExpCompiler* compiler, 2802 RegExpCompiler* compiler,
2634 int characters_filled_in, 2803 int characters_filled_in,
2635 bool not_at_start) { 2804 bool not_at_start) {
2636 if (body_can_be_zero_length_ || info()->visited) return; 2805 if (body_can_be_zero_length_ || info()->visited) return;
2637 VisitMarker marker(info()); 2806 VisitMarker marker(info());
2638 return ChoiceNode::GetQuickCheckDetails(details, 2807 return ChoiceNode::GetQuickCheckDetails(details,
2639 compiler, 2808 compiler,
2640 characters_filled_in, 2809 characters_filled_in,
2641 not_at_start); 2810 not_at_start);
(...skipping 3041 matching lines...) Expand 10 before | Expand all | Expand 10 after
5683 // at the start of input. 5852 // at the start of input.
5684 ChoiceNode* first_step_node = new ChoiceNode(2); 5853 ChoiceNode* first_step_node = new ChoiceNode(2);
5685 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 5854 first_step_node->AddAlternative(GuardedAlternative(captured_body));
5686 first_step_node->AddAlternative(GuardedAlternative( 5855 first_step_node->AddAlternative(GuardedAlternative(
5687 new TextNode(new RegExpCharacterClass('*'), loop_node))); 5856 new TextNode(new RegExpCharacterClass('*'), loop_node)));
5688 node = first_step_node; 5857 node = first_step_node;
5689 } else { 5858 } else {
5690 node = loop_node; 5859 node = loop_node;
5691 } 5860 }
5692 } 5861 }
5862 if (is_ascii) node = node->FilterASCII(RegExpCompiler::kMaxRecursion);
5863
5864 if (node == NULL) node = new EndNode(EndNode::BACKTRACK);
5693 data->node = node; 5865 data->node = node;
5694 Analysis analysis(ignore_case, is_ascii); 5866 Analysis analysis(ignore_case, is_ascii);
5695 analysis.EnsureAnalyzed(node); 5867 analysis.EnsureAnalyzed(node);
5696 if (analysis.has_failed()) { 5868 if (analysis.has_failed()) {
5697 const char* error_message = analysis.error_message(); 5869 const char* error_message = analysis.error_message();
5698 return CompilationResult(error_message); 5870 return CompilationResult(error_message);
5699 } 5871 }
5700 5872
5701 // Create the correct assembler for the architecture. 5873 // Create the correct assembler for the architecture.
5702 #ifndef V8_INTERPRETED_REGEXP 5874 #ifndef V8_INTERPRETED_REGEXP
(...skipping 29 matching lines...) Expand all
5732 } 5904 }
5733 5905
5734 return compiler.Assemble(&macro_assembler, 5906 return compiler.Assemble(&macro_assembler,
5735 node, 5907 node,
5736 data->capture_count, 5908 data->capture_count,
5737 pattern); 5909 pattern);
5738 } 5910 }
5739 5911
5740 5912
5741 }} // namespace v8::internal 5913 }} // namespace v8::internal
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698