Chromium Code Reviews| Index: src/jsregexp.cc |
| =================================================================== |
| --- src/jsregexp.cc (revision 11428) |
| +++ src/jsregexp.cc (working copy) |
| @@ -380,6 +380,9 @@ |
| } |
| +int kUninitializedRegExpNodePlaceHolder; |
| + |
| + |
| bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, |
| Handle<String> sample_subject, |
| bool is_ascii) { |
| @@ -2426,15 +2429,9 @@ |
| QuickCheckDetails::Position* pos = |
| details->positions(characters_filled_in); |
| uc16 c = quarks[i]; |
| - if (c > char_mask) { |
| - // If we expect a non-ASCII character from an ASCII string, |
| - // there is no way we can match. Not even case independent |
| - // matching can turn an ASCII character into non-ASCII or |
| - // vice versa. |
| - details->set_cannot_match(); |
| - pos->determines_perfectly = false; |
| - return; |
| - } |
| + // We should already have filtered out nodes that have non-ASCII |
| + // characters if we are matching against an ASCII string. |
| + ASSERT(c <= char_mask); |
| if (compiler->ignore_case()) { |
| unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(), |
| @@ -2496,11 +2493,9 @@ |
| int first_range = 0; |
| while (ranges->at(first_range).from() > char_mask) { |
| first_range++; |
| - if (first_range == ranges->length()) { |
| - details->set_cannot_match(); |
| - pos->determines_perfectly = false; |
| - return; |
| - } |
| + // We should already have filtered out nodes that cannot match |
| + // so the first range should be a valid range. |
| + ASSERT(first_range != ranges->length()); |
| } |
| CharacterRange range = ranges->at(first_range); |
| uc16 from = range.from(); |
| @@ -2629,6 +2624,180 @@ |
| }; |
| +RegExpNode* SeqRegExpNode::FilterASCII(int depth) { |
| + if (replacement_ != Uninitialized()) return replacement_; |
| + if (depth < 0) return this; |
| + ASSERT(!info()->visited); |
| + VisitMarker marker(info()); |
| + return FilterSuccessor(depth - 1); |
| +} |
| + |
| + |
| +RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) { |
| + RegExpNode* next = on_success_->FilterASCII(depth - 1); |
| + if (next == NULL) { |
| + replacement_ = NULL; |
| + return NULL; |
| + } |
| + on_success_ = next; |
| + replacement_ = this; |
| + return this; |
| +} |
| + |
| + |
| +RegExpNode* TextNode::FilterASCII(int depth) { |
| + if (replacement_ != Uninitialized()) return replacement_; |
| + if (depth < 0) return this; |
| + ASSERT(!info()->visited); |
| + VisitMarker marker(info()); |
| + int element_count = elms_->length(); |
| + for (int i = 0; i < element_count; i++) { |
| + TextElement elm = elms_->at(i); |
| + if (elm.type == TextElement::ATOM) { |
| + Vector<const uc16> quarks = elm.data.u_atom->data(); |
| + for (int j = 0; j < quarks.length(); j++) { |
| + // We don't need special handling for case independence |
| + // because of the rule that case independence cannot make |
| + // a non-ASCII character match an ASCII character. |
| + if (quarks[j] > String::kMaxAsciiCharCode) { |
| + replacement_ = NULL; |
| + return NULL; |
| + } |
| + } |
| + } else { |
| + ASSERT(elm.type == TextElement::CHAR_CLASS); |
| + RegExpCharacterClass* cc = elm.data.u_char_class; |
| + ZoneList<CharacterRange>* ranges = cc->ranges(); |
| + if (!CharacterRange::IsCanonical(ranges)) { |
| + CharacterRange::Canonicalize(ranges); |
| + } |
| + // Now they are in order so we only need to look at the first. |
| + int range_count = ranges->length(); |
| + if (cc->is_negated()) { |
| + if (range_count != 0 && |
| + ranges->at(0).from() == 0 && |
| + ranges->at(0).to() >= String::kMaxAsciiCharCode) { |
| + replacement_ = NULL; |
| + return NULL; |
| + } |
| + } else { |
| + if (range_count == 0 || |
| + ranges->at(0).from() > String::kMaxAsciiCharCode) { |
| + replacement_ = NULL; |
| + return NULL; |
| + } |
| + } |
| + } |
| + } |
| + return FilterSuccessor(depth - 1); |
| +} |
| + |
| + |
| +RegExpNode* LoopChoiceNode::FilterASCII(int depth) { |
| + if (replacement_ != Uninitialized()) return replacement_; |
| + if (depth < 0) return this; |
| + if (info()->visited) return this; |
| + VisitMarker marker(info()); |
| + |
| + RegExpNode* continue_replacement = continue_node_->FilterASCII(depth - 1); |
| + if (continue_replacement == NULL) { |
| + // If we can't continue after the loop then there is no sense in doing the |
| + // loop. |
| + replacement_ = NULL; |
| + return NULL; |
| + } |
| + |
| + return ChoiceNode::FilterASCII(depth - 1); |
| +} |
| + |
| + |
| +RegExpNode* ChoiceNode::FilterASCII(int depth) { |
| + if (replacement_ != Uninitialized()) return replacement_; |
| + if (depth < 0) return this; |
| + if (info()->visited) return this; |
| + VisitMarker marker(info()); |
| + int choice_count = alternatives_->length(); |
| + int surviving = 0; |
| + RegExpNode* survivor = NULL; |
| + for (int i = 0; i < choice_count; i++) { |
|
ulan
2012/04/25 12:55:28
Instead of duplicating the same loop twice, can we
Erik Corry
2012/04/26 08:23:11
I can't see a way to do in-place filtering, since
|
| + GuardedAlternative alternative = alternatives_->at(i); |
| + ZoneList<Guard*>* guards = alternative.guards(); |
| + int guard_count = (guards == NULL) ? 0 : guards->length(); |
| + if (guard_count == 0) { |
| + RegExpNode* replacement = alternative.node()->FilterASCII(depth - 1); |
| + if (replacement != NULL && replacement != this) { |
| + surviving++; |
| + alternatives_->at(i).set_node(replacement); |
| + survivor = replacement; |
| + } |
| + } else { |
| + surviving++; |
| + } |
| + } |
| + if (surviving == 0) { |
| + replacement_ = NULL; |
| + return NULL; |
| + } |
| + if (surviving == 1 && survivor != NULL) { |
| + replacement_ = survivor; |
| + return survivor; |
| + } |
| + replacement_ = this; |
| + if (surviving == choice_count) { |
| + return this; |
| + } |
| + // Only some of the nodes survived the filtering. We need to rebuild the |
| + // alternatives list. |
| + ZoneList<GuardedAlternative>* new_alternatives = |
| + new ZoneList<GuardedAlternative>(surviving); |
| + for (int i = 0; i < choice_count; i++) { |
| + GuardedAlternative alternative = alternatives_->at(i); |
| + ZoneList<Guard*>* guards = alternative.guards(); |
| + int guard_count = (guards == NULL) ? 0 : guards->length(); |
| + if (guard_count == 0) { |
| + RegExpNode* replacement = alternative.node()->FilterASCII(depth - 1); |
| + if (replacement != NULL && replacement != this) { |
| + GuardedAlternative new_alternative(replacement); |
| + new_alternatives->Add(new_alternative); |
| + } |
| + } else { |
| + new_alternatives->Add(alternative); |
| + } |
| + } |
| + alternatives_ = new_alternatives; |
| + return this; |
| +} |
| + |
| + |
| +RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth) { |
| + if (replacement_ != Uninitialized()) return replacement_; |
| + if (depth < 0) return this; |
| + if (info()->visited) return this; |
| + VisitMarker marker(info()); |
| + // Alternative 0 is the negative lookahead, alternative 1 is what comes |
| + // afterwards. |
| + RegExpNode* node = alternatives_->at(1).node(); |
| + RegExpNode* replacement = node->FilterASCII(depth - 1); |
| + if (node == NULL) { |
|
ulan
2012/04/25 12:55:28
Did you mean replacement == NULL?
Erik Corry
2012/04/26 08:23:11
Oops, yes.
|
| + replacement_ = NULL; |
| + return NULL; |
| + } |
| + alternatives_->at(1).set_node(replacement); |
| + |
| + RegExpNode* neg_node = alternatives_->at(0).node(); |
| + RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1); |
| + // If the negative lookahead is always going to fail then |
| + // we don't need to check it. |
| + if (neg_replacement == NULL) { |
| + replacement_ = replacement; |
| + return replacement; |
| + } |
| + alternatives_->at(0).set_node(neg_replacement); |
| + replacement_ = this; |
| + return this; |
| +} |
| + |
| + |
| void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, |
| RegExpCompiler* compiler, |
| int characters_filled_in, |
| @@ -5690,6 +5859,9 @@ |
| node = loop_node; |
| } |
| } |
| + if (is_ascii) node = node->FilterASCII(RegExpCompiler::kMaxRecursion); |
| + |
| + if (node == NULL) node = new EndNode(EndNode::BACKTRACK); |
| data->node = node; |
| Analysis analysis(ignore_case, is_ascii); |
| analysis.EnsureAnalyzed(node); |