Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(119)

Unified Diff: src/jsregexp.cc

Issue 10174017: Regexp: Remove nodes from the regexp that cannot match because (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 8 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/jsregexp.h ('k') | test/mjsunit/regexp-capture-3.js » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/jsregexp.cc
===================================================================
--- src/jsregexp.cc (revision 11428)
+++ src/jsregexp.cc (working copy)
@@ -2426,15 +2426,9 @@
QuickCheckDetails::Position* pos =
details->positions(characters_filled_in);
uc16 c = quarks[i];
- if (c > char_mask) {
- // If we expect a non-ASCII character from an ASCII string,
- // there is no way we can match. Not even case independent
- // matching can turn an ASCII character into non-ASCII or
- // vice versa.
- details->set_cannot_match();
- pos->determines_perfectly = false;
- return;
- }
+ // We should already have filtered out nodes that have non-ASCII
+ // characters if we are matching against an ASCII string.
+ ASSERT(c <= char_mask);
if (compiler->ignore_case()) {
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(),
@@ -2496,11 +2490,9 @@
int first_range = 0;
while (ranges->at(first_range).from() > char_mask) {
first_range++;
- if (first_range == ranges->length()) {
- details->set_cannot_match();
- pos->determines_perfectly = false;
- return;
- }
+ // We should already have filtered out nodes that cannot match
+ // so the first range should be a valid range.
+ ASSERT(first_range != ranges->length());
}
CharacterRange range = ranges->at(first_range);
uc16 from = range.from();
@@ -2629,6 +2621,144 @@
};
+RegExpNode* SeqRegExpNode::FilterASCII(int depth) {
+ if (info()->replacement_calculated) return replacement();
+ if (depth < 0) return this;
+ ASSERT(!info()->visited);
+ VisitMarker marker(info());
+ return FilterSuccessor(depth - 1);
+}
+
+
+RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
+ RegExpNode* next = on_success_->FilterASCII(depth - 1);
+ if (next == NULL) return set_replacement(NULL);
+ on_success_ = next;
+ return set_replacement(this);
+}
+
+
+RegExpNode* TextNode::FilterASCII(int depth) {
+ if (info()->replacement_calculated) return replacement();
+ if (depth < 0) return this;
+ ASSERT(!info()->visited);
+ VisitMarker marker(info());
+ int element_count = elms_->length();
+ for (int i = 0; i < element_count; i++) {
+ TextElement elm = elms_->at(i);
+ if (elm.type == TextElement::ATOM) {
+ Vector<const uc16> quarks = elm.data.u_atom->data();
+ for (int j = 0; j < quarks.length(); j++) {
+ // We don't need special handling for case independence
+ // because of the rule that case independence cannot make
+ // a non-ASCII character match an ASCII character.
+ if (quarks[j] > String::kMaxAsciiCharCode) {
+ return set_replacement(NULL);
+ }
+ }
+ } else {
+ ASSERT(elm.type == TextElement::CHAR_CLASS);
+ RegExpCharacterClass* cc = elm.data.u_char_class;
+ ZoneList<CharacterRange>* ranges = cc->ranges();
+ if (!CharacterRange::IsCanonical(ranges)) {
+ CharacterRange::Canonicalize(ranges);
+ }
+ // Now they are in order so we only need to look at the first.
+ int range_count = ranges->length();
+ if (cc->is_negated()) {
+ if (range_count != 0 &&
+ ranges->at(0).from() == 0 &&
+ ranges->at(0).to() >= String::kMaxAsciiCharCode) {
+ return set_replacement(NULL);
+ }
+ } else {
+ if (range_count == 0 ||
+ ranges->at(0).from() > String::kMaxAsciiCharCode) {
+ return set_replacement(NULL);
+ }
+ }
+ }
+ }
+ return FilterSuccessor(depth - 1);
+}
+
+
+RegExpNode* LoopChoiceNode::FilterASCII(int depth) {
+ if (info()->replacement_calculated) return replacement();
+ if (depth < 0) return this;
+ if (info()->visited) return this;
+ VisitMarker marker(info());
+
+ RegExpNode* continue_replacement = continue_node_->FilterASCII(depth - 1);
+ // If we can't continue after the loop then there is no sense in doing the
+ // loop.
+ if (continue_replacement == NULL) return set_replacement(NULL);
+
+ return ChoiceNode::FilterASCII(depth - 1);
+}
+
+
+RegExpNode* ChoiceNode::FilterASCII(int depth) {
+ if (info()->replacement_calculated) return replacement();
+ if (depth < 0) return this;
+ if (info()->visited) return this;
+ VisitMarker marker(info());
+ int choice_count = alternatives_->length();
+ int surviving = 0;
+ RegExpNode* survivor = NULL;
+ for (int i = 0; i < choice_count; i++) {
+ GuardedAlternative alternative = alternatives_->at(i);
+ RegExpNode* replacement = alternative.node()->FilterASCII(depth - 1);
+ ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
+ alternatives_->at(i).set_node(replacement);
+ if (replacement != NULL) {
+ surviving++;
+ survivor = replacement;
+ }
+ }
+ if (surviving < 2) return set_replacement(survivor);
+
+ set_replacement(this);
+ if (surviving == choice_count) {
+ return this;
+ }
+ // Only some of the nodes survived the filtering. We need to rebuild the
+ // alternatives list.
+ ZoneList<GuardedAlternative>* new_alternatives =
+ new ZoneList<GuardedAlternative>(surviving);
+ for (int i = 0; i < choice_count; i++) {
+ GuardedAlternative alternative = alternatives_->at(i);
+ if (alternative.node() != NULL) {
+ new_alternatives->Add(alternative);
+ }
+ }
+ alternatives_ = new_alternatives;
+ return this;
+}
+
+
+RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth) {
+ if (info()->replacement_calculated) return replacement();
+ if (depth < 0) return this;
+ if (info()->visited) return this;
+ VisitMarker marker(info());
+ // Alternative 0 is the negative lookahead, alternative 1 is what comes
+ // afterwards.
+ RegExpNode* node = alternatives_->at(1).node();
+ RegExpNode* replacement = node->FilterASCII(depth - 1);
+ if (replacement == NULL) return set_replacement(NULL);
+ alternatives_->at(1).set_node(replacement);
+
+ RegExpNode* neg_node = alternatives_->at(0).node();
+ RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1);
+ // If the negative lookahead is always going to fail then
+ // we don't need to check it.
+ if (neg_replacement == NULL) return set_replacement(replacement);
+ alternatives_->at(0).set_node(neg_replacement);
+ return set_replacement(this);
+}
+
+
void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in,
@@ -5690,6 +5820,9 @@
node = loop_node;
}
}
+ if (is_ascii) node = node->FilterASCII(RegExpCompiler::kMaxRecursion);
+
+ if (node == NULL) node = new EndNode(EndNode::BACKTRACK);
data->node = node;
Analysis analysis(ignore_case, is_ascii);
analysis.EnsureAnalyzed(node);
« no previous file with comments | « src/jsregexp.h ('k') | test/mjsunit/regexp-capture-3.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698