src/jsregexp.cc - Issue 10174017: Regexp: Remove nodes from the regexp that cannot match because

Unified Diff: src/jsregexp.cc

Issue 10174017: Regexp: Remove nodes from the regexp that cannot match because (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 8 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/jsregexp.cc

===================================================================

--- src/jsregexp.cc (revision 11428)

+++ src/jsregexp.cc (working copy)

@@ -380,6 +380,9 @@

}

+int kUninitializedRegExpNodePlaceHolder;

bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,

Handle<String> sample_subject,

bool is_ascii) {

@@ -2426,15 +2429,9 @@

QuickCheckDetails::Position* pos =

details->positions(characters_filled_in);

uc16 c = quarks[i];

- if (c > char_mask) {

- // If we expect a non-ASCII character from an ASCII string,

- // there is no way we can match. Not even case independent

- // matching can turn an ASCII character into non-ASCII or

- // vice versa.

- details->set_cannot_match();

- pos->determines_perfectly = false;

- return;

- }

+ // We should already have filtered out nodes that have non-ASCII

+ // characters if we are matching against an ASCII string.

+ ASSERT(c <= char_mask);

if (compiler->ignore_case()) {

unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(),

@@ -2496,11 +2493,9 @@

int first_range = 0;

while (ranges->at(first_range).from() > char_mask) {

first_range++;

- if (first_range == ranges->length()) {

- details->set_cannot_match();

- pos->determines_perfectly = false;

- return;

- }

+ // We should already have filtered out nodes that cannot match

+ // so the first range should be a valid range.

+ ASSERT(first_range != ranges->length());

}

CharacterRange range = ranges->at(first_range);

uc16 from = range.from();

@@ -2629,6 +2624,180 @@

};

+RegExpNode* SeqRegExpNode::FilterASCII(int depth) {

+ if (replacement_ != Uninitialized()) return replacement_;

+ if (depth < 0) return this;

+ ASSERT(!info()->visited);

+ VisitMarker marker(info());

+ return FilterSuccessor(depth - 1);

+RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {

+ RegExpNode* next = on_success_->FilterASCII(depth - 1);

+ if (next == NULL) {

+ replacement_ = NULL;

+ return NULL;

+ }

+ on_success_ = next;

+ replacement_ = this;

+ return this;

+RegExpNode* TextNode::FilterASCII(int depth) {

+ if (replacement_ != Uninitialized()) return replacement_;

+ if (depth < 0) return this;

+ ASSERT(!info()->visited);

+ VisitMarker marker(info());

+ int element_count = elms_->length();

+ for (int i = 0; i < element_count; i++) {

+ TextElement elm = elms_->at(i);

+ if (elm.type == TextElement::ATOM) {

+ Vector<const uc16> quarks = elm.data.u_atom->data();

+ for (int j = 0; j < quarks.length(); j++) {

+ // We don't need special handling for case independence

+ // because of the rule that case independence cannot make

+ // a non-ASCII character match an ASCII character.

+ if (quarks[j] > String::kMaxAsciiCharCode) {

+ replacement_ = NULL;

+ return NULL;

+ }

+ } else {

+ ASSERT(elm.type == TextElement::CHAR_CLASS);

+ RegExpCharacterClass* cc = elm.data.u_char_class;

+ ZoneList<CharacterRange>* ranges = cc->ranges();

+ if (!CharacterRange::IsCanonical(ranges)) {

+ CharacterRange::Canonicalize(ranges);

+ }

+ // Now they are in order so we only need to look at the first.

+ int range_count = ranges->length();

+ if (cc->is_negated()) {

+ if (range_count != 0 &&

+ ranges->at(0).from() == 0 &&

+ ranges->at(0).to() >= String::kMaxAsciiCharCode) {

+ replacement_ = NULL;

+ return NULL;

+ }

+ } else {

+ if (range_count == 0 ||

+ ranges->at(0).from() > String::kMaxAsciiCharCode) {

+ replacement_ = NULL;

+ return NULL;

+ }

+ return FilterSuccessor(depth - 1);

+RegExpNode* LoopChoiceNode::FilterASCII(int depth) {

+ if (replacement_ != Uninitialized()) return replacement_;

+ if (depth < 0) return this;

+ if (info()->visited) return this;

+ VisitMarker marker(info());

+ RegExpNode* continue_replacement = continue_node_->FilterASCII(depth - 1);

+ if (continue_replacement == NULL) {

+ // If we can't continue after the loop then there is no sense in doing the

+ // loop.

+ replacement_ = NULL;

+ return NULL;

+ }

+ return ChoiceNode::FilterASCII(depth - 1);

+RegExpNode* ChoiceNode::FilterASCII(int depth) {

+ if (replacement_ != Uninitialized()) return replacement_;

+ if (depth < 0) return this;

+ if (info()->visited) return this;

+ VisitMarker marker(info());

+ int choice_count = alternatives_->length();

+ int surviving = 0;

+ RegExpNode* survivor = NULL;

+ for (int i = 0; i < choice_count; i++) {

ulan 2012/04/25 12:55:28 Instead of duplicating the same loop twice, can we

Erik Corry 2012/04/26 08:23:11 I can't see a way to do in-place filtering, since

+ GuardedAlternative alternative = alternatives_->at(i);

+ ZoneList<Guard*>* guards = alternative.guards();

+ int guard_count = (guards == NULL) ? 0 : guards->length();

+ if (guard_count == 0) {

+ RegExpNode* replacement = alternative.node()->FilterASCII(depth - 1);

+ if (replacement != NULL && replacement != this) {

+ surviving++;

+ alternatives_->at(i).set_node(replacement);

+ survivor = replacement;

+ }

+ } else {

+ surviving++;

+ }

+ if (surviving == 0) {

+ replacement_ = NULL;

+ return NULL;

+ }

+ if (surviving == 1 && survivor != NULL) {

+ replacement_ = survivor;

+ return survivor;

+ }

+ replacement_ = this;

+ if (surviving == choice_count) {

+ return this;

+ }

+ // Only some of the nodes survived the filtering. We need to rebuild the

+ // alternatives list.

+ ZoneList<GuardedAlternative>* new_alternatives =

+ new ZoneList<GuardedAlternative>(surviving);

+ for (int i = 0; i < choice_count; i++) {

+ GuardedAlternative alternative = alternatives_->at(i);

+ ZoneList<Guard*>* guards = alternative.guards();

+ int guard_count = (guards == NULL) ? 0 : guards->length();

+ if (guard_count == 0) {

+ RegExpNode* replacement = alternative.node()->FilterASCII(depth - 1);

+ if (replacement != NULL && replacement != this) {

+ GuardedAlternative new_alternative(replacement);

+ new_alternatives->Add(new_alternative);

+ }

+ } else {

+ new_alternatives->Add(alternative);

+ }

+ alternatives_ = new_alternatives;

+ return this;

+RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth) {

+ if (replacement_ != Uninitialized()) return replacement_;

+ if (depth < 0) return this;

+ if (info()->visited) return this;

+ VisitMarker marker(info());

+ // Alternative 0 is the negative lookahead, alternative 1 is what comes

+ // afterwards.

+ RegExpNode* node = alternatives_->at(1).node();

+ RegExpNode* replacement = node->FilterASCII(depth - 1);

+ if (node == NULL) {

ulan 2012/04/25 12:55:28 Did you mean replacement == NULL?

Erik Corry 2012/04/26 08:23:11 Oops, yes.

+ replacement_ = NULL;

+ return NULL;

+ }

+ alternatives_->at(1).set_node(replacement);

+ RegExpNode* neg_node = alternatives_->at(0).node();

+ RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1);

+ // If the negative lookahead is always going to fail then

+ // we don't need to check it.

+ if (neg_replacement == NULL) {

+ replacement_ = replacement;

+ return replacement;

+ }

+ alternatives_->at(0).set_node(neg_replacement);

+ replacement_ = this;

+ return this;

void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,

RegExpCompiler* compiler,

int characters_filled_in,

@@ -5690,6 +5859,9 @@

node = loop_node;

}

+ if (is_ascii) node = node->FilterASCII(RegExpCompiler::kMaxRecursion);

+ if (node == NULL) node = new EndNode(EndNode::BACKTRACK);

data->node = node;

Analysis analysis(ignore_case, is_ascii);

analysis.EnsureAnalyzed(node);

« src/jsregexp.h ('K') | « src/jsregexp.h ('k') | test/mjsunit/regexp-capture-3.js » ('j') | test/mjsunit/regexp-capture-3.js » ('J')