Index: src/jsregexp.cc |
diff --git a/src/jsregexp.cc b/src/jsregexp.cc |
index 813208c9590e651749321a818351488629da4f1f..347fc03e7b26d96751bc0e270296bf828a0a41e3 100644 |
--- a/src/jsregexp.cc |
+++ b/src/jsregexp.cc |
@@ -1681,7 +1681,7 @@ static int GetCaseIndependentLetters(Isolate* isolate, |
letters[0] = character; |
length = 1; |
} |
- if (!ascii_subject || character <= String::kMaxAsciiCharCode) { |
+ if (!ascii_subject || character <= String::kMaxOneByteCharCode) { |
return length; |
} |
// The standard requires that non-ASCII characters cannot have ASCII |
@@ -1732,7 +1732,7 @@ static inline bool EmitAtomNonLetter(Isolate* isolate, |
bool checked = false; |
// We handle the length > 1 case in a later pass. |
if (length == 1) { |
- if (ascii && c > String::kMaxAsciiCharCodeU) { |
+ if (ascii && c > String::kMaxOneByteCharCodeU) { |
// Can't match - see above. |
return false; // Bounds not checked. |
} |
@@ -1753,7 +1753,7 @@ static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, |
Label* on_failure) { |
uc16 char_mask; |
if (ascii) { |
- char_mask = String::kMaxAsciiCharCode; |
+ char_mask = String::kMaxOneByteCharCode; |
} else { |
char_mask = String::kMaxUtf16CodeUnit; |
} |
@@ -2007,7 +2007,7 @@ static void SplitSearchSpace(ZoneList<int>* ranges, |
// range with a single not-taken branch, speeding up this important |
// character range (even non-ASCII charset-based text has spaces and |
// punctuation). |
- if (*border - 1 > String::kMaxAsciiCharCode && // ASCII case. |
+ if (*border - 1 > String::kMaxOneByteCharCode && // ASCII case. |
end_index - start_index > (*new_start_index - start_index) * 2 && |
last - first > kSize * 2 && |
binary_chop_index > *new_start_index && |
@@ -2211,7 +2211,7 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, |
int max_char; |
if (ascii) { |
- max_char = String::kMaxAsciiCharCode; |
+ max_char = String::kMaxOneByteCharCode; |
} else { |
max_char = String::kMaxUtf16CodeUnit; |
} |
@@ -2513,7 +2513,7 @@ bool QuickCheckDetails::Rationalize(bool asc) { |
bool found_useful_op = false; |
uint32_t char_mask; |
if (asc) { |
- char_mask = String::kMaxAsciiCharCode; |
+ char_mask = String::kMaxOneByteCharCode; |
} else { |
char_mask = String::kMaxUtf16CodeUnit; |
} |
@@ -2522,7 +2522,7 @@ bool QuickCheckDetails::Rationalize(bool asc) { |
int char_shift = 0; |
for (int i = 0; i < characters_; i++) { |
Position* pos = &positions_[i]; |
- if ((pos->mask & String::kMaxAsciiCharCode) != 0) { |
+ if ((pos->mask & String::kMaxOneByteCharCode) != 0) { |
found_useful_op = true; |
} |
mask_ |= (pos->mask & char_mask) << char_shift; |
@@ -2565,7 +2565,7 @@ bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, |
// load so the value is already masked down. |
uint32_t char_mask; |
if (compiler->ascii()) { |
- char_mask = String::kMaxAsciiCharCode; |
+ char_mask = String::kMaxOneByteCharCode; |
} else { |
char_mask = String::kMaxUtf16CodeUnit; |
} |
@@ -2575,7 +2575,11 @@ bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, |
// For 2-character preloads in ASCII mode or 1-character preloads in |
// TWO_BYTE mode we also use a 16 bit load with zero extend. |
if (details->characters() == 2 && compiler->ascii()) { |
- if ((mask & 0x7f7f) == 0x7f7f) need_mask = false; |
+#ifndef ENABLE_LATIN_1 |
+ if ((mask & 0x7f7f) == 0xffff) need_mask = false; |
+#else |
+ if ((mask & 0xffff) == 0xffff) need_mask = false; |
+#endif |
} else if (details->characters() == 1 && !compiler->ascii()) { |
if ((mask & 0xffff) == 0xffff) need_mask = false; |
} else { |
@@ -2617,7 +2621,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, |
int characters = details->characters(); |
int char_mask; |
if (compiler->ascii()) { |
- char_mask = String::kMaxAsciiCharCode; |
+ char_mask = String::kMaxOneByteCharCode; |
} else { |
char_mask = String::kMaxUtf16CodeUnit; |
} |
@@ -2834,24 +2838,24 @@ class VisitMarker { |
}; |
-RegExpNode* SeqRegExpNode::FilterASCII(int depth) { |
+RegExpNode* SeqRegExpNode::FilterASCII(int depth, bool ignore_case) { |
if (info()->replacement_calculated) return replacement(); |
if (depth < 0) return this; |
ASSERT(!info()->visited); |
VisitMarker marker(info()); |
- return FilterSuccessor(depth - 1); |
+ return FilterSuccessor(depth - 1, ignore_case); |
} |
-RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) { |
- RegExpNode* next = on_success_->FilterASCII(depth - 1); |
+RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) { |
+ RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case); |
if (next == NULL) return set_replacement(NULL); |
on_success_ = next; |
return set_replacement(this); |
} |
-RegExpNode* TextNode::FilterASCII(int depth) { |
+RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) { |
if (info()->replacement_calculated) return replacement(); |
if (depth < 0) return this; |
ASSERT(!info()->visited); |
@@ -2862,15 +2866,40 @@ RegExpNode* TextNode::FilterASCII(int depth) { |
if (elm.type == TextElement::ATOM) { |
Vector<const uc16> quarks = elm.data.u_atom->data(); |
for (int j = 0; j < quarks.length(); j++) { |
- // We don't need special handling for case independence |
- // because of the rule that case independence cannot make |
- // a non-ASCII character match an ASCII character. |
- if (quarks[j] > String::kMaxAsciiCharCode) { |
+#ifndef ENABLE_LATIN_1 |
+ if (quarks[j] > String::kMaxOneByteCharCode) { |
return set_replacement(NULL); |
} |
+#else |
+ if (quarks[j] <= String::kMaxOneByteCharCode) continue; |
+ if (!ignore_case) return set_replacement(NULL); |
+ // Here, we need to check for characters whose upper and lower cases |
+ // are outside the Latin-1 range. |
+ // TODO(dcarney): Replace this code with a simple |
+ // table lookup in unibrow::Latin-1. |
+ // TODO(dcarney): Test cases!. |
+ unibrow::uchar result; |
+ int chars; |
+ chars = unibrow::ToLowercase::Convert(quarks[j], 0, &result, NULL); |
+ if (chars > 1 || |
+ (chars == 1 && result <= String::kMaxOneByteCharCodeU)) { |
+ continue; |
+ } |
+ chars = unibrow::ToUppercase::Convert(quarks[j], 0, &result, NULL); |
+ if (chars > 1 || |
+ (chars == 1 && result <= String::kMaxOneByteCharCodeU)) { |
+ continue; |
+ } |
+ // This character is definitely not in the Latin-1 range. |
+ return set_replacement(NULL); |
+#endif |
} |
} else { |
ASSERT(elm.type == TextElement::CHAR_CLASS); |
+#ifdef ENABLE_LATIN_1 |
+ // TODO(dcarney): Can this be improved? |
+ if (ignore_case) continue; |
+#endif |
RegExpCharacterClass* cc = elm.data.u_char_class; |
ZoneList<CharacterRange>* ranges = cc->ranges(zone()); |
if (!CharacterRange::IsCanonical(ranges)) { |
@@ -2881,39 +2910,40 @@ RegExpNode* TextNode::FilterASCII(int depth) { |
if (cc->is_negated()) { |
if (range_count != 0 && |
ranges->at(0).from() == 0 && |
- ranges->at(0).to() >= String::kMaxAsciiCharCode) { |
+ ranges->at(0).to() >= String::kMaxOneByteCharCode) { |
return set_replacement(NULL); |
} |
} else { |
if (range_count == 0 || |
- ranges->at(0).from() > String::kMaxAsciiCharCode) { |
+ ranges->at(0).from() > String::kMaxOneByteCharCode) { |
return set_replacement(NULL); |
} |
} |
} |
} |
- return FilterSuccessor(depth - 1); |
+ return FilterSuccessor(depth - 1, ignore_case); |
} |
-RegExpNode* LoopChoiceNode::FilterASCII(int depth) { |
+RegExpNode* LoopChoiceNode::FilterASCII(int depth, bool ignore_case) { |
if (info()->replacement_calculated) return replacement(); |
if (depth < 0) return this; |
if (info()->visited) return this; |
{ |
VisitMarker marker(info()); |
- RegExpNode* continue_replacement = continue_node_->FilterASCII(depth - 1); |
+ RegExpNode* continue_replacement = |
+ continue_node_->FilterASCII(depth - 1, ignore_case); |
// If we can't continue after the loop then there is no sense in doing the |
// loop. |
if (continue_replacement == NULL) return set_replacement(NULL); |
} |
- return ChoiceNode::FilterASCII(depth - 1); |
+ return ChoiceNode::FilterASCII(depth - 1, ignore_case); |
} |
-RegExpNode* ChoiceNode::FilterASCII(int depth) { |
+RegExpNode* ChoiceNode::FilterASCII(int depth, bool ignore_case) { |
if (info()->replacement_calculated) return replacement(); |
if (depth < 0) return this; |
if (info()->visited) return this; |
@@ -2932,7 +2962,8 @@ RegExpNode* ChoiceNode::FilterASCII(int depth) { |
RegExpNode* survivor = NULL; |
for (int i = 0; i < choice_count; i++) { |
GuardedAlternative alternative = alternatives_->at(i); |
- RegExpNode* replacement = alternative.node()->FilterASCII(depth - 1); |
+ RegExpNode* replacement = |
+ alternative.node()->FilterASCII(depth - 1, ignore_case); |
ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK. |
if (replacement != NULL) { |
alternatives_->at(i).set_node(replacement); |
@@ -2952,7 +2983,7 @@ RegExpNode* ChoiceNode::FilterASCII(int depth) { |
new(zone()) ZoneList<GuardedAlternative>(surviving, zone()); |
for (int i = 0; i < choice_count; i++) { |
RegExpNode* replacement = |
- alternatives_->at(i).node()->FilterASCII(depth - 1); |
+ alternatives_->at(i).node()->FilterASCII(depth - 1, ignore_case); |
if (replacement != NULL) { |
alternatives_->at(i).set_node(replacement); |
new_alternatives->Add(alternatives_->at(i), zone()); |
@@ -2963,7 +2994,8 @@ RegExpNode* ChoiceNode::FilterASCII(int depth) { |
} |
-RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth) { |
+RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth, |
+ bool ignore_case) { |
if (info()->replacement_calculated) return replacement(); |
if (depth < 0) return this; |
if (info()->visited) return this; |
@@ -2971,12 +3003,12 @@ RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth) { |
// Alternative 0 is the negative lookahead, alternative 1 is what comes |
// afterwards. |
RegExpNode* node = alternatives_->at(1).node(); |
- RegExpNode* replacement = node->FilterASCII(depth - 1); |
+ RegExpNode* replacement = node->FilterASCII(depth - 1, ignore_case); |
if (replacement == NULL) return set_replacement(NULL); |
alternatives_->at(1).set_node(replacement); |
RegExpNode* neg_node = alternatives_->at(0).node(); |
- RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1); |
+ RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1, ignore_case); |
// If the negative lookahead is always going to fail then |
// we don't need to check it. |
if (neg_replacement == NULL) return set_replacement(replacement); |
@@ -3299,7 +3331,7 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, |
switch (pass) { |
case NON_ASCII_MATCH: |
ASSERT(ascii); |
- if (quarks[j] > String::kMaxAsciiCharCode) { |
+ if (quarks[j] > String::kMaxOneByteCharCode) { |
assembler->GoTo(backtrack); |
return; |
} |
@@ -3498,7 +3530,7 @@ RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( |
if (ranges->length() != 1) return NULL; |
uint32_t max_char; |
if (compiler->ascii()) { |
- max_char = String::kMaxAsciiCharCode; |
+ max_char = String::kMaxOneByteCharCode; |
} else { |
max_char = String::kMaxUtf16CodeUnit; |
} |
@@ -3698,7 +3730,7 @@ BoyerMooreLookahead::BoyerMooreLookahead( |
: length_(length), |
compiler_(compiler) { |
if (compiler->ascii()) { |
- max_char_ = String::kMaxAsciiCharCode; |
+ max_char_ = String::kMaxOneByteCharCode; |
} else { |
max_char_ = String::kMaxUtf16CodeUnit; |
} |
@@ -5337,8 +5369,8 @@ void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, |
uc16 bottom = from(); |
uc16 top = to(); |
if (is_ascii) { |
- if (bottom > String::kMaxAsciiCharCode) return; |
- if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode; |
+ if (bottom > String::kMaxOneByteCharCode) return; |
+ if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; |
} |
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
if (top == bottom) { |
@@ -5885,7 +5917,7 @@ void TextNode::FillInBMInfo(int initial_offset, |
int length = GetCaseIndependentLetters( |
ISOLATE, |
character, |
- bm->max_char() == String::kMaxAsciiCharCode, |
+ bm->max_char() == String::kMaxOneByteCharCode, |
chars); |
for (int j = 0; j < length; j++) { |
bm->Set(offset, chars[j]); |
@@ -6099,10 +6131,12 @@ RegExpEngine::CompilationResult RegExpEngine::Compile( |
} |
} |
if (is_ascii) { |
- node = node->FilterASCII(RegExpCompiler::kMaxRecursion); |
+ node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case); |
// Do it again to propagate the new nodes to places where they were not |
// put because they had not been calculated yet. |
- if (node != NULL) node = node->FilterASCII(RegExpCompiler::kMaxRecursion); |
+ if (node != NULL) { |
+ node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case); |
+ } |
} |
if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); |