Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(290)

Unified Diff: src/jsregexp.cc

Issue 11759008: Introduce ENABLE_LATIN_1 compile flag (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge
Patch Set: Fix FilterASCII Created 7 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/jsregexp.h ('k') | src/log.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/jsregexp.cc
diff --git a/src/jsregexp.cc b/src/jsregexp.cc
index 813208c9590e651749321a818351488629da4f1f..347fc03e7b26d96751bc0e270296bf828a0a41e3 100644
--- a/src/jsregexp.cc
+++ b/src/jsregexp.cc
@@ -1681,7 +1681,7 @@ static int GetCaseIndependentLetters(Isolate* isolate,
letters[0] = character;
length = 1;
}
- if (!ascii_subject || character <= String::kMaxAsciiCharCode) {
+ if (!ascii_subject || character <= String::kMaxOneByteCharCode) {
return length;
}
// The standard requires that non-ASCII characters cannot have ASCII
@@ -1732,7 +1732,7 @@ static inline bool EmitAtomNonLetter(Isolate* isolate,
bool checked = false;
// We handle the length > 1 case in a later pass.
if (length == 1) {
- if (ascii && c > String::kMaxAsciiCharCodeU) {
+ if (ascii && c > String::kMaxOneByteCharCodeU) {
// Can't match - see above.
return false; // Bounds not checked.
}
@@ -1753,7 +1753,7 @@ static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
Label* on_failure) {
uc16 char_mask;
if (ascii) {
- char_mask = String::kMaxAsciiCharCode;
+ char_mask = String::kMaxOneByteCharCode;
} else {
char_mask = String::kMaxUtf16CodeUnit;
}
@@ -2007,7 +2007,7 @@ static void SplitSearchSpace(ZoneList<int>* ranges,
// range with a single not-taken branch, speeding up this important
// character range (even non-ASCII charset-based text has spaces and
// punctuation).
- if (*border - 1 > String::kMaxAsciiCharCode && // ASCII case.
+ if (*border - 1 > String::kMaxOneByteCharCode && // ASCII case.
end_index - start_index > (*new_start_index - start_index) * 2 &&
last - first > kSize * 2 &&
binary_chop_index > *new_start_index &&
@@ -2211,7 +2211,7 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
int max_char;
if (ascii) {
- max_char = String::kMaxAsciiCharCode;
+ max_char = String::kMaxOneByteCharCode;
} else {
max_char = String::kMaxUtf16CodeUnit;
}
@@ -2513,7 +2513,7 @@ bool QuickCheckDetails::Rationalize(bool asc) {
bool found_useful_op = false;
uint32_t char_mask;
if (asc) {
- char_mask = String::kMaxAsciiCharCode;
+ char_mask = String::kMaxOneByteCharCode;
} else {
char_mask = String::kMaxUtf16CodeUnit;
}
@@ -2522,7 +2522,7 @@ bool QuickCheckDetails::Rationalize(bool asc) {
int char_shift = 0;
for (int i = 0; i < characters_; i++) {
Position* pos = &positions_[i];
- if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
+ if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
found_useful_op = true;
}
mask_ |= (pos->mask & char_mask) << char_shift;
@@ -2565,7 +2565,7 @@ bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
// load so the value is already masked down.
uint32_t char_mask;
if (compiler->ascii()) {
- char_mask = String::kMaxAsciiCharCode;
+ char_mask = String::kMaxOneByteCharCode;
} else {
char_mask = String::kMaxUtf16CodeUnit;
}
@@ -2575,7 +2575,11 @@ bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
// For 2-character preloads in ASCII mode or 1-character preloads in
// TWO_BYTE mode we also use a 16 bit load with zero extend.
if (details->characters() == 2 && compiler->ascii()) {
- if ((mask & 0x7f7f) == 0x7f7f) need_mask = false;
+#ifndef ENABLE_LATIN_1
+ if ((mask & 0x7f7f) == 0xffff) need_mask = false;
+#else
+ if ((mask & 0xffff) == 0xffff) need_mask = false;
+#endif
} else if (details->characters() == 1 && !compiler->ascii()) {
if ((mask & 0xffff) == 0xffff) need_mask = false;
} else {
@@ -2617,7 +2621,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
int characters = details->characters();
int char_mask;
if (compiler->ascii()) {
- char_mask = String::kMaxAsciiCharCode;
+ char_mask = String::kMaxOneByteCharCode;
} else {
char_mask = String::kMaxUtf16CodeUnit;
}
@@ -2834,24 +2838,24 @@ class VisitMarker {
};
-RegExpNode* SeqRegExpNode::FilterASCII(int depth) {
+RegExpNode* SeqRegExpNode::FilterASCII(int depth, bool ignore_case) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
ASSERT(!info()->visited);
VisitMarker marker(info());
- return FilterSuccessor(depth - 1);
+ return FilterSuccessor(depth - 1, ignore_case);
}
-RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
- RegExpNode* next = on_success_->FilterASCII(depth - 1);
+RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
+ RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case);
if (next == NULL) return set_replacement(NULL);
on_success_ = next;
return set_replacement(this);
}
-RegExpNode* TextNode::FilterASCII(int depth) {
+RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
ASSERT(!info()->visited);
@@ -2862,15 +2866,40 @@ RegExpNode* TextNode::FilterASCII(int depth) {
if (elm.type == TextElement::ATOM) {
Vector<const uc16> quarks = elm.data.u_atom->data();
for (int j = 0; j < quarks.length(); j++) {
- // We don't need special handling for case independence
- // because of the rule that case independence cannot make
- // a non-ASCII character match an ASCII character.
- if (quarks[j] > String::kMaxAsciiCharCode) {
+#ifndef ENABLE_LATIN_1
+ if (quarks[j] > String::kMaxOneByteCharCode) {
return set_replacement(NULL);
}
+#else
+ if (quarks[j] <= String::kMaxOneByteCharCode) continue;
+ if (!ignore_case) return set_replacement(NULL);
+ // Here, we need to check for characters whose upper and lower cases
+ // are outside the Latin-1 range.
+ // TODO(dcarney): Replace this code with a simple
+ // table lookup in unibrow::Latin-1.
+ // TODO(dcarney): Test cases!.
+ unibrow::uchar result;
+ int chars;
+ chars = unibrow::ToLowercase::Convert(quarks[j], 0, &result, NULL);
+ if (chars > 1 ||
+ (chars == 1 && result <= String::kMaxOneByteCharCodeU)) {
+ continue;
+ }
+ chars = unibrow::ToUppercase::Convert(quarks[j], 0, &result, NULL);
+ if (chars > 1 ||
+ (chars == 1 && result <= String::kMaxOneByteCharCodeU)) {
+ continue;
+ }
+ // This character is definitely not in the Latin-1 range.
+ return set_replacement(NULL);
+#endif
}
} else {
ASSERT(elm.type == TextElement::CHAR_CLASS);
+#ifdef ENABLE_LATIN_1
+ // TODO(dcarney): Can this be improved?
+ if (ignore_case) continue;
+#endif
RegExpCharacterClass* cc = elm.data.u_char_class;
ZoneList<CharacterRange>* ranges = cc->ranges(zone());
if (!CharacterRange::IsCanonical(ranges)) {
@@ -2881,39 +2910,40 @@ RegExpNode* TextNode::FilterASCII(int depth) {
if (cc->is_negated()) {
if (range_count != 0 &&
ranges->at(0).from() == 0 &&
- ranges->at(0).to() >= String::kMaxAsciiCharCode) {
+ ranges->at(0).to() >= String::kMaxOneByteCharCode) {
return set_replacement(NULL);
}
} else {
if (range_count == 0 ||
- ranges->at(0).from() > String::kMaxAsciiCharCode) {
+ ranges->at(0).from() > String::kMaxOneByteCharCode) {
return set_replacement(NULL);
}
}
}
}
- return FilterSuccessor(depth - 1);
+ return FilterSuccessor(depth - 1, ignore_case);
}
-RegExpNode* LoopChoiceNode::FilterASCII(int depth) {
+RegExpNode* LoopChoiceNode::FilterASCII(int depth, bool ignore_case) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
if (info()->visited) return this;
{
VisitMarker marker(info());
- RegExpNode* continue_replacement = continue_node_->FilterASCII(depth - 1);
+ RegExpNode* continue_replacement =
+ continue_node_->FilterASCII(depth - 1, ignore_case);
// If we can't continue after the loop then there is no sense in doing the
// loop.
if (continue_replacement == NULL) return set_replacement(NULL);
}
- return ChoiceNode::FilterASCII(depth - 1);
+ return ChoiceNode::FilterASCII(depth - 1, ignore_case);
}
-RegExpNode* ChoiceNode::FilterASCII(int depth) {
+RegExpNode* ChoiceNode::FilterASCII(int depth, bool ignore_case) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
if (info()->visited) return this;
@@ -2932,7 +2962,8 @@ RegExpNode* ChoiceNode::FilterASCII(int depth) {
RegExpNode* survivor = NULL;
for (int i = 0; i < choice_count; i++) {
GuardedAlternative alternative = alternatives_->at(i);
- RegExpNode* replacement = alternative.node()->FilterASCII(depth - 1);
+ RegExpNode* replacement =
+ alternative.node()->FilterASCII(depth - 1, ignore_case);
ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
if (replacement != NULL) {
alternatives_->at(i).set_node(replacement);
@@ -2952,7 +2983,7 @@ RegExpNode* ChoiceNode::FilterASCII(int depth) {
new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
for (int i = 0; i < choice_count; i++) {
RegExpNode* replacement =
- alternatives_->at(i).node()->FilterASCII(depth - 1);
+ alternatives_->at(i).node()->FilterASCII(depth - 1, ignore_case);
if (replacement != NULL) {
alternatives_->at(i).set_node(replacement);
new_alternatives->Add(alternatives_->at(i), zone());
@@ -2963,7 +2994,8 @@ RegExpNode* ChoiceNode::FilterASCII(int depth) {
}
-RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth) {
+RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth,
+ bool ignore_case) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
if (info()->visited) return this;
@@ -2971,12 +3003,12 @@ RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth) {
// Alternative 0 is the negative lookahead, alternative 1 is what comes
// afterwards.
RegExpNode* node = alternatives_->at(1).node();
- RegExpNode* replacement = node->FilterASCII(depth - 1);
+ RegExpNode* replacement = node->FilterASCII(depth - 1, ignore_case);
if (replacement == NULL) return set_replacement(NULL);
alternatives_->at(1).set_node(replacement);
RegExpNode* neg_node = alternatives_->at(0).node();
- RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1);
+ RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1, ignore_case);
// If the negative lookahead is always going to fail then
// we don't need to check it.
if (neg_replacement == NULL) return set_replacement(replacement);
@@ -3299,7 +3331,7 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler,
switch (pass) {
case NON_ASCII_MATCH:
ASSERT(ascii);
- if (quarks[j] > String::kMaxAsciiCharCode) {
+ if (quarks[j] > String::kMaxOneByteCharCode) {
assembler->GoTo(backtrack);
return;
}
@@ -3498,7 +3530,7 @@ RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
if (ranges->length() != 1) return NULL;
uint32_t max_char;
if (compiler->ascii()) {
- max_char = String::kMaxAsciiCharCode;
+ max_char = String::kMaxOneByteCharCode;
} else {
max_char = String::kMaxUtf16CodeUnit;
}
@@ -3698,7 +3730,7 @@ BoyerMooreLookahead::BoyerMooreLookahead(
: length_(length),
compiler_(compiler) {
if (compiler->ascii()) {
- max_char_ = String::kMaxAsciiCharCode;
+ max_char_ = String::kMaxOneByteCharCode;
} else {
max_char_ = String::kMaxUtf16CodeUnit;
}
@@ -5337,8 +5369,8 @@ void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
uc16 bottom = from();
uc16 top = to();
if (is_ascii) {
- if (bottom > String::kMaxAsciiCharCode) return;
- if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode;
+ if (bottom > String::kMaxOneByteCharCode) return;
+ if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
}
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
if (top == bottom) {
@@ -5885,7 +5917,7 @@ void TextNode::FillInBMInfo(int initial_offset,
int length = GetCaseIndependentLetters(
ISOLATE,
character,
- bm->max_char() == String::kMaxAsciiCharCode,
+ bm->max_char() == String::kMaxOneByteCharCode,
chars);
for (int j = 0; j < length; j++) {
bm->Set(offset, chars[j]);
@@ -6099,10 +6131,12 @@ RegExpEngine::CompilationResult RegExpEngine::Compile(
}
}
if (is_ascii) {
- node = node->FilterASCII(RegExpCompiler::kMaxRecursion);
+ node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);
// Do it again to propagate the new nodes to places where they were not
// put because they had not been calculated yet.
- if (node != NULL) node = node->FilterASCII(RegExpCompiler::kMaxRecursion);
+ if (node != NULL) {
+ node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);
+ }
}
if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
« no previous file with comments | « src/jsregexp.h ('k') | src/log.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698