Chromium Code Reviews

Unified Diff: src/jsregexp.cc

Issue 9854020: RegExp: Add support for table-based character class (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Index: src/jsregexp.cc
===================================================================
--- src/jsregexp.cc (revision 11134)
+++ src/jsregexp.cc (working copy)
@@ -1534,6 +1534,353 @@
}
+static void EmitBoundaryTest(RegExpMacroAssembler* masm,
+ int border,
+ Label* fall_through,
+ Label* above_or_equal,
+ Label* below) {
+ if (below != fall_through) {
+ masm->CheckCharacterLT(border, below);
+ if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
+ } else {
+ masm->CheckCharacterGT(border - 1, above_or_equal);
+ }
+}
+
+
+static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
+ int first,
+ int last,
+ Label* fall_through,
+ Label* in_range,
+ Label* out_of_range) {
+ if (in_range == fall_through) {
+ if (first == last) {
+ masm->CheckNotCharacter(first, out_of_range);
+ } else {
+ masm->CheckCharacterNotInRange(first, last, out_of_range);
+ }
+ } else {
+ if (first == last) {
+ masm->CheckCharacter(first, in_range);
+ } else {
+ masm->CheckCharacterInRange(first, last, in_range);
+ }
+ if (out_of_range != fall_through) masm->GoTo(out_of_range);
+ }
+}
+
+
+// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
+// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
+static void EmitUseLookupTable(
+ RegExpMacroAssembler* masm,
+ ZoneList<int>* ranges,
+ int start_index,
+ int end_index,
+ int min_char,
+ Label* fall_through,
+ Label* even_label,
+ Label* odd_label) {
+ static const int kSize = RegExpMacroAssembler::kTableSize;
+ static const int kMask = RegExpMacroAssembler::kTableMask;
+
+ int base = (min_char & ~kMask);
+ USE(base);
+
+ // Assert that everything is on one kTableSize page.
+ for (int i = start_index; i <= end_index; i++) {
+ ASSERT_EQ(ranges->at(i) & ~kMask, base);
+ }
+ ASSERT(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
+
+ char templ[kSize];
+ Label* on_bit_set;
+ Label* on_bit_clear;
+ int bit;
+ if (even_label == fall_through) {
+ on_bit_set = odd_label;
+ on_bit_clear = even_label;
+ bit = 1;
+ } else {
+ on_bit_set = even_label;
+ on_bit_clear = odd_label;
+ bit = 0;
+ }
+ for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
+ templ[i] = bit;
+ }
+ int j = 0;
+ bit ^= 1;
+ for (int i = start_index; i < end_index; i++) {
+ for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
+ templ[j] = bit;
+ }
+ bit ^= 1;
+ }
+ for (int i = j; i < kSize; i++) {
+ templ[i] = bit;
+ }
+ // TODO(erikcorry): Cache these.
+ Handle<ByteArray> ba = FACTORY->NewByteArray(kSize, TENURED);
+ for (int i = 0; i < kSize; i++) {
+ ba->set(i, templ[i]);
+ }
+ masm->CheckBitInTable(ba, on_bit_set);
+ if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
+}
+
+
+static void CutOutRange(RegExpMacroAssembler* masm,
+ ZoneList<int>* ranges,
+ int start_index,
+ int end_index,
+ int cut_index,
+ Label* even_label,
+ Label* odd_label) {
+ bool odd = (((cut_index - start_index) & 1) == 1);
+ Label* in_range_label = odd ? odd_label : even_label;
+ Label dummy;
+ EmitDoubleBoundaryTest(masm,
+ ranges->at(cut_index),
+ ranges->at(cut_index + 1) - 1,
+ &dummy,
+ in_range_label,
+ &dummy);
+ ASSERT(!dummy.is_linked());
+ // Cut out the single range by rewriting the array. This creates a new
+ // range that is a merger of the two ranges on either side of the one we
+ // are cutting out. The oddity of the labels is preserved.
+ for (int j = cut_index; j > start_index; j--) {
+ ranges->at(j) = ranges->at(j - 1);
+ }
+ for (int j = cut_index + 1; j < end_index; j++) {
+ ranges->at(j) = ranges->at(j + 1);
+ }
+}
+
+
+// Unicode case. Split the search space into kSize spaces that are handled
+// with recursion.
+static void SplitSearchSpace(ZoneList<int>* ranges,
+ int start_index,
+ int end_index,
+ int* new_start_index,
+ int* new_end_index,
+ int* border) {
+ static const int kSize = RegExpMacroAssembler::kTableSize;
+ static const int kMask = RegExpMacroAssembler::kTableMask;
+
+ int first = ranges->at(start_index);
+ int last = ranges->at(end_index) - 1;
+
+ *new_start_index = start_index;
+ *border = (ranges->at(start_index) & ~kMask) + kSize;
+ while (*new_start_index < end_index) {
+ if (ranges->at(*new_start_index) > *border) break;
+ (*new_start_index)++;
+ }
+ // new_start_index is the index of the first edge that is beyond the
+ // current kSize space.
+
+ // For very large search spaces we do a binary chop search of the non-ASCII
+ // space instead of just going to the end of the current kSize space. The
+ // heuristics are complicated a little by the fact that any 128-character
+ // encoding space can be quickly tested with a table lookup, so we don't
+ // wish to do binary chop search at a smaller granularity than that. A
+ // 128-character space can take up a lot of space in the ranges array if,
+ // for example, we only want to match every second character (eg. the lower
+ // case characters on some Unicode pages).
+ int binary_chop_index = (end_index + start_index) / 2;
+ // The first test ensures that we get to the code that handles the ASCII
+ // range with a single not-taken branch, speeding up this important
+ // character range (even non-ASCII charset-based text has spaces and
+ // punctuation).
+ if (*border - 1 > String::kMaxAsciiCharCode && // ASCII case.
+ end_index - start_index > (*new_start_index - start_index) * 2 &&
+ last - first > kSize * 2 &&
+ binary_chop_index > *new_start_index &&
+ ranges->at(binary_chop_index) >= first + 2 * kSize) {
+ int scan_forward_for_section_border = binary_chop_index;;
+ int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
+
+ while (scan_forward_for_section_border < end_index) {
+ if (ranges->at(scan_forward_for_section_border) > new_border) {
+ *new_start_index = scan_forward_for_section_border;
+ *border = new_border;
+ break;
+ }
+ scan_forward_for_section_border++;
+ }
+ }
+
+ ASSERT(*new_start_index > start_index);
+ *new_end_index = *new_start_index - 1;
+ if (ranges->at(*new_end_index) == *border) {
+ (*new_end_index)--;
+ }
+ if (*border >= ranges->at(end_index)) {
+ *border = ranges->at(end_index);
+ *new_start_index = end_index; // Won't be used.
+ *new_end_index = end_index - 1;
+ }
+}
+
+
+// Gets a series of segment boundaries representing a character class. If the
+// character is in the range between an even and an odd boundary (counting from
+// start_index) then go to even_label, otherwise go to odd_label. We already
+// know that the character is in the range of min_char to max_char inclusive.
+// Either label can be NULL indicating backtracking. Either label can also be
+// equal to the fall_through label.
+static void GenerateBranches(RegExpMacroAssembler* masm,
+ ZoneList<int>* ranges,
+ int start_index,
+ int end_index,
+ uc16 min_char,
+ uc16 max_char,
+ Label* fall_through,
+ Label* even_label,
+ Label* odd_label) {
+ int first = ranges->at(start_index);
+ int last = ranges->at(end_index) - 1;
+
+ // Just need to test if the character is before or on-or-after
+ // a particular character.
+ if (start_index == end_index) {
+ EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
+ return;
+ }
+
+ // Another almost trivial case: There is one interval in the middle that is
+ // different from the end intervals.
+ if (start_index + 1 == end_index) {
+ EmitDoubleBoundaryTest(
+ masm, first, last, fall_through, even_label, odd_label);
+ return;
+ }
+
+ // It's not worth using table lookup if there are very few intervals in the
+ // character class.
+ if (end_index - start_index <= 6) {
+ // It is faster to test for individual characters, so we look for those
+ // first, then try arbitrary ranges in the second round.
+ static int kNoCutIndex = -1;
+ int cut = kNoCutIndex;
+ for (int i = start_index; i < end_index; i++) {
+ if (ranges->at(i) == ranges->at(i + 1) - 1) {
+ cut = i;
+ break;
+ }
+ }
+ if (cut == kNoCutIndex) cut = start_index;
+ CutOutRange(
+ masm, ranges, start_index, end_index, cut, even_label, odd_label);
+ ASSERT_GE(end_index - start_index, 2);
+ GenerateBranches(masm,
+ ranges,
+ start_index + 1,
+ end_index - 1,
+ min_char,
+ max_char,
+ fall_through,
+ even_label,
+ odd_label);
+ return;
+ }
+
+ // If there are a lot of intervals in the regexp, then we will use tables to
+ // determine whether the character is inside or outside the character class.
+ static const int kBits = RegExpMacroAssembler::kTableSizeBits;
+
+ if ((max_char >> kBits) == (min_char >> kBits)) {
+ EmitUseLookupTable(masm,
+ ranges,
+ start_index,
+ end_index,
+ min_char,
+ fall_through,
+ even_label,
+ odd_label);
+ return;
+ }
+
+ if ((min_char >> kBits) != (first >> kBits)) {
+ masm->CheckCharacterLT(first, odd_label);
+ GenerateBranches(masm,
+ ranges,
+ start_index + 1,
+ end_index,
+ first,
+ max_char,
+ fall_through,
+ odd_label,
+ even_label);
+ return;
+ }
+
+ int new_start_index = 0;
+ int new_end_index = 0;
+ int border = 0;
+
+ SplitSearchSpace(ranges,
+ start_index,
+ end_index,
+ &new_start_index,
+ &new_end_index,
+ &border);
+
+ Label handle_rest;
+ Label* above = &handle_rest;
+ if (border == last + 1) {
+ // We didn't find any section that started after the limit, so everything
+ // above the border is one of the terminal labels.
+ above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
+ ASSERT(new_end_index == end_index - 1);
+ }
+
+ ASSERT_LT(start_index, new_start_index);
+ ASSERT_LT(new_end_index, end_index);
ulan 2012/03/29 16:19:17 Also: start_index <= new_end_index && new_start_in
Erik Corry 2012/03/30 07:46:28 Done.
+ ASSERT(new_end_index + 1 == new_start_index ||
+ (new_end_index + 2 == new_start_index &&
+ border == ranges->at(new_end_index + 1)));
+ ASSERT_LT(min_char, border - 1);
+ ASSERT_LT(border, max_char);
+ ASSERT_LT(ranges->at(new_end_index), border);
+ ASSERT(border < ranges->at(new_start_index) ||
+ (border == ranges->at(new_start_index) &&
+ new_start_index == end_index &&
+ new_end_index == end_index - 1 &&
+ border == last + 1));
+ ASSERT(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
+
+ masm->CheckCharacterGT(border - 1, above);
+ Label dummy;
+ GenerateBranches(masm,
+ ranges,
+ start_index,
+ new_end_index,
+ min_char,
+ border - 1,
+ &dummy,
+ even_label,
+ odd_label);
+ if (handle_rest.is_linked()) {
+ masm->Bind(&handle_rest);
+ bool flip = (new_start_index & 1) != (start_index & 1);
+ GenerateBranches(masm,
+ ranges,
+ new_start_index,
+ end_index,
+ border,
+ max_char,
+ &dummy,
+ flip ? odd_label : even_label,
+ flip ? even_label : odd_label);
+ }
+}
+
+
static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
RegExpCharacterClass* cc,
bool ascii,
@@ -1542,6 +1889,10 @@
bool check_offset,
bool preloaded) {
ZoneList<CharacterRange>* ranges = cc->ranges();
+ if (!CharacterRange::IsCanonical(ranges)) {
+ CharacterRange::Canonicalize(ranges);
+ }
+
int max_char;
if (ascii) {
max_char = String::kMaxAsciiCharCode;
@@ -1549,11 +1900,6 @@
max_char = String::kMaxUtf16CodeUnit;
}
- Label success;
-
- Label* char_is_in_class =
- cc->is_negated() ? on_failure : &success;
-
int range_count = ranges->length();
int last_valid_range = range_count - 1;
@@ -1567,8 +1913,6 @@
if (last_valid_range < 0) {
if (!cc->is_negated()) {
- // TODO(plesner): We can remove this when the node level does our
- // ASCII optimizations for us.
macro_assembler->GoTo(on_failure);
}
if (check_offset) {
@@ -1578,6 +1922,18 @@
}
if (last_valid_range == 0 &&
+ ranges->at(0).IsEverything(max_char)) {
+ if (cc->is_negated()) {
+ macro_assembler->GoTo(on_failure);
+ } else {
+ // This is a common case hit by non-anchored expressions.
+ if (check_offset) {
+ macro_assembler->CheckPosition(cp_offset, on_failure);
+ }
+ }
+ return;
+ }
+ if (last_valid_range == 0 &&
!cc->is_negated() &&
ranges->at(0).IsEverything(max_char)) {
// This is a common case hit by non-anchored expressions.
@@ -1597,64 +1953,43 @@
return;
}
- for (int i = 0; i < last_valid_range; i++) {
- CharacterRange& range = ranges->at(i);
- Label next_range;
- uc16 from = range.from();
- uc16 to = range.to();
- if (from > max_char) {
- continue;
- }
- if (to > max_char) to = max_char;
- if (to == from) {
- macro_assembler->CheckCharacter(to, char_is_in_class);
- } else {
- if (from != 0) {
- macro_assembler->CheckCharacterLT(from, &next_range);
- }
- if (to != max_char) {
- macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
- } else {
- macro_assembler->GoTo(char_is_in_class);
- }
- }
- macro_assembler->Bind(&next_range);
- }
- CharacterRange& range = ranges->at(last_valid_range);
- uc16 from = range.from();
- uc16 to = range.to();
+ // A new list with ascending entries. Each entry is a code unit
+ // where there is a boundary between code units that are part of
+ // the class and code units that are not. Normally we insert an
+ // entry at zero which goes to the failure label, but if there
+ // was already one there we fall through for success on that entry.
+ // Subsequent entries have alternating meaning (success/failure).
+ ZoneList<int>* range_boundaries = new ZoneList<int>(last_valid_range);
- if (to > max_char) to = max_char;
- ASSERT(to >= from);
+ bool zeroth_entry_is_failure = !cc->is_negated();
- if (to == from) {
- if (cc->is_negated()) {
- macro_assembler->CheckCharacter(to, on_failure);
+ for (int i = 0; i <= last_valid_range; i++) {
+ CharacterRange& range = ranges->at(i);
+ if (range.from() == 0) {
ulan 2012/03/29 16:19:17 I think we don't need to check for range[i] == 0.
Erik Corry 2012/03/30 07:46:28 There is an explicit assumption in GenerateBranche
+ ASSERT_EQ(i, 0);
+ zeroth_entry_is_failure = !zeroth_entry_is_failure;
} else {
- macro_assembler->CheckNotCharacter(to, on_failure);
+ range_boundaries->Add(range.from());
}
- } else {
- if (from != 0) {
- if (cc->is_negated()) {
- macro_assembler->CheckCharacterLT(from, &success);
- } else {
- macro_assembler->CheckCharacterLT(from, on_failure);
- }
- }
- if (to != String::kMaxUtf16CodeUnit) {
- if (cc->is_negated()) {
- macro_assembler->CheckCharacterLT(to + 1, on_failure);
- } else {
- macro_assembler->CheckCharacterGT(to, on_failure);
- }
- } else {
- if (cc->is_negated()) {
- macro_assembler->GoTo(on_failure);
- }
- }
+ range_boundaries->Add(range.to() + 1);
}
- macro_assembler->Bind(&success);
+ int end_index = range_boundaries->length() - 1;
+ if (range_boundaries->at(end_index) > max_char) {
+ end_index--;
+ }
+
+ Label fall_through;
+ GenerateBranches(macro_assembler,
+ range_boundaries,
+ 0, // start_index.
+ end_index,
+ 0, // min_char.
+ max_char,
+ &fall_through,
+ zeroth_entry_is_failure ? &fall_through : on_failure,
+ zeroth_entry_is_failure ? on_failure : &fall_through);
+ macro_assembler->Bind(&fall_through);
}

Powered by Google App Engine