Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(164)

Side by Side Diff: src/jsregexp.cc

Issue 9854020: RegExp: Add support for table-based character class (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 8 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2011 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 1516 matching lines...) Expand 10 before | Expand all | Expand 10 after
1527 macro_assembler->Bind(&ok); 1527 macro_assembler->Bind(&ok);
1528 break; 1528 break;
1529 default: 1529 default:
1530 UNREACHABLE(); 1530 UNREACHABLE();
1531 break; 1531 break;
1532 } 1532 }
1533 return true; 1533 return true;
1534 } 1534 }
1535 1535
1536 1536
1537 static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1538 int border,
1539 Label* fall_through,
1540 Label* above_or_equal,
1541 Label* below) {
1542 if (below != fall_through) {
1543 masm->CheckCharacterLT(border, below);
1544 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1545 } else {
1546 masm->CheckCharacterGT(border - 1, above_or_equal);
1547 }
1548 }
1549
1550
1551 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1552 int first,
1553 int last,
1554 Label* fall_through,
1555 Label* in_range,
1556 Label* out_of_range) {
1557 if (in_range == fall_through) {
1558 if (first == last) {
1559 masm->CheckNotCharacter(first, out_of_range);
1560 } else {
1561 masm->CheckCharacterNotInRange(first, last, out_of_range);
1562 }
1563 } else {
1564 if (first == last) {
1565 masm->CheckCharacter(first, in_range);
1566 } else {
1567 masm->CheckCharacterInRange(first, last, in_range);
1568 }
1569 if (out_of_range != fall_through) masm->GoTo(out_of_range);
1570 }
1571 }
1572
1573
1574 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1575 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1576 static void EmitUseLookupTable(
1577 RegExpMacroAssembler* masm,
1578 ZoneList<int>* ranges,
1579 int start_index,
1580 int end_index,
1581 int min_char,
1582 Label* fall_through,
1583 Label* even_label,
1584 Label* odd_label) {
1585 static const int kSize = RegExpMacroAssembler::kTableSize;
1586 static const int kMask = RegExpMacroAssembler::kTableMask;
1587
1588 int base = (min_char & ~kMask);
1589 USE(base);
1590
1591 // Assert that everything is on one kTableSize page.
1592 for (int i = start_index; i <= end_index; i++) {
1593 ASSERT_EQ(ranges->at(i) & ~kMask, base);
1594 }
1595 ASSERT(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1596
1597 char templ[kSize];
1598 Label* on_bit_set;
1599 Label* on_bit_clear;
1600 int bit;
1601 if (even_label == fall_through) {
1602 on_bit_set = odd_label;
1603 on_bit_clear = even_label;
1604 bit = 1;
1605 } else {
1606 on_bit_set = even_label;
1607 on_bit_clear = odd_label;
1608 bit = 0;
1609 }
1610 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1611 templ[i] = bit;
1612 }
1613 int j = 0;
1614 bit ^= 1;
1615 for (int i = start_index; i < end_index; i++) {
1616 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1617 templ[j] = bit;
1618 }
1619 bit ^= 1;
1620 }
1621 for (int i = j; i < kSize; i++) {
1622 templ[i] = bit;
1623 }
1624 // TODO(erikcorry): Cache these.
1625 Handle<ByteArray> ba = FACTORY->NewByteArray(kSize, TENURED);
1626 for (int i = 0; i < kSize; i++) {
1627 ba->set(i, templ[i]);
1628 }
1629 masm->CheckBitInTable(ba, on_bit_set);
1630 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1631 }
1632
1633
1634 static void CutOutRange(RegExpMacroAssembler* masm,
1635 ZoneList<int>* ranges,
1636 int start_index,
1637 int end_index,
1638 int cut_index,
1639 Label* even_label,
1640 Label* odd_label) {
1641 bool odd = (((cut_index - start_index) & 1) == 1);
1642 Label* in_range_label = odd ? odd_label : even_label;
1643 Label dummy;
1644 EmitDoubleBoundaryTest(masm,
1645 ranges->at(cut_index),
1646 ranges->at(cut_index + 1) - 1,
1647 &dummy,
1648 in_range_label,
1649 &dummy);
1650 ASSERT(!dummy.is_linked());
1651 // Cut out the single range by rewriting the array. This creates a new
1652 // range that is a merger of the two ranges on either side of the one we
1653 // are cutting out. The oddity of the labels is preserved.
1654 for (int j = cut_index; j > start_index; j--) {
1655 ranges->at(j) = ranges->at(j - 1);
1656 }
1657 for (int j = cut_index + 1; j < end_index; j++) {
1658 ranges->at(j) = ranges->at(j + 1);
1659 }
1660 }
1661
1662
1663 // Unicode case. Split the search space into kSize spaces that are handled
1664 // with recursion.
1665 static void SplitSearchSpace(ZoneList<int>* ranges,
1666 int start_index,
1667 int end_index,
1668 int* new_start_index,
1669 int* new_end_index,
1670 int* border) {
1671 static const int kSize = RegExpMacroAssembler::kTableSize;
1672 static const int kMask = RegExpMacroAssembler::kTableMask;
1673
1674 int first = ranges->at(start_index);
1675 int last = ranges->at(end_index) - 1;
1676
1677 *new_start_index = start_index;
1678 *border = (ranges->at(start_index) & ~kMask) + kSize;
1679 while (*new_start_index < end_index) {
1680 if (ranges->at(*new_start_index) > *border) break;
1681 (*new_start_index)++;
1682 }
1683 // new_start_index is the index of the first edge that is beyond the
1684 // current kSize space.
1685
1686 // For very large search spaces we do a binary chop search of the non-ASCII
1687 // space instead of just going to the end of the current kSize space. The
1688 // heuristics are complicated a little by the fact that any 128-character
1689 // encoding space can be quickly tested with a table lookup, so we don't
1690 // wish to do binary chop search at a smaller granularity than that. A
1691 // 128-character space can take up a lot of space in the ranges array if,
1692 // for example, we only want to match every second character (eg. the lower
1693 // case characters on some Unicode pages).
1694 int binary_chop_index = (end_index + start_index) / 2;
1695 // The first test ensures that we get to the code that handles the ASCII
1696 // range with a single not-taken branch, speeding up this important
1697 // character range (even non-ASCII charset-based text has spaces and
1698 // punctuation).
1699 if (*border - 1 > String::kMaxAsciiCharCode && // ASCII case.
1700 end_index - start_index > (*new_start_index - start_index) * 2 &&
1701 last - first > kSize * 2 &&
1702 binary_chop_index > *new_start_index &&
1703 ranges->at(binary_chop_index) >= first + 2 * kSize) {
1704 int scan_forward_for_section_border = binary_chop_index;;
1705 int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1706
1707 while (scan_forward_for_section_border < end_index) {
1708 if (ranges->at(scan_forward_for_section_border) > new_border) {
1709 *new_start_index = scan_forward_for_section_border;
1710 *border = new_border;
1711 break;
1712 }
1713 scan_forward_for_section_border++;
1714 }
1715 }
1716
1717 ASSERT(*new_start_index > start_index);
1718 *new_end_index = *new_start_index - 1;
1719 if (ranges->at(*new_end_index) == *border) {
1720 (*new_end_index)--;
1721 }
1722 if (*border >= ranges->at(end_index)) {
1723 *border = ranges->at(end_index);
1724 *new_start_index = end_index; // Won't be used.
1725 *new_end_index = end_index - 1;
1726 }
1727 }
1728
1729
1730 // Gets a series of segment boundaries representing a character class. If the
1731 // character is in the range between an even and an odd boundary (counting from
1732 // start_index) then go to even_label, otherwise go to odd_label. We already
1733 // know that the character is in the range of min_char to max_char inclusive.
1734 // Either label can be NULL indicating backtracking. Either label can also be
1735 // equal to the fall_through label.
1736 static void GenerateBranches(RegExpMacroAssembler* masm,
1737 ZoneList<int>* ranges,
1738 int start_index,
1739 int end_index,
1740 uc16 min_char,
1741 uc16 max_char,
1742 Label* fall_through,
1743 Label* even_label,
1744 Label* odd_label) {
1745 int first = ranges->at(start_index);
1746 int last = ranges->at(end_index) - 1;
1747
1748 // Just need to test if the character is before or on-or-after
1749 // a particular character.
1750 if (start_index == end_index) {
1751 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1752 return;
1753 }
1754
1755 // Another almost trivial case: There is one interval in the middle that is
1756 // different from the end intervals.
1757 if (start_index + 1 == end_index) {
1758 EmitDoubleBoundaryTest(
1759 masm, first, last, fall_through, even_label, odd_label);
1760 return;
1761 }
1762
1763 // It's not worth using table lookup if there are very few intervals in the
1764 // character class.
1765 if (end_index - start_index <= 6) {
1766 // It is faster to test for individual characters, so we look for those
1767 // first, then try arbitrary ranges in the second round.
1768 static int kNoCutIndex = -1;
1769 int cut = kNoCutIndex;
1770 for (int i = start_index; i < end_index; i++) {
1771 if (ranges->at(i) == ranges->at(i + 1) - 1) {
1772 cut = i;
1773 break;
1774 }
1775 }
1776 if (cut == kNoCutIndex) cut = start_index;
1777 CutOutRange(
1778 masm, ranges, start_index, end_index, cut, even_label, odd_label);
1779 ASSERT_GE(end_index - start_index, 2);
1780 GenerateBranches(masm,
1781 ranges,
1782 start_index + 1,
1783 end_index - 1,
1784 min_char,
1785 max_char,
1786 fall_through,
1787 even_label,
1788 odd_label);
1789 return;
1790 }
1791
1792 // If there are a lot of intervals in the regexp, then we will use tables to
1793 // determine whether the character is inside or outside the character class.
1794 static const int kBits = RegExpMacroAssembler::kTableSizeBits;
1795
1796 if ((max_char >> kBits) == (min_char >> kBits)) {
1797 EmitUseLookupTable(masm,
1798 ranges,
1799 start_index,
1800 end_index,
1801 min_char,
1802 fall_through,
1803 even_label,
1804 odd_label);
1805 return;
1806 }
1807
1808 if ((min_char >> kBits) != (first >> kBits)) {
1809 masm->CheckCharacterLT(first, odd_label);
1810 GenerateBranches(masm,
1811 ranges,
1812 start_index + 1,
1813 end_index,
1814 first,
1815 max_char,
1816 fall_through,
1817 odd_label,
1818 even_label);
1819 return;
1820 }
1821
1822 int new_start_index = 0;
1823 int new_end_index = 0;
1824 int border = 0;
1825
1826 SplitSearchSpace(ranges,
1827 start_index,
1828 end_index,
1829 &new_start_index,
1830 &new_end_index,
1831 &border);
1832
1833 Label handle_rest;
1834 Label* above = &handle_rest;
1835 if (border == last + 1) {
1836 // We didn't find any section that started after the limit, so everything
1837 // above the border is one of the terminal labels.
1838 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
1839 ASSERT(new_end_index == end_index - 1);
1840 }
1841
1842 ASSERT_LT(start_index, new_start_index);
1843 ASSERT_LT(new_end_index, end_index);
ulan 2012/03/29 16:19:17 Also: start_index <= new_end_index && new_start_in
Erik Corry 2012/03/30 07:46:28 Done.
1844 ASSERT(new_end_index + 1 == new_start_index ||
1845 (new_end_index + 2 == new_start_index &&
1846 border == ranges->at(new_end_index + 1)));
1847 ASSERT_LT(min_char, border - 1);
1848 ASSERT_LT(border, max_char);
1849 ASSERT_LT(ranges->at(new_end_index), border);
1850 ASSERT(border < ranges->at(new_start_index) ||
1851 (border == ranges->at(new_start_index) &&
1852 new_start_index == end_index &&
1853 new_end_index == end_index - 1 &&
1854 border == last + 1));
1855 ASSERT(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
1856
1857 masm->CheckCharacterGT(border - 1, above);
1858 Label dummy;
1859 GenerateBranches(masm,
1860 ranges,
1861 start_index,
1862 new_end_index,
1863 min_char,
1864 border - 1,
1865 &dummy,
1866 even_label,
1867 odd_label);
1868 if (handle_rest.is_linked()) {
1869 masm->Bind(&handle_rest);
1870 bool flip = (new_start_index & 1) != (start_index & 1);
1871 GenerateBranches(masm,
1872 ranges,
1873 new_start_index,
1874 end_index,
1875 border,
1876 max_char,
1877 &dummy,
1878 flip ? odd_label : even_label,
1879 flip ? even_label : odd_label);
1880 }
1881 }
1882
1883
1537 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, 1884 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1538 RegExpCharacterClass* cc, 1885 RegExpCharacterClass* cc,
1539 bool ascii, 1886 bool ascii,
1540 Label* on_failure, 1887 Label* on_failure,
1541 int cp_offset, 1888 int cp_offset,
1542 bool check_offset, 1889 bool check_offset,
1543 bool preloaded) { 1890 bool preloaded) {
1544 ZoneList<CharacterRange>* ranges = cc->ranges(); 1891 ZoneList<CharacterRange>* ranges = cc->ranges();
1892 if (!CharacterRange::IsCanonical(ranges)) {
1893 CharacterRange::Canonicalize(ranges);
1894 }
1895
1545 int max_char; 1896 int max_char;
1546 if (ascii) { 1897 if (ascii) {
1547 max_char = String::kMaxAsciiCharCode; 1898 max_char = String::kMaxAsciiCharCode;
1548 } else { 1899 } else {
1549 max_char = String::kMaxUtf16CodeUnit; 1900 max_char = String::kMaxUtf16CodeUnit;
1550 } 1901 }
1551 1902
1552 Label success;
1553
1554 Label* char_is_in_class =
1555 cc->is_negated() ? on_failure : &success;
1556
1557 int range_count = ranges->length(); 1903 int range_count = ranges->length();
1558 1904
1559 int last_valid_range = range_count - 1; 1905 int last_valid_range = range_count - 1;
1560 while (last_valid_range >= 0) { 1906 while (last_valid_range >= 0) {
1561 CharacterRange& range = ranges->at(last_valid_range); 1907 CharacterRange& range = ranges->at(last_valid_range);
1562 if (range.from() <= max_char) { 1908 if (range.from() <= max_char) {
1563 break; 1909 break;
1564 } 1910 }
1565 last_valid_range--; 1911 last_valid_range--;
1566 } 1912 }
1567 1913
1568 if (last_valid_range < 0) { 1914 if (last_valid_range < 0) {
1569 if (!cc->is_negated()) { 1915 if (!cc->is_negated()) {
1570 // TODO(plesner): We can remove this when the node level does our
1571 // ASCII optimizations for us.
1572 macro_assembler->GoTo(on_failure); 1916 macro_assembler->GoTo(on_failure);
1573 } 1917 }
1574 if (check_offset) { 1918 if (check_offset) {
1575 macro_assembler->CheckPosition(cp_offset, on_failure); 1919 macro_assembler->CheckPosition(cp_offset, on_failure);
1576 } 1920 }
1577 return; 1921 return;
1578 } 1922 }
1579 1923
1580 if (last_valid_range == 0 && 1924 if (last_valid_range == 0 &&
1925 ranges->at(0).IsEverything(max_char)) {
1926 if (cc->is_negated()) {
1927 macro_assembler->GoTo(on_failure);
1928 } else {
1929 // This is a common case hit by non-anchored expressions.
1930 if (check_offset) {
1931 macro_assembler->CheckPosition(cp_offset, on_failure);
1932 }
1933 }
1934 return;
1935 }
1936 if (last_valid_range == 0 &&
1581 !cc->is_negated() && 1937 !cc->is_negated() &&
1582 ranges->at(0).IsEverything(max_char)) { 1938 ranges->at(0).IsEverything(max_char)) {
1583 // This is a common case hit by non-anchored expressions. 1939 // This is a common case hit by non-anchored expressions.
1584 if (check_offset) { 1940 if (check_offset) {
1585 macro_assembler->CheckPosition(cp_offset, on_failure); 1941 macro_assembler->CheckPosition(cp_offset, on_failure);
1586 } 1942 }
1587 return; 1943 return;
1588 } 1944 }
1589 1945
1590 if (!preloaded) { 1946 if (!preloaded) {
1591 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); 1947 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
1592 } 1948 }
1593 1949
1594 if (cc->is_standard() && 1950 if (cc->is_standard() &&
1595 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), 1951 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
1596 on_failure)) { 1952 on_failure)) {
1597 return; 1953 return;
1598 } 1954 }
1599 1955
1600 for (int i = 0; i < last_valid_range; i++) { 1956
1957 // A new list with ascending entries. Each entry is a code unit
1958 // where there is a boundary between code units that are part of
1959 // the class and code units that are not. Normally we insert an
1960 // entry at zero which goes to the failure label, but if there
1961 // was already one there we fall through for success on that entry.
1962 // Subsequent entries have alternating meaning (success/failure).
1963 ZoneList<int>* range_boundaries = new ZoneList<int>(last_valid_range);
1964
1965 bool zeroth_entry_is_failure = !cc->is_negated();
1966
1967 for (int i = 0; i <= last_valid_range; i++) {
1601 CharacterRange& range = ranges->at(i); 1968 CharacterRange& range = ranges->at(i);
1602 Label next_range; 1969 if (range.from() == 0) {
ulan 2012/03/29 16:19:17 I think we don't need to check for range[i] == 0.
Erik Corry 2012/03/30 07:46:28 There is an explicit assumption in GenerateBranche
1603 uc16 from = range.from(); 1970 ASSERT_EQ(i, 0);
1604 uc16 to = range.to(); 1971 zeroth_entry_is_failure = !zeroth_entry_is_failure;
1605 if (from > max_char) { 1972 } else {
1606 continue; 1973 range_boundaries->Add(range.from());
1607 } 1974 }
1608 if (to > max_char) to = max_char; 1975 range_boundaries->Add(range.to() + 1);
1609 if (to == from) { 1976 }
1610 macro_assembler->CheckCharacter(to, char_is_in_class); 1977 int end_index = range_boundaries->length() - 1;
1611 } else { 1978 if (range_boundaries->at(end_index) > max_char) {
1612 if (from != 0) { 1979 end_index--;
1613 macro_assembler->CheckCharacterLT(from, &next_range);
1614 }
1615 if (to != max_char) {
1616 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
1617 } else {
1618 macro_assembler->GoTo(char_is_in_class);
1619 }
1620 }
1621 macro_assembler->Bind(&next_range);
1622 } 1980 }
1623 1981
1624 CharacterRange& range = ranges->at(last_valid_range); 1982 Label fall_through;
1625 uc16 from = range.from(); 1983 GenerateBranches(macro_assembler,
1626 uc16 to = range.to(); 1984 range_boundaries,
1627 1985 0, // start_index.
1628 if (to > max_char) to = max_char; 1986 end_index,
1629 ASSERT(to >= from); 1987 0, // min_char.
1630 1988 max_char,
1631 if (to == from) { 1989 &fall_through,
1632 if (cc->is_negated()) { 1990 zeroth_entry_is_failure ? &fall_through : on_failure,
1633 macro_assembler->CheckCharacter(to, on_failure); 1991 zeroth_entry_is_failure ? on_failure : &fall_through);
1634 } else { 1992 macro_assembler->Bind(&fall_through);
1635 macro_assembler->CheckNotCharacter(to, on_failure);
1636 }
1637 } else {
1638 if (from != 0) {
1639 if (cc->is_negated()) {
1640 macro_assembler->CheckCharacterLT(from, &success);
1641 } else {
1642 macro_assembler->CheckCharacterLT(from, on_failure);
1643 }
1644 }
1645 if (to != String::kMaxUtf16CodeUnit) {
1646 if (cc->is_negated()) {
1647 macro_assembler->CheckCharacterLT(to + 1, on_failure);
1648 } else {
1649 macro_assembler->CheckCharacterGT(to, on_failure);
1650 }
1651 } else {
1652 if (cc->is_negated()) {
1653 macro_assembler->GoTo(on_failure);
1654 }
1655 }
1656 }
1657 macro_assembler->Bind(&success);
1658 } 1993 }
1659 1994
1660 1995
1661 RegExpNode::~RegExpNode() { 1996 RegExpNode::~RegExpNode() {
1662 } 1997 }
1663 1998
1664 1999
1665 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, 2000 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
1666 Trace* trace) { 2001 Trace* trace) {
1667 // If we are generating a greedy loop then don't stop and don't reuse code. 2002 // If we are generating a greedy loop then don't stop and don't reuse code.
(...skipping 3666 matching lines...) Expand 10 before | Expand all | Expand 10 after
5334 } 5669 }
5335 5670
5336 return compiler.Assemble(&macro_assembler, 5671 return compiler.Assemble(&macro_assembler,
5337 node, 5672 node,
5338 data->capture_count, 5673 data->capture_count,
5339 pattern); 5674 pattern);
5340 } 5675 }
5341 5676
5342 5677
5343 }} // namespace v8::internal 5678 }} // namespace v8::internal
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698