Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(30)

Side by Side Diff: src/jsregexp.cc

Issue 11759008: Introduce ENABLE_LATIN_1 compile flag (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge
Patch Set: Fix FilterASCII Created 7 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « src/jsregexp.h ('k') | src/log.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 1663 matching lines...) Expand 10 before | Expand all | Expand 10 after
1674 bool ascii_subject, 1674 bool ascii_subject,
1675 unibrow::uchar* letters) { 1675 unibrow::uchar* letters) {
1676 int length = 1676 int length =
1677 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); 1677 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1678 // Unibrow returns 0 or 1 for characters where case independence is 1678 // Unibrow returns 0 or 1 for characters where case independence is
1679 // trivial. 1679 // trivial.
1680 if (length == 0) { 1680 if (length == 0) {
1681 letters[0] = character; 1681 letters[0] = character;
1682 length = 1; 1682 length = 1;
1683 } 1683 }
1684 if (!ascii_subject || character <= String::kMaxAsciiCharCode) { 1684 if (!ascii_subject || character <= String::kMaxOneByteCharCode) {
1685 return length; 1685 return length;
1686 } 1686 }
1687 // The standard requires that non-ASCII characters cannot have ASCII 1687 // The standard requires that non-ASCII characters cannot have ASCII
1688 // character codes in their equivalence class. 1688 // character codes in their equivalence class.
1689 return 0; 1689 return 0;
1690 } 1690 }
1691 1691
1692 1692
1693 static inline bool EmitSimpleCharacter(Isolate* isolate, 1693 static inline bool EmitSimpleCharacter(Isolate* isolate,
1694 RegExpCompiler* compiler, 1694 RegExpCompiler* compiler,
(...skipping 30 matching lines...) Expand all
1725 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1725 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1726 int length = GetCaseIndependentLetters(isolate, c, ascii, chars); 1726 int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
1727 if (length < 1) { 1727 if (length < 1) {
1728 // This can't match. Must be an ASCII subject and a non-ASCII character. 1728 // This can't match. Must be an ASCII subject and a non-ASCII character.
1729 // We do not need to do anything since the ASCII pass already handled this. 1729 // We do not need to do anything since the ASCII pass already handled this.
1730 return false; // Bounds not checked. 1730 return false; // Bounds not checked.
1731 } 1731 }
1732 bool checked = false; 1732 bool checked = false;
1733 // We handle the length > 1 case in a later pass. 1733 // We handle the length > 1 case in a later pass.
1734 if (length == 1) { 1734 if (length == 1) {
1735 if (ascii && c > String::kMaxAsciiCharCodeU) { 1735 if (ascii && c > String::kMaxOneByteCharCodeU) {
1736 // Can't match - see above. 1736 // Can't match - see above.
1737 return false; // Bounds not checked. 1737 return false; // Bounds not checked.
1738 } 1738 }
1739 if (!preloaded) { 1739 if (!preloaded) {
1740 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1740 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1741 checked = check; 1741 checked = check;
1742 } 1742 }
1743 macro_assembler->CheckNotCharacter(c, on_failure); 1743 macro_assembler->CheckNotCharacter(c, on_failure);
1744 } 1744 }
1745 return checked; 1745 return checked;
1746 } 1746 }
1747 1747
1748 1748
1749 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, 1749 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1750 bool ascii, 1750 bool ascii,
1751 uc16 c1, 1751 uc16 c1,
1752 uc16 c2, 1752 uc16 c2,
1753 Label* on_failure) { 1753 Label* on_failure) {
1754 uc16 char_mask; 1754 uc16 char_mask;
1755 if (ascii) { 1755 if (ascii) {
1756 char_mask = String::kMaxAsciiCharCode; 1756 char_mask = String::kMaxOneByteCharCode;
1757 } else { 1757 } else {
1758 char_mask = String::kMaxUtf16CodeUnit; 1758 char_mask = String::kMaxUtf16CodeUnit;
1759 } 1759 }
1760 uc16 exor = c1 ^ c2; 1760 uc16 exor = c1 ^ c2;
1761 // Check whether exor has only one bit set. 1761 // Check whether exor has only one bit set.
1762 if (((exor - 1) & exor) == 0) { 1762 if (((exor - 1) & exor) == 0) {
1763 // If c1 and c2 differ only by one bit. 1763 // If c1 and c2 differ only by one bit.
1764 // Ecma262UnCanonicalize always gives the highest number last. 1764 // Ecma262UnCanonicalize always gives the highest number last.
1765 ASSERT(c2 > c1); 1765 ASSERT(c2 > c1);
1766 uc16 mask = char_mask ^ exor; 1766 uc16 mask = char_mask ^ exor;
(...skipping 233 matching lines...) Expand 10 before | Expand all | Expand 10 after
2000 // encoding space can be quickly tested with a table lookup, so we don't 2000 // encoding space can be quickly tested with a table lookup, so we don't
2001 // wish to do binary chop search at a smaller granularity than that. A 2001 // wish to do binary chop search at a smaller granularity than that. A
2002 // 128-character space can take up a lot of space in the ranges array if, 2002 // 128-character space can take up a lot of space in the ranges array if,
2003 // for example, we only want to match every second character (eg. the lower 2003 // for example, we only want to match every second character (eg. the lower
2004 // case characters on some Unicode pages). 2004 // case characters on some Unicode pages).
2005 int binary_chop_index = (end_index + start_index) / 2; 2005 int binary_chop_index = (end_index + start_index) / 2;
2006 // The first test ensures that we get to the code that handles the ASCII 2006 // The first test ensures that we get to the code that handles the ASCII
2007 // range with a single not-taken branch, speeding up this important 2007 // range with a single not-taken branch, speeding up this important
2008 // character range (even non-ASCII charset-based text has spaces and 2008 // character range (even non-ASCII charset-based text has spaces and
2009 // punctuation). 2009 // punctuation).
2010 if (*border - 1 > String::kMaxAsciiCharCode && // ASCII case. 2010 if (*border - 1 > String::kMaxOneByteCharCode && // ASCII case.
2011 end_index - start_index > (*new_start_index - start_index) * 2 && 2011 end_index - start_index > (*new_start_index - start_index) * 2 &&
2012 last - first > kSize * 2 && 2012 last - first > kSize * 2 &&
2013 binary_chop_index > *new_start_index && 2013 binary_chop_index > *new_start_index &&
2014 ranges->at(binary_chop_index) >= first + 2 * kSize) { 2014 ranges->at(binary_chop_index) >= first + 2 * kSize) {
2015 int scan_forward_for_section_border = binary_chop_index;; 2015 int scan_forward_for_section_border = binary_chop_index;;
2016 int new_border = (ranges->at(binary_chop_index) | kMask) + 1; 2016 int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
2017 2017
2018 while (scan_forward_for_section_border < end_index) { 2018 while (scan_forward_for_section_border < end_index) {
2019 if (ranges->at(scan_forward_for_section_border) > new_border) { 2019 if (ranges->at(scan_forward_for_section_border) > new_border) {
2020 *new_start_index = scan_forward_for_section_border; 2020 *new_start_index = scan_forward_for_section_border;
(...skipping 183 matching lines...) Expand 10 before | Expand all | Expand 10 after
2204 bool check_offset, 2204 bool check_offset,
2205 bool preloaded, 2205 bool preloaded,
2206 Zone* zone) { 2206 Zone* zone) {
2207 ZoneList<CharacterRange>* ranges = cc->ranges(zone); 2207 ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2208 if (!CharacterRange::IsCanonical(ranges)) { 2208 if (!CharacterRange::IsCanonical(ranges)) {
2209 CharacterRange::Canonicalize(ranges); 2209 CharacterRange::Canonicalize(ranges);
2210 } 2210 }
2211 2211
2212 int max_char; 2212 int max_char;
2213 if (ascii) { 2213 if (ascii) {
2214 max_char = String::kMaxAsciiCharCode; 2214 max_char = String::kMaxOneByteCharCode;
2215 } else { 2215 } else {
2216 max_char = String::kMaxUtf16CodeUnit; 2216 max_char = String::kMaxUtf16CodeUnit;
2217 } 2217 }
2218 2218
2219 int range_count = ranges->length(); 2219 int range_count = ranges->length();
2220 2220
2221 int last_valid_range = range_count - 1; 2221 int last_valid_range = range_count - 1;
2222 while (last_valid_range >= 0) { 2222 while (last_valid_range >= 0) {
2223 CharacterRange& range = ranges->at(last_valid_range); 2223 CharacterRange& range = ranges->at(last_valid_range);
2224 if (range.from() <= max_char) { 2224 if (range.from() <= max_char) {
(...skipping 281 matching lines...) Expand 10 before | Expand all | Expand 10 after
2506 v |= v >> 8; 2506 v |= v >> 8;
2507 v |= v >> 16; 2507 v |= v >> 16;
2508 return v; 2508 return v;
2509 } 2509 }
2510 2510
2511 2511
2512 bool QuickCheckDetails::Rationalize(bool asc) { 2512 bool QuickCheckDetails::Rationalize(bool asc) {
2513 bool found_useful_op = false; 2513 bool found_useful_op = false;
2514 uint32_t char_mask; 2514 uint32_t char_mask;
2515 if (asc) { 2515 if (asc) {
2516 char_mask = String::kMaxAsciiCharCode; 2516 char_mask = String::kMaxOneByteCharCode;
2517 } else { 2517 } else {
2518 char_mask = String::kMaxUtf16CodeUnit; 2518 char_mask = String::kMaxUtf16CodeUnit;
2519 } 2519 }
2520 mask_ = 0; 2520 mask_ = 0;
2521 value_ = 0; 2521 value_ = 0;
2522 int char_shift = 0; 2522 int char_shift = 0;
2523 for (int i = 0; i < characters_; i++) { 2523 for (int i = 0; i < characters_; i++) {
2524 Position* pos = &positions_[i]; 2524 Position* pos = &positions_[i];
2525 if ((pos->mask & String::kMaxAsciiCharCode) != 0) { 2525 if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2526 found_useful_op = true; 2526 found_useful_op = true;
2527 } 2527 }
2528 mask_ |= (pos->mask & char_mask) << char_shift; 2528 mask_ |= (pos->mask & char_mask) << char_shift;
2529 value_ |= (pos->value & char_mask) << char_shift; 2529 value_ |= (pos->value & char_mask) << char_shift;
2530 char_shift += asc ? 8 : 16; 2530 char_shift += asc ? 8 : 16;
2531 } 2531 }
2532 return found_useful_op; 2532 return found_useful_op;
2533 } 2533 }
2534 2534
2535 2535
(...skipping 22 matching lines...) Expand all
2558 } 2558 }
2559 2559
2560 2560
2561 bool need_mask = true; 2561 bool need_mask = true;
2562 2562
2563 if (details->characters() == 1) { 2563 if (details->characters() == 1) {
2564 // If number of characters preloaded is 1 then we used a byte or 16 bit 2564 // If number of characters preloaded is 1 then we used a byte or 16 bit
2565 // load so the value is already masked down. 2565 // load so the value is already masked down.
2566 uint32_t char_mask; 2566 uint32_t char_mask;
2567 if (compiler->ascii()) { 2567 if (compiler->ascii()) {
2568 char_mask = String::kMaxAsciiCharCode; 2568 char_mask = String::kMaxOneByteCharCode;
2569 } else { 2569 } else {
2570 char_mask = String::kMaxUtf16CodeUnit; 2570 char_mask = String::kMaxUtf16CodeUnit;
2571 } 2571 }
2572 if ((mask & char_mask) == char_mask) need_mask = false; 2572 if ((mask & char_mask) == char_mask) need_mask = false;
2573 mask &= char_mask; 2573 mask &= char_mask;
2574 } else { 2574 } else {
2575 // For 2-character preloads in ASCII mode or 1-character preloads in 2575 // For 2-character preloads in ASCII mode or 1-character preloads in
2576 // TWO_BYTE mode we also use a 16 bit load with zero extend. 2576 // TWO_BYTE mode we also use a 16 bit load with zero extend.
2577 if (details->characters() == 2 && compiler->ascii()) { 2577 if (details->characters() == 2 && compiler->ascii()) {
2578 if ((mask & 0x7f7f) == 0x7f7f) need_mask = false; 2578 #ifndef ENABLE_LATIN_1
2579 if ((mask & 0x7f7f) == 0xffff) need_mask = false;
2580 #else
2581 if ((mask & 0xffff) == 0xffff) need_mask = false;
2582 #endif
2579 } else if (details->characters() == 1 && !compiler->ascii()) { 2583 } else if (details->characters() == 1 && !compiler->ascii()) {
2580 if ((mask & 0xffff) == 0xffff) need_mask = false; 2584 if ((mask & 0xffff) == 0xffff) need_mask = false;
2581 } else { 2585 } else {
2582 if (mask == 0xffffffff) need_mask = false; 2586 if (mask == 0xffffffff) need_mask = false;
2583 } 2587 }
2584 } 2588 }
2585 2589
2586 if (fall_through_on_failure) { 2590 if (fall_through_on_failure) {
2587 if (need_mask) { 2591 if (need_mask) {
2588 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); 2592 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
(...skipping 21 matching lines...) Expand all
2610 // generating a quick check. 2614 // generating a quick check.
2611 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, 2615 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2612 RegExpCompiler* compiler, 2616 RegExpCompiler* compiler,
2613 int characters_filled_in, 2617 int characters_filled_in,
2614 bool not_at_start) { 2618 bool not_at_start) {
2615 Isolate* isolate = Isolate::Current(); 2619 Isolate* isolate = Isolate::Current();
2616 ASSERT(characters_filled_in < details->characters()); 2620 ASSERT(characters_filled_in < details->characters());
2617 int characters = details->characters(); 2621 int characters = details->characters();
2618 int char_mask; 2622 int char_mask;
2619 if (compiler->ascii()) { 2623 if (compiler->ascii()) {
2620 char_mask = String::kMaxAsciiCharCode; 2624 char_mask = String::kMaxOneByteCharCode;
2621 } else { 2625 } else {
2622 char_mask = String::kMaxUtf16CodeUnit; 2626 char_mask = String::kMaxUtf16CodeUnit;
2623 } 2627 }
2624 for (int k = 0; k < elms_->length(); k++) { 2628 for (int k = 0; k < elms_->length(); k++) {
2625 TextElement elm = elms_->at(k); 2629 TextElement elm = elms_->at(k);
2626 if (elm.type == TextElement::ATOM) { 2630 if (elm.type == TextElement::ATOM) {
2627 Vector<const uc16> quarks = elm.data.u_atom->data(); 2631 Vector<const uc16> quarks = elm.data.u_atom->data();
2628 for (int i = 0; i < characters && i < quarks.length(); i++) { 2632 for (int i = 0; i < characters && i < quarks.length(); i++) {
2629 QuickCheckDetails::Position* pos = 2633 QuickCheckDetails::Position* pos =
2630 details->positions(characters_filled_in); 2634 details->positions(characters_filled_in);
(...skipping 196 matching lines...) Expand 10 before | Expand all | Expand 10 after
2827 info->visited = true; 2831 info->visited = true;
2828 } 2832 }
2829 ~VisitMarker() { 2833 ~VisitMarker() {
2830 info_->visited = false; 2834 info_->visited = false;
2831 } 2835 }
2832 private: 2836 private:
2833 NodeInfo* info_; 2837 NodeInfo* info_;
2834 }; 2838 };
2835 2839
2836 2840
2837 RegExpNode* SeqRegExpNode::FilterASCII(int depth) { 2841 RegExpNode* SeqRegExpNode::FilterASCII(int depth, bool ignore_case) {
2838 if (info()->replacement_calculated) return replacement(); 2842 if (info()->replacement_calculated) return replacement();
2839 if (depth < 0) return this; 2843 if (depth < 0) return this;
2840 ASSERT(!info()->visited); 2844 ASSERT(!info()->visited);
2841 VisitMarker marker(info()); 2845 VisitMarker marker(info());
2842 return FilterSuccessor(depth - 1); 2846 return FilterSuccessor(depth - 1, ignore_case);
2843 } 2847 }
2844 2848
2845 2849
2846 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) { 2850 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2847 RegExpNode* next = on_success_->FilterASCII(depth - 1); 2851 RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case);
2848 if (next == NULL) return set_replacement(NULL); 2852 if (next == NULL) return set_replacement(NULL);
2849 on_success_ = next; 2853 on_success_ = next;
2850 return set_replacement(this); 2854 return set_replacement(this);
2851 } 2855 }
2852 2856
2853 2857
2854 RegExpNode* TextNode::FilterASCII(int depth) { 2858 RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
2855 if (info()->replacement_calculated) return replacement(); 2859 if (info()->replacement_calculated) return replacement();
2856 if (depth < 0) return this; 2860 if (depth < 0) return this;
2857 ASSERT(!info()->visited); 2861 ASSERT(!info()->visited);
2858 VisitMarker marker(info()); 2862 VisitMarker marker(info());
2859 int element_count = elms_->length(); 2863 int element_count = elms_->length();
2860 for (int i = 0; i < element_count; i++) { 2864 for (int i = 0; i < element_count; i++) {
2861 TextElement elm = elms_->at(i); 2865 TextElement elm = elms_->at(i);
2862 if (elm.type == TextElement::ATOM) { 2866 if (elm.type == TextElement::ATOM) {
2863 Vector<const uc16> quarks = elm.data.u_atom->data(); 2867 Vector<const uc16> quarks = elm.data.u_atom->data();
2864 for (int j = 0; j < quarks.length(); j++) { 2868 for (int j = 0; j < quarks.length(); j++) {
2865 // We don't need special handling for case independence 2869 #ifndef ENABLE_LATIN_1
2866 // because of the rule that case independence cannot make 2870 if (quarks[j] > String::kMaxOneByteCharCode) {
2867 // a non-ASCII character match an ASCII character.
2868 if (quarks[j] > String::kMaxAsciiCharCode) {
2869 return set_replacement(NULL); 2871 return set_replacement(NULL);
2870 } 2872 }
2873 #else
2874 if (quarks[j] <= String::kMaxOneByteCharCode) continue;
2875 if (!ignore_case) return set_replacement(NULL);
2876 // Here, we need to check for characters whose upper and lower cases
2877 // are outside the Latin-1 range.
2878 // TODO(dcarney): Replace this code with a simple
2879 // table lookup in unibrow::Latin-1.
2880 // TODO(dcarney): Test cases!.
2881 unibrow::uchar result;
2882 int chars;
2883 chars = unibrow::ToLowercase::Convert(quarks[j], 0, &result, NULL);
2884 if (chars > 1 ||
2885 (chars == 1 && result <= String::kMaxOneByteCharCodeU)) {
2886 continue;
2887 }
2888 chars = unibrow::ToUppercase::Convert(quarks[j], 0, &result, NULL);
2889 if (chars > 1 ||
2890 (chars == 1 && result <= String::kMaxOneByteCharCodeU)) {
2891 continue;
2892 }
2893 // This character is definitely not in the Latin-1 range.
2894 return set_replacement(NULL);
2895 #endif
2871 } 2896 }
2872 } else { 2897 } else {
2873 ASSERT(elm.type == TextElement::CHAR_CLASS); 2898 ASSERT(elm.type == TextElement::CHAR_CLASS);
2899 #ifdef ENABLE_LATIN_1
2900 // TODO(dcarney): Can this be improved?
2901 if (ignore_case) continue;
2902 #endif
2874 RegExpCharacterClass* cc = elm.data.u_char_class; 2903 RegExpCharacterClass* cc = elm.data.u_char_class;
2875 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 2904 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2876 if (!CharacterRange::IsCanonical(ranges)) { 2905 if (!CharacterRange::IsCanonical(ranges)) {
2877 CharacterRange::Canonicalize(ranges); 2906 CharacterRange::Canonicalize(ranges);
2878 } 2907 }
2879 // Now they are in order so we only need to look at the first. 2908 // Now they are in order so we only need to look at the first.
2880 int range_count = ranges->length(); 2909 int range_count = ranges->length();
2881 if (cc->is_negated()) { 2910 if (cc->is_negated()) {
2882 if (range_count != 0 && 2911 if (range_count != 0 &&
2883 ranges->at(0).from() == 0 && 2912 ranges->at(0).from() == 0 &&
2884 ranges->at(0).to() >= String::kMaxAsciiCharCode) { 2913 ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2885 return set_replacement(NULL); 2914 return set_replacement(NULL);
2886 } 2915 }
2887 } else { 2916 } else {
2888 if (range_count == 0 || 2917 if (range_count == 0 ||
2889 ranges->at(0).from() > String::kMaxAsciiCharCode) { 2918 ranges->at(0).from() > String::kMaxOneByteCharCode) {
2890 return set_replacement(NULL); 2919 return set_replacement(NULL);
2891 } 2920 }
2892 } 2921 }
2893 } 2922 }
2894 } 2923 }
2895 return FilterSuccessor(depth - 1); 2924 return FilterSuccessor(depth - 1, ignore_case);
2896 } 2925 }
2897 2926
2898 2927
2899 RegExpNode* LoopChoiceNode::FilterASCII(int depth) { 2928 RegExpNode* LoopChoiceNode::FilterASCII(int depth, bool ignore_case) {
2900 if (info()->replacement_calculated) return replacement(); 2929 if (info()->replacement_calculated) return replacement();
2901 if (depth < 0) return this; 2930 if (depth < 0) return this;
2902 if (info()->visited) return this; 2931 if (info()->visited) return this;
2903 { 2932 {
2904 VisitMarker marker(info()); 2933 VisitMarker marker(info());
2905 2934
2906 RegExpNode* continue_replacement = continue_node_->FilterASCII(depth - 1); 2935 RegExpNode* continue_replacement =
2936 continue_node_->FilterASCII(depth - 1, ignore_case);
2907 // If we can't continue after the loop then there is no sense in doing the 2937 // If we can't continue after the loop then there is no sense in doing the
2908 // loop. 2938 // loop.
2909 if (continue_replacement == NULL) return set_replacement(NULL); 2939 if (continue_replacement == NULL) return set_replacement(NULL);
2910 } 2940 }
2911 2941
2912 return ChoiceNode::FilterASCII(depth - 1); 2942 return ChoiceNode::FilterASCII(depth - 1, ignore_case);
2913 } 2943 }
2914 2944
2915 2945
2916 RegExpNode* ChoiceNode::FilterASCII(int depth) { 2946 RegExpNode* ChoiceNode::FilterASCII(int depth, bool ignore_case) {
2917 if (info()->replacement_calculated) return replacement(); 2947 if (info()->replacement_calculated) return replacement();
2918 if (depth < 0) return this; 2948 if (depth < 0) return this;
2919 if (info()->visited) return this; 2949 if (info()->visited) return this;
2920 VisitMarker marker(info()); 2950 VisitMarker marker(info());
2921 int choice_count = alternatives_->length(); 2951 int choice_count = alternatives_->length();
2922 2952
2923 for (int i = 0; i < choice_count; i++) { 2953 for (int i = 0; i < choice_count; i++) {
2924 GuardedAlternative alternative = alternatives_->at(i); 2954 GuardedAlternative alternative = alternatives_->at(i);
2925 if (alternative.guards() != NULL && alternative.guards()->length() != 0) { 2955 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2926 set_replacement(this); 2956 set_replacement(this);
2927 return this; 2957 return this;
2928 } 2958 }
2929 } 2959 }
2930 2960
2931 int surviving = 0; 2961 int surviving = 0;
2932 RegExpNode* survivor = NULL; 2962 RegExpNode* survivor = NULL;
2933 for (int i = 0; i < choice_count; i++) { 2963 for (int i = 0; i < choice_count; i++) {
2934 GuardedAlternative alternative = alternatives_->at(i); 2964 GuardedAlternative alternative = alternatives_->at(i);
2935 RegExpNode* replacement = alternative.node()->FilterASCII(depth - 1); 2965 RegExpNode* replacement =
2966 alternative.node()->FilterASCII(depth - 1, ignore_case);
2936 ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK. 2967 ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
2937 if (replacement != NULL) { 2968 if (replacement != NULL) {
2938 alternatives_->at(i).set_node(replacement); 2969 alternatives_->at(i).set_node(replacement);
2939 surviving++; 2970 surviving++;
2940 survivor = replacement; 2971 survivor = replacement;
2941 } 2972 }
2942 } 2973 }
2943 if (surviving < 2) return set_replacement(survivor); 2974 if (surviving < 2) return set_replacement(survivor);
2944 2975
2945 set_replacement(this); 2976 set_replacement(this);
2946 if (surviving == choice_count) { 2977 if (surviving == choice_count) {
2947 return this; 2978 return this;
2948 } 2979 }
2949 // Only some of the nodes survived the filtering. We need to rebuild the 2980 // Only some of the nodes survived the filtering. We need to rebuild the
2950 // alternatives list. 2981 // alternatives list.
2951 ZoneList<GuardedAlternative>* new_alternatives = 2982 ZoneList<GuardedAlternative>* new_alternatives =
2952 new(zone()) ZoneList<GuardedAlternative>(surviving, zone()); 2983 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2953 for (int i = 0; i < choice_count; i++) { 2984 for (int i = 0; i < choice_count; i++) {
2954 RegExpNode* replacement = 2985 RegExpNode* replacement =
2955 alternatives_->at(i).node()->FilterASCII(depth - 1); 2986 alternatives_->at(i).node()->FilterASCII(depth - 1, ignore_case);
2956 if (replacement != NULL) { 2987 if (replacement != NULL) {
2957 alternatives_->at(i).set_node(replacement); 2988 alternatives_->at(i).set_node(replacement);
2958 new_alternatives->Add(alternatives_->at(i), zone()); 2989 new_alternatives->Add(alternatives_->at(i), zone());
2959 } 2990 }
2960 } 2991 }
2961 alternatives_ = new_alternatives; 2992 alternatives_ = new_alternatives;
2962 return this; 2993 return this;
2963 } 2994 }
2964 2995
2965 2996
2966 RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth) { 2997 RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth,
2998 bool ignore_case) {
2967 if (info()->replacement_calculated) return replacement(); 2999 if (info()->replacement_calculated) return replacement();
2968 if (depth < 0) return this; 3000 if (depth < 0) return this;
2969 if (info()->visited) return this; 3001 if (info()->visited) return this;
2970 VisitMarker marker(info()); 3002 VisitMarker marker(info());
2971 // Alternative 0 is the negative lookahead, alternative 1 is what comes 3003 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2972 // afterwards. 3004 // afterwards.
2973 RegExpNode* node = alternatives_->at(1).node(); 3005 RegExpNode* node = alternatives_->at(1).node();
2974 RegExpNode* replacement = node->FilterASCII(depth - 1); 3006 RegExpNode* replacement = node->FilterASCII(depth - 1, ignore_case);
2975 if (replacement == NULL) return set_replacement(NULL); 3007 if (replacement == NULL) return set_replacement(NULL);
2976 alternatives_->at(1).set_node(replacement); 3008 alternatives_->at(1).set_node(replacement);
2977 3009
2978 RegExpNode* neg_node = alternatives_->at(0).node(); 3010 RegExpNode* neg_node = alternatives_->at(0).node();
2979 RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1); 3011 RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1, ignore_case);
2980 // If the negative lookahead is always going to fail then 3012 // If the negative lookahead is always going to fail then
2981 // we don't need to check it. 3013 // we don't need to check it.
2982 if (neg_replacement == NULL) return set_replacement(replacement); 3014 if (neg_replacement == NULL) return set_replacement(replacement);
2983 alternatives_->at(0).set_node(neg_replacement); 3015 alternatives_->at(0).set_node(neg_replacement);
2984 return set_replacement(this); 3016 return set_replacement(this);
2985 } 3017 }
2986 3018
2987 3019
2988 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 3020 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2989 RegExpCompiler* compiler, 3021 RegExpCompiler* compiler,
(...skipping 302 matching lines...) Expand 10 before | Expand all | Expand 10 after
3292 int cp_offset = trace->cp_offset() + elm.cp_offset; 3324 int cp_offset = trace->cp_offset() + elm.cp_offset;
3293 if (elm.type == TextElement::ATOM) { 3325 if (elm.type == TextElement::ATOM) {
3294 Vector<const uc16> quarks = elm.data.u_atom->data(); 3326 Vector<const uc16> quarks = elm.data.u_atom->data();
3295 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { 3327 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3296 if (first_element_checked && i == 0 && j == 0) continue; 3328 if (first_element_checked && i == 0 && j == 0) continue;
3297 if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue; 3329 if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
3298 EmitCharacterFunction* emit_function = NULL; 3330 EmitCharacterFunction* emit_function = NULL;
3299 switch (pass) { 3331 switch (pass) {
3300 case NON_ASCII_MATCH: 3332 case NON_ASCII_MATCH:
3301 ASSERT(ascii); 3333 ASSERT(ascii);
3302 if (quarks[j] > String::kMaxAsciiCharCode) { 3334 if (quarks[j] > String::kMaxOneByteCharCode) {
3303 assembler->GoTo(backtrack); 3335 assembler->GoTo(backtrack);
3304 return; 3336 return;
3305 } 3337 }
3306 break; 3338 break;
3307 case NON_LETTER_CHARACTER_MATCH: 3339 case NON_LETTER_CHARACTER_MATCH:
3308 emit_function = &EmitAtomNonLetter; 3340 emit_function = &EmitAtomNonLetter;
3309 break; 3341 break;
3310 case SIMPLE_CHARACTER_MATCH: 3342 case SIMPLE_CHARACTER_MATCH:
3311 emit_function = &EmitSimpleCharacter; 3343 emit_function = &EmitSimpleCharacter;
3312 break; 3344 break;
(...skipping 178 matching lines...) Expand 10 before | Expand all | Expand 10 after
3491 ZoneList<CharacterRange>* ranges = node->ranges(zone()); 3523 ZoneList<CharacterRange>* ranges = node->ranges(zone());
3492 if (!CharacterRange::IsCanonical(ranges)) { 3524 if (!CharacterRange::IsCanonical(ranges)) {
3493 CharacterRange::Canonicalize(ranges); 3525 CharacterRange::Canonicalize(ranges);
3494 } 3526 }
3495 if (node->is_negated()) { 3527 if (node->is_negated()) {
3496 return ranges->length() == 0 ? on_success() : NULL; 3528 return ranges->length() == 0 ? on_success() : NULL;
3497 } 3529 }
3498 if (ranges->length() != 1) return NULL; 3530 if (ranges->length() != 1) return NULL;
3499 uint32_t max_char; 3531 uint32_t max_char;
3500 if (compiler->ascii()) { 3532 if (compiler->ascii()) {
3501 max_char = String::kMaxAsciiCharCode; 3533 max_char = String::kMaxOneByteCharCode;
3502 } else { 3534 } else {
3503 max_char = String::kMaxUtf16CodeUnit; 3535 max_char = String::kMaxUtf16CodeUnit;
3504 } 3536 }
3505 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL; 3537 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
3506 } 3538 }
3507 3539
3508 3540
3509 // Finds the fixed match length of a sequence of nodes that goes from 3541 // Finds the fixed match length of a sequence of nodes that goes from
3510 // this alternative and back to this choice node. If there are variable 3542 // this alternative and back to this choice node. If there are variable
3511 // length nodes or other complications in the way then return a sentinel 3543 // length nodes or other complications in the way then return a sentinel
(...skipping 179 matching lines...) Expand 10 before | Expand all | Expand 10 after
3691 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; 3723 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3692 } 3724 }
3693 } 3725 }
3694 3726
3695 3727
3696 BoyerMooreLookahead::BoyerMooreLookahead( 3728 BoyerMooreLookahead::BoyerMooreLookahead(
3697 int length, RegExpCompiler* compiler, Zone* zone) 3729 int length, RegExpCompiler* compiler, Zone* zone)
3698 : length_(length), 3730 : length_(length),
3699 compiler_(compiler) { 3731 compiler_(compiler) {
3700 if (compiler->ascii()) { 3732 if (compiler->ascii()) {
3701 max_char_ = String::kMaxAsciiCharCode; 3733 max_char_ = String::kMaxOneByteCharCode;
3702 } else { 3734 } else {
3703 max_char_ = String::kMaxUtf16CodeUnit; 3735 max_char_ = String::kMaxUtf16CodeUnit;
3704 } 3736 }
3705 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone); 3737 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3706 for (int i = 0; i < length; i++) { 3738 for (int i = 0; i < length; i++) {
3707 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone); 3739 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3708 } 3740 }
3709 } 3741 }
3710 3742
3711 3743
(...skipping 1618 matching lines...) Expand 10 before | Expand all | Expand 10 after
5330 } 5362 }
5331 5363
5332 5364
5333 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, 5365 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
5334 bool is_ascii, 5366 bool is_ascii,
5335 Zone* zone) { 5367 Zone* zone) {
5336 Isolate* isolate = Isolate::Current(); 5368 Isolate* isolate = Isolate::Current();
5337 uc16 bottom = from(); 5369 uc16 bottom = from();
5338 uc16 top = to(); 5370 uc16 top = to();
5339 if (is_ascii) { 5371 if (is_ascii) {
5340 if (bottom > String::kMaxAsciiCharCode) return; 5372 if (bottom > String::kMaxOneByteCharCode) return;
5341 if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode; 5373 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5342 } 5374 }
5343 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5375 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5344 if (top == bottom) { 5376 if (top == bottom) {
5345 // If this is a singleton we just expand the one character. 5377 // If this is a singleton we just expand the one character.
5346 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); 5378 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5347 for (int i = 0; i < length; i++) { 5379 for (int i = 0; i < length; i++) {
5348 uc32 chr = chars[i]; 5380 uc32 chr = chars[i];
5349 if (chr != bottom) { 5381 if (chr != bottom) {
5350 ranges->Add(CharacterRange::Singleton(chars[i]), zone); 5382 ranges->Add(CharacterRange::Singleton(chars[i]), zone);
5351 } 5383 }
(...skipping 526 matching lines...) Expand 10 before | Expand all | Expand 10 after
5878 if (offset >= bm->length()) { 5910 if (offset >= bm->length()) {
5879 if (initial_offset == 0) set_bm_info(not_at_start, bm); 5911 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5880 return; 5912 return;
5881 } 5913 }
5882 uc16 character = atom->data()[j]; 5914 uc16 character = atom->data()[j];
5883 if (bm->compiler()->ignore_case()) { 5915 if (bm->compiler()->ignore_case()) {
5884 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5916 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5885 int length = GetCaseIndependentLetters( 5917 int length = GetCaseIndependentLetters(
5886 ISOLATE, 5918 ISOLATE,
5887 character, 5919 character,
5888 bm->max_char() == String::kMaxAsciiCharCode, 5920 bm->max_char() == String::kMaxOneByteCharCode,
5889 chars); 5921 chars);
5890 for (int j = 0; j < length; j++) { 5922 for (int j = 0; j < length; j++) {
5891 bm->Set(offset, chars[j]); 5923 bm->Set(offset, chars[j]);
5892 } 5924 }
5893 } else { 5925 } else {
5894 if (character <= max_char) bm->Set(offset, character); 5926 if (character <= max_char) bm->Set(offset, character);
5895 } 5927 }
5896 } 5928 }
5897 } else { 5929 } else {
5898 ASSERT(text.type == TextElement::CHAR_CLASS); 5930 ASSERT(text.type == TextElement::CHAR_CLASS);
(...skipping 193 matching lines...) Expand 10 before | Expand all | Expand 10 after
6092 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone); 6124 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6093 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 6125 first_step_node->AddAlternative(GuardedAlternative(captured_body));
6094 first_step_node->AddAlternative(GuardedAlternative( 6126 first_step_node->AddAlternative(GuardedAlternative(
6095 new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node))); 6127 new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node)));
6096 node = first_step_node; 6128 node = first_step_node;
6097 } else { 6129 } else {
6098 node = loop_node; 6130 node = loop_node;
6099 } 6131 }
6100 } 6132 }
6101 if (is_ascii) { 6133 if (is_ascii) {
6102 node = node->FilterASCII(RegExpCompiler::kMaxRecursion); 6134 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);
6103 // Do it again to propagate the new nodes to places where they were not 6135 // Do it again to propagate the new nodes to places where they were not
6104 // put because they had not been calculated yet. 6136 // put because they had not been calculated yet.
6105 if (node != NULL) node = node->FilterASCII(RegExpCompiler::kMaxRecursion); 6137 if (node != NULL) {
6138 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);
6139 }
6106 } 6140 }
6107 6141
6108 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); 6142 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6109 data->node = node; 6143 data->node = node;
6110 Analysis analysis(ignore_case, is_ascii); 6144 Analysis analysis(ignore_case, is_ascii);
6111 analysis.EnsureAnalyzed(node); 6145 analysis.EnsureAnalyzed(node);
6112 if (analysis.has_failed()) { 6146 if (analysis.has_failed()) {
6113 const char* error_message = analysis.error_message(); 6147 const char* error_message = analysis.error_message();
6114 return CompilationResult(error_message); 6148 return CompilationResult(error_message);
6115 } 6149 }
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
6159 } 6193 }
6160 6194
6161 return compiler.Assemble(&macro_assembler, 6195 return compiler.Assemble(&macro_assembler,
6162 node, 6196 node,
6163 data->capture_count, 6197 data->capture_count,
6164 pattern); 6198 pattern);
6165 } 6199 }
6166 6200
6167 6201
6168 }} // namespace v8::internal 6202 }} // namespace v8::internal
OLDNEW
« no previous file with comments | « src/jsregexp.h ('k') | src/log.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698