src/jsregexp.cc - Issue 11759008: Introduce ENABLE_LATIN_1 compile flag

Side by Side Diff: src/jsregexp.cc

Issue 11759008: Introduce ENABLE_LATIN_1 compile flag (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge

Patch Set: Fix FilterASCII Created 7 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 1663 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1674 bool ascii_subject,	1674 bool ascii_subject,

1675 unibrow::uchar* letters) {	1675 unibrow::uchar* letters) {

1676 int length =	1676 int length =

1677 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);	1677 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);

1678 // Unibrow returns 0 or 1 for characters where case independence is	1678 // Unibrow returns 0 or 1 for characters where case independence is

1679 // trivial.	1679 // trivial.

1680 if (length == 0) {	1680 if (length == 0) {

1681 letters[0] = character;	1681 letters[0] = character;

1682 length = 1;	1682 length = 1;

1683 }	1683 }

1684 if (!ascii_subject \|\| character <= String::kMaxAsciiCharCode) {	1684 if (!ascii_subject \|\| character <= String::kMaxOneByteCharCode) {

1685 return length;	1685 return length;

1686 }	1686 }

1687 // The standard requires that non-ASCII characters cannot have ASCII	1687 // The standard requires that non-ASCII characters cannot have ASCII

1688 // character codes in their equivalence class.	1688 // character codes in their equivalence class.

1689 return 0;	1689 return 0;

1690 }	1690 }

1691	1691

1692	1692

1693 static inline bool EmitSimpleCharacter(Isolate* isolate,	1693 static inline bool EmitSimpleCharacter(Isolate* isolate,

1694 RegExpCompiler* compiler,	1694 RegExpCompiler* compiler,

(...skipping 30 matching lines...) Expand all Loading...
1725 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	1725 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

1726 int length = GetCaseIndependentLetters(isolate, c, ascii, chars);	1726 int length = GetCaseIndependentLetters(isolate, c, ascii, chars);

1727 if (length < 1) {	1727 if (length < 1) {

1728 // This can't match. Must be an ASCII subject and a non-ASCII character.	1728 // This can't match. Must be an ASCII subject and a non-ASCII character.

1729 // We do not need to do anything since the ASCII pass already handled this.	1729 // We do not need to do anything since the ASCII pass already handled this.

1730 return false; // Bounds not checked.	1730 return false; // Bounds not checked.

1731 }	1731 }

1732 bool checked = false;	1732 bool checked = false;

1733 // We handle the length > 1 case in a later pass.	1733 // We handle the length > 1 case in a later pass.

1734 if (length == 1) {	1734 if (length == 1) {

1735 if (ascii && c > String::kMaxAsciiCharCodeU) {	1735 if (ascii && c > String::kMaxOneByteCharCodeU) {

1736 // Can't match - see above.	1736 // Can't match - see above.

1737 return false; // Bounds not checked.	1737 return false; // Bounds not checked.

1738 }	1738 }

1739 if (!preloaded) {	1739 if (!preloaded) {

1740 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);	1740 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);

1741 checked = check;	1741 checked = check;

1742 }	1742 }

1743 macro_assembler->CheckNotCharacter(c, on_failure);	1743 macro_assembler->CheckNotCharacter(c, on_failure);

1744 }	1744 }

1745 return checked;	1745 return checked;

1746 }	1746 }

1747	1747

1748	1748

1749 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,	1749 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,

1750 bool ascii,	1750 bool ascii,

1751 uc16 c1,	1751 uc16 c1,

1752 uc16 c2,	1752 uc16 c2,

1753 Label* on_failure) {	1753 Label* on_failure) {

1754 uc16 char_mask;	1754 uc16 char_mask;

1755 if (ascii) {	1755 if (ascii) {

1756 char_mask = String::kMaxAsciiCharCode;	1756 char_mask = String::kMaxOneByteCharCode;

1757 } else {	1757 } else {

1758 char_mask = String::kMaxUtf16CodeUnit;	1758 char_mask = String::kMaxUtf16CodeUnit;

1759 }	1759 }

1760 uc16 exor = c1 ^ c2;	1760 uc16 exor = c1 ^ c2;

1761 // Check whether exor has only one bit set.	1761 // Check whether exor has only one bit set.

1762 if (((exor - 1) & exor) == 0) {	1762 if (((exor - 1) & exor) == 0) {

1763 // If c1 and c2 differ only by one bit.	1763 // If c1 and c2 differ only by one bit.

1764 // Ecma262UnCanonicalize always gives the highest number last.	1764 // Ecma262UnCanonicalize always gives the highest number last.

1765 ASSERT(c2 > c1);	1765 ASSERT(c2 > c1);

1766 uc16 mask = char_mask ^ exor;	1766 uc16 mask = char_mask ^ exor;

(...skipping 233 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2000 // encoding space can be quickly tested with a table lookup, so we don't	2000 // encoding space can be quickly tested with a table lookup, so we don't

2001 // wish to do binary chop search at a smaller granularity than that. A	2001 // wish to do binary chop search at a smaller granularity than that. A

2002 // 128-character space can take up a lot of space in the ranges array if,	2002 // 128-character space can take up a lot of space in the ranges array if,

2003 // for example, we only want to match every second character (eg. the lower	2003 // for example, we only want to match every second character (eg. the lower

2004 // case characters on some Unicode pages).	2004 // case characters on some Unicode pages).

2005 int binary_chop_index = (end_index + start_index) / 2;	2005 int binary_chop_index = (end_index + start_index) / 2;

2006 // The first test ensures that we get to the code that handles the ASCII	2006 // The first test ensures that we get to the code that handles the ASCII

2007 // range with a single not-taken branch, speeding up this important	2007 // range with a single not-taken branch, speeding up this important

2008 // character range (even non-ASCII charset-based text has spaces and	2008 // character range (even non-ASCII charset-based text has spaces and

2009 // punctuation).	2009 // punctuation).

2010 if (*border - 1 > String::kMaxAsciiCharCode && // ASCII case.	2010 if (*border - 1 > String::kMaxOneByteCharCode && // ASCII case.

2011 end_index - start_index > (new_start_index - start_index) 2 &&	2011 end_index - start_index > (new_start_index - start_index) 2 &&

2012 last - first > kSize * 2 &&	2012 last - first > kSize * 2 &&

2013 binary_chop_index > *new_start_index &&	2013 binary_chop_index > *new_start_index &&

2014 ranges->at(binary_chop_index) >= first + 2 * kSize) {	2014 ranges->at(binary_chop_index) >= first + 2 * kSize) {

2015 int scan_forward_for_section_border = binary_chop_index;;	2015 int scan_forward_for_section_border = binary_chop_index;;

2016 int new_border = (ranges->at(binary_chop_index) \| kMask) + 1;	2016 int new_border = (ranges->at(binary_chop_index) \| kMask) + 1;

2017	2017

2018 while (scan_forward_for_section_border < end_index) {	2018 while (scan_forward_for_section_border < end_index) {

2019 if (ranges->at(scan_forward_for_section_border) > new_border) {	2019 if (ranges->at(scan_forward_for_section_border) > new_border) {

2020 *new_start_index = scan_forward_for_section_border;	2020 *new_start_index = scan_forward_for_section_border;

(...skipping 183 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2204 bool check_offset,	2204 bool check_offset,

2205 bool preloaded,	2205 bool preloaded,

2206 Zone* zone) {	2206 Zone* zone) {

2207 ZoneList<CharacterRange>* ranges = cc->ranges(zone);	2207 ZoneList<CharacterRange>* ranges = cc->ranges(zone);

2208 if (!CharacterRange::IsCanonical(ranges)) {	2208 if (!CharacterRange::IsCanonical(ranges)) {

2209 CharacterRange::Canonicalize(ranges);	2209 CharacterRange::Canonicalize(ranges);

2210 }	2210 }

2211	2211

2212 int max_char;	2212 int max_char;

2213 if (ascii) {	2213 if (ascii) {

2214 max_char = String::kMaxAsciiCharCode;	2214 max_char = String::kMaxOneByteCharCode;

2215 } else {	2215 } else {

2216 max_char = String::kMaxUtf16CodeUnit;	2216 max_char = String::kMaxUtf16CodeUnit;

2217 }	2217 }

2218	2218

2219 int range_count = ranges->length();	2219 int range_count = ranges->length();

2220	2220

2221 int last_valid_range = range_count - 1;	2221 int last_valid_range = range_count - 1;

2222 while (last_valid_range >= 0) {	2222 while (last_valid_range >= 0) {

2223 CharacterRange& range = ranges->at(last_valid_range);	2223 CharacterRange& range = ranges->at(last_valid_range);

2224 if (range.from() <= max_char) {	2224 if (range.from() <= max_char) {

(...skipping 281 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2506 v \|= v >> 8;	2506 v \|= v >> 8;

2507 v \|= v >> 16;	2507 v \|= v >> 16;

2508 return v;	2508 return v;

2509 }	2509 }

2510	2510

2511	2511

2512 bool QuickCheckDetails::Rationalize(bool asc) {	2512 bool QuickCheckDetails::Rationalize(bool asc) {

2513 bool found_useful_op = false;	2513 bool found_useful_op = false;

2514 uint32_t char_mask;	2514 uint32_t char_mask;

2515 if (asc) {	2515 if (asc) {

2516 char_mask = String::kMaxAsciiCharCode;	2516 char_mask = String::kMaxOneByteCharCode;

2517 } else {	2517 } else {

2518 char_mask = String::kMaxUtf16CodeUnit;	2518 char_mask = String::kMaxUtf16CodeUnit;

2519 }	2519 }

2520 mask_ = 0;	2520 mask_ = 0;

2521 value_ = 0;	2521 value_ = 0;

2522 int char_shift = 0;	2522 int char_shift = 0;

2523 for (int i = 0; i < characters_; i++) {	2523 for (int i = 0; i < characters_; i++) {

2524 Position* pos = &positions_[i];	2524 Position* pos = &positions_[i];

2525 if ((pos->mask & String::kMaxAsciiCharCode) != 0) {	2525 if ((pos->mask & String::kMaxOneByteCharCode) != 0) {

2526 found_useful_op = true;	2526 found_useful_op = true;

2527 }	2527 }

2528 mask_ \|= (pos->mask & char_mask) << char_shift;	2528 mask_ \|= (pos->mask & char_mask) << char_shift;

2529 value_ \|= (pos->value & char_mask) << char_shift;	2529 value_ \|= (pos->value & char_mask) << char_shift;

2530 char_shift += asc ? 8 : 16;	2530 char_shift += asc ? 8 : 16;

2531 }	2531 }

2532 return found_useful_op;	2532 return found_useful_op;

2533 }	2533 }

2534	2534

2535	2535

(...skipping 22 matching lines...) Expand all Loading...
2558 }	2558 }

2559	2559

2560	2560

2561 bool need_mask = true;	2561 bool need_mask = true;

2562	2562

2563 if (details->characters() == 1) {	2563 if (details->characters() == 1) {

2564 // If number of characters preloaded is 1 then we used a byte or 16 bit	2564 // If number of characters preloaded is 1 then we used a byte or 16 bit

2565 // load so the value is already masked down.	2565 // load so the value is already masked down.

2566 uint32_t char_mask;	2566 uint32_t char_mask;

2567 if (compiler->ascii()) {	2567 if (compiler->ascii()) {

2568 char_mask = String::kMaxAsciiCharCode;	2568 char_mask = String::kMaxOneByteCharCode;

2569 } else {	2569 } else {

2570 char_mask = String::kMaxUtf16CodeUnit;	2570 char_mask = String::kMaxUtf16CodeUnit;

2571 }	2571 }

2572 if ((mask & char_mask) == char_mask) need_mask = false;	2572 if ((mask & char_mask) == char_mask) need_mask = false;

2573 mask &= char_mask;	2573 mask &= char_mask;

2574 } else {	2574 } else {

2575 // For 2-character preloads in ASCII mode or 1-character preloads in	2575 // For 2-character preloads in ASCII mode or 1-character preloads in

2576 // TWO_BYTE mode we also use a 16 bit load with zero extend.	2576 // TWO_BYTE mode we also use a 16 bit load with zero extend.

2577 if (details->characters() == 2 && compiler->ascii()) {	2577 if (details->characters() == 2 && compiler->ascii()) {

2578 if ((mask & 0x7f7f) == 0x7f7f) need_mask = false;	2578 #ifndef ENABLE_LATIN_1

	2579 if ((mask & 0x7f7f) == 0xffff) need_mask = false;

	2580 #else

	2581 if ((mask & 0xffff) == 0xffff) need_mask = false;

	2582 #endif

2579 } else if (details->characters() == 1 && !compiler->ascii()) {	2583 } else if (details->characters() == 1 && !compiler->ascii()) {

2580 if ((mask & 0xffff) == 0xffff) need_mask = false;	2584 if ((mask & 0xffff) == 0xffff) need_mask = false;

2581 } else {	2585 } else {

2582 if (mask == 0xffffffff) need_mask = false;	2586 if (mask == 0xffffffff) need_mask = false;

2583 }	2587 }

2584 }	2588 }

2585	2589

2586 if (fall_through_on_failure) {	2590 if (fall_through_on_failure) {

2587 if (need_mask) {	2591 if (need_mask) {

2588 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);	2592 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);

(...skipping 21 matching lines...) Expand all Loading...
2610 // generating a quick check.	2614 // generating a quick check.

2611 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,	2615 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,

2612 RegExpCompiler* compiler,	2616 RegExpCompiler* compiler,

2613 int characters_filled_in,	2617 int characters_filled_in,

2614 bool not_at_start) {	2618 bool not_at_start) {

2615 Isolate* isolate = Isolate::Current();	2619 Isolate* isolate = Isolate::Current();

2616 ASSERT(characters_filled_in < details->characters());	2620 ASSERT(characters_filled_in < details->characters());

2617 int characters = details->characters();	2621 int characters = details->characters();

2618 int char_mask;	2622 int char_mask;

2619 if (compiler->ascii()) {	2623 if (compiler->ascii()) {

2620 char_mask = String::kMaxAsciiCharCode;	2624 char_mask = String::kMaxOneByteCharCode;

2621 } else {	2625 } else {

2622 char_mask = String::kMaxUtf16CodeUnit;	2626 char_mask = String::kMaxUtf16CodeUnit;

2623 }	2627 }

2624 for (int k = 0; k < elms_->length(); k++) {	2628 for (int k = 0; k < elms_->length(); k++) {

2625 TextElement elm = elms_->at(k);	2629 TextElement elm = elms_->at(k);

2626 if (elm.type == TextElement::ATOM) {	2630 if (elm.type == TextElement::ATOM) {

2627 Vector<const uc16> quarks = elm.data.u_atom->data();	2631 Vector<const uc16> quarks = elm.data.u_atom->data();

2628 for (int i = 0; i < characters && i < quarks.length(); i++) {	2632 for (int i = 0; i < characters && i < quarks.length(); i++) {

2629 QuickCheckDetails::Position* pos =	2633 QuickCheckDetails::Position* pos =

2630 details->positions(characters_filled_in);	2634 details->positions(characters_filled_in);

(...skipping 196 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2827 info->visited = true;	2831 info->visited = true;

2828 }	2832 }

2829 ~VisitMarker() {	2833 ~VisitMarker() {

2830 info_->visited = false;	2834 info_->visited = false;

2831 }	2835 }

2832 private:	2836 private:

2833 NodeInfo* info_;	2837 NodeInfo* info_;

2834 };	2838 };

2835	2839

2836	2840

2837 RegExpNode* SeqRegExpNode::FilterASCII(int depth) {	2841 RegExpNode* SeqRegExpNode::FilterASCII(int depth, bool ignore_case) {

2838 if (info()->replacement_calculated) return replacement();	2842 if (info()->replacement_calculated) return replacement();

2839 if (depth < 0) return this;	2843 if (depth < 0) return this;

2840 ASSERT(!info()->visited);	2844 ASSERT(!info()->visited);

2841 VisitMarker marker(info());	2845 VisitMarker marker(info());

2842 return FilterSuccessor(depth - 1);	2846 return FilterSuccessor(depth - 1, ignore_case);

2843 }	2847 }

2844	2848

2845	2849

2846 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {	2850 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {

2847 RegExpNode* next = on_success_->FilterASCII(depth - 1);	2851 RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case);

2848 if (next == NULL) return set_replacement(NULL);	2852 if (next == NULL) return set_replacement(NULL);

2849 on_success_ = next;	2853 on_success_ = next;

2850 return set_replacement(this);	2854 return set_replacement(this);

2851 }	2855 }

2852	2856

2853	2857

2854 RegExpNode* TextNode::FilterASCII(int depth) {	2858 RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {

2855 if (info()->replacement_calculated) return replacement();	2859 if (info()->replacement_calculated) return replacement();

2856 if (depth < 0) return this;	2860 if (depth < 0) return this;

2857 ASSERT(!info()->visited);	2861 ASSERT(!info()->visited);

2858 VisitMarker marker(info());	2862 VisitMarker marker(info());

2859 int element_count = elms_->length();	2863 int element_count = elms_->length();

2860 for (int i = 0; i < element_count; i++) {	2864 for (int i = 0; i < element_count; i++) {

2861 TextElement elm = elms_->at(i);	2865 TextElement elm = elms_->at(i);

2862 if (elm.type == TextElement::ATOM) {	2866 if (elm.type == TextElement::ATOM) {

2863 Vector<const uc16> quarks = elm.data.u_atom->data();	2867 Vector<const uc16> quarks = elm.data.u_atom->data();

2864 for (int j = 0; j < quarks.length(); j++) {	2868 for (int j = 0; j < quarks.length(); j++) {

2865 // We don't need special handling for case independence	2869 #ifndef ENABLE_LATIN_1

2866 // because of the rule that case independence cannot make	2870 if (quarks[j] > String::kMaxOneByteCharCode) {

2867 // a non-ASCII character match an ASCII character.

2868 if (quarks[j] > String::kMaxAsciiCharCode) {

2869 return set_replacement(NULL);	2871 return set_replacement(NULL);

2870 }	2872 }

	2873 #else

	2874 if (quarks[j] <= String::kMaxOneByteCharCode) continue;

	2875 if (!ignore_case) return set_replacement(NULL);

	2876 // Here, we need to check for characters whose upper and lower cases

	2877 // are outside the Latin-1 range.

	2878 // TODO(dcarney): Replace this code with a simple

	2879 // table lookup in unibrow::Latin-1.

	2880 // TODO(dcarney): Test cases!.

	2881 unibrow::uchar result;

	2882 int chars;

	2883 chars = unibrow::ToLowercase::Convert(quarks[j], 0, &result, NULL);

	2884 if (chars > 1 \|\|

	2885 (chars == 1 && result <= String::kMaxOneByteCharCodeU)) {

	2886 continue;

	2887 }

	2888 chars = unibrow::ToUppercase::Convert(quarks[j], 0, &result, NULL);

	2889 if (chars > 1 \|\|

	2890 (chars == 1 && result <= String::kMaxOneByteCharCodeU)) {

	2891 continue;

	2892 }

	2893 // This character is definitely not in the Latin-1 range.

	2894 return set_replacement(NULL);

	2895 #endif

2871 }	2896 }

2872 } else {	2897 } else {

2873 ASSERT(elm.type == TextElement::CHAR_CLASS);	2898 ASSERT(elm.type == TextElement::CHAR_CLASS);

	2899 #ifdef ENABLE_LATIN_1

	2900 // TODO(dcarney): Can this be improved?

	2901 if (ignore_case) continue;

	2902 #endif

2874 RegExpCharacterClass* cc = elm.data.u_char_class;	2903 RegExpCharacterClass* cc = elm.data.u_char_class;

2875 ZoneList<CharacterRange>* ranges = cc->ranges(zone());	2904 ZoneList<CharacterRange>* ranges = cc->ranges(zone());

2876 if (!CharacterRange::IsCanonical(ranges)) {	2905 if (!CharacterRange::IsCanonical(ranges)) {

2877 CharacterRange::Canonicalize(ranges);	2906 CharacterRange::Canonicalize(ranges);

2878 }	2907 }

2879 // Now they are in order so we only need to look at the first.	2908 // Now they are in order so we only need to look at the first.

2880 int range_count = ranges->length();	2909 int range_count = ranges->length();

2881 if (cc->is_negated()) {	2910 if (cc->is_negated()) {

2882 if (range_count != 0 &&	2911 if (range_count != 0 &&

2883 ranges->at(0).from() == 0 &&	2912 ranges->at(0).from() == 0 &&

2884 ranges->at(0).to() >= String::kMaxAsciiCharCode) {	2913 ranges->at(0).to() >= String::kMaxOneByteCharCode) {

2885 return set_replacement(NULL);	2914 return set_replacement(NULL);

2886 }	2915 }

2887 } else {	2916 } else {

2888 if (range_count == 0 \|\|	2917 if (range_count == 0 \|\|

2889 ranges->at(0).from() > String::kMaxAsciiCharCode) {	2918 ranges->at(0).from() > String::kMaxOneByteCharCode) {

2890 return set_replacement(NULL);	2919 return set_replacement(NULL);

2891 }	2920 }

2892 }	2921 }

2893 }	2922 }

2894 }	2923 }

2895 return FilterSuccessor(depth - 1);	2924 return FilterSuccessor(depth - 1, ignore_case);

2896 }	2925 }

2897	2926

2898	2927

2899 RegExpNode* LoopChoiceNode::FilterASCII(int depth) {	2928 RegExpNode* LoopChoiceNode::FilterASCII(int depth, bool ignore_case) {

2900 if (info()->replacement_calculated) return replacement();	2929 if (info()->replacement_calculated) return replacement();

2901 if (depth < 0) return this;	2930 if (depth < 0) return this;

2902 if (info()->visited) return this;	2931 if (info()->visited) return this;

2903 {	2932 {

2904 VisitMarker marker(info());	2933 VisitMarker marker(info());

2905	2934

2906 RegExpNode* continue_replacement = continue_node_->FilterASCII(depth - 1);	2935 RegExpNode* continue_replacement =

	2936 continue_node_->FilterASCII(depth - 1, ignore_case);

2907 // If we can't continue after the loop then there is no sense in doing the	2937 // If we can't continue after the loop then there is no sense in doing the

2908 // loop.	2938 // loop.

2909 if (continue_replacement == NULL) return set_replacement(NULL);	2939 if (continue_replacement == NULL) return set_replacement(NULL);

2910 }	2940 }

2911	2941

2912 return ChoiceNode::FilterASCII(depth - 1);	2942 return ChoiceNode::FilterASCII(depth - 1, ignore_case);

2913 }	2943 }

2914	2944

2915	2945

2916 RegExpNode* ChoiceNode::FilterASCII(int depth) {	2946 RegExpNode* ChoiceNode::FilterASCII(int depth, bool ignore_case) {

2917 if (info()->replacement_calculated) return replacement();	2947 if (info()->replacement_calculated) return replacement();

2918 if (depth < 0) return this;	2948 if (depth < 0) return this;

2919 if (info()->visited) return this;	2949 if (info()->visited) return this;

2920 VisitMarker marker(info());	2950 VisitMarker marker(info());

2921 int choice_count = alternatives_->length();	2951 int choice_count = alternatives_->length();

2922	2952

2923 for (int i = 0; i < choice_count; i++) {	2953 for (int i = 0; i < choice_count; i++) {

2924 GuardedAlternative alternative = alternatives_->at(i);	2954 GuardedAlternative alternative = alternatives_->at(i);

2925 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {	2955 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {

2926 set_replacement(this);	2956 set_replacement(this);

2927 return this;	2957 return this;

2928 }	2958 }

2929 }	2959 }

2930	2960

2931 int surviving = 0;	2961 int surviving = 0;

2932 RegExpNode* survivor = NULL;	2962 RegExpNode* survivor = NULL;

2933 for (int i = 0; i < choice_count; i++) {	2963 for (int i = 0; i < choice_count; i++) {

2934 GuardedAlternative alternative = alternatives_->at(i);	2964 GuardedAlternative alternative = alternatives_->at(i);

2935 RegExpNode* replacement = alternative.node()->FilterASCII(depth - 1);	2965 RegExpNode* replacement =

	2966 alternative.node()->FilterASCII(depth - 1, ignore_case);

2936 ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.	2967 ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.

2937 if (replacement != NULL) {	2968 if (replacement != NULL) {

2938 alternatives_->at(i).set_node(replacement);	2969 alternatives_->at(i).set_node(replacement);

2939 surviving++;	2970 surviving++;

2940 survivor = replacement;	2971 survivor = replacement;

2941 }	2972 }

2942 }	2973 }

2943 if (surviving < 2) return set_replacement(survivor);	2974 if (surviving < 2) return set_replacement(survivor);

2944	2975

2945 set_replacement(this);	2976 set_replacement(this);

2946 if (surviving == choice_count) {	2977 if (surviving == choice_count) {

2947 return this;	2978 return this;

2948 }	2979 }

2949 // Only some of the nodes survived the filtering. We need to rebuild the	2980 // Only some of the nodes survived the filtering. We need to rebuild the

2950 // alternatives list.	2981 // alternatives list.

2951 ZoneList<GuardedAlternative>* new_alternatives =	2982 ZoneList<GuardedAlternative>* new_alternatives =

2952 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());	2983 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());

2953 for (int i = 0; i < choice_count; i++) {	2984 for (int i = 0; i < choice_count; i++) {

2954 RegExpNode* replacement =	2985 RegExpNode* replacement =

2955 alternatives_->at(i).node()->FilterASCII(depth - 1);	2986 alternatives_->at(i).node()->FilterASCII(depth - 1, ignore_case);

2956 if (replacement != NULL) {	2987 if (replacement != NULL) {

2957 alternatives_->at(i).set_node(replacement);	2988 alternatives_->at(i).set_node(replacement);

2958 new_alternatives->Add(alternatives_->at(i), zone());	2989 new_alternatives->Add(alternatives_->at(i), zone());

2959 }	2990 }

2960 }	2991 }

2961 alternatives_ = new_alternatives;	2992 alternatives_ = new_alternatives;

2962 return this;	2993 return this;

2963 }	2994 }

2964	2995

2965	2996

2966 RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth) {	2997 RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth,

	2998 bool ignore_case) {

2967 if (info()->replacement_calculated) return replacement();	2999 if (info()->replacement_calculated) return replacement();

2968 if (depth < 0) return this;	3000 if (depth < 0) return this;

2969 if (info()->visited) return this;	3001 if (info()->visited) return this;

2970 VisitMarker marker(info());	3002 VisitMarker marker(info());

2971 // Alternative 0 is the negative lookahead, alternative 1 is what comes	3003 // Alternative 0 is the negative lookahead, alternative 1 is what comes

2972 // afterwards.	3004 // afterwards.

2973 RegExpNode* node = alternatives_->at(1).node();	3005 RegExpNode* node = alternatives_->at(1).node();

2974 RegExpNode* replacement = node->FilterASCII(depth - 1);	3006 RegExpNode* replacement = node->FilterASCII(depth - 1, ignore_case);

2975 if (replacement == NULL) return set_replacement(NULL);	3007 if (replacement == NULL) return set_replacement(NULL);

2976 alternatives_->at(1).set_node(replacement);	3008 alternatives_->at(1).set_node(replacement);

2977	3009

2978 RegExpNode* neg_node = alternatives_->at(0).node();	3010 RegExpNode* neg_node = alternatives_->at(0).node();

2979 RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1);	3011 RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1, ignore_case);

2980 // If the negative lookahead is always going to fail then	3012 // If the negative lookahead is always going to fail then

2981 // we don't need to check it.	3013 // we don't need to check it.

2982 if (neg_replacement == NULL) return set_replacement(replacement);	3014 if (neg_replacement == NULL) return set_replacement(replacement);

2983 alternatives_->at(0).set_node(neg_replacement);	3015 alternatives_->at(0).set_node(neg_replacement);

2984 return set_replacement(this);	3016 return set_replacement(this);

2985 }	3017 }

2986	3018

2987	3019

2988 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,	3020 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,

2989 RegExpCompiler* compiler,	3021 RegExpCompiler* compiler,

(...skipping 302 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3292 int cp_offset = trace->cp_offset() + elm.cp_offset;	3324 int cp_offset = trace->cp_offset() + elm.cp_offset;

3293 if (elm.type == TextElement::ATOM) {	3325 if (elm.type == TextElement::ATOM) {

3294 Vector<const uc16> quarks = elm.data.u_atom->data();	3326 Vector<const uc16> quarks = elm.data.u_atom->data();

3295 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {	3327 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {

3296 if (first_element_checked && i == 0 && j == 0) continue;	3328 if (first_element_checked && i == 0 && j == 0) continue;

3297 if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;	3329 if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;

3298 EmitCharacterFunction* emit_function = NULL;	3330 EmitCharacterFunction* emit_function = NULL;

3299 switch (pass) {	3331 switch (pass) {

3300 case NON_ASCII_MATCH:	3332 case NON_ASCII_MATCH:

3301 ASSERT(ascii);	3333 ASSERT(ascii);

3302 if (quarks[j] > String::kMaxAsciiCharCode) {	3334 if (quarks[j] > String::kMaxOneByteCharCode) {

3303 assembler->GoTo(backtrack);	3335 assembler->GoTo(backtrack);

3304 return;	3336 return;

3305 }	3337 }

3306 break;	3338 break;

3307 case NON_LETTER_CHARACTER_MATCH:	3339 case NON_LETTER_CHARACTER_MATCH:

3308 emit_function = &EmitAtomNonLetter;	3340 emit_function = &EmitAtomNonLetter;

3309 break;	3341 break;

3310 case SIMPLE_CHARACTER_MATCH:	3342 case SIMPLE_CHARACTER_MATCH:

3311 emit_function = &EmitSimpleCharacter;	3343 emit_function = &EmitSimpleCharacter;

3312 break;	3344 break;

(...skipping 178 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3491 ZoneList<CharacterRange>* ranges = node->ranges(zone());	3523 ZoneList<CharacterRange>* ranges = node->ranges(zone());

3492 if (!CharacterRange::IsCanonical(ranges)) {	3524 if (!CharacterRange::IsCanonical(ranges)) {

3493 CharacterRange::Canonicalize(ranges);	3525 CharacterRange::Canonicalize(ranges);

3494 }	3526 }

3495 if (node->is_negated()) {	3527 if (node->is_negated()) {

3496 return ranges->length() == 0 ? on_success() : NULL;	3528 return ranges->length() == 0 ? on_success() : NULL;

3497 }	3529 }

3498 if (ranges->length() != 1) return NULL;	3530 if (ranges->length() != 1) return NULL;

3499 uint32_t max_char;	3531 uint32_t max_char;

3500 if (compiler->ascii()) {	3532 if (compiler->ascii()) {

3501 max_char = String::kMaxAsciiCharCode;	3533 max_char = String::kMaxOneByteCharCode;

3502 } else {	3534 } else {

3503 max_char = String::kMaxUtf16CodeUnit;	3535 max_char = String::kMaxUtf16CodeUnit;

3504 }	3536 }

3505 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;	3537 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;

3506 }	3538 }

3507	3539

3508	3540

3509 // Finds the fixed match length of a sequence of nodes that goes from	3541 // Finds the fixed match length of a sequence of nodes that goes from

3510 // this alternative and back to this choice node. If there are variable	3542 // this alternative and back to this choice node. If there are variable

3511 // length nodes or other complications in the way then return a sentinel	3543 // length nodes or other complications in the way then return a sentinel

(...skipping 179 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3691 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;	3723 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;

3692 }	3724 }

3693 }	3725 }

3694	3726

3695	3727

3696 BoyerMooreLookahead::BoyerMooreLookahead(	3728 BoyerMooreLookahead::BoyerMooreLookahead(

3697 int length, RegExpCompiler* compiler, Zone* zone)	3729 int length, RegExpCompiler* compiler, Zone* zone)

3698 : length_(length),	3730 : length_(length),

3699 compiler_(compiler) {	3731 compiler_(compiler) {

3700 if (compiler->ascii()) {	3732 if (compiler->ascii()) {

3701 max_char_ = String::kMaxAsciiCharCode;	3733 max_char_ = String::kMaxOneByteCharCode;

3702 } else {	3734 } else {

3703 max_char_ = String::kMaxUtf16CodeUnit;	3735 max_char_ = String::kMaxUtf16CodeUnit;

3704 }	3736 }

3705 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);	3737 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);

3706 for (int i = 0; i < length; i++) {	3738 for (int i = 0; i < length; i++) {

3707 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);	3739 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);

3708 }	3740 }

3709 }	3741 }

3710	3742

3711	3743

(...skipping 1618 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5330 }	5362 }

5331	5363

5332	5364

5333 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,	5365 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,

5334 bool is_ascii,	5366 bool is_ascii,

5335 Zone* zone) {	5367 Zone* zone) {

5336 Isolate* isolate = Isolate::Current();	5368 Isolate* isolate = Isolate::Current();

5337 uc16 bottom = from();	5369 uc16 bottom = from();

5338 uc16 top = to();	5370 uc16 top = to();

5339 if (is_ascii) {	5371 if (is_ascii) {

5340 if (bottom > String::kMaxAsciiCharCode) return;	5372 if (bottom > String::kMaxOneByteCharCode) return;

5341 if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode;	5373 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;

5342 }	5374 }

5343 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	5375 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

5344 if (top == bottom) {	5376 if (top == bottom) {

5345 // If this is a singleton we just expand the one character.	5377 // If this is a singleton we just expand the one character.

5346 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);	5378 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);

5347 for (int i = 0; i < length; i++) {	5379 for (int i = 0; i < length; i++) {

5348 uc32 chr = chars[i];	5380 uc32 chr = chars[i];

5349 if (chr != bottom) {	5381 if (chr != bottom) {

5350 ranges->Add(CharacterRange::Singleton(chars[i]), zone);	5382 ranges->Add(CharacterRange::Singleton(chars[i]), zone);

5351 }	5383 }

(...skipping 526 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5878 if (offset >= bm->length()) {	5910 if (offset >= bm->length()) {

5879 if (initial_offset == 0) set_bm_info(not_at_start, bm);	5911 if (initial_offset == 0) set_bm_info(not_at_start, bm);

5880 return;	5912 return;

5881 }	5913 }

5882 uc16 character = atom->data()[j];	5914 uc16 character = atom->data()[j];

5883 if (bm->compiler()->ignore_case()) {	5915 if (bm->compiler()->ignore_case()) {

5884 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	5916 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

5885 int length = GetCaseIndependentLetters(	5917 int length = GetCaseIndependentLetters(

5886 ISOLATE,	5918 ISOLATE,

5887 character,	5919 character,

5888 bm->max_char() == String::kMaxAsciiCharCode,	5920 bm->max_char() == String::kMaxOneByteCharCode,

5889 chars);	5921 chars);

5890 for (int j = 0; j < length; j++) {	5922 for (int j = 0; j < length; j++) {

5891 bm->Set(offset, chars[j]);	5923 bm->Set(offset, chars[j]);

5892 }	5924 }

5893 } else {	5925 } else {

5894 if (character <= max_char) bm->Set(offset, character);	5926 if (character <= max_char) bm->Set(offset, character);

5895 }	5927 }

5896 }	5928 }

5897 } else {	5929 } else {

5898 ASSERT(text.type == TextElement::CHAR_CLASS);	5930 ASSERT(text.type == TextElement::CHAR_CLASS);

(...skipping 193 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6092 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);	6124 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);

6093 first_step_node->AddAlternative(GuardedAlternative(captured_body));	6125 first_step_node->AddAlternative(GuardedAlternative(captured_body));

6094 first_step_node->AddAlternative(GuardedAlternative(	6126 first_step_node->AddAlternative(GuardedAlternative(

6095 new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node)));	6127 new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node)));

6096 node = first_step_node;	6128 node = first_step_node;

6097 } else {	6129 } else {

6098 node = loop_node;	6130 node = loop_node;

6099 }	6131 }

6100 }	6132 }

6101 if (is_ascii) {	6133 if (is_ascii) {

6102 node = node->FilterASCII(RegExpCompiler::kMaxRecursion);	6134 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);

6103 // Do it again to propagate the new nodes to places where they were not	6135 // Do it again to propagate the new nodes to places where they were not

6104 // put because they had not been calculated yet.	6136 // put because they had not been calculated yet.

6105 if (node != NULL) node = node->FilterASCII(RegExpCompiler::kMaxRecursion);	6137 if (node != NULL) {

	6138 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);

	6139 }

6106 }	6140 }

6107	6141

6108 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);	6142 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);

6109 data->node = node;	6143 data->node = node;

6110 Analysis analysis(ignore_case, is_ascii);	6144 Analysis analysis(ignore_case, is_ascii);

6111 analysis.EnsureAnalyzed(node);	6145 analysis.EnsureAnalyzed(node);

6112 if (analysis.has_failed()) {	6146 if (analysis.has_failed()) {

6113 const char* error_message = analysis.error_message();	6147 const char* error_message = analysis.error_message();

6114 return CompilationResult(error_message);	6148 return CompilationResult(error_message);

6115 }	6149 }

(...skipping 43 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6159 }	6193 }

6160	6194

6161 return compiler.Assemble(&macro_assembler,	6195 return compiler.Assemble(&macro_assembler,

6162 node,	6196 node,

6163 data->capture_count,	6197 data->capture_count,

6164 pattern);	6198 pattern);

6165 }	6199 }

6166	6200

6167	6201

6168 }} // namespace v8::internal	6202 }} // namespace v8::internal

OLD	NEW

« no previous file with comments | « src/jsregexp.h ('k') | src/log.cc » ('j') | no next file with comments »