OLD | NEW |
1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 2837 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2848 | 2848 |
2849 | 2849 |
2850 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) { | 2850 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) { |
2851 RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case); | 2851 RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case); |
2852 if (next == NULL) return set_replacement(NULL); | 2852 if (next == NULL) return set_replacement(NULL); |
2853 on_success_ = next; | 2853 on_success_ = next; |
2854 return set_replacement(this); | 2854 return set_replacement(this); |
2855 } | 2855 } |
2856 | 2856 |
2857 | 2857 |
| 2858 // We need to check for the following characters: 0x39c 0x3bc 0x178. |
| 2859 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) { |
| 2860 #ifdef ENABLE_LATIN_1 |
| 2861 // TODO(dcarney): this could be a lot more efficient. |
| 2862 return range.Contains(0x39c) || |
| 2863 range.Contains(0x3bc) || range.Contains(0x178); |
| 2864 #else |
| 2865 return false; |
| 2866 #endif |
| 2867 } |
| 2868 |
| 2869 |
| 2870 #ifdef ENABLE_LATIN_1 |
| 2871 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) { |
| 2872 for (int i = 0; i < ranges->length(); i++) { |
| 2873 // TODO(dcarney): this could be a lot more efficient. |
| 2874 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; |
| 2875 } |
| 2876 return false; |
| 2877 } |
| 2878 #endif |
| 2879 |
| 2880 |
2858 RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) { | 2881 RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) { |
2859 if (info()->replacement_calculated) return replacement(); | 2882 if (info()->replacement_calculated) return replacement(); |
2860 if (depth < 0) return this; | 2883 if (depth < 0) return this; |
2861 ASSERT(!info()->visited); | 2884 ASSERT(!info()->visited); |
2862 VisitMarker marker(info()); | 2885 VisitMarker marker(info()); |
2863 int element_count = elms_->length(); | 2886 int element_count = elms_->length(); |
2864 for (int i = 0; i < element_count; i++) { | 2887 for (int i = 0; i < element_count; i++) { |
2865 TextElement elm = elms_->at(i); | 2888 TextElement elm = elms_->at(i); |
2866 if (elm.type == TextElement::ATOM) { | 2889 if (elm.type == TextElement::ATOM) { |
2867 Vector<const uc16> quarks = elm.data.u_atom->data(); | 2890 Vector<const uc16> quarks = elm.data.u_atom->data(); |
2868 for (int j = 0; j < quarks.length(); j++) { | 2891 for (int j = 0; j < quarks.length(); j++) { |
2869 #ifndef ENABLE_LATIN_1 | 2892 #ifndef ENABLE_LATIN_1 |
2870 if (quarks[j] > String::kMaxOneByteCharCode) { | 2893 if (quarks[j] > String::kMaxOneByteCharCode) { |
2871 return set_replacement(NULL); | 2894 return set_replacement(NULL); |
2872 } | 2895 } |
2873 #else | 2896 #else |
2874 if (quarks[j] <= String::kMaxOneByteCharCode) continue; | 2897 uint16_t c = quarks[j]; |
| 2898 if (c <= String::kMaxOneByteCharCode) continue; |
2875 if (!ignore_case) return set_replacement(NULL); | 2899 if (!ignore_case) return set_replacement(NULL); |
2876 // Here, we need to check for characters whose upper and lower cases | 2900 // Here, we need to check for characters whose upper and lower cases |
2877 // are outside the Latin-1 range. | 2901 // are outside the Latin-1 range. |
2878 if (!unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(quarks[j])) { | 2902 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c); |
2879 return set_replacement(NULL); | 2903 // Character is outside Latin-1 completely |
2880 } | 2904 if (converted == 0) return set_replacement(NULL); |
| 2905 // Convert quark to Latin-1 in place. |
| 2906 uint16_t* copy = const_cast<uint16_t*>(quarks.start()); |
| 2907 copy[j] = converted; |
2881 #endif | 2908 #endif |
2882 } | 2909 } |
2883 } else { | 2910 } else { |
2884 ASSERT(elm.type == TextElement::CHAR_CLASS); | 2911 ASSERT(elm.type == TextElement::CHAR_CLASS); |
2885 #ifdef ENABLE_LATIN_1 | |
2886 // TODO(dcarney): Can this be improved? | |
2887 if (ignore_case) continue; | |
2888 #endif | |
2889 RegExpCharacterClass* cc = elm.data.u_char_class; | 2912 RegExpCharacterClass* cc = elm.data.u_char_class; |
2890 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); | 2913 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); |
2891 if (!CharacterRange::IsCanonical(ranges)) { | 2914 if (!CharacterRange::IsCanonical(ranges)) { |
2892 CharacterRange::Canonicalize(ranges); | 2915 CharacterRange::Canonicalize(ranges); |
2893 } | 2916 } |
2894 // Now they are in order so we only need to look at the first. | 2917 // Now they are in order so we only need to look at the first. |
2895 int range_count = ranges->length(); | 2918 int range_count = ranges->length(); |
2896 if (cc->is_negated()) { | 2919 if (cc->is_negated()) { |
2897 if (range_count != 0 && | 2920 if (range_count != 0 && |
2898 ranges->at(0).from() == 0 && | 2921 ranges->at(0).from() == 0 && |
2899 ranges->at(0).to() >= String::kMaxOneByteCharCode) { | 2922 ranges->at(0).to() >= String::kMaxOneByteCharCode) { |
| 2923 #ifdef ENABLE_LATIN_1 |
| 2924 // This will be handled in a later filter. |
| 2925 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; |
| 2926 #endif |
2900 return set_replacement(NULL); | 2927 return set_replacement(NULL); |
2901 } | 2928 } |
2902 } else { | 2929 } else { |
2903 if (range_count == 0 || | 2930 if (range_count == 0 || |
2904 ranges->at(0).from() > String::kMaxOneByteCharCode) { | 2931 ranges->at(0).from() > String::kMaxOneByteCharCode) { |
| 2932 #ifdef ENABLE_LATIN_1 |
| 2933 // This will be handled in a later filter. |
| 2934 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; |
| 2935 #endif |
2905 return set_replacement(NULL); | 2936 return set_replacement(NULL); |
2906 } | 2937 } |
2907 } | 2938 } |
2908 } | 2939 } |
2909 } | 2940 } |
2910 return FilterSuccessor(depth - 1, ignore_case); | 2941 return FilterSuccessor(depth - 1, ignore_case); |
2911 } | 2942 } |
2912 | 2943 |
2913 | 2944 |
2914 RegExpNode* LoopChoiceNode::FilterASCII(int depth, bool ignore_case) { | 2945 RegExpNode* LoopChoiceNode::FilterASCII(int depth, bool ignore_case) { |
(...skipping 2432 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5347 table.ForEach(&callback); | 5378 table.ForEach(&callback); |
5348 } | 5379 } |
5349 | 5380 |
5350 | 5381 |
5351 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, | 5382 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, |
5352 bool is_ascii, | 5383 bool is_ascii, |
5353 Zone* zone) { | 5384 Zone* zone) { |
5354 Isolate* isolate = Isolate::Current(); | 5385 Isolate* isolate = Isolate::Current(); |
5355 uc16 bottom = from(); | 5386 uc16 bottom = from(); |
5356 uc16 top = to(); | 5387 uc16 top = to(); |
5357 if (is_ascii) { | 5388 if (is_ascii && !RangeContainsLatin1Equivalents(*this)) { |
5358 if (bottom > String::kMaxOneByteCharCode) return; | 5389 if (bottom > String::kMaxOneByteCharCode) return; |
5359 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; | 5390 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; |
5360 } | 5391 } |
5361 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 5392 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
5362 if (top == bottom) { | 5393 if (top == bottom) { |
5363 // If this is a singleton we just expand the one character. | 5394 // If this is a singleton we just expand the one character. |
5364 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); | 5395 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); |
5365 for (int i = 0; i < length; i++) { | 5396 for (int i = 0; i < length; i++) { |
5366 uc32 chr = chars[i]; | 5397 uc32 chr = chars[i]; |
5367 if (chr != bottom) { | 5398 if (chr != bottom) { |
(...skipping 811 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6179 } | 6210 } |
6180 | 6211 |
6181 return compiler.Assemble(¯o_assembler, | 6212 return compiler.Assemble(¯o_assembler, |
6182 node, | 6213 node, |
6183 data->capture_count, | 6214 data->capture_count, |
6184 pattern); | 6215 pattern); |
6185 } | 6216 } |
6186 | 6217 |
6187 | 6218 |
6188 }} // namespace v8::internal | 6219 }} // namespace v8::internal |
OLD | NEW |