src/jsregexp.cc - Issue 11962035: Fix some latin-1 webkit units tests

Side by Side Diff: src/jsregexp.cc

Issue 11962035: Fix some latin-1 webkit units tests (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge

Patch Set: Fixed last latin-1 webkit test failure Created 7 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 2837 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2848	2848

2849	2849

2850 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {	2850 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {

2851 RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case);	2851 RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case);

2852 if (next == NULL) return set_replacement(NULL);	2852 if (next == NULL) return set_replacement(NULL);

2853 on_success_ = next;	2853 on_success_ = next;

2854 return set_replacement(this);	2854 return set_replacement(this);

2855 }	2855 }

2856	2856

2857	2857

	2858 // We need to check for the following characters: 0x39c 0x3bc 0x178.

	2859 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {

	2860 #ifdef ENABLE_LATIN_1

	2861 // TODO(dcarney): this could be a lot more efficient.

	2862 return range.Contains(0x39c) \|\|

	2863 range.Contains(0x3bc) \|\| range.Contains(0x178);

	2864 #else

	2865 return false;

	2866 #endif

	2867 }

	2868

	2869

	2870 #ifdef ENABLE_LATIN_1

	2871 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {

	2872 for (int i = 0; i < ranges->length(); i++) {

	2873 // TODO(dcarney): this could be a lot more efficient.

	2874 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;

	2875 }

	2876 return false;

	2877 }

	2878 #endif

	2879

	2880

2858 RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {	2881 RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {

2859 if (info()->replacement_calculated) return replacement();	2882 if (info()->replacement_calculated) return replacement();

2860 if (depth < 0) return this;	2883 if (depth < 0) return this;

2861 ASSERT(!info()->visited);	2884 ASSERT(!info()->visited);

2862 VisitMarker marker(info());	2885 VisitMarker marker(info());

2863 int element_count = elms_->length();	2886 int element_count = elms_->length();

2864 for (int i = 0; i < element_count; i++) {	2887 for (int i = 0; i < element_count; i++) {

2865 TextElement elm = elms_->at(i);	2888 TextElement elm = elms_->at(i);

2866 if (elm.type == TextElement::ATOM) {	2889 if (elm.type == TextElement::ATOM) {

2867 Vector<const uc16> quarks = elm.data.u_atom->data();	2890 Vector<const uc16> quarks = elm.data.u_atom->data();

2868 for (int j = 0; j < quarks.length(); j++) {	2891 for (int j = 0; j < quarks.length(); j++) {

2869 #ifndef ENABLE_LATIN_1	2892 #ifndef ENABLE_LATIN_1

2870 if (quarks[j] > String::kMaxOneByteCharCode) {	2893 if (quarks[j] > String::kMaxOneByteCharCode) {

2871 return set_replacement(NULL);	2894 return set_replacement(NULL);

2872 }	2895 }

2873 #else	2896 #else

2874 if (quarks[j] <= String::kMaxOneByteCharCode) continue;	2897 uint16_t c = quarks[j];

	2898 if (c <= String::kMaxOneByteCharCode) continue;

2875 if (!ignore_case) return set_replacement(NULL);	2899 if (!ignore_case) return set_replacement(NULL);

2876 // Here, we need to check for characters whose upper and lower cases	2900 // Here, we need to check for characters whose upper and lower cases

2877 // are outside the Latin-1 range.	2901 // are outside the Latin-1 range.

2878 if (!unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(quarks[j])) {	2902 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);

2879 return set_replacement(NULL);	2903 // Character is outside Latin-1 completely

2880 }	2904 if (converted == 0) return set_replacement(NULL);

	2905 // Convert quark to Latin-1 in place.

	2906 uint16_t* copy = const_cast<uint16_t*>(quarks.start());

	2907 copy[j] = converted;

2881 #endif	2908 #endif

2882 }	2909 }

2883 } else {	2910 } else {

2884 ASSERT(elm.type == TextElement::CHAR_CLASS);	2911 ASSERT(elm.type == TextElement::CHAR_CLASS);

2885 #ifdef ENABLE_LATIN_1

2886 // TODO(dcarney): Can this be improved?

2887 if (ignore_case) continue;

2888 #endif

2889 RegExpCharacterClass* cc = elm.data.u_char_class;	2912 RegExpCharacterClass* cc = elm.data.u_char_class;

2890 ZoneList<CharacterRange>* ranges = cc->ranges(zone());	2913 ZoneList<CharacterRange>* ranges = cc->ranges(zone());

2891 if (!CharacterRange::IsCanonical(ranges)) {	2914 if (!CharacterRange::IsCanonical(ranges)) {

2892 CharacterRange::Canonicalize(ranges);	2915 CharacterRange::Canonicalize(ranges);

2893 }	2916 }

2894 // Now they are in order so we only need to look at the first.	2917 // Now they are in order so we only need to look at the first.

2895 int range_count = ranges->length();	2918 int range_count = ranges->length();

2896 if (cc->is_negated()) {	2919 if (cc->is_negated()) {

2897 if (range_count != 0 &&	2920 if (range_count != 0 &&

2898 ranges->at(0).from() == 0 &&	2921 ranges->at(0).from() == 0 &&

2899 ranges->at(0).to() >= String::kMaxOneByteCharCode) {	2922 ranges->at(0).to() >= String::kMaxOneByteCharCode) {

	2923 #ifdef ENABLE_LATIN_1

	2924 // This will be handled in a later filter.

	2925 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;

	2926 #endif

2900 return set_replacement(NULL);	2927 return set_replacement(NULL);

2901 }	2928 }

2902 } else {	2929 } else {

2903 if (range_count == 0 \|\|	2930 if (range_count == 0 \|\|

2904 ranges->at(0).from() > String::kMaxOneByteCharCode) {	2931 ranges->at(0).from() > String::kMaxOneByteCharCode) {

	2932 #ifdef ENABLE_LATIN_1

	2933 // This will be handled in a later filter.

	2934 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;

	2935 #endif

2905 return set_replacement(NULL);	2936 return set_replacement(NULL);

2906 }	2937 }

2907 }	2938 }

2908 }	2939 }

2909 }	2940 }

2910 return FilterSuccessor(depth - 1, ignore_case);	2941 return FilterSuccessor(depth - 1, ignore_case);

2911 }	2942 }

2912	2943

2913	2944

2914 RegExpNode* LoopChoiceNode::FilterASCII(int depth, bool ignore_case) {	2945 RegExpNode* LoopChoiceNode::FilterASCII(int depth, bool ignore_case) {

(...skipping 2432 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5347 table.ForEach(&callback);	5378 table.ForEach(&callback);

5348 }	5379 }

5349	5380

5350	5381

5351 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,	5382 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,

5352 bool is_ascii,	5383 bool is_ascii,

5353 Zone* zone) {	5384 Zone* zone) {

5354 Isolate* isolate = Isolate::Current();	5385 Isolate* isolate = Isolate::Current();

5355 uc16 bottom = from();	5386 uc16 bottom = from();

5356 uc16 top = to();	5387 uc16 top = to();

5357 if (is_ascii) {	5388 if (is_ascii && !RangeContainsLatin1Equivalents(*this)) {

5358 if (bottom > String::kMaxOneByteCharCode) return;	5389 if (bottom > String::kMaxOneByteCharCode) return;

5359 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;	5390 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;

5360 }	5391 }

5361 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	5392 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

5362 if (top == bottom) {	5393 if (top == bottom) {

5363 // If this is a singleton we just expand the one character.	5394 // If this is a singleton we just expand the one character.

5364 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);	5395 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);

5365 for (int i = 0; i < length; i++) {	5396 for (int i = 0; i < length; i++) {

5366 uc32 chr = chars[i];	5397 uc32 chr = chars[i];

5367 if (chr != bottom) {	5398 if (chr != bottom) {

(...skipping 811 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6179 }	6210 }

6180	6211

6181 return compiler.Assemble(&macro_assembler,	6212 return compiler.Assemble(&macro_assembler,

6182 node,	6213 node,

6183 data->capture_count,	6214 data->capture_count,

6184 pattern);	6215 pattern);

6185 }	6216 }

6186	6217

6187	6218

6188 }} // namespace v8::internal	6219 }} // namespace v8::internal

OLD	NEW

« no previous file with comments | « no previous file | src/regexp-macro-assembler.h » ('j') | no next file with comments »