Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(719)

Side by Side Diff: src/jsregexp.cc

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2011 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 1426 matching lines...) Expand 10 before | Expand all | Expand 10 after
1437 1437
1438 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, 1438 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1439 bool ascii, 1439 bool ascii,
1440 uc16 c1, 1440 uc16 c1,
1441 uc16 c2, 1441 uc16 c2,
1442 Label* on_failure) { 1442 Label* on_failure) {
1443 uc16 char_mask; 1443 uc16 char_mask;
1444 if (ascii) { 1444 if (ascii) {
1445 char_mask = String::kMaxAsciiCharCode; 1445 char_mask = String::kMaxAsciiCharCode;
1446 } else { 1446 } else {
1447 char_mask = String::kMaxUC16CharCode; 1447 char_mask = String::kMaxUtf16CodeUnit;
1448 } 1448 }
1449 uc16 exor = c1 ^ c2; 1449 uc16 exor = c1 ^ c2;
1450 // Check whether exor has only one bit set. 1450 // Check whether exor has only one bit set.
1451 if (((exor - 1) & exor) == 0) { 1451 if (((exor - 1) & exor) == 0) {
1452 // If c1 and c2 differ only by one bit. 1452 // If c1 and c2 differ only by one bit.
1453 // Ecma262UnCanonicalize always gives the highest number last. 1453 // Ecma262UnCanonicalize always gives the highest number last.
1454 ASSERT(c2 > c1); 1454 ASSERT(c2 > c1);
1455 uc16 mask = char_mask ^ exor; 1455 uc16 mask = char_mask ^ exor;
1456 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure); 1456 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1457 return true; 1457 return true;
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after
1539 bool ascii, 1539 bool ascii,
1540 Label* on_failure, 1540 Label* on_failure,
1541 int cp_offset, 1541 int cp_offset,
1542 bool check_offset, 1542 bool check_offset,
1543 bool preloaded) { 1543 bool preloaded) {
1544 ZoneList<CharacterRange>* ranges = cc->ranges(); 1544 ZoneList<CharacterRange>* ranges = cc->ranges();
1545 int max_char; 1545 int max_char;
1546 if (ascii) { 1546 if (ascii) {
1547 max_char = String::kMaxAsciiCharCode; 1547 max_char = String::kMaxAsciiCharCode;
1548 } else { 1548 } else {
1549 max_char = String::kMaxUC16CharCode; 1549 max_char = String::kMaxUtf16CodeUnit;
1550 } 1550 }
1551 1551
1552 Label success; 1552 Label success;
1553 1553
1554 Label* char_is_in_class = 1554 Label* char_is_in_class =
1555 cc->is_negated() ? on_failure : &success; 1555 cc->is_negated() ? on_failure : &success;
1556 1556
1557 int range_count = ranges->length(); 1557 int range_count = ranges->length();
1558 1558
1559 int last_valid_range = range_count - 1; 1559 int last_valid_range = range_count - 1;
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after
1635 macro_assembler->CheckNotCharacter(to, on_failure); 1635 macro_assembler->CheckNotCharacter(to, on_failure);
1636 } 1636 }
1637 } else { 1637 } else {
1638 if (from != 0) { 1638 if (from != 0) {
1639 if (cc->is_negated()) { 1639 if (cc->is_negated()) {
1640 macro_assembler->CheckCharacterLT(from, &success); 1640 macro_assembler->CheckCharacterLT(from, &success);
1641 } else { 1641 } else {
1642 macro_assembler->CheckCharacterLT(from, on_failure); 1642 macro_assembler->CheckCharacterLT(from, on_failure);
1643 } 1643 }
1644 } 1644 }
1645 if (to != String::kMaxUC16CharCode) { 1645 if (to != String::kMaxUtf16CodeUnit) {
1646 if (cc->is_negated()) { 1646 if (cc->is_negated()) {
1647 macro_assembler->CheckCharacterLT(to + 1, on_failure); 1647 macro_assembler->CheckCharacterLT(to + 1, on_failure);
1648 } else { 1648 } else {
1649 macro_assembler->CheckCharacterGT(to, on_failure); 1649 macro_assembler->CheckCharacterGT(to, on_failure);
1650 } 1650 }
1651 } else { 1651 } else {
1652 if (cc->is_negated()) { 1652 if (cc->is_negated()) {
1653 macro_assembler->GoTo(on_failure); 1653 macro_assembler->GoTo(on_failure);
1654 } 1654 }
1655 } 1655 }
(...skipping 172 matching lines...) Expand 10 before | Expand all | Expand 10 after
1828 return v; 1828 return v;
1829 } 1829 }
1830 1830
1831 1831
1832 bool QuickCheckDetails::Rationalize(bool asc) { 1832 bool QuickCheckDetails::Rationalize(bool asc) {
1833 bool found_useful_op = false; 1833 bool found_useful_op = false;
1834 uint32_t char_mask; 1834 uint32_t char_mask;
1835 if (asc) { 1835 if (asc) {
1836 char_mask = String::kMaxAsciiCharCode; 1836 char_mask = String::kMaxAsciiCharCode;
1837 } else { 1837 } else {
1838 char_mask = String::kMaxUC16CharCode; 1838 char_mask = String::kMaxUtf16CodeUnit;
1839 } 1839 }
1840 mask_ = 0; 1840 mask_ = 0;
1841 value_ = 0; 1841 value_ = 0;
1842 int char_shift = 0; 1842 int char_shift = 0;
1843 for (int i = 0; i < characters_; i++) { 1843 for (int i = 0; i < characters_; i++) {
1844 Position* pos = &positions_[i]; 1844 Position* pos = &positions_[i];
1845 if ((pos->mask & String::kMaxAsciiCharCode) != 0) { 1845 if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
1846 found_useful_op = true; 1846 found_useful_op = true;
1847 } 1847 }
1848 mask_ |= (pos->mask & char_mask) << char_shift; 1848 mask_ |= (pos->mask & char_mask) << char_shift;
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
1880 1880
1881 bool need_mask = true; 1881 bool need_mask = true;
1882 1882
1883 if (details->characters() == 1) { 1883 if (details->characters() == 1) {
1884 // If number of characters preloaded is 1 then we used a byte or 16 bit 1884 // If number of characters preloaded is 1 then we used a byte or 16 bit
1885 // load so the value is already masked down. 1885 // load so the value is already masked down.
1886 uint32_t char_mask; 1886 uint32_t char_mask;
1887 if (compiler->ascii()) { 1887 if (compiler->ascii()) {
1888 char_mask = String::kMaxAsciiCharCode; 1888 char_mask = String::kMaxAsciiCharCode;
1889 } else { 1889 } else {
1890 char_mask = String::kMaxUC16CharCode; 1890 char_mask = String::kMaxUtf16CodeUnit;
1891 } 1891 }
1892 if ((mask & char_mask) == char_mask) need_mask = false; 1892 if ((mask & char_mask) == char_mask) need_mask = false;
1893 mask &= char_mask; 1893 mask &= char_mask;
1894 } else { 1894 } else {
1895 // For 2-character preloads in ASCII mode or 1-character preloads in 1895 // For 2-character preloads in ASCII mode or 1-character preloads in
1896 // TWO_BYTE mode we also use a 16 bit load with zero extend. 1896 // TWO_BYTE mode we also use a 16 bit load with zero extend.
1897 if (details->characters() == 2 && compiler->ascii()) { 1897 if (details->characters() == 2 && compiler->ascii()) {
1898 if ((mask & 0x7f7f) == 0x7f7f) need_mask = false; 1898 if ((mask & 0x7f7f) == 0x7f7f) need_mask = false;
1899 } else if (details->characters() == 1 && !compiler->ascii()) { 1899 } else if (details->characters() == 1 && !compiler->ascii()) {
1900 if ((mask & 0xffff) == 0xffff) need_mask = false; 1900 if ((mask & 0xffff) == 0xffff) need_mask = false;
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
1932 RegExpCompiler* compiler, 1932 RegExpCompiler* compiler,
1933 int characters_filled_in, 1933 int characters_filled_in,
1934 bool not_at_start) { 1934 bool not_at_start) {
1935 Isolate* isolate = Isolate::Current(); 1935 Isolate* isolate = Isolate::Current();
1936 ASSERT(characters_filled_in < details->characters()); 1936 ASSERT(characters_filled_in < details->characters());
1937 int characters = details->characters(); 1937 int characters = details->characters();
1938 int char_mask; 1938 int char_mask;
1939 if (compiler->ascii()) { 1939 if (compiler->ascii()) {
1940 char_mask = String::kMaxAsciiCharCode; 1940 char_mask = String::kMaxAsciiCharCode;
1941 } else { 1941 } else {
1942 char_mask = String::kMaxUC16CharCode; 1942 char_mask = String::kMaxUtf16CodeUnit;
1943 } 1943 }
1944 for (int k = 0; k < elms_->length(); k++) { 1944 for (int k = 0; k < elms_->length(); k++) {
1945 TextElement elm = elms_->at(k); 1945 TextElement elm = elms_->at(k);
1946 if (elm.type == TextElement::ATOM) { 1946 if (elm.type == TextElement::ATOM) {
1947 Vector<const uc16> quarks = elm.data.u_atom->data(); 1947 Vector<const uc16> quarks = elm.data.u_atom->data();
1948 for (int i = 0; i < characters && i < quarks.length(); i++) { 1948 for (int i = 0; i < characters && i < quarks.length(); i++) {
1949 QuickCheckDetails::Position* pos = 1949 QuickCheckDetails::Position* pos =
1950 details->positions(characters_filled_in); 1950 details->positions(characters_filled_in);
1951 uc16 c = quarks[i]; 1951 uc16 c = quarks[i];
1952 if (c > char_mask) { 1952 if (c > char_mask) {
(...skipping 2119 matching lines...) Expand 10 before | Expand all | Expand 10 after
4072 ASSERT(elmv[i] <= elmv[i + 1]); 4072 ASSERT(elmv[i] <= elmv[i + 1]);
4073 ranges->Add(CharacterRange(elmv[i], elmv[i + 1])); 4073 ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
4074 } 4074 }
4075 } 4075 }
4076 4076
4077 4077
4078 static void AddClassNegated(const uc16 *elmv, 4078 static void AddClassNegated(const uc16 *elmv,
4079 int elmc, 4079 int elmc,
4080 ZoneList<CharacterRange>* ranges) { 4080 ZoneList<CharacterRange>* ranges) {
4081 ASSERT(elmv[0] != 0x0000); 4081 ASSERT(elmv[0] != 0x0000);
4082 ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode); 4082 ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
4083 uc16 last = 0x0000; 4083 uc16 last = 0x0000;
4084 for (int i = 0; i < elmc; i += 2) { 4084 for (int i = 0; i < elmc; i += 2) {
4085 ASSERT(last <= elmv[i] - 1); 4085 ASSERT(last <= elmv[i] - 1);
4086 ASSERT(elmv[i] <= elmv[i + 1]); 4086 ASSERT(elmv[i] <= elmv[i + 1]);
4087 ranges->Add(CharacterRange(last, elmv[i] - 1)); 4087 ranges->Add(CharacterRange(last, elmv[i] - 1));
4088 last = elmv[i + 1] + 1; 4088 last = elmv[i + 1] + 1;
4089 } 4089 }
4090 ranges->Add(CharacterRange(last, String::kMaxUC16CharCode)); 4090 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit));
4091 } 4091 }
4092 4092
4093 4093
4094 void CharacterRange::AddClassEscape(uc16 type, 4094 void CharacterRange::AddClassEscape(uc16 type,
4095 ZoneList<CharacterRange>* ranges) { 4095 ZoneList<CharacterRange>* ranges) {
4096 switch (type) { 4096 switch (type) {
4097 case 's': 4097 case 's':
4098 AddClass(kSpaceRanges, kSpaceRangeCount, ranges); 4098 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
4099 break; 4099 break;
4100 case 'S': 4100 case 'S':
(...skipping 525 matching lines...) Expand 10 before | Expand all | Expand 10 after
4626 if (range_count > 0 && ranges->at(0).from() == 0) { 4626 if (range_count > 0 && ranges->at(0).from() == 0) {
4627 from = ranges->at(0).to(); 4627 from = ranges->at(0).to();
4628 i = 1; 4628 i = 1;
4629 } 4629 }
4630 while (i < range_count) { 4630 while (i < range_count) {
4631 CharacterRange range = ranges->at(i); 4631 CharacterRange range = ranges->at(i);
4632 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1)); 4632 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1));
4633 from = range.to(); 4633 from = range.to();
4634 i++; 4634 i++;
4635 } 4635 }
4636 if (from < String::kMaxUC16CharCode) { 4636 if (from < String::kMaxUtf16CodeUnit) {
4637 negated_ranges->Add(CharacterRange(from + 1, String::kMaxUC16CharCode)); 4637 negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit));
4638 } 4638 }
4639 } 4639 }
4640 4640
4641 4641
4642 4642
4643 // ------------------------------------------------------------------- 4643 // -------------------------------------------------------------------
4644 // Interest propagation 4644 // Interest propagation
4645 4645
4646 4646
4647 RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) { 4647 RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
(...skipping 142 matching lines...) Expand 10 before | Expand all | Expand 10 after
4790 entry->out_set())); 4790 entry->out_set()));
4791 entry->set_to(current.to()); 4791 entry->set_to(current.to());
4792 } 4792 }
4793 ASSERT(entry->to() <= current.to()); 4793 ASSERT(entry->to() <= current.to());
4794 // The overlapping range is now completely contained by the range 4794 // The overlapping range is now completely contained by the range
4795 // we're adding so we can just update it and move the start point 4795 // we're adding so we can just update it and move the start point
4796 // of the range we're adding just past it. 4796 // of the range we're adding just past it.
4797 entry->AddValue(value); 4797 entry->AddValue(value);
4798 // Bail out if the last interval ended at 0xFFFF since otherwise 4798 // Bail out if the last interval ended at 0xFFFF since otherwise
4799 // adding 1 will wrap around to 0. 4799 // adding 1 will wrap around to 0.
4800 if (entry->to() == String::kMaxUC16CharCode) 4800 if (entry->to() == String::kMaxUtf16CodeUnit)
4801 break; 4801 break;
4802 ASSERT(entry->to() + 1 > current.from()); 4802 ASSERT(entry->to() + 1 > current.from());
4803 current.set_from(entry->to() + 1); 4803 current.set_from(entry->to() + 1);
4804 } else { 4804 } else {
4805 // There is no overlap so we can just add the range 4805 // There is no overlap so we can just add the range
4806 ZoneSplayTree<Config>::Locator ins; 4806 ZoneSplayTree<Config>::Locator ins;
4807 ASSERT_RESULT(tree()->Insert(current.from(), &ins)); 4807 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4808 ins.set_value(Entry(current.from(), 4808 ins.set_value(Entry(current.from(),
4809 current.to(), 4809 current.to(),
4810 empty()->Extend(value))); 4810 empty()->Extend(value)));
(...skipping 299 matching lines...) Expand 10 before | Expand all | Expand 10 after
5110 RegExpCharacterClass* char_class = text.data.u_char_class; 5110 RegExpCharacterClass* char_class = text.data.u_char_class;
5111 ZoneList<CharacterRange>* ranges = char_class->ranges(); 5111 ZoneList<CharacterRange>* ranges = char_class->ranges();
5112 // TODO(lrn): Canonicalize ranges when they are created 5112 // TODO(lrn): Canonicalize ranges when they are created
5113 // instead of waiting until now. 5113 // instead of waiting until now.
5114 CharacterRange::Canonicalize(ranges); 5114 CharacterRange::Canonicalize(ranges);
5115 if (char_class->is_negated()) { 5115 if (char_class->is_negated()) {
5116 int length = ranges->length(); 5116 int length = ranges->length();
5117 int new_length = length + 1; 5117 int new_length = length + 1;
5118 if (length > 0) { 5118 if (length > 0) {
5119 if (ranges->at(0).from() == 0) new_length--; 5119 if (ranges->at(0).from() == 0) new_length--;
5120 if (ranges->at(length - 1).to() == String::kMaxUC16CharCode) { 5120 if (ranges->at(length - 1).to() == String::kMaxUtf16CodeUnit) {
5121 new_length--; 5121 new_length--;
5122 } 5122 }
5123 } 5123 }
5124 ZoneList<CharacterRange>* negated_ranges = 5124 ZoneList<CharacterRange>* negated_ranges =
5125 new ZoneList<CharacterRange>(new_length); 5125 new ZoneList<CharacterRange>(new_length);
5126 CharacterRange::Negate(ranges, negated_ranges); 5126 CharacterRange::Negate(ranges, negated_ranges);
5127 set_first_character_set(negated_ranges); 5127 set_first_character_set(negated_ranges);
5128 } else { 5128 } else {
5129 set_first_character_set(ranges); 5129 set_first_character_set(ranges);
5130 } 5130 }
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after
5200 5200
5201 5201
5202 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) { 5202 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
5203 ranges->Sort(CompareRangeByFrom); 5203 ranges->Sort(CompareRangeByFrom);
5204 uc16 last = 0; 5204 uc16 last = 0;
5205 for (int i = 0; i < ranges->length(); i++) { 5205 for (int i = 0; i < ranges->length(); i++) {
5206 CharacterRange range = ranges->at(i); 5206 CharacterRange range = ranges->at(i);
5207 if (last < range.from()) 5207 if (last < range.from())
5208 AddRange(CharacterRange(last, range.from() - 1)); 5208 AddRange(CharacterRange(last, range.from() - 1));
5209 if (range.to() >= last) { 5209 if (range.to() >= last) {
5210 if (range.to() == String::kMaxUC16CharCode) { 5210 if (range.to() == String::kMaxUtf16CodeUnit) {
5211 return; 5211 return;
5212 } else { 5212 } else {
5213 last = range.to() + 1; 5213 last = range.to() + 1;
5214 } 5214 }
5215 } 5215 }
5216 } 5216 }
5217 AddRange(CharacterRange(last, String::kMaxUC16CharCode)); 5217 AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit));
5218 } 5218 }
5219 5219
5220 5220
5221 void DispatchTableConstructor::VisitText(TextNode* that) { 5221 void DispatchTableConstructor::VisitText(TextNode* that) {
5222 TextElement elm = that->elements()->at(0); 5222 TextElement elm = that->elements()->at(0);
5223 switch (elm.type) { 5223 switch (elm.type) {
5224 case TextElement::ATOM: { 5224 case TextElement::ATOM: {
5225 uc16 c = elm.data.u_atom->data()[0]; 5225 uc16 c = elm.data.u_atom->data()[0];
5226 AddRange(CharacterRange(c, c)); 5226 AddRange(CharacterRange(c, c));
5227 break; 5227 break;
(...skipping 106 matching lines...) Expand 10 before | Expand all | Expand 10 after
5334 } 5334 }
5335 5335
5336 return compiler.Assemble(&macro_assembler, 5336 return compiler.Assemble(&macro_assembler,
5337 node, 5337 node,
5338 data->capture_count, 5338 data->capture_count,
5339 pattern); 5339 pattern);
5340 } 5340 }
5341 5341
5342 5342
5343 }} // namespace v8::internal 5343 }} // namespace v8::internal
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698