src/objects.cc - Issue 9600009: Fix input and output to handle UTF16 surrogate pairs.

Side by Side Diff: src/objects.cc

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 6025 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6036	6036

6037 // Negative length means the to the end of the string.	6037 // Negative length means the to the end of the string.

6038 if (length < 0) length = kMaxInt - offset;	6038 if (length < 0) length = kMaxInt - offset;

6039	6039

6040 // Compute the size of the UTF-8 string. Start at the specified offset.	6040 // Compute the size of the UTF-8 string. Start at the specified offset.

6041 Access<StringInputBuffer> buffer(	6041 Access<StringInputBuffer> buffer(

6042 heap->isolate()->objects_string_input_buffer());	6042 heap->isolate()->objects_string_input_buffer());

6043 buffer->Reset(offset, this);	6043 buffer->Reset(offset, this);

6044 int character_position = offset;	6044 int character_position = offset;

6045 int utf8_bytes = 0;	6045 int utf8_bytes = 0;

	6046 int last = unibrow::Utf16::kNoPreviousCharacter;

6046 while (buffer->has_more() && character_position++ < offset + length) {	6047 while (buffer->has_more() && character_position++ < offset + length) {

6047 uint16_t character = buffer->GetNext();	6048 uint16_t character = buffer->GetNext();

6048 utf8_bytes += unibrow::Utf8::Length(character);	6049 utf8_bytes += unibrow::Utf8::Length(character, last);

	6050 last = character;

6049 }	6051 }

6050	6052

6051 if (length_return) {	6053 if (length_return) {

6052 *length_return = utf8_bytes;	6054 *length_return = utf8_bytes;

6053 }	6055 }

6054	6056

6055 char* result = NewArray<char>(utf8_bytes + 1);	6057 char* result = NewArray<char>(utf8_bytes + 1);

6056	6058

6057 // Convert the UTF-16 string to a UTF-8 buffer. Start at the specified offset.	6059 // Convert the UTF-16 string to a UTF-8 buffer. Start at the specified offset.

6058 buffer->Rewind();	6060 buffer->Rewind();

6059 buffer->Seek(offset);	6061 buffer->Seek(offset);

6060 character_position = offset;	6062 character_position = offset;

6061 int utf8_byte_position = 0;	6063 int utf8_byte_position = 0;

	6064 last = unibrow::Utf16::kNoPreviousCharacter;

6062 while (buffer->has_more() && character_position++ < offset + length) {	6065 while (buffer->has_more() && character_position++ < offset + length) {

6063 uint16_t character = buffer->GetNext();	6066 uint16_t character = buffer->GetNext();

6064 if (allow_nulls == DISALLOW_NULLS && character == 0) {	6067 if (allow_nulls == DISALLOW_NULLS && character == 0) {

6065 character = ' ';	6068 character = ' ';

6066 }	6069 }

6067 utf8_byte_position +=	6070 utf8_byte_position +=

6068 unibrow::Utf8::Encode(result + utf8_byte_position, character);	6071 unibrow::Utf8::Encode(result + utf8_byte_position, character, last);

	6072 last = character;

6069 }	6073 }

6070 result[utf8_byte_position] = 0;	6074 result[utf8_byte_position] = 0;

6071 return SmartArrayPointer<char>(result);	6075 return SmartArrayPointer<char>(result);

6072 }	6076 }

6073	6077

6074	6078

6075 SmartArrayPointer<char> String::ToCString(AllowNullsFlag allow_nulls,	6079 SmartArrayPointer<char> String::ToCString(AllowNullsFlag allow_nulls,

6076 RobustnessFlag robust_flag,	6080 RobustnessFlag robust_flag,

6077 int* length_return) {	6081 int* length_return) {

6078 return ToCString(allow_nulls, robust_flag, 0, -1, length_return);	6082 return ToCString(allow_nulls, robust_flag, 0, -1, length_return);

(...skipping 293 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6372 max_chars);	6376 max_chars);

6373 default:	6377 default:

6374 break;	6378 break;

6375 }	6379 }

6376	6380

6377 UNREACHABLE();	6381 UNREACHABLE();

6378 return 0;	6382 return 0;

6379 }	6383 }

6380	6384

6381	6385

6382 // This method determines the type of string involved and then gets the UTF8

6383 // length of the string. It doesn't flatten the string and has log(n) recursion

6384 // for a string of length n.

6385 int String::Utf8Length(String* input, int from, int to) {

6386 if (from == to) return 0;

6387 int total = 0;

6388 while (true) {

6389 if (input->IsAsciiRepresentation()) return total + to - from;

6390 switch (StringShape(input).representation_tag()) {

6391 case kConsStringTag: {

6392 ConsString* str = ConsString::cast(input);

6393 String* first = str->first();

6394 String* second = str->second();

6395 int first_length = first->length();

6396 if (first_length - from < to - first_length) {

6397 if (first_length > from) {

6398 // Left hand side is shorter.

6399 total += Utf8Length(first, from, first_length);

6400 input = second;

6401 from = 0;

6402 to -= first_length;

6403 } else {

6404 // We only need the right hand side.

6405 input = second;

6406 from -= first_length;

6407 to -= first_length;

6408 }

6409 } else {

6410 if (first_length <= to) {

6411 // Right hand side is shorter.

6412 total += Utf8Length(second, 0, to - first_length);

6413 input = first;

6414 to = first_length;

6415 } else {

6416 // We only need the left hand side.

6417 input = first;

6418 }

6419 }

6420 continue;

6421 }

6422 case kExternalStringTag:

6423 case kSeqStringTag: {

6424 Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();

6425 const uc16* p = vector.start();

6426 for (int i = from; i < to; i++) {

6427 total += unibrow::Utf8::Length(p[i]);

6428 }

6429 return total;

6430 }

6431 case kSlicedStringTag: {

6432 SlicedString* str = SlicedString::cast(input);

6433 int offset = str->offset();

6434 input = str->parent();

6435 from += offset;

6436 to += offset;

6437 continue;

6438 }

6439 default:

6440 break;

6441 }

6442 UNREACHABLE();

6443 return 0;

6444 }

6445 return 0;

6446 }

6447

6448

6449 void Relocatable::PostGarbageCollectionProcessing() {	6386 void Relocatable::PostGarbageCollectionProcessing() {

6450 Isolate* isolate = Isolate::Current();	6387 Isolate* isolate = Isolate::Current();

6451 Relocatable* current = isolate->relocatable_top();	6388 Relocatable* current = isolate->relocatable_top();

6452 while (current != NULL) {	6389 while (current != NULL) {

6453 current->PostGarbageCollection();	6390 current->PostGarbageCollection();

6454 current = current->prev_;	6391 current = current->prev_;

6455 }	6392 }

6456 }	6393 }

6457	6394

6458	6395

(...skipping 373 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6832 }	6769 }

6833 }	6770 }

6834 }	6771 }

6835	6772

6836	6773

6837 template <typename IteratorA, typename IteratorB>	6774 template <typename IteratorA, typename IteratorB>

6838 static inline bool CompareStringContents(IteratorA* ia, IteratorB* ib) {	6775 static inline bool CompareStringContents(IteratorA* ia, IteratorB* ib) {

6839 // General slow case check. We know that the ia and ib iterators	6776 // General slow case check. We know that the ia and ib iterators

6840 // have the same length.	6777 // have the same length.

6841 while (ia->has_more()) {	6778 while (ia->has_more()) {

6842 uc32 ca = ia->GetNext();	6779 uint32_t ca = ia->GetNext();

6843 uc32 cb = ib->GetNext();	6780 uint32_t cb = ib->GetNext();

	6781 ASSERT(ca <= unibrow::Utf16::kMaxNonSurrogateCharCode);

	6782 ASSERT(cb <= unibrow::Utf16::kMaxNonSurrogateCharCode);

6844 if (ca != cb)	6783 if (ca != cb)

6845 return false;	6784 return false;

6846 }	6785 }

6847 return true;	6786 return true;

6848 }	6787 }

6849	6788

6850	6789

6851 // Compares the contents of two strings by reading and comparing	6790 // Compares the contents of two strings by reading and comparing

6852 // int-sized blocks of characters.	6791 // int-sized blocks of characters.

6853 template <typename Char>	6792 template <typename Char>

(...skipping 162 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
7016	6955

7017	6956

7018 bool String::IsEqualTo(Vector<const char> str) {	6957 bool String::IsEqualTo(Vector<const char> str) {

7019 Isolate* isolate = GetIsolate();	6958 Isolate* isolate = GetIsolate();

7020 int slen = length();	6959 int slen = length();

7021 Access<UnicodeCache::Utf8Decoder>	6960 Access<UnicodeCache::Utf8Decoder>

7022 decoder(isolate->unicode_cache()->utf8_decoder());	6961 decoder(isolate->unicode_cache()->utf8_decoder());

7023 decoder->Reset(str.start(), str.length());	6962 decoder->Reset(str.start(), str.length());

7024 int i;	6963 int i;

7025 for (i = 0; i < slen && decoder->has_more(); i++) {	6964 for (i = 0; i < slen && decoder->has_more(); i++) {

7026 uc32 r = decoder->GetNext();	6965 uint32_t r = decoder->GetNext();

7027 if (Get(i) != r) return false;	6966 if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {

	6967 if (i > slen - 1) return false;

	6968 if (Get(i++) != unibrow::Utf16::LeadSurrogate(r)) return false;

	6969 if (Get(i) != unibrow::Utf16::TrailSurrogate(r)) return false;

	6970 } else {

	6971 if (Get(i) != r) return false;

	6972 }

7028 }	6973 }

7029 return i == slen && !decoder->has_more();	6974 return i == slen && !decoder->has_more();

7030 }	6975 }

7031	6976

7032	6977

7033 bool String::IsAsciiEqualTo(Vector<const char> str) {	6978 bool String::IsAsciiEqualTo(Vector<const char> str) {

7034 int slen = length();	6979 int slen = length();

7035 if (str.length() != slen) return false;	6980 if (str.length() != slen) return false;

7036 FlatContent content = GetFlatContent();	6981 FlatContent content = GetFlatContent();

7037 if (content.IsAscii()) {	6982 if (content.IsAscii()) {

(...skipping 109 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
7147 value <<= String::kHashShift;	7092 value <<= String::kHashShift;

7148 value \|= length << String::kArrayIndexHashLengthShift;	7093 value \|= length << String::kArrayIndexHashLengthShift;

7149	7094

7150 ASSERT((value & String::kIsNotArrayIndexMask) == 0);	7095 ASSERT((value & String::kIsNotArrayIndexMask) == 0);

7151 ASSERT((length > String::kMaxCachedArrayIndexLength) \|\|	7096 ASSERT((length > String::kMaxCachedArrayIndexLength) \|\|

7152 (value & String::kContainsCachedArrayIndexMask) == 0);	7097 (value & String::kContainsCachedArrayIndexMask) == 0);

7153 return value;	7098 return value;

7154 }	7099 }

7155	7100

7156	7101

	7102 void StringHasher::AddSurrogatePair(uc32 c) {

	7103 uint16_t lead = unibrow::Utf16::LeadSurrogate(c);

	7104 AddCharacter(lead);

	7105 uint16_t trail = unibrow::Utf16::TrailSurrogate(c);

	7106 AddCharacter(trail);

	7107 }

	7108

	7109

	7110 void StringHasher::AddSurrogatePairNoIndex(uc32 c) {

	7111 uint16_t lead = unibrow::Utf16::LeadSurrogate(c);

	7112 AddCharacterNoIndex(lead);

	7113 uint16_t trail = unibrow::Utf16::TrailSurrogate(c);

	7114 AddCharacterNoIndex(trail);

	7115 }

	7116

	7117

7157 uint32_t StringHasher::GetHashField() {	7118 uint32_t StringHasher::GetHashField() {

7158 ASSERT(is_valid());	7119 ASSERT(is_valid());

7159 if (length_ <= String::kMaxHashCalcLength) {	7120 if (length_ <= String::kMaxHashCalcLength) {

7160 if (is_array_index()) {	7121 if (is_array_index()) {

7161 return MakeArrayIndexHash(array_index(), length_);	7122 return MakeArrayIndexHash(array_index(), length_);

7162 }	7123 }

7163 return (GetHash() << String::kHashShift) \| String::kIsNotArrayIndexMask;	7124 return (GetHash() << String::kHashShift) \| String::kIsNotArrayIndexMask;

7164 } else {	7125 } else {

7165 return (length_ << String::kHashShift) \| String::kIsNotArrayIndexMask;	7126 return (length_ << String::kHashShift) \| String::kIsNotArrayIndexMask;

7166 }	7127 }

(...skipping 3572 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
10739 : string_(string), hash_field_(0), seed_(seed) { }	10700 : string_(string), hash_field_(0), seed_(seed) { }

10740	10701

10741 bool IsMatch(Object* string) {	10702 bool IsMatch(Object* string) {

10742 return String::cast(string)->IsEqualTo(string_);	10703 return String::cast(string)->IsEqualTo(string_);

10743 }	10704 }

10744	10705

10745 uint32_t Hash() {	10706 uint32_t Hash() {

10746 if (hash_field_ != 0) return hash_field_ >> String::kHashShift;	10707 if (hash_field_ != 0) return hash_field_ >> String::kHashShift;

10747 unibrow::Utf8InputBuffer<> buffer(string_.start(),	10708 unibrow::Utf8InputBuffer<> buffer(string_.start(),

10748 static_cast<unsigned>(string_.length()));	10709 static_cast<unsigned>(string_.length()));

10749 chars_ = buffer.Length();	10710 chars_ = buffer.Utf16Length();

10750 hash_field_ = String::ComputeHashField(&buffer, chars_, seed_);	10711 hash_field_ = String::ComputeHashField(&buffer, chars_, seed_);

10751 uint32_t result = hash_field_ >> String::kHashShift;	10712 uint32_t result = hash_field_ >> String::kHashShift;

10752 ASSERT(result != 0); // Ensure that the hash value of 0 is never computed.	10713 ASSERT(result != 0); // Ensure that the hash value of 0 is never computed.

10753 return result;	10714 return result;

10754 }	10715 }

10755	10716

10756 uint32_t HashForObject(Object* other) {	10717 uint32_t HashForObject(Object* other) {

10757 return String::cast(other)->Hash();	10718 return String::cast(other)->Hash();

10758 }	10719 }

10759	10720

(...skipping 2197 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
12957 if (break_point_objects()->IsUndefined()) return 0;	12918 if (break_point_objects()->IsUndefined()) return 0;

12958 // Single break point.	12919 // Single break point.

12959 if (!break_point_objects()->IsFixedArray()) return 1;	12920 if (!break_point_objects()->IsFixedArray()) return 1;

12960 // Multiple break points.	12921 // Multiple break points.

12961 return FixedArray::cast(break_point_objects())->length();	12922 return FixedArray::cast(break_point_objects())->length();

12962 }	12923 }

12963 #endif // ENABLE_DEBUGGER_SUPPORT	12924 #endif // ENABLE_DEBUGGER_SUPPORT

12964	12925

12965	12926

12966 } } // namespace v8::internal	12927 } } // namespace v8::internal

OLD	NEW

« src/handles.cc ('K') | « src/objects.h ('k') | src/objects-inl.h » ('j') | src/unicode.h » ('J')