src/objects.cc - Issue 9600009: Fix input and output to handle UTF16 surrogate pairs.

Unified Diff: src/objects.cc

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/objects.cc

===================================================================

--- src/objects.cc (revision 10944)

+++ src/objects.cc (working copy)

@@ -6043,9 +6043,11 @@

buffer->Reset(offset, this);

int character_position = offset;

int utf8_bytes = 0;

+ int last = unibrow::Utf16::kNoPreviousCharacter;

while (buffer->has_more() && character_position++ < offset + length) {

uint16_t character = buffer->GetNext();

- utf8_bytes += unibrow::Utf8::Length(character);

+ utf8_bytes += unibrow::Utf8::Length(character, last);

+ last = character;

}

if (length_return) {

@@ -6059,13 +6061,15 @@

buffer->Seek(offset);

character_position = offset;

int utf8_byte_position = 0;

+ last = unibrow::Utf16::kNoPreviousCharacter;

while (buffer->has_more() && character_position++ < offset + length) {

uint16_t character = buffer->GetNext();

if (allow_nulls == DISALLOW_NULLS && character == 0) {

character = ' ';

}

utf8_byte_position +=

- unibrow::Utf8::Encode(result + utf8_byte_position, character);

+ unibrow::Utf8::Encode(result + utf8_byte_position, character, last);

+ last = character;

}

result[utf8_byte_position] = 0;

return SmartArrayPointer<char>(result);

@@ -6379,73 +6383,6 @@

}

-// This method determines the type of string involved and then gets the UTF8

-// length of the string. It doesn't flatten the string and has log(n) recursion

-// for a string of length n.

-int String::Utf8Length(String* input, int from, int to) {

- if (from == to) return 0;

- int total = 0;

- while (true) {

- if (input->IsAsciiRepresentation()) return total + to - from;

- switch (StringShape(input).representation_tag()) {

- case kConsStringTag: {

- ConsString* str = ConsString::cast(input);

- String* first = str->first();

- String* second = str->second();

- int first_length = first->length();

- if (first_length - from < to - first_length) {

- if (first_length > from) {

- // Left hand side is shorter.

- total += Utf8Length(first, from, first_length);

- input = second;

- from = 0;

- to -= first_length;

- } else {

- // We only need the right hand side.

- input = second;

- from -= first_length;

- to -= first_length;

- }

- } else {

- if (first_length <= to) {

- // Right hand side is shorter.

- total += Utf8Length(second, 0, to - first_length);

- input = first;

- to = first_length;

- } else {

- // We only need the left hand side.

- input = first;

- }

- continue;

- }

- case kExternalStringTag:

- case kSeqStringTag: {

- Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();

- const uc16* p = vector.start();

- for (int i = from; i < to; i++) {

- total += unibrow::Utf8::Length(p[i]);

- }

- return total;

- }

- case kSlicedStringTag: {

- SlicedString* str = SlicedString::cast(input);

- int offset = str->offset();

- input = str->parent();

- from += offset;

- to += offset;

- continue;

- }

- default:

- break;

- }

- UNREACHABLE();

- return 0;

- }

- return 0;

void Relocatable::PostGarbageCollectionProcessing() {

Isolate* isolate = Isolate::Current();

Relocatable* current = isolate->relocatable_top();

@@ -6839,8 +6776,10 @@

// General slow case check. We know that the ia and ib iterators

// have the same length.

while (ia->has_more()) {

- uc32 ca = ia->GetNext();

- uc32 cb = ib->GetNext();

+ uint32_t ca = ia->GetNext();

+ uint32_t cb = ib->GetNext();

+ ASSERT(ca <= unibrow::Utf16::kMaxNonSurrogateCharCode);

+ ASSERT(cb <= unibrow::Utf16::kMaxNonSurrogateCharCode);

if (ca != cb)

return false;

}

@@ -7023,8 +6962,14 @@

decoder->Reset(str.start(), str.length());

int i;

for (i = 0; i < slen && decoder->has_more(); i++) {

- uc32 r = decoder->GetNext();

- if (Get(i) != r) return false;

+ uint32_t r = decoder->GetNext();

+ if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {

+ if (i > slen - 1) return false;

+ if (Get(i++) != unibrow::Utf16::LeadSurrogate(r)) return false;

+ if (Get(i) != unibrow::Utf16::TrailSurrogate(r)) return false;

+ } else {

+ if (Get(i) != r) return false;

+ }

}

return i == slen && !decoder->has_more();

}

@@ -7154,6 +7099,22 @@

}

+void StringHasher::AddSurrogatePair(uc32 c) {

+ uint16_t lead = unibrow::Utf16::LeadSurrogate(c);

+ AddCharacter(lead);

+ uint16_t trail = unibrow::Utf16::TrailSurrogate(c);

+ AddCharacter(trail);

+void StringHasher::AddSurrogatePairNoIndex(uc32 c) {

+ uint16_t lead = unibrow::Utf16::LeadSurrogate(c);

+ AddCharacterNoIndex(lead);

+ uint16_t trail = unibrow::Utf16::TrailSurrogate(c);

+ AddCharacterNoIndex(trail);

uint32_t StringHasher::GetHashField() {

ASSERT(is_valid());

if (length_ <= String::kMaxHashCalcLength) {

@@ -10746,7 +10707,7 @@

if (hash_field_ != 0) return hash_field_ >> String::kHashShift;

unibrow::Utf8InputBuffer<> buffer(string_.start(),

static_cast<unsigned>(string_.length()));

- chars_ = buffer.Length();

+ chars_ = buffer.Utf16Length();

hash_field_ = String::ComputeHashField(&buffer, chars_, seed_);

uint32_t result = hash_field_ >> String::kHashShift;

ASSERT(result != 0); // Ensure that the hash value of 0 is never computed.

« src/handles.cc ('K') | « src/objects.h ('k') | src/objects-inl.h » ('j') | src/unicode.h » ('J')