src/api.cc - Issue 9600009: Fix input and output to handle UTF16 surrogate pairs.

Unified Diff: src/api.cc

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: '' Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/api.cc

===================================================================

--- src/api.cc (revision 10944)

+++ src/api.cc (working copy)

@@ -1429,7 +1429,7 @@

ScriptData* ScriptData::PreCompile(const char* input, int length) {

- i::Utf8ToUC16CharacterStream stream(

+ i::Utf8ToUtf16CharacterStream stream(

reinterpret_cast<const unsigned char*>(input), length);

return i::ParserApi::PreParse(&stream, NULL, i::FLAG_harmony_scoping);

}

@@ -1438,11 +1438,11 @@

ScriptData* ScriptData::PreCompile(v8::Handle<String> source) {

i::Handle<i::String> str = Utils::OpenHandle(*source);

if (str->IsExternalTwoByteString()) {

- i::ExternalTwoByteStringUC16CharacterStream stream(

+ i::ExternalTwoByteStringUtf16CharacterStream stream(

i::Handle<i::ExternalTwoByteString>::cast(str), 0, str->length());

return i::ParserApi::PreParse(&stream, NULL, i::FLAG_harmony_scoping);

} else {

- i::GenericStringUC16CharacterStream stream(str, 0, str->length());

+ i::GenericStringUtf16CharacterStream stream(str, 0, str->length());

return i::ParserApi::PreParse(&stream, NULL, i::FLAG_harmony_scoping);

}

@@ -3689,7 +3689,13 @@

int String::Utf8Length() const {

i::Handle<i::String> str = Utils::OpenHandle(this);

if (IsDeadCheck(str->GetIsolate(), "v8::String::Utf8Length()")) return 0;

- return str->Utf8Length();

+ int length = str->Utf8Length();

+ if (length < 0) {

+ FlattenString(str);

+ length = str->Utf8Length();

+ }

+ ASSERT(length >= 0);

+ return length;

}

@@ -3735,11 +3741,13 @@

int i;

int pos = 0;

int nchars = 0;

+ int previous = unibrow::Utf8::kNoPreviousCharacter;

for (i = 0; i < len && (capacity == -1 || pos < fast_end); i++) {

i::uc32 c = write_input_buffer.GetNext();

- int written = unibrow::Utf8::Encode(buffer + pos, c);

+ int written = unibrow::Utf8::Encode(buffer + pos, c, previous);

pos += written;

nchars++;

+ previous = c;

}

if (i < len) {

// For the last characters we need to check the length for each one

@@ -3748,16 +3756,34 @@

char intermediate[unibrow::Utf8::kMaxEncodedSize];

for (; i < len && pos < capacity; i++) {

i::uc32 c = write_input_buffer.GetNext();

- int written = unibrow::Utf8::Encode(intermediate, c);

- if (pos + written <= capacity) {

- for (int j = 0; j < written; j++)

- buffer[pos + j] = intermediate[j];

+ if (unibrow::Utf16::IsTrailSurrogate(c) &&

+ previous != unibrow::Utf8::kNoPreviousCharacter &&

+ unibrow::Utf16::IsLeadSurrogate(previous)) {

+ // We can't use the intermediate buffer here because the encoding

+ // of surrogate pairs is done under assumption that you can step

+ // back and fix the UTF8 stream. Luckily we only need space for one

+ // more byte, so there is always space.

+ ASSERT(pos < capacity);

+ int written = unibrow::Utf8::Encode(buffer + pos, c, previous);

+ ASSERT(written == 1);

pos += written;

nchars++;

} else {

- // We've reached the end of the buffer

- break;

+ int written =

+ unibrow::Utf8::Encode(intermediate,

+ c,

+ unibrow::Utf8::kNoPreviousCharacter);

+ if (pos + written <= capacity) {

+ for (int j = 0; j < written; j++)

+ buffer[pos + j] = intermediate[j];

+ pos += written;

+ nchars++;

+ } else {

+ // We've reached the end of the buffer

+ break;

+ }

}

+ previous = c;

}

if (nchars_ref != NULL) *nchars_ref = nchars;

« no previous file with comments | « no previous file | src/arm/regexp-macro-assembler-arm.cc » ('j') | src/debug-agent.cc » ('J')