vm/unicode.cc - Issue 11275008: - Represent strings internally in UTF-16 format, this makes it

Unified Diff: vm/unicode.cc

Issue 11275008: - Represent strings internally in UTF-16 format, this makes it (Closed) Base URL: http://dart.googlecode.com/svn/branches/bleeding_edge/dart/runtime/

Patch Set: Created 8 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: vm/unicode.cc

===================================================================

--- vm/unicode.cc (revision 14046)

+++ vm/unicode.cc (working copy)

@@ -10,7 +10,7 @@

namespace dart {

-static const uint8_t kTrailBytes[256] = {

+static const int8_t kTrailBytes[256] = {

cshapiro 2012/10/24 23:52:29 If you are going to mess with this table at all we

siva 2012/10/26 21:38:29 maybe for another CL. On 2012/10/24 23:52:29, csh

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

@@ -58,6 +58,18 @@

}

+static bool IsIsoLatin1(uint8_t code_unit) {

cshapiro 2012/10/24 23:52:29 See below. This function and the one below it sni

siva 2012/10/26 21:38:29 Renamed to IsIsoLatin1SequenceStart but left it he

+ // Check is codepoint is <= U+00FF

+ return (code_unit < 0xC3);

+static bool IsSMP(uint8_t code_unit) {

cshapiro 2012/10/24 23:52:29 I think this is a somewhat sketchy name. This cod

siva 2012/10/26 21:38:29 Renamed to IsSmpSequenceStart but left it here as

+ // Check is codepoint is >= U+10000.

+ return (code_unit >= 0xF0);

// Returns true if the code point is a high- or low-surrogate.

static bool IsSurrogate(uint32_t code_point) {

return (code_point & 0xfffff800) == 0xd800;

@@ -76,31 +88,35 @@

}

+void Utf8::ConvertUTF32ToUTF16(int32_t codepoint, uint16_t* dst) {

+ ASSERT(codepoint >= 0x10000);

cshapiro 2012/10/24 23:52:29 I think you need a constant for kMaxBmpCodePoint

siva 2012/10/26 21:38:29 Done.

+ ASSERT(dst != NULL);

+ dst[0] = (Utf8::kLeadOffset + (codepoint >> 10));

+ dst[1] = (0xDC00 + (codepoint & 0x3FF));

// Returns a count of the number of UTF-8 trail bytes.

-intptr_t Utf8::CodePointCount(const char* str, intptr_t* width) {

- bool is_two_byte_string = false;

- bool is_four_byte_string = false;

+intptr_t Utf8::CodePointCount(const uint8_t* utf8_array,

+ intptr_t array_len,

+ Type* type) {

intptr_t len = 0;

- for (; *str != '\0'; ++str) {

- uint8_t code_unit = *str;

+ Type char_type = kISOLatin1;

+ for (intptr_t i = 0; i < array_len; i++) {

+ uint8_t code_unit = utf8_array[i];

if (!IsTrailByte(code_unit)) {

++len;

}

- if (code_unit > 0xC3) { // > U+00FF

- if (code_unit < 0xF0) { // < U+10000

- is_two_byte_string = true;

+ if (!IsIsoLatin1(code_unit)) { // > U+00FF

+ if (IsSMP(code_unit)) { // >= U+10000

+ char_type = kSMP;

+ ++len;

} else {

- is_four_byte_string = true;

+ char_type = kBMP;

}

- if (is_four_byte_string) {

- *width = 4;

- } else if (is_two_byte_string) {

- *width = 2;

- } else {

- *width = 1;

- }

+ *type = char_type;

return len;

}

@@ -112,7 +128,7 @@

uint32_t ch = str[i] & 0xFF;

intptr_t j = 1;

if (ch >= 0x80) {

- uint8_t num_trail_bytes = kTrailBytes[ch];

+ int8_t num_trail_bytes = kTrailBytes[ch];

bool is_malformed = false;

for (; j < num_trail_bytes; ++j) {

if (str[i + j] != '\0') {

@@ -202,15 +218,17 @@

}

-intptr_t Utf8::Decode(const char* src, int32_t* dst) {

- uint32_t ch = src[0] & 0xFF;

- uint32_t i = 1;

+intptr_t Utf8::Decode(const uint8_t* utf8_array,

+ intptr_t array_len,

+ int32_t* dst) {

+ uint32_t ch = utf8_array[0] & 0xFF;

cshapiro 2012/10/24 23:52:29 The & is probably unnecessary now as the lhs and r

siva 2012/10/26 21:38:29 Done.

+ intptr_t i = 1;

if (ch >= 0x80) {

- uint32_t num_trail_bytes = kTrailBytes[ch];

+ int32_t num_trail_bytes = kTrailBytes[ch];

cshapiro 2012/10/24 23:52:29 This has no significance as an int32, why not just

siva 2012/10/26 21:38:29 Changed to int8_t to match the type of kTrailBytes

bool is_malformed = false;

for (; i < num_trail_bytes; ++i) {

- if (src[i] != '\0') {

- uint8_t code_unit = src[i];

+ if (i < array_len) {

+ uint8_t code_unit = utf8_array[i];

is_malformed |= !IsTrailByte(code_unit);

ch = (ch << 6) + code_unit;

} else {

@@ -233,38 +251,77 @@

}

-template<typename T>

-static bool DecodeImpl(const char* src, T* dst, intptr_t len) {

+bool Utf8::DecodeToISOLatin1(const uint8_t* utf8_array,

+ intptr_t array_len,

+ uint8_t* dst,

+ intptr_t len) {

intptr_t i = 0;

intptr_t j = 0;

intptr_t num_bytes;

- for (; src[i] != '\0' && j < len; i += num_bytes, ++j) {

+ for (; (i < array_len) && (j < len); i += num_bytes, ++j) {

int32_t ch;

- num_bytes = Utf8::Decode(&src[i], &ch);

+ ASSERT(IsIsoLatin1(utf8_array[i]));

+ num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);

if (ch == -1) {

return false; // invalid input

}

+ ASSERT(ch <= 0xff);

cshapiro 2012/10/24 23:52:29 Replace 0xFF with kMaxOneByteCharacter

siva 2012/10/26 21:38:29 As discussed offline this is 0xff unless we decide

dst[j] = ch;

}

- if (src[i] != '\0' && j == len) {

+ if ((i < array_len) && (j == len)) {

return false; // output overflow

}

return true; // success

}

-bool Utf8::Decode(const char* src, uint8_t* dst, intptr_t len) {

- return DecodeImpl(src, dst, len);

+bool Utf8::DecodeToUTF16(const uint8_t* utf8_array,

+ intptr_t array_len,

+ uint16_t* dst,

+ intptr_t len) {

+ intptr_t i = 0;

+ intptr_t j = 0;

+ intptr_t num_bytes;

+ for (; (i < array_len) && (j < len); i += num_bytes, ++j) {

+ int32_t ch;

+ bool is_smp = IsSMP(utf8_array[i]);

+ num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);

+ if (ch == -1) {

+ return false; // invalid input

+ }

+ if (is_smp) {

+ ConvertUTF32ToUTF16(ch, &(dst[j]));

+ j = j + 1;

+ } else {

+ dst[j] = ch;

+ }

+ if ((i < array_len) && (j == len)) {

+ return false; // output overflow

+ }

+ return true; // success

}

-bool Utf8::Decode(const char* src, uint16_t* dst, intptr_t len) {

- return DecodeImpl(src, dst, len);

+bool Utf8::DecodeToUTF32(const uint8_t* utf8_array,

+ intptr_t array_len,

+ uint32_t* dst,

+ intptr_t len) {

+ intptr_t i = 0;

+ intptr_t j = 0;

+ intptr_t num_bytes;

+ for (; (i < array_len) && (j < len); i += num_bytes, ++j) {

+ int32_t ch;

+ num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);

+ if (ch == -1) {

+ return false; // invalid input

+ }

+ dst[j] = ch;

+ }

+ if ((i < array_len) && (j == len)) {

+ return false; // output overflow

+ }

+ return true; // success

}

-bool Utf8::Decode(const char* src, uint32_t* dst, intptr_t len) {

- return DecodeImpl(src, dst, len);

} // namespace dart

« vm/unicode.h ('K') | « vm/unicode.h ('k') | vm/unicode_test.cc » ('j') | vm/unit_test.h » ('J')