Chromium Code Reviews| Index: vm/unicode.cc |
| =================================================================== |
| --- vm/unicode.cc (revision 14046) |
| +++ vm/unicode.cc (working copy) |
| @@ -10,7 +10,7 @@ |
| namespace dart { |
| -static const uint8_t kTrailBytes[256] = { |
| +static const int8_t kTrailBytes[256] = { |
|
cshapiro
2012/10/24 23:52:29
If you are going to mess with this table at all we
siva
2012/10/26 21:38:29
maybe for another CL.
On 2012/10/24 23:52:29, csh
|
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| @@ -58,6 +58,18 @@ |
| } |
| +static bool IsIsoLatin1(uint8_t code_unit) { |
|
cshapiro
2012/10/24 23:52:29
See below. This function and the one below it sni
siva
2012/10/26 21:38:29
Renamed to IsIsoLatin1SequenceStart but left it he
|
| + // Check is codepoint is <= U+00FF |
| + return (code_unit < 0xC3); |
| +} |
| + |
| + |
| +static bool IsSMP(uint8_t code_unit) { |
|
cshapiro
2012/10/24 23:52:29
I think this is a somewhat sketchy name. This cod
siva
2012/10/26 21:38:29
Renamed to IsSmpSequenceStart but left it here as
|
| + // Check is codepoint is >= U+10000. |
| + return (code_unit >= 0xF0); |
| +} |
| + |
| + |
| // Returns true if the code point is a high- or low-surrogate. |
| static bool IsSurrogate(uint32_t code_point) { |
| return (code_point & 0xfffff800) == 0xd800; |
| @@ -76,31 +88,35 @@ |
| } |
| +void Utf8::ConvertUTF32ToUTF16(int32_t codepoint, uint16_t* dst) { |
| + ASSERT(codepoint >= 0x10000); |
|
cshapiro
2012/10/24 23:52:29
I think you need a constant for kMaxBmpCodePoint
siva
2012/10/26 21:38:29
Done.
|
| + ASSERT(dst != NULL); |
| + dst[0] = (Utf8::kLeadOffset + (codepoint >> 10)); |
| + dst[1] = (0xDC00 + (codepoint & 0x3FF)); |
| +} |
| + |
| + |
| // Returns a count of the number of UTF-8 trail bytes. |
| -intptr_t Utf8::CodePointCount(const char* str, intptr_t* width) { |
| - bool is_two_byte_string = false; |
| - bool is_four_byte_string = false; |
| +intptr_t Utf8::CodePointCount(const uint8_t* utf8_array, |
| + intptr_t array_len, |
| + Type* type) { |
| intptr_t len = 0; |
| - for (; *str != '\0'; ++str) { |
| - uint8_t code_unit = *str; |
| + Type char_type = kISOLatin1; |
| + for (intptr_t i = 0; i < array_len; i++) { |
| + uint8_t code_unit = utf8_array[i]; |
| if (!IsTrailByte(code_unit)) { |
| ++len; |
| } |
| - if (code_unit > 0xC3) { // > U+00FF |
| - if (code_unit < 0xF0) { // < U+10000 |
| - is_two_byte_string = true; |
| + if (!IsIsoLatin1(code_unit)) { // > U+00FF |
| + if (IsSMP(code_unit)) { // >= U+10000 |
| + char_type = kSMP; |
| + ++len; |
| } else { |
| - is_four_byte_string = true; |
| + char_type = kBMP; |
| } |
| } |
| } |
| - if (is_four_byte_string) { |
| - *width = 4; |
| - } else if (is_two_byte_string) { |
| - *width = 2; |
| - } else { |
| - *width = 1; |
| - } |
| + *type = char_type; |
| return len; |
| } |
| @@ -112,7 +128,7 @@ |
| uint32_t ch = str[i] & 0xFF; |
| intptr_t j = 1; |
| if (ch >= 0x80) { |
| - uint8_t num_trail_bytes = kTrailBytes[ch]; |
| + int8_t num_trail_bytes = kTrailBytes[ch]; |
| bool is_malformed = false; |
| for (; j < num_trail_bytes; ++j) { |
| if (str[i + j] != '\0') { |
| @@ -202,15 +218,17 @@ |
| } |
| -intptr_t Utf8::Decode(const char* src, int32_t* dst) { |
| - uint32_t ch = src[0] & 0xFF; |
| - uint32_t i = 1; |
| +intptr_t Utf8::Decode(const uint8_t* utf8_array, |
| + intptr_t array_len, |
| + int32_t* dst) { |
| + uint32_t ch = utf8_array[0] & 0xFF; |
|
cshapiro
2012/10/24 23:52:29
The & is probably unnecessary now as the lhs and r
siva
2012/10/26 21:38:29
Done.
|
| + intptr_t i = 1; |
| if (ch >= 0x80) { |
| - uint32_t num_trail_bytes = kTrailBytes[ch]; |
| + int32_t num_trail_bytes = kTrailBytes[ch]; |
|
cshapiro
2012/10/24 23:52:29
This has no significance as an int32, why not just
siva
2012/10/26 21:38:29
Changed to int8_t to match the type of kTrailBytes
|
| bool is_malformed = false; |
| for (; i < num_trail_bytes; ++i) { |
| - if (src[i] != '\0') { |
| - uint8_t code_unit = src[i]; |
| + if (i < array_len) { |
| + uint8_t code_unit = utf8_array[i]; |
| is_malformed |= !IsTrailByte(code_unit); |
| ch = (ch << 6) + code_unit; |
| } else { |
| @@ -233,38 +251,77 @@ |
| } |
| -template<typename T> |
| -static bool DecodeImpl(const char* src, T* dst, intptr_t len) { |
| +bool Utf8::DecodeToISOLatin1(const uint8_t* utf8_array, |
| + intptr_t array_len, |
| + uint8_t* dst, |
| + intptr_t len) { |
| intptr_t i = 0; |
| intptr_t j = 0; |
| intptr_t num_bytes; |
| - for (; src[i] != '\0' && j < len; i += num_bytes, ++j) { |
| + for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
| int32_t ch; |
| - num_bytes = Utf8::Decode(&src[i], &ch); |
| + ASSERT(IsIsoLatin1(utf8_array[i])); |
| + num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
| if (ch == -1) { |
| return false; // invalid input |
| } |
| + ASSERT(ch <= 0xff); |
|
cshapiro
2012/10/24 23:52:29
Replace 0xFF with kMaxOneByteCharacter
siva
2012/10/26 21:38:29
As discussed offline this is 0xff unless we decide
|
| dst[j] = ch; |
| } |
| - if (src[i] != '\0' && j == len) { |
| + if ((i < array_len) && (j == len)) { |
| return false; // output overflow |
| } |
| return true; // success |
| } |
| -bool Utf8::Decode(const char* src, uint8_t* dst, intptr_t len) { |
| - return DecodeImpl(src, dst, len); |
| +bool Utf8::DecodeToUTF16(const uint8_t* utf8_array, |
| + intptr_t array_len, |
| + uint16_t* dst, |
| + intptr_t len) { |
| + intptr_t i = 0; |
| + intptr_t j = 0; |
| + intptr_t num_bytes; |
| + for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
| + int32_t ch; |
| + bool is_smp = IsSMP(utf8_array[i]); |
| + num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
| + if (ch == -1) { |
| + return false; // invalid input |
| + } |
| + if (is_smp) { |
| + ConvertUTF32ToUTF16(ch, &(dst[j])); |
| + j = j + 1; |
| + } else { |
| + dst[j] = ch; |
| + } |
| + } |
| + if ((i < array_len) && (j == len)) { |
| + return false; // output overflow |
| + } |
| + return true; // success |
| } |
| -bool Utf8::Decode(const char* src, uint16_t* dst, intptr_t len) { |
| - return DecodeImpl(src, dst, len); |
| +bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, |
| + intptr_t array_len, |
| + uint32_t* dst, |
| + intptr_t len) { |
| + intptr_t i = 0; |
| + intptr_t j = 0; |
| + intptr_t num_bytes; |
| + for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
| + int32_t ch; |
| + num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
| + if (ch == -1) { |
| + return false; // invalid input |
| + } |
| + dst[j] = ch; |
| + } |
| + if ((i < array_len) && (j == len)) { |
| + return false; // output overflow |
| + } |
| + return true; // success |
| } |
| - |
| -bool Utf8::Decode(const char* src, uint32_t* dst, intptr_t len) { |
| - return DecodeImpl(src, dst, len); |
| -} |
| - |
| } // namespace dart |