Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(640)

Unified Diff: vm/unicode.cc

Issue 11275008: - Represent strings internally in UTF-16 format, this makes it (Closed) Base URL: http://dart.googlecode.com/svn/branches/bleeding_edge/dart/runtime/
Patch Set: Created 8 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: vm/unicode.cc
===================================================================
--- vm/unicode.cc (revision 14046)
+++ vm/unicode.cc (working copy)
@@ -10,7 +10,7 @@
namespace dart {
-static const uint8_t kTrailBytes[256] = {
+static const int8_t kTrailBytes[256] = {
cshapiro 2012/10/24 23:52:29 If you are going to mess with this table at all we
siva 2012/10/26 21:38:29 maybe for another CL. On 2012/10/24 23:52:29, csh
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -58,6 +58,18 @@
}
+static bool IsIsoLatin1(uint8_t code_unit) {
cshapiro 2012/10/24 23:52:29 See below. This function and the one below it sni
siva 2012/10/26 21:38:29 Renamed to IsIsoLatin1SequenceStart but left it he
+ // Check is codepoint is <= U+00FF
+ return (code_unit < 0xC3);
+}
+
+
+static bool IsSMP(uint8_t code_unit) {
cshapiro 2012/10/24 23:52:29 I think this is a somewhat sketchy name. This cod
siva 2012/10/26 21:38:29 Renamed to IsSmpSequenceStart but left it here as
+ // Check is codepoint is >= U+10000.
+ return (code_unit >= 0xF0);
+}
+
+
// Returns true if the code point is a high- or low-surrogate.
static bool IsSurrogate(uint32_t code_point) {
return (code_point & 0xfffff800) == 0xd800;
@@ -76,31 +88,35 @@
}
+void Utf8::ConvertUTF32ToUTF16(int32_t codepoint, uint16_t* dst) {
+ ASSERT(codepoint >= 0x10000);
cshapiro 2012/10/24 23:52:29 I think you need a constant for kMaxBmpCodePoint
siva 2012/10/26 21:38:29 Done.
+ ASSERT(dst != NULL);
+ dst[0] = (Utf8::kLeadOffset + (codepoint >> 10));
+ dst[1] = (0xDC00 + (codepoint & 0x3FF));
+}
+
+
// Returns a count of the number of UTF-8 trail bytes.
-intptr_t Utf8::CodePointCount(const char* str, intptr_t* width) {
- bool is_two_byte_string = false;
- bool is_four_byte_string = false;
+intptr_t Utf8::CodePointCount(const uint8_t* utf8_array,
+ intptr_t array_len,
+ Type* type) {
intptr_t len = 0;
- for (; *str != '\0'; ++str) {
- uint8_t code_unit = *str;
+ Type char_type = kISOLatin1;
+ for (intptr_t i = 0; i < array_len; i++) {
+ uint8_t code_unit = utf8_array[i];
if (!IsTrailByte(code_unit)) {
++len;
}
- if (code_unit > 0xC3) { // > U+00FF
- if (code_unit < 0xF0) { // < U+10000
- is_two_byte_string = true;
+ if (!IsIsoLatin1(code_unit)) { // > U+00FF
+ if (IsSMP(code_unit)) { // >= U+10000
+ char_type = kSMP;
+ ++len;
} else {
- is_four_byte_string = true;
+ char_type = kBMP;
}
}
}
- if (is_four_byte_string) {
- *width = 4;
- } else if (is_two_byte_string) {
- *width = 2;
- } else {
- *width = 1;
- }
+ *type = char_type;
return len;
}
@@ -112,7 +128,7 @@
uint32_t ch = str[i] & 0xFF;
intptr_t j = 1;
if (ch >= 0x80) {
- uint8_t num_trail_bytes = kTrailBytes[ch];
+ int8_t num_trail_bytes = kTrailBytes[ch];
bool is_malformed = false;
for (; j < num_trail_bytes; ++j) {
if (str[i + j] != '\0') {
@@ -202,15 +218,17 @@
}
-intptr_t Utf8::Decode(const char* src, int32_t* dst) {
- uint32_t ch = src[0] & 0xFF;
- uint32_t i = 1;
+intptr_t Utf8::Decode(const uint8_t* utf8_array,
+ intptr_t array_len,
+ int32_t* dst) {
+ uint32_t ch = utf8_array[0] & 0xFF;
cshapiro 2012/10/24 23:52:29 The & is probably unnecessary now as the lhs and r
siva 2012/10/26 21:38:29 Done.
+ intptr_t i = 1;
if (ch >= 0x80) {
- uint32_t num_trail_bytes = kTrailBytes[ch];
+ int32_t num_trail_bytes = kTrailBytes[ch];
cshapiro 2012/10/24 23:52:29 This has no significance as an int32, why not just
siva 2012/10/26 21:38:29 Changed to int8_t to match the type of kTrailBytes
bool is_malformed = false;
for (; i < num_trail_bytes; ++i) {
- if (src[i] != '\0') {
- uint8_t code_unit = src[i];
+ if (i < array_len) {
+ uint8_t code_unit = utf8_array[i];
is_malformed |= !IsTrailByte(code_unit);
ch = (ch << 6) + code_unit;
} else {
@@ -233,38 +251,77 @@
}
-template<typename T>
-static bool DecodeImpl(const char* src, T* dst, intptr_t len) {
+bool Utf8::DecodeToISOLatin1(const uint8_t* utf8_array,
+ intptr_t array_len,
+ uint8_t* dst,
+ intptr_t len) {
intptr_t i = 0;
intptr_t j = 0;
intptr_t num_bytes;
- for (; src[i] != '\0' && j < len; i += num_bytes, ++j) {
+ for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
int32_t ch;
- num_bytes = Utf8::Decode(&src[i], &ch);
+ ASSERT(IsIsoLatin1(utf8_array[i]));
+ num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
if (ch == -1) {
return false; // invalid input
}
+ ASSERT(ch <= 0xff);
cshapiro 2012/10/24 23:52:29 Replace 0xFF with kMaxOneByteCharacter
siva 2012/10/26 21:38:29 As discussed offline this is 0xff unless we decide
dst[j] = ch;
}
- if (src[i] != '\0' && j == len) {
+ if ((i < array_len) && (j == len)) {
return false; // output overflow
}
return true; // success
}
-bool Utf8::Decode(const char* src, uint8_t* dst, intptr_t len) {
- return DecodeImpl(src, dst, len);
+bool Utf8::DecodeToUTF16(const uint8_t* utf8_array,
+ intptr_t array_len,
+ uint16_t* dst,
+ intptr_t len) {
+ intptr_t i = 0;
+ intptr_t j = 0;
+ intptr_t num_bytes;
+ for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
+ int32_t ch;
+ bool is_smp = IsSMP(utf8_array[i]);
+ num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
+ if (ch == -1) {
+ return false; // invalid input
+ }
+ if (is_smp) {
+ ConvertUTF32ToUTF16(ch, &(dst[j]));
+ j = j + 1;
+ } else {
+ dst[j] = ch;
+ }
+ }
+ if ((i < array_len) && (j == len)) {
+ return false; // output overflow
+ }
+ return true; // success
}
-bool Utf8::Decode(const char* src, uint16_t* dst, intptr_t len) {
- return DecodeImpl(src, dst, len);
+bool Utf8::DecodeToUTF32(const uint8_t* utf8_array,
+ intptr_t array_len,
+ uint32_t* dst,
+ intptr_t len) {
+ intptr_t i = 0;
+ intptr_t j = 0;
+ intptr_t num_bytes;
+ for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
+ int32_t ch;
+ num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
+ if (ch == -1) {
+ return false; // invalid input
+ }
+ dst[j] = ch;
+ }
+ if ((i < array_len) && (j == len)) {
+ return false; // output overflow
+ }
+ return true; // success
}
-
-bool Utf8::Decode(const char* src, uint32_t* dst, intptr_t len) {
- return DecodeImpl(src, dst, len);
-}
-
} // namespace dart

Powered by Google App Engine
This is Rietveld 408576698