Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: vm/unicode.cc

Issue 11275008: - Represent strings internally in UTF-16 format, this makes it (Closed) Base URL: http://dart.googlecode.com/svn/branches/bleeding_edge/dart/runtime/
Patch Set: Created 8 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 #include "vm/unicode.h" 5 #include "vm/unicode.h"
6 6
7 #include "vm/allocation.h" 7 #include "vm/allocation.h"
8 #include "vm/globals.h" 8 #include "vm/globals.h"
9 #include "vm/object.h" 9 #include "vm/object.h"
10 10
11 namespace dart { 11 namespace dart {
12 12
13 static const uint8_t kTrailBytes[256] = { 13 static const int8_t kTrailBytes[256] = {
cshapiro 2012/10/24 23:52:29 If you are going to mess with this table at all we
siva 2012/10/26 21:38:29 maybe for another CL. On 2012/10/24 23:52:29, csh
14 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 14 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
(...skipping 27 matching lines...) Expand all
51 0xFFFFFFFF, 51 0xFFFFFFFF,
52 0xFFFFFFFF 52 0xFFFFFFFF
53 }; 53 };
54 54
55 55
56 static bool IsTrailByte(uint8_t code_unit) { 56 static bool IsTrailByte(uint8_t code_unit) {
57 return (code_unit & 0xc0) == 0x80; 57 return (code_unit & 0xc0) == 0x80;
58 } 58 }
59 59
60 60
61 static bool IsIsoLatin1(uint8_t code_unit) {
cshapiro 2012/10/24 23:52:29 See below. This function and the one below it sni
siva 2012/10/26 21:38:29 Renamed to IsIsoLatin1SequenceStart but left it he
62 // Check is codepoint is <= U+00FF
63 return (code_unit < 0xC3);
64 }
65
66
67 static bool IsSMP(uint8_t code_unit) {
cshapiro 2012/10/24 23:52:29 I think this is a somewhat sketchy name. This cod
siva 2012/10/26 21:38:29 Renamed to IsSmpSequenceStart but left it here as
68 // Check is codepoint is >= U+10000.
69 return (code_unit >= 0xF0);
70 }
71
72
61 // Returns true if the code point is a high- or low-surrogate. 73 // Returns true if the code point is a high- or low-surrogate.
62 static bool IsSurrogate(uint32_t code_point) { 74 static bool IsSurrogate(uint32_t code_point) {
63 return (code_point & 0xfffff800) == 0xd800; 75 return (code_point & 0xfffff800) == 0xd800;
64 } 76 }
65 77
66 78
67 // Returns true if the code point value is above Plane 17. 79 // Returns true if the code point value is above Plane 17.
68 static bool IsOutOfRange(uint32_t code_point) { 80 static bool IsOutOfRange(uint32_t code_point) {
69 return code_point > 0x10FFFF; 81 return code_point > 0x10FFFF;
70 } 82 }
71 83
72 84
73 // Returns true if the byte sequence is ill-formed. 85 // Returns true if the byte sequence is ill-formed.
74 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { 86 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) {
75 return code_point < kOverlongMinimum[num_bytes]; 87 return code_point < kOverlongMinimum[num_bytes];
76 } 88 }
77 89
78 90
91 void Utf8::ConvertUTF32ToUTF16(int32_t codepoint, uint16_t* dst) {
92 ASSERT(codepoint >= 0x10000);
cshapiro 2012/10/24 23:52:29 I think you need a constant for kMaxBmpCodePoint
siva 2012/10/26 21:38:29 Done.
93 ASSERT(dst != NULL);
94 dst[0] = (Utf8::kLeadOffset + (codepoint >> 10));
95 dst[1] = (0xDC00 + (codepoint & 0x3FF));
96 }
97
98
79 // Returns a count of the number of UTF-8 trail bytes. 99 // Returns a count of the number of UTF-8 trail bytes.
80 intptr_t Utf8::CodePointCount(const char* str, intptr_t* width) { 100 intptr_t Utf8::CodePointCount(const uint8_t* utf8_array,
81 bool is_two_byte_string = false; 101 intptr_t array_len,
82 bool is_four_byte_string = false; 102 Type* type) {
83 intptr_t len = 0; 103 intptr_t len = 0;
84 for (; *str != '\0'; ++str) { 104 Type char_type = kISOLatin1;
85 uint8_t code_unit = *str; 105 for (intptr_t i = 0; i < array_len; i++) {
106 uint8_t code_unit = utf8_array[i];
86 if (!IsTrailByte(code_unit)) { 107 if (!IsTrailByte(code_unit)) {
87 ++len; 108 ++len;
88 } 109 }
89 if (code_unit > 0xC3) { // > U+00FF 110 if (!IsIsoLatin1(code_unit)) { // > U+00FF
90 if (code_unit < 0xF0) { // < U+10000 111 if (IsSMP(code_unit)) { // >= U+10000
91 is_two_byte_string = true; 112 char_type = kSMP;
113 ++len;
92 } else { 114 } else {
93 is_four_byte_string = true; 115 char_type = kBMP;
94 } 116 }
95 } 117 }
96 } 118 }
97 if (is_four_byte_string) { 119 *type = char_type;
98 *width = 4;
99 } else if (is_two_byte_string) {
100 *width = 2;
101 } else {
102 *width = 1;
103 }
104 return len; 120 return len;
105 } 121 }
106 122
107 123
108 // Returns true if str is a valid NUL-terminated UTF-8 string. 124 // Returns true if str is a valid NUL-terminated UTF-8 string.
109 bool Utf8::IsValid(const char* str) { 125 bool Utf8::IsValid(const char* str) {
110 intptr_t i = 0; 126 intptr_t i = 0;
111 while (str[i] != '\0') { 127 while (str[i] != '\0') {
112 uint32_t ch = str[i] & 0xFF; 128 uint32_t ch = str[i] & 0xFF;
113 intptr_t j = 1; 129 intptr_t j = 1;
114 if (ch >= 0x80) { 130 if (ch >= 0x80) {
115 uint8_t num_trail_bytes = kTrailBytes[ch]; 131 int8_t num_trail_bytes = kTrailBytes[ch];
116 bool is_malformed = false; 132 bool is_malformed = false;
117 for (; j < num_trail_bytes; ++j) { 133 for (; j < num_trail_bytes; ++j) {
118 if (str[i + j] != '\0') { 134 if (str[i + j] != '\0') {
119 uint8_t code_unit = str[i + j]; 135 uint8_t code_unit = str[i + j];
120 is_malformed |= !IsTrailByte(code_unit); 136 is_malformed |= !IsTrailByte(code_unit);
121 ch = (ch << 6) + code_unit; 137 ch = (ch << 6) + code_unit;
122 } else { 138 } else {
123 return false; 139 return false;
124 } 140 }
125 } 141 }
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after
195 if (pos + num_bytes > len) { 211 if (pos + num_bytes > len) {
196 break; 212 break;
197 } 213 }
198 Utf8::Encode(ch, &dst[pos]); 214 Utf8::Encode(ch, &dst[pos]);
199 pos += num_bytes; 215 pos += num_bytes;
200 } 216 }
201 return pos; 217 return pos;
202 } 218 }
203 219
204 220
205 intptr_t Utf8::Decode(const char* src, int32_t* dst) { 221 intptr_t Utf8::Decode(const uint8_t* utf8_array,
206 uint32_t ch = src[0] & 0xFF; 222 intptr_t array_len,
207 uint32_t i = 1; 223 int32_t* dst) {
224 uint32_t ch = utf8_array[0] & 0xFF;
cshapiro 2012/10/24 23:52:29 The & is probably unnecessary now as the lhs and r
siva 2012/10/26 21:38:29 Done.
225 intptr_t i = 1;
208 if (ch >= 0x80) { 226 if (ch >= 0x80) {
209 uint32_t num_trail_bytes = kTrailBytes[ch]; 227 int32_t num_trail_bytes = kTrailBytes[ch];
cshapiro 2012/10/24 23:52:29 This has no significance as an int32, why not just
siva 2012/10/26 21:38:29 Changed to int8_t to match the type of kTrailBytes
210 bool is_malformed = false; 228 bool is_malformed = false;
211 for (; i < num_trail_bytes; ++i) { 229 for (; i < num_trail_bytes; ++i) {
212 if (src[i] != '\0') { 230 if (i < array_len) {
213 uint8_t code_unit = src[i]; 231 uint8_t code_unit = utf8_array[i];
214 is_malformed |= !IsTrailByte(code_unit); 232 is_malformed |= !IsTrailByte(code_unit);
215 ch = (ch << 6) + code_unit; 233 ch = (ch << 6) + code_unit;
216 } else { 234 } else {
217 *dst = -1; 235 *dst = -1;
218 return 0; 236 return 0;
219 } 237 }
220 } 238 }
221 ch -= kMagicBits[num_trail_bytes]; 239 ch -= kMagicBits[num_trail_bytes];
222 if (!((is_malformed == false) && 240 if (!((is_malformed == false) &&
223 (i == num_trail_bytes) && 241 (i == num_trail_bytes) &&
224 !IsOutOfRange(ch) && 242 !IsOutOfRange(ch) &&
225 !IsNonShortestForm(ch, i) && 243 !IsNonShortestForm(ch, i) &&
226 !IsSurrogate(ch))) { 244 !IsSurrogate(ch))) {
227 *dst = -1; 245 *dst = -1;
228 return 0; 246 return 0;
229 } 247 }
230 } 248 }
231 *dst = ch; 249 *dst = ch;
232 return i; 250 return i;
233 } 251 }
234 252
235 253
236 template<typename T> 254 bool Utf8::DecodeToISOLatin1(const uint8_t* utf8_array,
237 static bool DecodeImpl(const char* src, T* dst, intptr_t len) { 255 intptr_t array_len,
256 uint8_t* dst,
257 intptr_t len) {
238 intptr_t i = 0; 258 intptr_t i = 0;
239 intptr_t j = 0; 259 intptr_t j = 0;
240 intptr_t num_bytes; 260 intptr_t num_bytes;
241 for (; src[i] != '\0' && j < len; i += num_bytes, ++j) { 261 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
242 int32_t ch; 262 int32_t ch;
243 num_bytes = Utf8::Decode(&src[i], &ch); 263 ASSERT(IsIsoLatin1(utf8_array[i]));
264 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
244 if (ch == -1) { 265 if (ch == -1) {
245 return false; // invalid input 266 return false; // invalid input
246 } 267 }
268 ASSERT(ch <= 0xff);
cshapiro 2012/10/24 23:52:29 Replace 0xFF with kMaxOneByteCharacter
siva 2012/10/26 21:38:29 As discussed offline this is 0xff unless we decide
247 dst[j] = ch; 269 dst[j] = ch;
248 } 270 }
249 if (src[i] != '\0' && j == len) { 271 if ((i < array_len) && (j == len)) {
250 return false; // output overflow 272 return false; // output overflow
251 } 273 }
252 return true; // success 274 return true; // success
253 } 275 }
254 276
255 277
256 bool Utf8::Decode(const char* src, uint8_t* dst, intptr_t len) { 278 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array,
257 return DecodeImpl(src, dst, len); 279 intptr_t array_len,
280 uint16_t* dst,
281 intptr_t len) {
282 intptr_t i = 0;
283 intptr_t j = 0;
284 intptr_t num_bytes;
285 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
286 int32_t ch;
287 bool is_smp = IsSMP(utf8_array[i]);
288 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
289 if (ch == -1) {
290 return false; // invalid input
291 }
292 if (is_smp) {
293 ConvertUTF32ToUTF16(ch, &(dst[j]));
294 j = j + 1;
295 } else {
296 dst[j] = ch;
297 }
298 }
299 if ((i < array_len) && (j == len)) {
300 return false; // output overflow
301 }
302 return true; // success
258 } 303 }
259 304
260 305
261 bool Utf8::Decode(const char* src, uint16_t* dst, intptr_t len) { 306 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array,
262 return DecodeImpl(src, dst, len); 307 intptr_t array_len,
263 } 308 uint32_t* dst,
264 309 intptr_t len) {
265 310 intptr_t i = 0;
266 bool Utf8::Decode(const char* src, uint32_t* dst, intptr_t len) { 311 intptr_t j = 0;
267 return DecodeImpl(src, dst, len); 312 intptr_t num_bytes;
313 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
314 int32_t ch;
315 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
316 if (ch == -1) {
317 return false; // invalid input
318 }
319 dst[j] = ch;
320 }
321 if ((i < array_len) && (j == len)) {
322 return false; // output overflow
323 }
324 return true; // success
268 } 325 }
269 326
270 } // namespace dart 327 } // namespace dart
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698