OLD | NEW |
---|---|
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 #include "vm/unicode.h" | 5 #include "vm/unicode.h" |
6 | 6 |
7 #include "vm/allocation.h" | 7 #include "vm/allocation.h" |
8 #include "vm/globals.h" | 8 #include "vm/globals.h" |
9 #include "vm/object.h" | 9 #include "vm/object.h" |
10 | 10 |
11 namespace dart { | 11 namespace dart { |
12 | 12 |
13 static const uint8_t kTrailBytes[256] = { | 13 static const int8_t kTrailBytes[256] = { |
cshapiro
2012/10/24 23:52:29
If you are going to mess with this table at all we
siva
2012/10/26 21:38:29
maybe for another CL.
On 2012/10/24 23:52:29, csh
| |
14 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 14 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
(...skipping 27 matching lines...) Expand all Loading... | |
51 0xFFFFFFFF, | 51 0xFFFFFFFF, |
52 0xFFFFFFFF | 52 0xFFFFFFFF |
53 }; | 53 }; |
54 | 54 |
55 | 55 |
56 static bool IsTrailByte(uint8_t code_unit) { | 56 static bool IsTrailByte(uint8_t code_unit) { |
57 return (code_unit & 0xc0) == 0x80; | 57 return (code_unit & 0xc0) == 0x80; |
58 } | 58 } |
59 | 59 |
60 | 60 |
61 static bool IsIsoLatin1(uint8_t code_unit) { | |
cshapiro
2012/10/24 23:52:29
See below. This function and the one below it sni
siva
2012/10/26 21:38:29
Renamed to IsIsoLatin1SequenceStart but left it he
| |
62 // Check is codepoint is <= U+00FF | |
63 return (code_unit < 0xC3); | |
64 } | |
65 | |
66 | |
67 static bool IsSMP(uint8_t code_unit) { | |
cshapiro
2012/10/24 23:52:29
I think this is a somewhat sketchy name. This cod
siva
2012/10/26 21:38:29
Renamed to IsSmpSequenceStart but left it here as
| |
68 // Check is codepoint is >= U+10000. | |
69 return (code_unit >= 0xF0); | |
70 } | |
71 | |
72 | |
61 // Returns true if the code point is a high- or low-surrogate. | 73 // Returns true if the code point is a high- or low-surrogate. |
62 static bool IsSurrogate(uint32_t code_point) { | 74 static bool IsSurrogate(uint32_t code_point) { |
63 return (code_point & 0xfffff800) == 0xd800; | 75 return (code_point & 0xfffff800) == 0xd800; |
64 } | 76 } |
65 | 77 |
66 | 78 |
67 // Returns true if the code point value is above Plane 17. | 79 // Returns true if the code point value is above Plane 17. |
68 static bool IsOutOfRange(uint32_t code_point) { | 80 static bool IsOutOfRange(uint32_t code_point) { |
69 return code_point > 0x10FFFF; | 81 return code_point > 0x10FFFF; |
70 } | 82 } |
71 | 83 |
72 | 84 |
73 // Returns true if the byte sequence is ill-formed. | 85 // Returns true if the byte sequence is ill-formed. |
74 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { | 86 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { |
75 return code_point < kOverlongMinimum[num_bytes]; | 87 return code_point < kOverlongMinimum[num_bytes]; |
76 } | 88 } |
77 | 89 |
78 | 90 |
91 void Utf8::ConvertUTF32ToUTF16(int32_t codepoint, uint16_t* dst) { | |
92 ASSERT(codepoint >= 0x10000); | |
cshapiro
2012/10/24 23:52:29
I think you need a constant for kMaxBmpCodePoint
siva
2012/10/26 21:38:29
Done.
| |
93 ASSERT(dst != NULL); | |
94 dst[0] = (Utf8::kLeadOffset + (codepoint >> 10)); | |
95 dst[1] = (0xDC00 + (codepoint & 0x3FF)); | |
96 } | |
97 | |
98 | |
79 // Returns a count of the number of UTF-8 trail bytes. | 99 // Returns a count of the number of UTF-8 trail bytes. |
80 intptr_t Utf8::CodePointCount(const char* str, intptr_t* width) { | 100 intptr_t Utf8::CodePointCount(const uint8_t* utf8_array, |
81 bool is_two_byte_string = false; | 101 intptr_t array_len, |
82 bool is_four_byte_string = false; | 102 Type* type) { |
83 intptr_t len = 0; | 103 intptr_t len = 0; |
84 for (; *str != '\0'; ++str) { | 104 Type char_type = kISOLatin1; |
85 uint8_t code_unit = *str; | 105 for (intptr_t i = 0; i < array_len; i++) { |
106 uint8_t code_unit = utf8_array[i]; | |
86 if (!IsTrailByte(code_unit)) { | 107 if (!IsTrailByte(code_unit)) { |
87 ++len; | 108 ++len; |
88 } | 109 } |
89 if (code_unit > 0xC3) { // > U+00FF | 110 if (!IsIsoLatin1(code_unit)) { // > U+00FF |
90 if (code_unit < 0xF0) { // < U+10000 | 111 if (IsSMP(code_unit)) { // >= U+10000 |
91 is_two_byte_string = true; | 112 char_type = kSMP; |
113 ++len; | |
92 } else { | 114 } else { |
93 is_four_byte_string = true; | 115 char_type = kBMP; |
94 } | 116 } |
95 } | 117 } |
96 } | 118 } |
97 if (is_four_byte_string) { | 119 *type = char_type; |
98 *width = 4; | |
99 } else if (is_two_byte_string) { | |
100 *width = 2; | |
101 } else { | |
102 *width = 1; | |
103 } | |
104 return len; | 120 return len; |
105 } | 121 } |
106 | 122 |
107 | 123 |
108 // Returns true if str is a valid NUL-terminated UTF-8 string. | 124 // Returns true if str is a valid NUL-terminated UTF-8 string. |
109 bool Utf8::IsValid(const char* str) { | 125 bool Utf8::IsValid(const char* str) { |
110 intptr_t i = 0; | 126 intptr_t i = 0; |
111 while (str[i] != '\0') { | 127 while (str[i] != '\0') { |
112 uint32_t ch = str[i] & 0xFF; | 128 uint32_t ch = str[i] & 0xFF; |
113 intptr_t j = 1; | 129 intptr_t j = 1; |
114 if (ch >= 0x80) { | 130 if (ch >= 0x80) { |
115 uint8_t num_trail_bytes = kTrailBytes[ch]; | 131 int8_t num_trail_bytes = kTrailBytes[ch]; |
116 bool is_malformed = false; | 132 bool is_malformed = false; |
117 for (; j < num_trail_bytes; ++j) { | 133 for (; j < num_trail_bytes; ++j) { |
118 if (str[i + j] != '\0') { | 134 if (str[i + j] != '\0') { |
119 uint8_t code_unit = str[i + j]; | 135 uint8_t code_unit = str[i + j]; |
120 is_malformed |= !IsTrailByte(code_unit); | 136 is_malformed |= !IsTrailByte(code_unit); |
121 ch = (ch << 6) + code_unit; | 137 ch = (ch << 6) + code_unit; |
122 } else { | 138 } else { |
123 return false; | 139 return false; |
124 } | 140 } |
125 } | 141 } |
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
195 if (pos + num_bytes > len) { | 211 if (pos + num_bytes > len) { |
196 break; | 212 break; |
197 } | 213 } |
198 Utf8::Encode(ch, &dst[pos]); | 214 Utf8::Encode(ch, &dst[pos]); |
199 pos += num_bytes; | 215 pos += num_bytes; |
200 } | 216 } |
201 return pos; | 217 return pos; |
202 } | 218 } |
203 | 219 |
204 | 220 |
205 intptr_t Utf8::Decode(const char* src, int32_t* dst) { | 221 intptr_t Utf8::Decode(const uint8_t* utf8_array, |
206 uint32_t ch = src[0] & 0xFF; | 222 intptr_t array_len, |
207 uint32_t i = 1; | 223 int32_t* dst) { |
224 uint32_t ch = utf8_array[0] & 0xFF; | |
cshapiro
2012/10/24 23:52:29
The & is probably unnecessary now as the lhs and r
siva
2012/10/26 21:38:29
Done.
| |
225 intptr_t i = 1; | |
208 if (ch >= 0x80) { | 226 if (ch >= 0x80) { |
209 uint32_t num_trail_bytes = kTrailBytes[ch]; | 227 int32_t num_trail_bytes = kTrailBytes[ch]; |
cshapiro
2012/10/24 23:52:29
This has no significance as an int32, why not just
siva
2012/10/26 21:38:29
Changed to int8_t to match the type of kTrailBytes
| |
210 bool is_malformed = false; | 228 bool is_malformed = false; |
211 for (; i < num_trail_bytes; ++i) { | 229 for (; i < num_trail_bytes; ++i) { |
212 if (src[i] != '\0') { | 230 if (i < array_len) { |
213 uint8_t code_unit = src[i]; | 231 uint8_t code_unit = utf8_array[i]; |
214 is_malformed |= !IsTrailByte(code_unit); | 232 is_malformed |= !IsTrailByte(code_unit); |
215 ch = (ch << 6) + code_unit; | 233 ch = (ch << 6) + code_unit; |
216 } else { | 234 } else { |
217 *dst = -1; | 235 *dst = -1; |
218 return 0; | 236 return 0; |
219 } | 237 } |
220 } | 238 } |
221 ch -= kMagicBits[num_trail_bytes]; | 239 ch -= kMagicBits[num_trail_bytes]; |
222 if (!((is_malformed == false) && | 240 if (!((is_malformed == false) && |
223 (i == num_trail_bytes) && | 241 (i == num_trail_bytes) && |
224 !IsOutOfRange(ch) && | 242 !IsOutOfRange(ch) && |
225 !IsNonShortestForm(ch, i) && | 243 !IsNonShortestForm(ch, i) && |
226 !IsSurrogate(ch))) { | 244 !IsSurrogate(ch))) { |
227 *dst = -1; | 245 *dst = -1; |
228 return 0; | 246 return 0; |
229 } | 247 } |
230 } | 248 } |
231 *dst = ch; | 249 *dst = ch; |
232 return i; | 250 return i; |
233 } | 251 } |
234 | 252 |
235 | 253 |
236 template<typename T> | 254 bool Utf8::DecodeToISOLatin1(const uint8_t* utf8_array, |
237 static bool DecodeImpl(const char* src, T* dst, intptr_t len) { | 255 intptr_t array_len, |
256 uint8_t* dst, | |
257 intptr_t len) { | |
238 intptr_t i = 0; | 258 intptr_t i = 0; |
239 intptr_t j = 0; | 259 intptr_t j = 0; |
240 intptr_t num_bytes; | 260 intptr_t num_bytes; |
241 for (; src[i] != '\0' && j < len; i += num_bytes, ++j) { | 261 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
242 int32_t ch; | 262 int32_t ch; |
243 num_bytes = Utf8::Decode(&src[i], &ch); | 263 ASSERT(IsIsoLatin1(utf8_array[i])); |
264 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | |
244 if (ch == -1) { | 265 if (ch == -1) { |
245 return false; // invalid input | 266 return false; // invalid input |
246 } | 267 } |
268 ASSERT(ch <= 0xff); | |
cshapiro
2012/10/24 23:52:29
Replace 0xFF with kMaxOneByteCharacter
siva
2012/10/26 21:38:29
As discussed offline this is 0xff unless we decide
| |
247 dst[j] = ch; | 269 dst[j] = ch; |
248 } | 270 } |
249 if (src[i] != '\0' && j == len) { | 271 if ((i < array_len) && (j == len)) { |
250 return false; // output overflow | 272 return false; // output overflow |
251 } | 273 } |
252 return true; // success | 274 return true; // success |
253 } | 275 } |
254 | 276 |
255 | 277 |
256 bool Utf8::Decode(const char* src, uint8_t* dst, intptr_t len) { | 278 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array, |
257 return DecodeImpl(src, dst, len); | 279 intptr_t array_len, |
280 uint16_t* dst, | |
281 intptr_t len) { | |
282 intptr_t i = 0; | |
283 intptr_t j = 0; | |
284 intptr_t num_bytes; | |
285 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | |
286 int32_t ch; | |
287 bool is_smp = IsSMP(utf8_array[i]); | |
288 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | |
289 if (ch == -1) { | |
290 return false; // invalid input | |
291 } | |
292 if (is_smp) { | |
293 ConvertUTF32ToUTF16(ch, &(dst[j])); | |
294 j = j + 1; | |
295 } else { | |
296 dst[j] = ch; | |
297 } | |
298 } | |
299 if ((i < array_len) && (j == len)) { | |
300 return false; // output overflow | |
301 } | |
302 return true; // success | |
258 } | 303 } |
259 | 304 |
260 | 305 |
261 bool Utf8::Decode(const char* src, uint16_t* dst, intptr_t len) { | 306 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, |
262 return DecodeImpl(src, dst, len); | 307 intptr_t array_len, |
263 } | 308 uint32_t* dst, |
264 | 309 intptr_t len) { |
265 | 310 intptr_t i = 0; |
266 bool Utf8::Decode(const char* src, uint32_t* dst, intptr_t len) { | 311 intptr_t j = 0; |
267 return DecodeImpl(src, dst, len); | 312 intptr_t num_bytes; |
313 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | |
314 int32_t ch; | |
315 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | |
316 if (ch == -1) { | |
317 return false; // invalid input | |
318 } | |
319 dst[j] = ch; | |
320 } | |
321 if ((i < array_len) && (j == len)) { | |
322 return false; // output overflow | |
323 } | |
324 return true; // success | |
268 } | 325 } |
269 | 326 |
270 } // namespace dart | 327 } // namespace dart |
OLD | NEW |