vm/unicode.cc - Issue 11275008: - Represent strings internally in UTF-16 format, this makes it

Side by Side Diff: vm/unicode.cc

Issue 11275008: - Represent strings internally in UTF-16 format, this makes it (Closed) Base URL: http://dart.googlecode.com/svn/branches/bleeding_edge/dart/runtime/

Patch Set: Created 8 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 #include "vm/unicode.h"	5 #include "vm/unicode.h"

6	6

7 #include "vm/allocation.h"	7 #include "vm/allocation.h"

8 #include "vm/globals.h"	8 #include "vm/globals.h"

9 #include "vm/object.h"	9 #include "vm/object.h"

10	10

11 namespace dart {	11 namespace dart {

12	12

13 static const uint8_t kTrailBytes[256] = {	13 static const int8_t kTrailBytes[256] = {
	cshapiro 2012/10/24 23:52:29 If you are going to mess with this table at all we If you are going to mess with this table at all we should fold it in half and index by "val >> 1". siva 2012/10/26 21:38:29 maybe for another CL. On 2012/10/24 23:52:29, csh maybe for another CL. On 2012/10/24 23:52:29, cshapiro wrote: Show quoted text > If you are going to mess with this table at all we should fold it in half and > index by "val >> 1".
14 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	14 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

(...skipping 27 matching lines...) Expand all Loading...
51 0xFFFFFFFF,	51 0xFFFFFFFF,

52 0xFFFFFFFF	52 0xFFFFFFFF

53 };	53 };

54	54

55	55

56 static bool IsTrailByte(uint8_t code_unit) {	56 static bool IsTrailByte(uint8_t code_unit) {

57 return (code_unit & 0xc0) == 0x80;	57 return (code_unit & 0xc0) == 0x80;

58 }	58 }

59	59

60	60

	61 static bool IsIsoLatin1(uint8_t code_unit) {
	cshapiro 2012/10/24 23:52:29 See below. This function and the one below it sni See below. This function and the one below it sniff the first code unit of a (potentially) multi-code-unit sequence. They should be named as such. siva 2012/10/26 21:38:29 Renamed to IsIsoLatin1SequenceStart but left it he Renamed to IsIsoLatin1SequenceStart but left it here as we generally collect all static functions at the top of the file. On 2012/10/24 23:52:29, cshapiro wrote: Show quoted text > See below. This function and the one below it sniff the first code unit of a > (potentially) multi-code-unit sequence. They should be named as such.
	62 // Check is codepoint is <= U+00FF

	63 return (code_unit < 0xC3);

	64 }

	65

	66

	67 static bool IsSMP(uint8_t code_unit) {
	cshapiro 2012/10/24 23:52:29 I think this is a somewhat sketchy name. This cod I think this is a somewhat sketchy name. This code unit is the first byte of a supplementary character. Why not call this IsSmpFirstByte or IsSmpStart or some other such thing? Also, these are only used by one routine. Why not move them next to it? siva 2012/10/26 21:38:29 Renamed to IsSmpSequenceStart but left it here as Renamed to IsSmpSequenceStart but left it here as we generally collect all static functions at the top of the file. On 2012/10/24 23:52:29, cshapiro wrote: Show quoted text > I think this is a somewhat sketchy name. This code unit is the first byte of a > supplementary character. Why not call this IsSmpFirstByte or IsSmpStart or some > other such thing? > > Also, these are only used by one routine. Why not move them next to it?
	68 // Check is codepoint is >= U+10000.

	69 return (code_unit >= 0xF0);

	70 }

	71

	72

61 // Returns true if the code point is a high- or low-surrogate.	73 // Returns true if the code point is a high- or low-surrogate.

62 static bool IsSurrogate(uint32_t code_point) {	74 static bool IsSurrogate(uint32_t code_point) {

63 return (code_point & 0xfffff800) == 0xd800;	75 return (code_point & 0xfffff800) == 0xd800;

64 }	76 }

65	77

66	78

67 // Returns true if the code point value is above Plane 17.	79 // Returns true if the code point value is above Plane 17.

68 static bool IsOutOfRange(uint32_t code_point) {	80 static bool IsOutOfRange(uint32_t code_point) {

69 return code_point > 0x10FFFF;	81 return code_point > 0x10FFFF;

70 }	82 }

71	83

72	84

73 // Returns true if the byte sequence is ill-formed.	85 // Returns true if the byte sequence is ill-formed.

74 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) {	86 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) {

75 return code_point < kOverlongMinimum[num_bytes];	87 return code_point < kOverlongMinimum[num_bytes];

76 }	88 }

77	89

78	90

	91 void Utf8::ConvertUTF32ToUTF16(int32_t codepoint, uint16_t* dst) {

	92 ASSERT(codepoint >= 0x10000);
	cshapiro 2012/10/24 23:52:29 I think you need a constant for kMaxBmpCodePoint I think you need a constant for kMaxBmpCodePoint siva 2012/10/26 21:38:29 Done. Show quoted text On 2012/10/24 23:52:29, cshapiro wrote: > I think you need a constant for kMaxBmpCodePoint Done.
	93 ASSERT(dst != NULL);

	94 dst[0] = (Utf8::kLeadOffset + (codepoint >> 10));

	95 dst[1] = (0xDC00 + (codepoint & 0x3FF));

	96 }

	97

	98

79 // Returns a count of the number of UTF-8 trail bytes.	99 // Returns a count of the number of UTF-8 trail bytes.

80 intptr_t Utf8::CodePointCount(const char* str, intptr_t* width) {	100 intptr_t Utf8::CodePointCount(const uint8_t* utf8_array,

81 bool is_two_byte_string = false;	101 intptr_t array_len,

82 bool is_four_byte_string = false;	102 Type* type) {

83 intptr_t len = 0;	103 intptr_t len = 0;

84 for (; *str != '\0'; ++str) {	104 Type char_type = kISOLatin1;

85 uint8_t code_unit = *str;	105 for (intptr_t i = 0; i < array_len; i++) {

	106 uint8_t code_unit = utf8_array[i];

86 if (!IsTrailByte(code_unit)) {	107 if (!IsTrailByte(code_unit)) {

87 ++len;	108 ++len;

88 }	109 }

89 if (code_unit > 0xC3) { // > U+00FF	110 if (!IsIsoLatin1(code_unit)) { // > U+00FF

90 if (code_unit < 0xF0) { // < U+10000	111 if (IsSMP(code_unit)) { // >= U+10000

91 is_two_byte_string = true;	112 char_type = kSMP;

	113 ++len;

92 } else {	114 } else {

93 is_four_byte_string = true;	115 char_type = kBMP;

94 }	116 }

95 }	117 }

96 }	118 }

97 if (is_four_byte_string) {	119 *type = char_type;

98 *width = 4;

99 } else if (is_two_byte_string) {

100 *width = 2;

101 } else {

102 *width = 1;

103 }

104 return len;	120 return len;

105 }	121 }

106	122

107	123

108 // Returns true if str is a valid NUL-terminated UTF-8 string.	124 // Returns true if str is a valid NUL-terminated UTF-8 string.

109 bool Utf8::IsValid(const char* str) {	125 bool Utf8::IsValid(const char* str) {

110 intptr_t i = 0;	126 intptr_t i = 0;

111 while (str[i] != '\0') {	127 while (str[i] != '\0') {

112 uint32_t ch = str[i] & 0xFF;	128 uint32_t ch = str[i] & 0xFF;

113 intptr_t j = 1;	129 intptr_t j = 1;

114 if (ch >= 0x80) {	130 if (ch >= 0x80) {

115 uint8_t num_trail_bytes = kTrailBytes[ch];	131 int8_t num_trail_bytes = kTrailBytes[ch];

116 bool is_malformed = false;	132 bool is_malformed = false;

117 for (; j < num_trail_bytes; ++j) {	133 for (; j < num_trail_bytes; ++j) {

118 if (str[i + j] != '\0') {	134 if (str[i + j] != '\0') {

119 uint8_t code_unit = str[i + j];	135 uint8_t code_unit = str[i + j];

120 is_malformed \|= !IsTrailByte(code_unit);	136 is_malformed \|= !IsTrailByte(code_unit);

121 ch = (ch << 6) + code_unit;	137 ch = (ch << 6) + code_unit;

122 } else {	138 } else {

123 return false;	139 return false;

124 }	140 }

125 }	141 }

(...skipping 69 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
195 if (pos + num_bytes > len) {	211 if (pos + num_bytes > len) {

196 break;	212 break;

197 }	213 }

198 Utf8::Encode(ch, &dst[pos]);	214 Utf8::Encode(ch, &dst[pos]);

199 pos += num_bytes;	215 pos += num_bytes;

200 }	216 }

201 return pos;	217 return pos;

202 }	218 }

203	219

204	220

205 intptr_t Utf8::Decode(const char* src, int32_t* dst) {	221 intptr_t Utf8::Decode(const uint8_t* utf8_array,

206 uint32_t ch = src[0] & 0xFF;	222 intptr_t array_len,

207 uint32_t i = 1;	223 int32_t* dst) {

	224 uint32_t ch = utf8_array[0] & 0xFF;
	cshapiro 2012/10/24 23:52:29 The & is probably unnecessary now as the lhs and r The & is probably unnecessary now as the lhs and rhs are both unsigned. siva 2012/10/26 21:38:29 Done. Show quoted text On 2012/10/24 23:52:29, cshapiro wrote: > The & is probably unnecessary now as the lhs and rhs are both unsigned. Done.
	225 intptr_t i = 1;

208 if (ch >= 0x80) {	226 if (ch >= 0x80) {

209 uint32_t num_trail_bytes = kTrailBytes[ch];	227 int32_t num_trail_bytes = kTrailBytes[ch];
	cshapiro 2012/10/24 23:52:29 This has no significance as an int32, why not just This has no significance as an int32, why not just an int? siva 2012/10/26 21:38:29 Changed to int8_t to match the type of kTrailBytes Changed to int8_t to match the type of kTrailBytes On 2012/10/24 23:52:29, cshapiro wrote: Show quoted text > This has no significance as an int32, why not just an int?
210 bool is_malformed = false;	228 bool is_malformed = false;

211 for (; i < num_trail_bytes; ++i) {	229 for (; i < num_trail_bytes; ++i) {

212 if (src[i] != '\0') {	230 if (i < array_len) {

213 uint8_t code_unit = src[i];	231 uint8_t code_unit = utf8_array[i];

214 is_malformed \|= !IsTrailByte(code_unit);	232 is_malformed \|= !IsTrailByte(code_unit);

215 ch = (ch << 6) + code_unit;	233 ch = (ch << 6) + code_unit;

216 } else {	234 } else {

217 *dst = -1;	235 *dst = -1;

218 return 0;	236 return 0;

219 }	237 }

220 }	238 }

221 ch -= kMagicBits[num_trail_bytes];	239 ch -= kMagicBits[num_trail_bytes];

222 if (!((is_malformed == false) &&	240 if (!((is_malformed == false) &&

223 (i == num_trail_bytes) &&	241 (i == num_trail_bytes) &&

224 !IsOutOfRange(ch) &&	242 !IsOutOfRange(ch) &&

225 !IsNonShortestForm(ch, i) &&	243 !IsNonShortestForm(ch, i) &&

226 !IsSurrogate(ch))) {	244 !IsSurrogate(ch))) {

227 *dst = -1;	245 *dst = -1;

228 return 0;	246 return 0;

229 }	247 }

230 }	248 }

231 *dst = ch;	249 *dst = ch;

232 return i;	250 return i;

233 }	251 }

234	252

235	253

236 template<typename T>	254 bool Utf8::DecodeToISOLatin1(const uint8_t* utf8_array,

237 static bool DecodeImpl(const char* src, T* dst, intptr_t len) {	255 intptr_t array_len,

	256 uint8_t* dst,

	257 intptr_t len) {

238 intptr_t i = 0;	258 intptr_t i = 0;

239 intptr_t j = 0;	259 intptr_t j = 0;

240 intptr_t num_bytes;	260 intptr_t num_bytes;

241 for (; src[i] != '\0' && j < len; i += num_bytes, ++j) {	261 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {

242 int32_t ch;	262 int32_t ch;

243 num_bytes = Utf8::Decode(&src[i], &ch);	263 ASSERT(IsIsoLatin1(utf8_array[i]));

	264 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);

244 if (ch == -1) {	265 if (ch == -1) {

245 return false; // invalid input	266 return false; // invalid input

246 }	267 }

	268 ASSERT(ch <= 0xff);
	cshapiro 2012/10/24 23:52:29 Replace 0xFF with kMaxOneByteCharacter Replace 0xFF with kMaxOneByteCharacter siva 2012/10/26 21:38:29 As discussed offline this is 0xff unless we decide As discussed offline this is 0xff unless we decide to go with ASCII only characters for OneByteString On 2012/10/24 23:52:29, cshapiro wrote: Show quoted text > Replace 0xFF with kMaxOneByteCharacter
247 dst[j] = ch;	269 dst[j] = ch;

248 }	270 }

249 if (src[i] != '\0' && j == len) {	271 if ((i < array_len) && (j == len)) {

250 return false; // output overflow	272 return false; // output overflow

251 }	273 }

252 return true; // success	274 return true; // success

253 }	275 }

254	276

255	277

256 bool Utf8::Decode(const char* src, uint8_t* dst, intptr_t len) {	278 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array,

257 return DecodeImpl(src, dst, len);	279 intptr_t array_len,

	280 uint16_t* dst,

	281 intptr_t len) {

	282 intptr_t i = 0;

	283 intptr_t j = 0;

	284 intptr_t num_bytes;

	285 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {

	286 int32_t ch;

	287 bool is_smp = IsSMP(utf8_array[i]);

	288 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);

	289 if (ch == -1) {

	290 return false; // invalid input

	291 }

	292 if (is_smp) {

	293 ConvertUTF32ToUTF16(ch, &(dst[j]));

	294 j = j + 1;

	295 } else {

	296 dst[j] = ch;

	297 }

	298 }

	299 if ((i < array_len) && (j == len)) {

	300 return false; // output overflow

	301 }

	302 return true; // success

258 }	303 }

259	304

260	305

261 bool Utf8::Decode(const char* src, uint16_t* dst, intptr_t len) {	306 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array,

262 return DecodeImpl(src, dst, len);	307 intptr_t array_len,

263 }	308 uint32_t* dst,

264	309 intptr_t len) {

265	310 intptr_t i = 0;

266 bool Utf8::Decode(const char* src, uint32_t* dst, intptr_t len) {	311 intptr_t j = 0;

267 return DecodeImpl(src, dst, len);	312 intptr_t num_bytes;

	313 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {

	314 int32_t ch;

	315 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);

	316 if (ch == -1) {

	317 return false; // invalid input

	318 }

	319 dst[j] = ch;

	320 }

	321 if ((i < array_len) && (j == len)) {

	322 return false; // output overflow

	323 }

	324 return true; // success

268 }	325 }

269	326

270 } // namespace dart	327 } // namespace dart

OLD	NEW

« vm/unicode.h ('K') | « vm/unicode.h ('k') | vm/unicode_test.cc » ('j') | vm/unit_test.h » ('J')