OLD | NEW |
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 #include "vm/unicode.h" | 5 #include "vm/unicode.h" |
6 | 6 |
7 #include "vm/allocation.h" | 7 #include "vm/allocation.h" |
8 #include "vm/globals.h" | 8 #include "vm/globals.h" |
9 #include "vm/object.h" | 9 #include "vm/object.h" |
10 | 10 |
(...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
69 return code_point > 0x10FFFF; | 69 return code_point > 0x10FFFF; |
70 } | 70 } |
71 | 71 |
72 | 72 |
73 // Returns true if the byte sequence is ill-formed. | 73 // Returns true if the byte sequence is ill-formed. |
74 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { | 74 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { |
75 return code_point < kOverlongMinimum[num_bytes]; | 75 return code_point < kOverlongMinimum[num_bytes]; |
76 } | 76 } |
77 | 77 |
78 | 78 |
| 79 // Returns a count of the number of UTF-8 trail bytes. |
79 intptr_t Utf8::CodePointCount(const char* str, intptr_t* width) { | 80 intptr_t Utf8::CodePointCount(const char* str, intptr_t* width) { |
80 bool is_two_byte_string = false; | 81 bool is_two_byte_string = false; |
81 bool is_four_byte_string = false; | 82 bool is_four_byte_string = false; |
82 intptr_t len = 0; | 83 intptr_t len = 0; |
83 for (; *str != '\0'; ++str) { | 84 for (; *str != '\0'; ++str) { |
84 uint8_t code_unit = *str; | 85 uint8_t code_unit = *str; |
85 if (!IsTrailByte(code_unit)) { | 86 if (!IsTrailByte(code_unit)) { |
86 ++len; | 87 ++len; |
87 } | 88 } |
88 if (code_unit > 0xC3) { // > U+00FF | 89 if (code_unit > 0xC3) { // > U+00FF |
89 if (code_unit < 0xF0) { // < U+10000 | 90 if (code_unit < 0xF0) { // < U+10000 |
90 is_two_byte_string = true; | 91 is_two_byte_string = true; |
91 } else { | 92 } else { |
92 is_four_byte_string = true; | 93 is_four_byte_string = true; |
93 } | 94 } |
94 } | 95 } |
95 } | 96 } |
96 if (is_four_byte_string) { | 97 if (is_four_byte_string) { |
97 *width = 4; | 98 *width = 4; |
98 } else if (is_two_byte_string) { | 99 } else if (is_two_byte_string) { |
99 *width = 2; | 100 *width = 2; |
100 } else { | 101 } else { |
101 *width = 1; | 102 *width = 1; |
102 } | 103 } |
103 return len; | 104 return len; |
104 } | 105 } |
105 | 106 |
106 | 107 |
| 108 // Returns true if str is a valid NUL-terminated UTF-8 string. |
| 109 bool Utf8::IsValid(const char* str) { |
| 110 intptr_t i = 0; |
| 111 while (str[i] != '\0') { |
| 112 uint32_t ch = str[i] & 0xFF; |
| 113 intptr_t j = 1; |
| 114 if (ch >= 0x80) { |
| 115 uint8_t num_trail_bytes = kTrailBytes[ch]; |
| 116 bool is_malformed = false; |
| 117 for (; j < num_trail_bytes; ++j) { |
| 118 if (str[i + j] != '\0') { |
| 119 uint8_t code_unit = str[i + j]; |
| 120 is_malformed |= !IsTrailByte(code_unit); |
| 121 ch = (ch << 6) + code_unit; |
| 122 } else { |
| 123 return false; |
| 124 } |
| 125 } |
| 126 ch -= kMagicBits[num_trail_bytes]; |
| 127 if (!((is_malformed == false) && |
| 128 (j == num_trail_bytes) && |
| 129 !IsOutOfRange(ch) && |
| 130 !IsNonShortestForm(ch, j) && |
| 131 !IsSurrogate(ch))) { |
| 132 return false; |
| 133 } |
| 134 } |
| 135 i += j; |
| 136 } |
| 137 return true; |
| 138 } |
| 139 |
| 140 |
107 intptr_t Utf8::Length(int32_t ch) { | 141 intptr_t Utf8::Length(int32_t ch) { |
108 if (ch <= kMaxOneByteChar) { | 142 if (ch <= kMaxOneByteChar) { |
109 return 1; | 143 return 1; |
110 } else if (ch <= kMaxTwoByteChar) { | 144 } else if (ch <= kMaxTwoByteChar) { |
111 return 2; | 145 return 2; |
112 } else if (ch <= kMaxThreeByteChar) { | 146 } else if (ch <= kMaxThreeByteChar) { |
113 return 3; | 147 return 3; |
114 } | 148 } |
115 ASSERT(ch <= kMaxFourByteChar); | 149 ASSERT(ch <= kMaxFourByteChar); |
116 return 4; | 150 return 4; |
(...skipping 110 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
227 bool Utf8::Decode(const char* src, uint16_t* dst, intptr_t len) { | 261 bool Utf8::Decode(const char* src, uint16_t* dst, intptr_t len) { |
228 return DecodeImpl(src, dst, len); | 262 return DecodeImpl(src, dst, len); |
229 } | 263 } |
230 | 264 |
231 | 265 |
232 bool Utf8::Decode(const char* src, uint32_t* dst, intptr_t len) { | 266 bool Utf8::Decode(const char* src, uint32_t* dst, intptr_t len) { |
233 return DecodeImpl(src, dst, len); | 267 return DecodeImpl(src, dst, len); |
234 } | 268 } |
235 | 269 |
236 } // namespace dart | 270 } // namespace dart |
OLD | NEW |