OLD | NEW |
1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 11 matching lines...) Expand all Loading... |
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
27 | 27 |
28 #ifndef V8_UNICODE_H_ | 28 #ifndef V8_UNICODE_H_ |
29 #define V8_UNICODE_H_ | 29 #define V8_UNICODE_H_ |
30 | 30 |
31 #include <sys/types.h> | 31 #include <sys/types.h> |
32 | 32 #include <stdint.h> |
| 33 #include <globals.h> |
33 /** | 34 /** |
34 * \file | 35 * \file |
35 * Definitions and convenience functions for working with unicode. | 36 * Definitions and convenience functions for working with unicode. |
36 */ | 37 */ |
37 | 38 |
38 namespace unibrow { | 39 namespace unibrow { |
39 | 40 |
40 typedef unsigned int uchar; | 41 typedef unsigned int uchar; |
41 typedef unsigned char byte; | 42 typedef unsigned char byte; |
42 | 43 |
(...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
133 static const uchar kMaxNonSurrogateCharCode = 0xffff; | 134 static const uchar kMaxNonSurrogateCharCode = 0xffff; |
134 // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes | 135 // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes |
135 // of UTF-8 data. The special case where the unit is a surrogate | 136 // of UTF-8 data. The special case where the unit is a surrogate |
136 // trail produces 1 byte net, because the encoding of the pair is | 137 // trail produces 1 byte net, because the encoding of the pair is |
137 // 4 bytes and the 3 bytes that were used to encode the lead surrogate | 138 // 4 bytes and the 3 bytes that were used to encode the lead surrogate |
138 // can be reclaimed. | 139 // can be reclaimed. |
139 static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3; | 140 static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3; |
140 // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes. | 141 // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes. |
141 // The illegality stems from the surrogate not being part of a pair. | 142 // The illegality stems from the surrogate not being part of a pair. |
142 static const int kUtf8BytesToCodeASurrogate = 3; | 143 static const int kUtf8BytesToCodeASurrogate = 3; |
143 static inline uchar LeadSurrogate(int char_code) { | 144 static inline uint16_t LeadSurrogate(uint32_t char_code) { |
144 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff); | 145 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff); |
145 } | 146 } |
146 static inline uchar TrailSurrogate(int char_code) { | 147 static inline uint16_t TrailSurrogate(uint32_t char_code) { |
147 return 0xdc00 + (char_code & 0x3ff); | 148 return 0xdc00 + (char_code & 0x3ff); |
148 } | 149 } |
149 }; | 150 }; |
150 | 151 |
151 | 152 |
152 class Utf8 { | 153 class Utf8 { |
153 public: | 154 public: |
154 static inline uchar Length(uchar chr, int previous); | 155 static inline uchar Length(uchar chr, int previous); |
155 static inline unsigned Encode( | 156 static inline unsigned Encode( |
156 char* out, uchar c, int previous); | 157 char* out, uchar c, int previous); |
157 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer, | |
158 unsigned capacity, unsigned* chars_read, unsigned* offset); | |
159 static uchar CalculateValue(const byte* str, | 158 static uchar CalculateValue(const byte* str, |
160 unsigned length, | 159 unsigned length, |
161 unsigned* cursor); | 160 unsigned* cursor); |
162 static const uchar kBadChar = 0xFFFD; | 161 static const uchar kBadChar = 0xFFFD; |
163 static const unsigned kMaxEncodedSize = 4; | 162 static const unsigned kMaxEncodedSize = 4; |
164 static const unsigned kMaxOneByteChar = 0x7f; | 163 static const unsigned kMaxOneByteChar = 0x7f; |
165 static const unsigned kMaxTwoByteChar = 0x7ff; | 164 static const unsigned kMaxTwoByteChar = 0x7ff; |
166 static const unsigned kMaxThreeByteChar = 0xffff; | 165 static const unsigned kMaxThreeByteChar = 0xffff; |
167 static const unsigned kMaxFourByteChar = 0x1fffff; | 166 static const unsigned kMaxFourByteChar = 0x1fffff; |
168 | 167 |
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
234 unsigned offset_; | 233 unsigned offset_; |
235 // The input string | 234 // The input string |
236 Input input_; | 235 Input input_; |
237 // To avoid heap allocation, we keep an internal buffer to which | 236 // To avoid heap allocation, we keep an internal buffer to which |
238 // the encoded string can write its characters. The string | 237 // the encoded string can write its characters. The string |
239 // implementation is free to decide whether it wants to use this | 238 // implementation is free to decide whether it wants to use this |
240 // buffer or not. | 239 // buffer or not. |
241 byte util_buffer_[kSize]; | 240 byte util_buffer_[kSize]; |
242 }; | 241 }; |
243 | 242 |
244 // --- U t f 8 I n p u t B u f f e r --- | |
245 | 243 |
246 template <unsigned s = 256> | 244 class Utf8DecoderBase { |
247 class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> { | |
248 public: | 245 public: |
249 inline Utf8InputBuffer() { } | 246 // Initialization done in subclass. |
250 inline Utf8InputBuffer(const char* data, unsigned length); | 247 inline Utf8DecoderBase(); |
251 inline void Reset(const char* data, unsigned length) { | 248 inline Utf8DecoderBase(uint16_t* buffer, |
252 InputBuffer<Utf8, Buffer<const char*>, s>::Reset( | 249 unsigned buffer_length, |
253 Buffer<const char*>(data, length)); | 250 const uint8_t* stream, |
254 } | 251 unsigned stream_length); |
| 252 inline unsigned Utf16Length() const { return utf16_length_; } |
| 253 protected: |
| 254 // This reads all characters and sets the utf16_length_. |
| 255 // The first buffer_length utf16 chars are cached in the buffer. |
| 256 void Reset(uint16_t* buffer, |
| 257 unsigned buffer_length, |
| 258 const uint8_t* stream, |
| 259 unsigned stream_length); |
| 260 static void WriteUtf16Slow(const uint8_t* stream, |
| 261 uint16_t* data, |
| 262 unsigned length); |
| 263 const uint8_t* unbuffered_start_; |
| 264 unsigned utf16_length_; |
| 265 bool last_byte_of_buffer_unused_; |
| 266 private: |
| 267 DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase); |
| 268 }; |
| 269 |
| 270 template <unsigned kBufferSize> |
| 271 class Utf8Decoder : public Utf8DecoderBase { |
| 272 public: |
| 273 inline Utf8Decoder() {} |
| 274 inline Utf8Decoder(const char* stream, unsigned length); |
| 275 inline void Reset(const char* stream, unsigned length); |
| 276 inline unsigned WriteUtf16(uint16_t* data, unsigned length) const; |
| 277 private: |
| 278 uint16_t buffer_[kBufferSize]; |
255 }; | 279 }; |
256 | 280 |
257 | 281 |
258 struct Uppercase { | 282 struct Uppercase { |
259 static bool Is(uchar c); | 283 static bool Is(uchar c); |
260 }; | 284 }; |
261 struct Lowercase { | 285 struct Lowercase { |
262 static bool Is(uchar c); | 286 static bool Is(uchar c); |
263 }; | 287 }; |
264 struct Letter { | 288 struct Letter { |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
314 static const int kMaxWidth = 1; | 338 static const int kMaxWidth = 1; |
315 static int Convert(uchar c, | 339 static int Convert(uchar c, |
316 uchar n, | 340 uchar n, |
317 uchar* result, | 341 uchar* result, |
318 bool* allow_caching_ptr); | 342 bool* allow_caching_ptr); |
319 }; | 343 }; |
320 | 344 |
321 } // namespace unibrow | 345 } // namespace unibrow |
322 | 346 |
323 #endif // V8_UNICODE_H_ | 347 #endif // V8_UNICODE_H_ |
OLD | NEW |