| OLD | NEW |
| 1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
| 4 // met: | 4 // met: |
| 5 // | 5 // |
| 6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
| 7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
| 8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
| 9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
| 10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
| (...skipping 11 matching lines...) Expand all Loading... |
| 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 27 | 27 |
| 28 #ifndef V8_UNICODE_H_ | 28 #ifndef V8_UNICODE_H_ |
| 29 #define V8_UNICODE_H_ | 29 #define V8_UNICODE_H_ |
| 30 | 30 |
| 31 #include <sys/types.h> | 31 #include <sys/types.h> |
| 32 | 32 #include <stdint.h> |
| 33 #include <globals.h> |
| 33 /** | 34 /** |
| 34 * \file | 35 * \file |
| 35 * Definitions and convenience functions for working with unicode. | 36 * Definitions and convenience functions for working with unicode. |
| 36 */ | 37 */ |
| 37 | 38 |
| 38 namespace unibrow { | 39 namespace unibrow { |
| 39 | 40 |
| 40 typedef unsigned int uchar; | 41 typedef unsigned int uchar; |
| 41 typedef unsigned char byte; | 42 typedef unsigned char byte; |
| 42 | 43 |
| (...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 133 static const uchar kMaxNonSurrogateCharCode = 0xffff; | 134 static const uchar kMaxNonSurrogateCharCode = 0xffff; |
| 134 // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes | 135 // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes |
| 135 // of UTF-8 data. The special case where the unit is a surrogate | 136 // of UTF-8 data. The special case where the unit is a surrogate |
| 136 // trail produces 1 byte net, because the encoding of the pair is | 137 // trail produces 1 byte net, because the encoding of the pair is |
| 137 // 4 bytes and the 3 bytes that were used to encode the lead surrogate | 138 // 4 bytes and the 3 bytes that were used to encode the lead surrogate |
| 138 // can be reclaimed. | 139 // can be reclaimed. |
| 139 static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3; | 140 static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3; |
| 140 // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes. | 141 // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes. |
| 141 // The illegality stems from the surrogate not being part of a pair. | 142 // The illegality stems from the surrogate not being part of a pair. |
| 142 static const int kUtf8BytesToCodeASurrogate = 3; | 143 static const int kUtf8BytesToCodeASurrogate = 3; |
| 143 static inline uchar LeadSurrogate(int char_code) { | 144 static inline uint16_t LeadSurrogate(uint32_t char_code) { |
| 144 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff); | 145 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff); |
| 145 } | 146 } |
| 146 static inline uchar TrailSurrogate(int char_code) { | 147 static inline uint16_t TrailSurrogate(uint32_t char_code) { |
| 147 return 0xdc00 + (char_code & 0x3ff); | 148 return 0xdc00 + (char_code & 0x3ff); |
| 148 } | 149 } |
| 149 }; | 150 }; |
| 150 | 151 |
| 151 | 152 |
| 152 class Utf8 { | 153 class Utf8 { |
| 153 public: | 154 public: |
| 154 static inline uchar Length(uchar chr, int previous); | 155 static inline uchar Length(uchar chr, int previous); |
| 155 static inline unsigned Encode( | 156 static inline unsigned Encode( |
| 156 char* out, uchar c, int previous); | 157 char* out, uchar c, int previous); |
| 157 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer, | |
| 158 unsigned capacity, unsigned* chars_read, unsigned* offset); | |
| 159 static uchar CalculateValue(const byte* str, | 158 static uchar CalculateValue(const byte* str, |
| 160 unsigned length, | 159 unsigned length, |
| 161 unsigned* cursor); | 160 unsigned* cursor); |
| 162 static const uchar kBadChar = 0xFFFD; | 161 static const uchar kBadChar = 0xFFFD; |
| 163 static const unsigned kMaxEncodedSize = 4; | 162 static const unsigned kMaxEncodedSize = 4; |
| 164 static const unsigned kMaxOneByteChar = 0x7f; | 163 static const unsigned kMaxOneByteChar = 0x7f; |
| 165 static const unsigned kMaxTwoByteChar = 0x7ff; | 164 static const unsigned kMaxTwoByteChar = 0x7ff; |
| 166 static const unsigned kMaxThreeByteChar = 0xffff; | 165 static const unsigned kMaxThreeByteChar = 0xffff; |
| 167 static const unsigned kMaxFourByteChar = 0x1fffff; | 166 static const unsigned kMaxFourByteChar = 0x1fffff; |
| 168 | 167 |
| (...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 234 unsigned offset_; | 233 unsigned offset_; |
| 235 // The input string | 234 // The input string |
| 236 Input input_; | 235 Input input_; |
| 237 // To avoid heap allocation, we keep an internal buffer to which | 236 // To avoid heap allocation, we keep an internal buffer to which |
| 238 // the encoded string can write its characters. The string | 237 // the encoded string can write its characters. The string |
| 239 // implementation is free to decide whether it wants to use this | 238 // implementation is free to decide whether it wants to use this |
| 240 // buffer or not. | 239 // buffer or not. |
| 241 byte util_buffer_[kSize]; | 240 byte util_buffer_[kSize]; |
| 242 }; | 241 }; |
| 243 | 242 |
| 244 // --- U t f 8 I n p u t B u f f e r --- | |
| 245 | 243 |
| 246 template <unsigned s = 256> | 244 class Utf8DecoderBase { |
| 247 class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> { | |
| 248 public: | 245 public: |
| 249 inline Utf8InputBuffer() { } | 246 // Initialization done in subclass. |
| 250 inline Utf8InputBuffer(const char* data, unsigned length); | 247 inline Utf8DecoderBase(); |
| 251 inline void Reset(const char* data, unsigned length) { | 248 inline Utf8DecoderBase(uint16_t* buffer, |
| 252 InputBuffer<Utf8, Buffer<const char*>, s>::Reset( | 249 unsigned buffer_length, |
| 253 Buffer<const char*>(data, length)); | 250 const uint8_t* stream, |
| 254 } | 251 unsigned stream_length); |
| 252 inline unsigned Utf16Length() const { return utf16_length_; } |
| 253 protected: |
| 254 // This reads all characters and sets the utf16_length_. |
| 255 // The first buffer_length utf16 chars are cached in the buffer. |
| 256 void Reset(uint16_t* buffer, |
| 257 unsigned buffer_length, |
| 258 const uint8_t* stream, |
| 259 unsigned stream_length); |
| 260 static void WriteUtf16Slow(const uint8_t* stream, |
| 261 uint16_t* data, |
| 262 unsigned length); |
| 263 const uint8_t* unbuffered_start_; |
| 264 unsigned utf16_length_; |
| 265 bool last_byte_of_buffer_unused_; |
| 266 private: |
| 267 DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase); |
| 268 }; |
| 269 |
| 270 template <unsigned kBufferSize> |
| 271 class Utf8Decoder : public Utf8DecoderBase { |
| 272 public: |
| 273 inline Utf8Decoder() {} |
| 274 inline Utf8Decoder(const char* stream, unsigned length); |
| 275 inline void Reset(const char* stream, unsigned length); |
| 276 inline unsigned WriteUtf16(uint16_t* data, unsigned length) const; |
| 277 private: |
| 278 uint16_t buffer_[kBufferSize]; |
| 255 }; | 279 }; |
| 256 | 280 |
| 257 | 281 |
| 258 struct Uppercase { | 282 struct Uppercase { |
| 259 static bool Is(uchar c); | 283 static bool Is(uchar c); |
| 260 }; | 284 }; |
| 261 struct Lowercase { | 285 struct Lowercase { |
| 262 static bool Is(uchar c); | 286 static bool Is(uchar c); |
| 263 }; | 287 }; |
| 264 struct Letter { | 288 struct Letter { |
| (...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 314 static const int kMaxWidth = 1; | 338 static const int kMaxWidth = 1; |
| 315 static int Convert(uchar c, | 339 static int Convert(uchar c, |
| 316 uchar n, | 340 uchar n, |
| 317 uchar* result, | 341 uchar* result, |
| 318 bool* allow_caching_ptr); | 342 bool* allow_caching_ptr); |
| 319 }; | 343 }; |
| 320 | 344 |
| 321 } // namespace unibrow | 345 } // namespace unibrow |
| 322 | 346 |
| 323 #endif // V8_UNICODE_H_ | 347 #endif // V8_UNICODE_H_ |
| OLD | NEW |