Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
| 4 // met: | 4 // met: |
| 5 // | 5 // |
| 6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
| 7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
| 8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
| 9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
| 10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
| (...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 93 CacheEntry entries_[kSize]; | 93 CacheEntry entries_[kSize]; |
| 94 }; | 94 }; |
| 95 | 95 |
| 96 class UnicodeData { | 96 class UnicodeData { |
| 97 private: | 97 private: |
| 98 friend class Test; | 98 friend class Test; |
| 99 static int GetByteCount(); | 99 static int GetByteCount(); |
| 100 static const uchar kMaxCodePoint; | 100 static const uchar kMaxCodePoint; |
| 101 }; | 101 }; |
| 102 | 102 |
| 103 // --- U t f 8 --- | 103 // --- U t f 8 a n d 16 --- |
| 104 | 104 |
| 105 template <typename Data> | 105 template <typename Data> |
| 106 class Buffer { | 106 class Buffer { |
| 107 public: | 107 public: |
| 108 inline Buffer(Data data, unsigned length) : data_(data), length_(length) { } | 108 inline Buffer(Data data, unsigned length) : data_(data), length_(length) { } |
| 109 inline Buffer() : data_(0), length_(0) { } | 109 inline Buffer() : data_(0), length_(0) { } |
| 110 Data data() { return data_; } | 110 Data data() { return data_; } |
| 111 unsigned length() { return length_; } | 111 unsigned length() { return length_; } |
| 112 private: | 112 private: |
| 113 Data data_; | 113 Data data_; |
| 114 unsigned length_; | 114 unsigned length_; |
| 115 }; | 115 }; |
| 116 | 116 |
| 117 | |
| 118 class Utf16 { | |
| 119 public: | |
| 120 static inline bool IsLeadSurrogate(int32_t code) { | |
| 121 if (code == kNoPreviousCharacter) return false; | |
|
rossberg
2012/03/12 10:55:05
I still think this is implied by the bit masking b
Erik Corry
2012/03/12 12:34:10
Yes, I think that would be too implicit.
| |
| 122 return (code & 0xfc00) == 0xd800; | |
| 123 } | |
| 124 static inline bool IsTrailSurrogate(int32_t code) { | |
| 125 if (code == kNoPreviousCharacter) return false; | |
| 126 return (code & 0xfc00) == 0xdc00; | |
| 127 } | |
| 128 | |
| 129 static inline int32_t CombineSurrogatePair(uchar lead, uchar trail) { | |
| 130 return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff); | |
| 131 } | |
| 132 static const int kNoPreviousCharacter = -1; | |
|
rossberg
2012/03/12 10:55:05
int32_t?
Erik Corry
2012/03/12 12:34:10
Done.
| |
| 133 static const uchar kMaxNonSurrogateCharCode = 0xffff; | |
| 134 // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes | |
| 135 // of UTF-8 data. The special case where the unit is a surrogate | |
| 136 // trail produces 1 byte net, because the encoding of the pair is | |
| 137 // 4 bytes and the 3 bytes that were used to encode the lead surrogate | |
| 138 // can be reclaimed. | |
| 139 static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3; | |
| 140 // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes. | |
| 141 // The illegality stems from the surrogate not being part of a pair. | |
| 142 static const int kUtf8BytesToCodeASurrogate = 3; | |
| 143 static inline uchar LeadSurrogate(int32_t char_code) { | |
| 144 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff); | |
| 145 } | |
| 146 static inline uchar TrailSurrogate(int32_t char_code) { | |
| 147 return 0xdc00 + (char_code & 0x3ff); | |
| 148 } | |
| 149 }; | |
| 150 | |
| 151 | |
| 117 class Utf8 { | 152 class Utf8 { |
| 118 public: | 153 public: |
| 119 static inline uchar Length(uchar chr); | 154 static inline uchar Length(uchar chr, int previous); |
| 120 static inline unsigned Encode(char* out, uchar c); | 155 static inline unsigned Encode( |
| 156 char* out, uchar c, int previous); | |
| 121 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer, | 157 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer, |
| 122 unsigned capacity, unsigned* chars_read, unsigned* offset); | 158 unsigned capacity, unsigned* chars_read, unsigned* offset); |
| 123 static uchar CalculateValue(const byte* str, | 159 static uchar CalculateValue(const byte* str, |
| 124 unsigned length, | 160 unsigned length, |
| 125 unsigned* cursor); | 161 unsigned* cursor); |
| 126 static const uchar kBadChar = 0xFFFD; | 162 static const uchar kBadChar = 0xFFFD; |
| 127 static const unsigned kMaxEncodedSize = 4; | 163 static const unsigned kMaxEncodedSize = 4; |
| 128 static const unsigned kMaxOneByteChar = 0x7f; | 164 static const unsigned kMaxOneByteChar = 0x7f; |
| 129 static const unsigned kMaxTwoByteChar = 0x7ff; | 165 static const unsigned kMaxTwoByteChar = 0x7ff; |
| 130 static const unsigned kMaxThreeByteChar = 0xffff; | 166 static const unsigned kMaxThreeByteChar = 0xffff; |
| 131 static const unsigned kMaxFourByteChar = 0x1fffff; | 167 static const unsigned kMaxFourByteChar = 0x1fffff; |
| 132 | 168 |
| 169 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together | |
| 170 // that match are coded as a 4 byte UTF-8 sequence. | |
| 171 static const unsigned kBytesSavedByCombiningSurrogates = 2; | |
| 172 static const unsigned kSizeOfUnmatchedSurrogate = 3; | |
| 173 | |
| 133 private: | 174 private: |
| 134 template <unsigned s> friend class Utf8InputBuffer; | 175 template <unsigned s> friend class Utf8InputBuffer; |
| 135 friend class Test; | 176 friend class Test; |
| 136 static inline uchar ValueOf(const byte* str, | 177 static inline uchar ValueOf(const byte* str, |
| 137 unsigned length, | 178 unsigned length, |
| 138 unsigned* cursor); | 179 unsigned* cursor); |
| 139 }; | 180 }; |
| 140 | 181 |
| 141 // --- C h a r a c t e r S t r e a m --- | 182 // --- C h a r a c t e r S t r e a m --- |
| 142 | 183 |
| 143 class CharacterStream { | 184 class CharacterStream { |
| 144 public: | 185 public: |
| 145 inline uchar GetNext(); | 186 inline uchar GetNext(); |
| 146 inline bool has_more() { return remaining_ != 0; } | 187 inline bool has_more() { return remaining_ != 0; } |
| 147 // Note that default implementation is not efficient. | 188 // Note that default implementation is not efficient. |
| 148 virtual void Seek(unsigned); | 189 virtual void Seek(unsigned); |
| 149 unsigned Length(); | 190 unsigned Length(); |
| 191 unsigned Utf16Length(); | |
| 150 virtual ~CharacterStream() { } | 192 virtual ~CharacterStream() { } |
| 151 static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity, | 193 static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity, |
| 152 unsigned& offset); | 194 unsigned& offset); |
| 153 static inline bool EncodeAsciiCharacter(uchar c, byte* buffer, | 195 static inline bool EncodeAsciiCharacter(uchar c, byte* buffer, |
| 154 unsigned capacity, unsigned& offset); | 196 unsigned capacity, unsigned& offset); |
| 155 static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer, | 197 static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer, |
| 156 unsigned capacity, unsigned& offset); | 198 unsigned capacity, unsigned& offset); |
| 157 static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset); | 199 static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset); |
| 158 virtual void Rewind() = 0; | 200 virtual void Rewind() = 0; |
| 201 | |
| 159 protected: | 202 protected: |
| 160 virtual void FillBuffer() = 0; | 203 virtual void FillBuffer() = 0; |
| 161 // The number of characters left in the current buffer | 204 // The number of characters left in the current buffer |
| 162 unsigned remaining_; | 205 unsigned remaining_; |
| 163 // The current offset within the buffer | 206 // The current offset within the buffer |
| 164 unsigned cursor_; | 207 unsigned cursor_; |
| 165 // The buffer containing the decoded characters. | 208 // The buffer containing the decoded characters. |
| 166 const byte* buffer_; | 209 const byte* buffer_; |
| 167 }; | 210 }; |
| 168 | 211 |
| (...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 271 static const int kMaxWidth = 1; | 314 static const int kMaxWidth = 1; |
| 272 static int Convert(uchar c, | 315 static int Convert(uchar c, |
| 273 uchar n, | 316 uchar n, |
| 274 uchar* result, | 317 uchar* result, |
| 275 bool* allow_caching_ptr); | 318 bool* allow_caching_ptr); |
| 276 }; | 319 }; |
| 277 | 320 |
| 278 } // namespace unibrow | 321 } // namespace unibrow |
| 279 | 322 |
| 280 #endif // V8_UNICODE_H_ | 323 #endif // V8_UNICODE_H_ |
| OLD | NEW |