| OLD | NEW |
| 1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
| 4 // met: | 4 // met: |
| 5 // | 5 // |
| 6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
| 7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
| 8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
| 9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
| 10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
| (...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 93 CacheEntry entries_[kSize]; | 93 CacheEntry entries_[kSize]; |
| 94 }; | 94 }; |
| 95 | 95 |
| 96 class UnicodeData { | 96 class UnicodeData { |
| 97 private: | 97 private: |
| 98 friend class Test; | 98 friend class Test; |
| 99 static int GetByteCount(); | 99 static int GetByteCount(); |
| 100 static const uchar kMaxCodePoint; | 100 static const uchar kMaxCodePoint; |
| 101 }; | 101 }; |
| 102 | 102 |
| 103 // --- U t f 8 a n d 16 --- | |
| 104 | |
| 105 template <typename Data> | |
| 106 class Buffer { | |
| 107 public: | |
| 108 inline Buffer(Data data, unsigned length) : data_(data), length_(length) { } | |
| 109 inline Buffer() : data_(0), length_(0) { } | |
| 110 Data data() { return data_; } | |
| 111 unsigned length() { return length_; } | |
| 112 private: | |
| 113 Data data_; | |
| 114 unsigned length_; | |
| 115 }; | |
| 116 | |
| 117 | |
| 118 class Utf16 { | 103 class Utf16 { |
| 119 public: | 104 public: |
| 120 static inline bool IsLeadSurrogate(int code) { | 105 static inline bool IsLeadSurrogate(int code) { |
| 121 if (code == kNoPreviousCharacter) return false; | 106 if (code == kNoPreviousCharacter) return false; |
| 122 return (code & 0xfc00) == 0xd800; | 107 return (code & 0xfc00) == 0xd800; |
| 123 } | 108 } |
| 124 static inline bool IsTrailSurrogate(int code) { | 109 static inline bool IsTrailSurrogate(int code) { |
| 125 if (code == kNoPreviousCharacter) return false; | 110 if (code == kNoPreviousCharacter) return false; |
| 126 return (code & 0xfc00) == 0xdc00; | 111 return (code & 0xfc00) == 0xdc00; |
| 127 } | 112 } |
| (...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 166 | 151 |
| 167 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together | 152 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together |
| 168 // that match are coded as a 4 byte UTF-8 sequence. | 153 // that match are coded as a 4 byte UTF-8 sequence. |
| 169 static const unsigned kBytesSavedByCombiningSurrogates = 2; | 154 static const unsigned kBytesSavedByCombiningSurrogates = 2; |
| 170 static const unsigned kSizeOfUnmatchedSurrogate = 3; | 155 static const unsigned kSizeOfUnmatchedSurrogate = 3; |
| 171 static inline uchar ValueOf(const byte* str, | 156 static inline uchar ValueOf(const byte* str, |
| 172 unsigned length, | 157 unsigned length, |
| 173 unsigned* cursor); | 158 unsigned* cursor); |
| 174 }; | 159 }; |
| 175 | 160 |
| 176 // --- C h a r a c t e r S t r e a m --- | |
| 177 | |
| 178 class CharacterStream { | |
| 179 public: | |
| 180 inline uchar GetNext(); | |
| 181 inline bool has_more() { return remaining_ != 0; } | |
| 182 // Note that default implementation is not efficient. | |
| 183 virtual void Seek(unsigned); | |
| 184 unsigned Length(); | |
| 185 unsigned Utf16Length(); | |
| 186 virtual ~CharacterStream() { } | |
| 187 static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity, | |
| 188 unsigned& offset); | |
| 189 static inline bool EncodeAsciiCharacter(uchar c, byte* buffer, | |
| 190 unsigned capacity, unsigned& offset); | |
| 191 static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer, | |
| 192 unsigned capacity, unsigned& offset); | |
| 193 static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset); | |
| 194 virtual void Rewind() = 0; | |
| 195 | |
| 196 protected: | |
| 197 virtual void FillBuffer() = 0; | |
| 198 virtual bool BoundsCheck(unsigned offset) = 0; | |
| 199 // The number of characters left in the current buffer | |
| 200 unsigned remaining_; | |
| 201 // The current offset within the buffer | |
| 202 unsigned cursor_; | |
| 203 // The buffer containing the decoded characters. | |
| 204 const byte* buffer_; | |
| 205 }; | |
| 206 | |
| 207 // --- I n p u t B u f f e r --- | |
| 208 | |
| 209 /** | |
| 210 * Provides efficient access to encoded characters in strings. It | |
| 211 * does so by reading characters one block at a time, rather than one | |
| 212 * character at a time, which gives string implementations an | |
| 213 * opportunity to optimize the decoding. | |
| 214 */ | |
| 215 template <class Reader, class Input = Reader*, unsigned kSize = 256> | |
| 216 class InputBuffer : public CharacterStream { | |
| 217 public: | |
| 218 virtual void Rewind(); | |
| 219 inline void Reset(Input input); | |
| 220 void Seek(unsigned position); | |
| 221 inline void Reset(unsigned position, Input input); | |
| 222 protected: | |
| 223 InputBuffer() { } | |
| 224 explicit InputBuffer(Input input) { Reset(input); } | |
| 225 virtual void FillBuffer(); | |
| 226 virtual bool BoundsCheck(unsigned offset) { | |
| 227 return (buffer_ != util_buffer_) || (offset < kSize); | |
| 228 } | |
| 229 | |
| 230 // A custom offset that can be used by the string implementation to | |
| 231 // mark progress within the encoded string. | |
| 232 unsigned offset_; | |
| 233 // The input string | |
| 234 Input input_; | |
| 235 // To avoid heap allocation, we keep an internal buffer to which | |
| 236 // the encoded string can write its characters. The string | |
| 237 // implementation is free to decide whether it wants to use this | |
| 238 // buffer or not. | |
| 239 byte util_buffer_[kSize]; | |
| 240 }; | |
| 241 | |
| 242 | 161 |
| 243 class Utf8DecoderBase { | 162 class Utf8DecoderBase { |
| 244 public: | 163 public: |
| 245 // Initialization done in subclass. | 164 // Initialization done in subclass. |
| 246 inline Utf8DecoderBase(); | 165 inline Utf8DecoderBase(); |
| 247 inline Utf8DecoderBase(uint16_t* buffer, | 166 inline Utf8DecoderBase(uint16_t* buffer, |
| 248 unsigned buffer_length, | 167 unsigned buffer_length, |
| 249 const uint8_t* stream, | 168 const uint8_t* stream, |
| 250 unsigned stream_length); | 169 unsigned stream_length); |
| 251 inline unsigned Utf16Length() const { return utf16_length_; } | 170 inline unsigned Utf16Length() const { return utf16_length_; } |
| (...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 337 static const int kMaxWidth = 1; | 256 static const int kMaxWidth = 1; |
| 338 static int Convert(uchar c, | 257 static int Convert(uchar c, |
| 339 uchar n, | 258 uchar n, |
| 340 uchar* result, | 259 uchar* result, |
| 341 bool* allow_caching_ptr); | 260 bool* allow_caching_ptr); |
| 342 }; | 261 }; |
| 343 | 262 |
| 344 } // namespace unibrow | 263 } // namespace unibrow |
| 345 | 264 |
| 346 #endif // V8_UNICODE_H_ | 265 #endif // V8_UNICODE_H_ |
| OLD | NEW |