OLD | NEW |
---|---|
1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
93 CacheEntry entries_[kSize]; | 93 CacheEntry entries_[kSize]; |
94 }; | 94 }; |
95 | 95 |
96 class UnicodeData { | 96 class UnicodeData { |
97 private: | 97 private: |
98 friend class Test; | 98 friend class Test; |
99 static int GetByteCount(); | 99 static int GetByteCount(); |
100 static const uchar kMaxCodePoint; | 100 static const uchar kMaxCodePoint; |
101 }; | 101 }; |
102 | 102 |
103 // --- U t f 8 --- | 103 // --- U t f 8 a n d 16 --- |
104 | 104 |
105 template <typename Data> | 105 template <typename Data> |
106 class Buffer { | 106 class Buffer { |
107 public: | 107 public: |
108 inline Buffer(Data data, unsigned length) : data_(data), length_(length) { } | 108 inline Buffer(Data data, unsigned length) : data_(data), length_(length) { } |
109 inline Buffer() : data_(0), length_(0) { } | 109 inline Buffer() : data_(0), length_(0) { } |
110 Data data() { return data_; } | 110 Data data() { return data_; } |
111 unsigned length() { return length_; } | 111 unsigned length() { return length_; } |
112 private: | 112 private: |
113 Data data_; | 113 Data data_; |
114 unsigned length_; | 114 unsigned length_; |
115 }; | 115 }; |
116 | 116 |
117 | |
118 class Utf16 { | |
119 public: | |
120 static inline bool IsLeadSurrogate(int32_t code) { | |
121 if (code == kNoPreviousCharacter) return false; | |
rossberg
2012/03/12 10:55:05
I still think this is implied by the bit masking b
Erik Corry
2012/03/12 12:34:10
Yes, I think that would be too implicit.
| |
122 return (code & 0xfc00) == 0xd800; | |
123 } | |
124 static inline bool IsTrailSurrogate(int32_t code) { | |
125 if (code == kNoPreviousCharacter) return false; | |
126 return (code & 0xfc00) == 0xdc00; | |
127 } | |
128 | |
129 static inline int32_t CombineSurrogatePair(uchar lead, uchar trail) { | |
130 return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff); | |
131 } | |
132 static const int kNoPreviousCharacter = -1; | |
rossberg
2012/03/12 10:55:05
int32_t?
Erik Corry
2012/03/12 12:34:10
Done.
| |
133 static const uchar kMaxNonSurrogateCharCode = 0xffff; | |
134 // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes | |
135 // of UTF-8 data. The special case where the unit is a surrogate | |
136 // trail produces 1 byte net, because the encoding of the pair is | |
137 // 4 bytes and the 3 bytes that were used to encode the lead surrogate | |
138 // can be reclaimed. | |
139 static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3; | |
140 // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes. | |
141 // The illegality stems from the surrogate not being part of a pair. | |
142 static const int kUtf8BytesToCodeASurrogate = 3; | |
143 static inline uchar LeadSurrogate(int32_t char_code) { | |
144 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff); | |
145 } | |
146 static inline uchar TrailSurrogate(int32_t char_code) { | |
147 return 0xdc00 + (char_code & 0x3ff); | |
148 } | |
149 }; | |
150 | |
151 | |
117 class Utf8 { | 152 class Utf8 { |
118 public: | 153 public: |
119 static inline uchar Length(uchar chr); | 154 static inline uchar Length(uchar chr, int previous); |
120 static inline unsigned Encode(char* out, uchar c); | 155 static inline unsigned Encode( |
156 char* out, uchar c, int previous); | |
121 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer, | 157 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer, |
122 unsigned capacity, unsigned* chars_read, unsigned* offset); | 158 unsigned capacity, unsigned* chars_read, unsigned* offset); |
123 static uchar CalculateValue(const byte* str, | 159 static uchar CalculateValue(const byte* str, |
124 unsigned length, | 160 unsigned length, |
125 unsigned* cursor); | 161 unsigned* cursor); |
126 static const uchar kBadChar = 0xFFFD; | 162 static const uchar kBadChar = 0xFFFD; |
127 static const unsigned kMaxEncodedSize = 4; | 163 static const unsigned kMaxEncodedSize = 4; |
128 static const unsigned kMaxOneByteChar = 0x7f; | 164 static const unsigned kMaxOneByteChar = 0x7f; |
129 static const unsigned kMaxTwoByteChar = 0x7ff; | 165 static const unsigned kMaxTwoByteChar = 0x7ff; |
130 static const unsigned kMaxThreeByteChar = 0xffff; | 166 static const unsigned kMaxThreeByteChar = 0xffff; |
131 static const unsigned kMaxFourByteChar = 0x1fffff; | 167 static const unsigned kMaxFourByteChar = 0x1fffff; |
132 | 168 |
169 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together | |
170 // that match are coded as a 4 byte UTF-8 sequence. | |
171 static const unsigned kBytesSavedByCombiningSurrogates = 2; | |
172 static const unsigned kSizeOfUnmatchedSurrogate = 3; | |
173 | |
133 private: | 174 private: |
134 template <unsigned s> friend class Utf8InputBuffer; | 175 template <unsigned s> friend class Utf8InputBuffer; |
135 friend class Test; | 176 friend class Test; |
136 static inline uchar ValueOf(const byte* str, | 177 static inline uchar ValueOf(const byte* str, |
137 unsigned length, | 178 unsigned length, |
138 unsigned* cursor); | 179 unsigned* cursor); |
139 }; | 180 }; |
140 | 181 |
141 // --- C h a r a c t e r S t r e a m --- | 182 // --- C h a r a c t e r S t r e a m --- |
142 | 183 |
143 class CharacterStream { | 184 class CharacterStream { |
144 public: | 185 public: |
145 inline uchar GetNext(); | 186 inline uchar GetNext(); |
146 inline bool has_more() { return remaining_ != 0; } | 187 inline bool has_more() { return remaining_ != 0; } |
147 // Note that default implementation is not efficient. | 188 // Note that default implementation is not efficient. |
148 virtual void Seek(unsigned); | 189 virtual void Seek(unsigned); |
149 unsigned Length(); | 190 unsigned Length(); |
191 unsigned Utf16Length(); | |
150 virtual ~CharacterStream() { } | 192 virtual ~CharacterStream() { } |
151 static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity, | 193 static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity, |
152 unsigned& offset); | 194 unsigned& offset); |
153 static inline bool EncodeAsciiCharacter(uchar c, byte* buffer, | 195 static inline bool EncodeAsciiCharacter(uchar c, byte* buffer, |
154 unsigned capacity, unsigned& offset); | 196 unsigned capacity, unsigned& offset); |
155 static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer, | 197 static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer, |
156 unsigned capacity, unsigned& offset); | 198 unsigned capacity, unsigned& offset); |
157 static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset); | 199 static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset); |
158 virtual void Rewind() = 0; | 200 virtual void Rewind() = 0; |
201 | |
159 protected: | 202 protected: |
160 virtual void FillBuffer() = 0; | 203 virtual void FillBuffer() = 0; |
161 // The number of characters left in the current buffer | 204 // The number of characters left in the current buffer |
162 unsigned remaining_; | 205 unsigned remaining_; |
163 // The current offset within the buffer | 206 // The current offset within the buffer |
164 unsigned cursor_; | 207 unsigned cursor_; |
165 // The buffer containing the decoded characters. | 208 // The buffer containing the decoded characters. |
166 const byte* buffer_; | 209 const byte* buffer_; |
167 }; | 210 }; |
168 | 211 |
(...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
271 static const int kMaxWidth = 1; | 314 static const int kMaxWidth = 1; |
272 static int Convert(uchar c, | 315 static int Convert(uchar c, |
273 uchar n, | 316 uchar n, |
274 uchar* result, | 317 uchar* result, |
275 bool* allow_caching_ptr); | 318 bool* allow_caching_ptr); |
276 }; | 319 }; |
277 | 320 |
278 } // namespace unibrow | 321 } // namespace unibrow |
279 | 322 |
280 #endif // V8_UNICODE_H_ | 323 #endif // V8_UNICODE_H_ |
OLD | NEW |