OLD | NEW |
---|---|
1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 96 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
107 public: | 107 public: |
108 inline Buffer(Data data, unsigned length) : data_(data), length_(length) { } | 108 inline Buffer(Data data, unsigned length) : data_(data), length_(length) { } |
109 inline Buffer() : data_(0), length_(0) { } | 109 inline Buffer() : data_(0), length_(0) { } |
110 Data data() { return data_; } | 110 Data data() { return data_; } |
111 unsigned length() { return length_; } | 111 unsigned length() { return length_; } |
112 private: | 112 private: |
113 Data data_; | 113 Data data_; |
114 unsigned length_; | 114 unsigned length_; |
115 }; | 115 }; |
116 | 116 |
117 | |
118 class Utf16 { | |
rossberg
2012/03/07 13:32:47
Nit: this doesn't quite fit into the above Utf8 se
Erik Corry
2012/03/11 19:29:22
Done.
| |
119 public: | |
120 static inline bool IsLeadSurrogate(uchar code) { | |
121 return (code & 0xfc00) == 0xd800; | |
122 } | |
123 static inline bool IsTrailSurrogate(uchar code) { | |
124 return (code & 0xfc00) == 0xdc00; | |
125 } | |
126 static inline int CombineSurrogatePair(uchar lead, uchar trail) { | |
rossberg
2012/03/07 13:32:47
Isn't int32_t more accurate as result type?
Erik Corry
2012/03/11 19:29:22
Done.
| |
127 return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff); | |
128 } | |
129 static const uchar kMaxNonSurrogateCharCode = 0xffff; | |
130 static inline uchar LeadSurrogate(int char_code) { | |
rossberg
2012/03/07 13:32:47
Similar here (and below), isn't char_code an int32
Erik Corry
2012/03/11 19:29:22
Done.
| |
131 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff); | |
132 } | |
133 static inline uchar TrailSurrogate(int char_code) { | |
134 return 0xdc00 + (char_code & 0x3ff); | |
135 } | |
136 }; | |
137 | |
138 | |
117 class Utf8 { | 139 class Utf8 { |
118 public: | 140 public: |
119 static inline uchar Length(uchar chr); | 141 static inline uchar Length(uchar chr, int previous); |
120 static inline unsigned Encode(char* out, uchar c); | 142 static inline unsigned Encode( |
143 char* out, uchar c, int previous); | |
121 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer, | 144 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer, |
122 unsigned capacity, unsigned* chars_read, unsigned* offset); | 145 unsigned capacity, unsigned* chars_read, unsigned* offset); |
123 static uchar CalculateValue(const byte* str, | 146 static uchar CalculateValue(const byte* str, |
124 unsigned length, | 147 unsigned length, |
125 unsigned* cursor); | 148 unsigned* cursor); |
126 static const uchar kBadChar = 0xFFFD; | 149 static const uchar kBadChar = 0xFFFD; |
127 static const unsigned kMaxEncodedSize = 4; | 150 static const unsigned kMaxEncodedSize = 4; |
128 static const unsigned kMaxOneByteChar = 0x7f; | 151 static const unsigned kMaxOneByteChar = 0x7f; |
129 static const unsigned kMaxTwoByteChar = 0x7ff; | 152 static const unsigned kMaxTwoByteChar = 0x7ff; |
130 static const unsigned kMaxThreeByteChar = 0xffff; | 153 static const unsigned kMaxThreeByteChar = 0xffff; |
131 static const unsigned kMaxFourByteChar = 0x1fffff; | 154 static const unsigned kMaxFourByteChar = 0x1fffff; |
132 | 155 |
156 static const int kNoPreviousCharacter = -1; | |
157 | |
158 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together | |
159 // that match are coded as a 4 byte UTF-8 sequence. | |
160 static const unsigned kBytesSavedByCombiningSurrogates = 2; | |
161 static const unsigned kSizeOfUnmatchedSurrogate = 3; | |
162 | |
133 private: | 163 private: |
134 template <unsigned s> friend class Utf8InputBuffer; | 164 template <unsigned s> friend class Utf8InputBuffer; |
135 friend class Test; | 165 friend class Test; |
136 static inline uchar ValueOf(const byte* str, | 166 static inline uchar ValueOf(const byte* str, |
137 unsigned length, | 167 unsigned length, |
138 unsigned* cursor); | 168 unsigned* cursor); |
139 }; | 169 }; |
140 | 170 |
141 // --- C h a r a c t e r S t r e a m --- | 171 // --- C h a r a c t e r S t r e a m --- |
142 | 172 |
143 class CharacterStream { | 173 class CharacterStream { |
144 public: | 174 public: |
145 inline uchar GetNext(); | 175 inline uchar GetNext(); |
146 inline bool has_more() { return remaining_ != 0; } | 176 inline bool has_more() { return remaining_ != 0; } |
147 // Note that default implementation is not efficient. | 177 // Note that default implementation is not efficient. |
148 virtual void Seek(unsigned); | 178 virtual void Seek(unsigned); |
149 unsigned Length(); | 179 unsigned Length(); |
180 unsigned Utf16Length(); | |
150 virtual ~CharacterStream() { } | 181 virtual ~CharacterStream() { } |
151 static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity, | 182 static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity, |
152 unsigned& offset); | 183 unsigned& offset); |
153 static inline bool EncodeAsciiCharacter(uchar c, byte* buffer, | 184 static inline bool EncodeAsciiCharacter(uchar c, byte* buffer, |
154 unsigned capacity, unsigned& offset); | 185 unsigned capacity, unsigned& offset); |
155 static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer, | 186 static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer, |
156 unsigned capacity, unsigned& offset); | 187 unsigned capacity, unsigned& offset); |
157 static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset); | 188 static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset); |
158 virtual void Rewind() = 0; | 189 virtual void Rewind() = 0; |
190 | |
159 protected: | 191 protected: |
160 virtual void FillBuffer() = 0; | 192 virtual void FillBuffer() = 0; |
161 // The number of characters left in the current buffer | 193 // The number of characters left in the current buffer |
162 unsigned remaining_; | 194 unsigned remaining_; |
163 // The current offset within the buffer | 195 // The current offset within the buffer |
164 unsigned cursor_; | 196 unsigned cursor_; |
165 // The buffer containing the decoded characters. | 197 // The buffer containing the decoded characters. |
166 const byte* buffer_; | 198 const byte* buffer_; |
167 }; | 199 }; |
168 | 200 |
(...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
271 static const int kMaxWidth = 1; | 303 static const int kMaxWidth = 1; |
272 static int Convert(uchar c, | 304 static int Convert(uchar c, |
273 uchar n, | 305 uchar n, |
274 uchar* result, | 306 uchar* result, |
275 bool* allow_caching_ptr); | 307 bool* allow_caching_ptr); |
276 }; | 308 }; |
277 | 309 |
278 } // namespace unibrow | 310 } // namespace unibrow |
279 | 311 |
280 #endif // V8_UNICODE_H_ | 312 #endif // V8_UNICODE_H_ |
OLD | NEW |