Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(222)

Side by Side Diff: src/unicode.h

Issue 11649018: Remove Utf8InputBuffer (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge
Patch Set: Created 8 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2011 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 11 matching lines...) Expand all
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 27
28 #ifndef V8_UNICODE_H_ 28 #ifndef V8_UNICODE_H_
29 #define V8_UNICODE_H_ 29 #define V8_UNICODE_H_
30 30
31 #include <sys/types.h> 31 #include <sys/types.h>
32 32 #include <stdint.h>
33 #include <globals.h>
33 /** 34 /**
34 * \file 35 * \file
35 * Definitions and convenience functions for working with unicode. 36 * Definitions and convenience functions for working with unicode.
36 */ 37 */
37 38
38 namespace unibrow { 39 namespace unibrow {
39 40
40 typedef unsigned int uchar; 41 typedef unsigned int uchar;
41 typedef unsigned char byte; 42 typedef unsigned char byte;
42 43
(...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after
133 static const uchar kMaxNonSurrogateCharCode = 0xffff; 134 static const uchar kMaxNonSurrogateCharCode = 0xffff;
134 // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes 135 // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
135 // of UTF-8 data. The special case where the unit is a surrogate 136 // of UTF-8 data. The special case where the unit is a surrogate
136 // trail produces 1 byte net, because the encoding of the pair is 137 // trail produces 1 byte net, because the encoding of the pair is
137 // 4 bytes and the 3 bytes that were used to encode the lead surrogate 138 // 4 bytes and the 3 bytes that were used to encode the lead surrogate
138 // can be reclaimed. 139 // can be reclaimed.
139 static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3; 140 static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
140 // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes. 141 // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
141 // The illegality stems from the surrogate not being part of a pair. 142 // The illegality stems from the surrogate not being part of a pair.
142 static const int kUtf8BytesToCodeASurrogate = 3; 143 static const int kUtf8BytesToCodeASurrogate = 3;
143 static inline uchar LeadSurrogate(int char_code) { 144 static inline uint16_t LeadSurrogate(uint32_t char_code) {
144 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff); 145 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
145 } 146 }
146 static inline uchar TrailSurrogate(int char_code) { 147 static inline uint16_t TrailSurrogate(uint32_t char_code) {
147 return 0xdc00 + (char_code & 0x3ff); 148 return 0xdc00 + (char_code & 0x3ff);
148 } 149 }
149 }; 150 };
150 151
151 152
152 class Utf8 { 153 class Utf8 {
153 public: 154 public:
154 static inline uchar Length(uchar chr, int previous); 155 static inline uchar Length(uchar chr, int previous);
155 static inline unsigned Encode( 156 static inline unsigned Encode(
156 char* out, uchar c, int previous); 157 char* out, uchar c, int previous);
157 static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
158 unsigned capacity, unsigned* chars_read, unsigned* offset);
159 static uchar CalculateValue(const byte* str, 158 static uchar CalculateValue(const byte* str,
160 unsigned length, 159 unsigned length,
161 unsigned* cursor); 160 unsigned* cursor);
162 static const uchar kBadChar = 0xFFFD; 161 static const uchar kBadChar = 0xFFFD;
163 static const unsigned kMaxEncodedSize = 4; 162 static const unsigned kMaxEncodedSize = 4;
164 static const unsigned kMaxOneByteChar = 0x7f; 163 static const unsigned kMaxOneByteChar = 0x7f;
165 static const unsigned kMaxTwoByteChar = 0x7ff; 164 static const unsigned kMaxTwoByteChar = 0x7ff;
166 static const unsigned kMaxThreeByteChar = 0xffff; 165 static const unsigned kMaxThreeByteChar = 0xffff;
167 static const unsigned kMaxFourByteChar = 0x1fffff; 166 static const unsigned kMaxFourByteChar = 0x1fffff;
168 167
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after
234 unsigned offset_; 233 unsigned offset_;
235 // The input string 234 // The input string
236 Input input_; 235 Input input_;
237 // To avoid heap allocation, we keep an internal buffer to which 236 // To avoid heap allocation, we keep an internal buffer to which
238 // the encoded string can write its characters. The string 237 // the encoded string can write its characters. The string
239 // implementation is free to decide whether it wants to use this 238 // implementation is free to decide whether it wants to use this
240 // buffer or not. 239 // buffer or not.
241 byte util_buffer_[kSize]; 240 byte util_buffer_[kSize];
242 }; 241 };
243 242
244 // --- U t f 8 I n p u t B u f f e r ---
245 243
246 template <unsigned s = 256> 244 class Utf8DecoderBase {
247 class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
248 public: 245 public:
249 inline Utf8InputBuffer() { } 246 // Initialization done in subclass.
250 inline Utf8InputBuffer(const char* data, unsigned length); 247 inline Utf8DecoderBase();
251 inline void Reset(const char* data, unsigned length) { 248 inline Utf8DecoderBase(uint16_t* buffer,
252 InputBuffer<Utf8, Buffer<const char*>, s>::Reset( 249 unsigned buffer_length,
253 Buffer<const char*>(data, length)); 250 const uint8_t* stream,
254 } 251 unsigned stream_length);
252 inline unsigned Utf16Length() const { return utf16_length_; }
253 protected:
254 // This reads all characters and sets the utf16_length_.
255 // The first buffer_length utf16 chars are cached in the buffer.
256 void Reset(uint16_t* buffer,
257 unsigned buffer_length,
258 const uint8_t* stream,
259 unsigned stream_length);
260 static void WriteUtf16Slow(const uint8_t* stream,
261 uint16_t* data,
262 unsigned length);
263 const uint8_t* unbuffered_start_;
264 unsigned utf16_length_;
265 bool last_byte_of_buffer_unused_;
266 private:
267 DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
268 };
269
270 template <unsigned kBufferSize>
271 class Utf8Decoder : public Utf8DecoderBase {
272 public:
273 inline Utf8Decoder() {}
274 inline Utf8Decoder(const char* stream, unsigned length);
275 inline void Reset(const char* stream, unsigned length);
276 inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;
277 private:
278 uint16_t buffer_[kBufferSize];
255 }; 279 };
256 280
257 281
258 struct Uppercase { 282 struct Uppercase {
259 static bool Is(uchar c); 283 static bool Is(uchar c);
260 }; 284 };
261 struct Lowercase { 285 struct Lowercase {
262 static bool Is(uchar c); 286 static bool Is(uchar c);
263 }; 287 };
264 struct Letter { 288 struct Letter {
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
314 static const int kMaxWidth = 1; 338 static const int kMaxWidth = 1;
315 static int Convert(uchar c, 339 static int Convert(uchar c,
316 uchar n, 340 uchar n,
317 uchar* result, 341 uchar* result,
318 bool* allow_caching_ptr); 342 bool* allow_caching_ptr);
319 }; 343 };
320 344
321 } // namespace unibrow 345 } // namespace unibrow
322 346
323 #endif // V8_UNICODE_H_ 347 #endif // V8_UNICODE_H_
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698