src/unicode.h - Issue 11649018: Remove Utf8InputBuffer

Side by Side Diff: src/unicode.h

Issue 11649018: Remove Utf8InputBuffer (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2011 the V8 project authors. All rights reserved.	1 // Copyright 2011 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 11 matching lines...) Expand all Loading...
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,	22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY	23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT	24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE	25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.	26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

27	27

28 #ifndef V8_UNICODE_H_	28 #ifndef V8_UNICODE_H_

29 #define V8_UNICODE_H_	29 #define V8_UNICODE_H_

30	30

31 #include <sys/types.h>	31 #include <sys/types.h>

32	32 #include <stdint.h>

	33 #include <globals.h>

33 /**	34 /**

34 * \file	35 * \file

35 * Definitions and convenience functions for working with unicode.	36 * Definitions and convenience functions for working with unicode.

36 */	37 */

37	38

38 namespace unibrow {	39 namespace unibrow {

39	40

40 typedef unsigned int uchar;	41 typedef unsigned int uchar;

41 typedef unsigned char byte;	42 typedef unsigned char byte;

42	43

(...skipping 90 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
133 static const uchar kMaxNonSurrogateCharCode = 0xffff;	134 static const uchar kMaxNonSurrogateCharCode = 0xffff;

134 // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes	135 // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes

135 // of UTF-8 data. The special case where the unit is a surrogate	136 // of UTF-8 data. The special case where the unit is a surrogate

136 // trail produces 1 byte net, because the encoding of the pair is	137 // trail produces 1 byte net, because the encoding of the pair is

137 // 4 bytes and the 3 bytes that were used to encode the lead surrogate	138 // 4 bytes and the 3 bytes that were used to encode the lead surrogate

138 // can be reclaimed.	139 // can be reclaimed.

139 static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;	140 static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;

140 // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.	141 // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.

141 // The illegality stems from the surrogate not being part of a pair.	142 // The illegality stems from the surrogate not being part of a pair.

142 static const int kUtf8BytesToCodeASurrogate = 3;	143 static const int kUtf8BytesToCodeASurrogate = 3;

143 static inline uchar LeadSurrogate(int char_code) {	144 static inline uint16_t LeadSurrogate(uint32_t char_code) {

144 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);	145 return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);

145 }	146 }

146 static inline uchar TrailSurrogate(int char_code) {	147 static inline uint16_t TrailSurrogate(uint32_t char_code) {

147 return 0xdc00 + (char_code & 0x3ff);	148 return 0xdc00 + (char_code & 0x3ff);

148 }	149 }

149 };	150 };

150	151

151	152

152 class Utf8 {	153 class Utf8 {

153 public:	154 public:

154 static inline uchar Length(uchar chr, int previous);	155 static inline uchar Length(uchar chr, int previous);

155 static inline unsigned Encode(	156 static inline unsigned Encode(

156 char* out, uchar c, int previous);	157 char* out, uchar c, int previous);

157 static const byte* ReadBlock(Buffer<const char> str, byte buffer,

158 unsigned capacity, unsigned* chars_read, unsigned* offset);

159 static uchar CalculateValue(const byte* str,	158 static uchar CalculateValue(const byte* str,

160 unsigned length,	159 unsigned length,

161 unsigned* cursor);	160 unsigned* cursor);

162 static const uchar kBadChar = 0xFFFD;	161 static const uchar kBadChar = 0xFFFD;

163 static const unsigned kMaxEncodedSize = 4;	162 static const unsigned kMaxEncodedSize = 4;

164 static const unsigned kMaxOneByteChar = 0x7f;	163 static const unsigned kMaxOneByteChar = 0x7f;

165 static const unsigned kMaxTwoByteChar = 0x7ff;	164 static const unsigned kMaxTwoByteChar = 0x7ff;

166 static const unsigned kMaxThreeByteChar = 0xffff;	165 static const unsigned kMaxThreeByteChar = 0xffff;

167 static const unsigned kMaxFourByteChar = 0x1fffff;	166 static const unsigned kMaxFourByteChar = 0x1fffff;

168	167

(...skipping 65 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
234 unsigned offset_;	233 unsigned offset_;

235 // The input string	234 // The input string

236 Input input_;	235 Input input_;

237 // To avoid heap allocation, we keep an internal buffer to which	236 // To avoid heap allocation, we keep an internal buffer to which

238 // the encoded string can write its characters. The string	237 // the encoded string can write its characters. The string

239 // implementation is free to decide whether it wants to use this	238 // implementation is free to decide whether it wants to use this

240 // buffer or not.	239 // buffer or not.

241 byte util_buffer_[kSize];	240 byte util_buffer_[kSize];

242 };	241 };

243	242

244 // --- U t f 8 I n p u t B u f f e r ---

245	243

246 template <unsigned s = 256>	244 class Utf8DecoderBase {

247 class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {

248 public:	245 public:

249 inline Utf8InputBuffer() { }	246 // Initialization done in subclass.

250 inline Utf8InputBuffer(const char* data, unsigned length);	247 inline Utf8DecoderBase();

251 inline void Reset(const char* data, unsigned length) {	248 inline Utf8DecoderBase(uint16_t* buffer,

252 InputBuffer<Utf8, Buffer<const char*>, s>::Reset(	249 unsigned buffer_length,

253 Buffer<const char*>(data, length));	250 const uint8_t* stream,

254 }	251 unsigned stream_length);

	252 inline unsigned Utf16Length() const { return utf16_length_; }

	253 protected:

	254 // This reads all characters and sets the utf16_length_.

	255 // The first buffer_length utf16 chars are cached in the buffer.

	256 void Reset(uint16_t* buffer,

	257 unsigned buffer_length,

	258 const uint8_t* stream,

	259 unsigned stream_length);

	260 static void WriteUtf16Slow(const uint8_t* stream,

	261 uint16_t* data,

	262 unsigned length);

	263 const uint8_t* unbuffered_start_;

	264 unsigned utf16_length_;

	265 bool last_byte_of_buffer_unused_;

	266 private:

	267 DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);

	268 };

	269

	270 template <unsigned kBufferSize>

	271 class Utf8Decoder : public Utf8DecoderBase {

	272 public:

	273 inline Utf8Decoder() {}

	274 inline Utf8Decoder(const char* stream, unsigned length);

	275 inline void Reset(const char* stream, unsigned length);

	276 inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;

	277 private:

	278 uint16_t buffer_[kBufferSize];

255 };	279 };

256	280

257	281

258 struct Uppercase {	282 struct Uppercase {

259 static bool Is(uchar c);	283 static bool Is(uchar c);

260 };	284 };

261 struct Lowercase {	285 struct Lowercase {

262 static bool Is(uchar c);	286 static bool Is(uchar c);

263 };	287 };

264 struct Letter {	288 struct Letter {

(...skipping 49 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
314 static const int kMaxWidth = 1;	338 static const int kMaxWidth = 1;

315 static int Convert(uchar c,	339 static int Convert(uchar c,

316 uchar n,	340 uchar n,

317 uchar* result,	341 uchar* result,

318 bool* allow_caching_ptr);	342 bool* allow_caching_ptr);

319 };	343 };

320	344

321 } // namespace unibrow	345 } // namespace unibrow

322	346

323 #endif // V8_UNICODE_H_	347 #endif // V8_UNICODE_H_

OLD	NEW

« src/objects.cc ('K') | « src/scanner.h ('k') | src/unicode.cc » ('j') | src/unicode.cc » ('J')