src/unicode.h - Issue 11727004: Remove InputBuffer

Side by Side Diff: src/unicode.h

Issue 11727004: Remove InputBuffer (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge

Patch Set: Created 7 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2011 the V8 project authors. All rights reserved.	1 // Copyright 2011 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 82 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
93 CacheEntry entries_[kSize];	93 CacheEntry entries_[kSize];

94 };	94 };

95	95

96 class UnicodeData {	96 class UnicodeData {

97 private:	97 private:

98 friend class Test;	98 friend class Test;

99 static int GetByteCount();	99 static int GetByteCount();

100 static const uchar kMaxCodePoint;	100 static const uchar kMaxCodePoint;

101 };	101 };

102	102

103 // --- U t f 8 a n d 16 ---

104

105 template <typename Data>

106 class Buffer {

107 public:

108 inline Buffer(Data data, unsigned length) : data_(data), length_(length) { }

109 inline Buffer() : data_(0), length_(0) { }

110 Data data() { return data_; }

111 unsigned length() { return length_; }

112 private:

113 Data data_;

114 unsigned length_;

115 };

116

117

118 class Utf16 {	103 class Utf16 {

119 public:	104 public:

120 static inline bool IsLeadSurrogate(int code) {	105 static inline bool IsLeadSurrogate(int code) {

121 if (code == kNoPreviousCharacter) return false;	106 if (code == kNoPreviousCharacter) return false;

122 return (code & 0xfc00) == 0xd800;	107 return (code & 0xfc00) == 0xd800;

123 }	108 }

124 static inline bool IsTrailSurrogate(int code) {	109 static inline bool IsTrailSurrogate(int code) {

125 if (code == kNoPreviousCharacter) return false;	110 if (code == kNoPreviousCharacter) return false;

126 return (code & 0xfc00) == 0xdc00;	111 return (code & 0xfc00) == 0xdc00;

127 }	112 }

(...skipping 38 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
166	151

167 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together	152 // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together

168 // that match are coded as a 4 byte UTF-8 sequence.	153 // that match are coded as a 4 byte UTF-8 sequence.

169 static const unsigned kBytesSavedByCombiningSurrogates = 2;	154 static const unsigned kBytesSavedByCombiningSurrogates = 2;

170 static const unsigned kSizeOfUnmatchedSurrogate = 3;	155 static const unsigned kSizeOfUnmatchedSurrogate = 3;

171 static inline uchar ValueOf(const byte* str,	156 static inline uchar ValueOf(const byte* str,

172 unsigned length,	157 unsigned length,

173 unsigned* cursor);	158 unsigned* cursor);

174 };	159 };

175	160

176 // --- C h a r a c t e r S t r e a m ---

177

178 class CharacterStream {

179 public:

180 inline uchar GetNext();

181 inline bool has_more() { return remaining_ != 0; }

182 // Note that default implementation is not efficient.

183 virtual void Seek(unsigned);

184 unsigned Length();

185 unsigned Utf16Length();

186 virtual ~CharacterStream() { }

187 static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,

188 unsigned& offset);

189 static inline bool EncodeAsciiCharacter(uchar c, byte* buffer,

190 unsigned capacity, unsigned& offset);

191 static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer,

192 unsigned capacity, unsigned& offset);

193 static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);

194 virtual void Rewind() = 0;

195

196 protected:

197 virtual void FillBuffer() = 0;

198 virtual bool BoundsCheck(unsigned offset) = 0;

199 // The number of characters left in the current buffer

200 unsigned remaining_;

201 // The current offset within the buffer

202 unsigned cursor_;

203 // The buffer containing the decoded characters.

204 const byte* buffer_;

205 };

206

207 // --- I n p u t B u f f e r ---

208

209 /**

210 * Provides efficient access to encoded characters in strings. It

211 * does so by reading characters one block at a time, rather than one

212 * character at a time, which gives string implementations an

213 * opportunity to optimize the decoding.

214 */

215 template <class Reader, class Input = Reader*, unsigned kSize = 256>

216 class InputBuffer : public CharacterStream {

217 public:

218 virtual void Rewind();

219 inline void Reset(Input input);

220 void Seek(unsigned position);

221 inline void Reset(unsigned position, Input input);

222 protected:

223 InputBuffer() { }

224 explicit InputBuffer(Input input) { Reset(input); }

225 virtual void FillBuffer();

226 virtual bool BoundsCheck(unsigned offset) {

227 return (buffer_ != util_buffer_) \|\| (offset < kSize);

228 }

229

230 // A custom offset that can be used by the string implementation to

231 // mark progress within the encoded string.

232 unsigned offset_;

233 // The input string

234 Input input_;

235 // To avoid heap allocation, we keep an internal buffer to which

236 // the encoded string can write its characters. The string

237 // implementation is free to decide whether it wants to use this

238 // buffer or not.

239 byte util_buffer_[kSize];

240 };

241

242	161

243 class Utf8DecoderBase {	162 class Utf8DecoderBase {

244 public:	163 public:

245 // Initialization done in subclass.	164 // Initialization done in subclass.

246 inline Utf8DecoderBase();	165 inline Utf8DecoderBase();

247 inline Utf8DecoderBase(uint16_t* buffer,	166 inline Utf8DecoderBase(uint16_t* buffer,

248 unsigned buffer_length,	167 unsigned buffer_length,

249 const uint8_t* stream,	168 const uint8_t* stream,

250 unsigned stream_length);	169 unsigned stream_length);

251 inline unsigned Utf16Length() const { return utf16_length_; }	170 inline unsigned Utf16Length() const { return utf16_length_; }

(...skipping 85 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
337 static const int kMaxWidth = 1;	256 static const int kMaxWidth = 1;

338 static int Convert(uchar c,	257 static int Convert(uchar c,

339 uchar n,	258 uchar n,

340 uchar* result,	259 uchar* result,

341 bool* allow_caching_ptr);	260 bool* allow_caching_ptr);

342 };	261 };

343	262

344 } // namespace unibrow	263 } // namespace unibrow

345	264

346 #endif // V8_UNICODE_H_	265 #endif // V8_UNICODE_H_

OLD	NEW

« no previous file with comments | « src/scanner.h ('k') | src/unicode.cc » ('j') | no next file with comments »