src/scanner.h - Issue 9600009: Fix input and output to handle UTF16 surrogate pairs.

Side by Side Diff: src/scanner.h

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2011 the V8 project authors. All rights reserved.	1 // Copyright 2011 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 55 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
66 inline int HexValue(uc32 c) {	66 inline int HexValue(uc32 c) {

67 c -= '0';	67 c -= '0';

68 if (static_cast<unsigned>(c) <= 9) return c;	68 if (static_cast<unsigned>(c) <= 9) return c;

69 c = (c \| 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.	69 c = (c \| 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.

70 if (static_cast<unsigned>(c) <= 5) return c + 10;	70 if (static_cast<unsigned>(c) <= 5) return c + 10;

71 return -1;	71 return -1;

72 }	72 }

73	73

74	74

75 // ---------------------------------------------------------------------	75 // ---------------------------------------------------------------------

76 // Buffered stream of characters, using an internal UC16 buffer.	76 // Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.

	77 // A code unit is a 16 bit value representing either a 16 bit code point

	78 // or one part of a surrogate pair that make a single 21 bit code point.

77	79

78 class UC16CharacterStream {	80 class Utf16CharacterStream {

79 public:	81 public:

80 UC16CharacterStream() : pos_(0) { }	82 Utf16CharacterStream() : pos_(0) { }

81 virtual ~UC16CharacterStream() { }	83 virtual ~Utf16CharacterStream() { }

82	84

83 // Returns and advances past the next UC16 character in the input	85 // Returns and advances past the next UTF-16 code unit in the input

84 // stream. If there are no more characters, it returns a negative	86 // stream. If there are no more code units, it returns a negative

85 // value.	87 // value.

86 inline uc32 Advance() {	88 inline uc32 Advance() {

87 if (buffer_cursor_ < buffer_end_ \|\| ReadBlock()) {	89 if (buffer_cursor_ < buffer_end_ \|\| ReadBlock()) {

88 pos_++;	90 pos_++;

89 return static_cast<uc32>(*(buffer_cursor_++));	91 return static_cast<uc32>(*(buffer_cursor_++));

90 }	92 }

91 // Note: currently the following increment is necessary to avoid a	93 // Note: currently the following increment is necessary to avoid a

92 // parser problem! The scanner treats the final kEndOfInput as	94 // parser problem! The scanner treats the final kEndOfInput as

93 // a character with a position, and does math relative to that	95 // a code unit with a position, and does math relative to that

94 // position.	96 // position.

95 pos_++;	97 pos_++;

96	98

97 return kEndOfInput;	99 return kEndOfInput;

98 }	100 }

99	101

100 // Return the current position in the character stream.	102 // Return the current position in the code unit stream.

101 // Starts at zero.	103 // Starts at zero.

102 inline unsigned pos() const { return pos_; }	104 inline unsigned pos() const { return pos_; }

103	105

104 // Skips forward past the next character_count UC16 characters	106 // Skips forward past the next code_unit_count UTF-16 code units

105 // in the input, or until the end of input if that comes sooner.	107 // in the input, or until the end of input if that comes sooner.

106 // Returns the number of characters actually skipped. If less	108 // Returns the number of code units actually skipped. If less

107 // than character_count,	109 // than code_unit_count,

108 inline unsigned SeekForward(unsigned character_count) {	110 inline unsigned SeekForward(unsigned code_unit_count) {

109 unsigned buffered_chars =	111 unsigned buffered_chars =

110 static_cast<unsigned>(buffer_end_ - buffer_cursor_);	112 static_cast<unsigned>(buffer_end_ - buffer_cursor_);

111 if (character_count <= buffered_chars) {	113 if (code_unit_count <= buffered_chars) {

112 buffer_cursor_ += character_count;	114 buffer_cursor_ += code_unit_count;

113 pos_ += character_count;	115 pos_ += code_unit_count;

114 return character_count;	116 return code_unit_count;

115 }	117 }

116 return SlowSeekForward(character_count);	118 return SlowSeekForward(code_unit_count);

117 }	119 }

118	120

119 // Pushes back the most recently read UC16 character (or negative	121 // Pushes back the most recently read UTF-16 code unit (or negative

120 // value if at end of input), i.e., the value returned by the most recent	122 // value if at end of input), i.e., the value returned by the most recent

121 // call to Advance.	123 // call to Advance.

122 // Must not be used right after calling SeekForward.	124 // Must not be used right after calling SeekForward.

123 virtual void PushBack(int32_t character) = 0;	125 virtual void PushBack(int32_t code_unit) = 0;

124	126

125 protected:	127 protected:

126 static const uc32 kEndOfInput = -1;	128 static const uc32 kEndOfInput = -1;

127	129

128 // Ensures that the buffer_cursor_ points to the character at	130 // Ensures that the buffer_cursor_ points to the code_unit at

129 // position pos_ of the input, if possible. If the position	131 // position pos_ of the input, if possible. If the position

130 // is at or after the end of the input, return false. If there	132 // is at or after the end of the input, return false. If there

131 // are more characters available, return true.	133 // are more code_units available, return true.

132 virtual bool ReadBlock() = 0;	134 virtual bool ReadBlock() = 0;

133 virtual unsigned SlowSeekForward(unsigned character_count) = 0;	135 virtual unsigned SlowSeekForward(unsigned code_unit_count) = 0;

134	136

135 const uc16* buffer_cursor_;	137 const uc16* buffer_cursor_;

136 const uc16* buffer_end_;	138 const uc16* buffer_end_;

137 unsigned pos_;	139 unsigned pos_;

138 };	140 };

139	141

140	142

141 class UnicodeCache {	143 class UnicodeCache {

142 // ---------------------------------------------------------------------	144 // ---------------------------------------------------------------------

143 // Caching predicates used by scanners.	145 // Caching predicates used by scanners.

(...skipping 27 matching lines...) Expand all Loading...
171 class LiteralBuffer {	173 class LiteralBuffer {

172 public:	174 public:

173 LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { }	175 LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { }

174	176

175 ~LiteralBuffer() {	177 ~LiteralBuffer() {

176 if (backing_store_.length() > 0) {	178 if (backing_store_.length() > 0) {

177 backing_store_.Dispose();	179 backing_store_.Dispose();

178 }	180 }

179 }	181 }

180	182

181 INLINE(void AddChar(uc16 character)) {	183 INLINE(void AddChar(uint32_t code_unit)) {

182 if (position_ >= backing_store_.length()) ExpandBuffer();	184 if (position_ >= backing_store_.length()) ExpandBuffer();

183 if (is_ascii_) {	185 if (is_ascii_) {

184 if (character < kMaxAsciiCharCodeU) {	186 if (code_unit < kMaxAsciiCharCodeU) {

185 backing_store_[position_] = static_cast<byte>(character);	187 backing_store_[position_] = static_cast<byte>(code_unit);

186 position_ += kASCIISize;	188 position_ += kASCIISize;

187 return;	189 return;

188 }	190 }

189 ConvertToUC16();	191 ConvertToUtf16();

190 }	192 }

191 reinterpret_cast<uc16>(&backing_store_[position_]) = character;	193 ASSERT(code_unit < 0x10000u);

	194 reinterpret_cast<uc16>(&backing_store_[position_]) = code_unit;

192 position_ += kUC16Size;	195 position_ += kUC16Size;

193 }	196 }

194	197

195 bool is_ascii() { return is_ascii_; }	198 bool is_ascii() { return is_ascii_; }

196	199

197 Vector<const uc16> uc16_literal() {	200 Vector<const uc16> utf16_literal() {

198 ASSERT(!is_ascii_);	201 ASSERT(!is_ascii_);

199 ASSERT((position_ & 0x1) == 0);	202 ASSERT((position_ & 0x1) == 0);

200 return Vector<const uc16>(	203 return Vector<const uc16>(

201 reinterpret_cast<const uc16*>(backing_store_.start()),	204 reinterpret_cast<const uc16*>(backing_store_.start()),

202 position_ >> 1);	205 position_ >> 1);

203 }	206 }

204	207

205 Vector<const char> ascii_literal() {	208 Vector<const char> ascii_literal() {

206 ASSERT(is_ascii_);	209 ASSERT(is_ascii_);

207 return Vector<const char>(	210 return Vector<const char>(

(...skipping 21 matching lines...) Expand all Loading...
229 return new_capacity;	232 return new_capacity;

230 }	233 }

231	234

232 void ExpandBuffer() {	235 void ExpandBuffer() {

233 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));	236 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));

234 memcpy(new_store.start(), backing_store_.start(), position_);	237 memcpy(new_store.start(), backing_store_.start(), position_);

235 backing_store_.Dispose();	238 backing_store_.Dispose();

236 backing_store_ = new_store;	239 backing_store_ = new_store;

237 }	240 }

238	241

239 void ConvertToUC16() {	242 void ConvertToUtf16() {

240 ASSERT(is_ascii_);	243 ASSERT(is_ascii_);

241 Vector<byte> new_store;	244 Vector<byte> new_store;

242 int new_content_size = position_ * kUC16Size;	245 int new_content_size = position_ * kUC16Size;

243 if (new_content_size >= backing_store_.length()) {	246 if (new_content_size >= backing_store_.length()) {

244 // Ensure room for all currently read characters as UC16 as well	247 // Ensure room for all currently read code units as UC16 as well

245 // as the character about to be stored.	248 // as the code unit about to be stored.

246 new_store = Vector<byte>::New(NewCapacity(new_content_size));	249 new_store = Vector<byte>::New(NewCapacity(new_content_size));

247 } else {	250 } else {

248 new_store = backing_store_;	251 new_store = backing_store_;

249 }	252 }

250 char* src = reinterpret_cast<char*>(backing_store_.start());	253 char* src = reinterpret_cast<char*>(backing_store_.start());

251 uc16* dst = reinterpret_cast<uc16*>(new_store.start());	254 uc16* dst = reinterpret_cast<uc16*>(new_store.start());

252 for (int i = position_ - 1; i >= 0; i--) {	255 for (int i = position_ - 1; i >= 0; i--) {

253 dst[i] = src[i];	256 dst[i] = src[i];

254 }	257 }

255 if (new_store.start() != backing_store_.start()) {	258 if (new_store.start() != backing_store_.start()) {

(...skipping 53 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
309 int end_pos;	312 int end_pos;

310 };	313 };

311	314

312 // -1 is outside of the range of any real source code.	315 // -1 is outside of the range of any real source code.

313 static const int kNoOctalLocation = -1;	316 static const int kNoOctalLocation = -1;

314	317

315 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;	318 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;

316	319

317 explicit Scanner(UnicodeCache* scanner_contants);	320 explicit Scanner(UnicodeCache* scanner_contants);

318	321

319 void Initialize(UC16CharacterStream* source);	322 void Initialize(Utf16CharacterStream* source);

320	323

321 // Returns the next token and advances input.	324 // Returns the next token and advances input.

322 Token::Value Next();	325 Token::Value Next();

323 // Returns the current token again.	326 // Returns the current token again.

324 Token::Value current_token() { return current_.token; }	327 Token::Value current_token() { return current_.token; }

325 // Returns the location information for the current token	328 // Returns the location information for the current token

326 // (the token last returned by Next()).	329 // (the token last returned by Next()).

327 Location location() const { return current_.location; }	330 Location location() const { return current_.location; }

328 // Returns the literal string, if any, for the current token (the	331 // Returns the literal string, if any, for the current token (the

329 // token last returned by Next()). The string is 0-terminated.	332 // token last returned by Next()). The string is 0-terminated.

330 // Literal strings are collected for identifiers, strings, and	333 // Literal strings are collected for identifiers, strings, and

331 // numbers.	334 // numbers.

332 // These functions only give the correct result if the literal	335 // These functions only give the correct result if the literal

333 // was scanned between calls to StartLiteral() and TerminateLiteral().	336 // was scanned between calls to StartLiteral() and TerminateLiteral().

334 Vector<const char> literal_ascii_string() {	337 Vector<const char> literal_ascii_string() {

335 ASSERT_NOT_NULL(current_.literal_chars);	338 ASSERT_NOT_NULL(current_.literal_chars);

336 return current_.literal_chars->ascii_literal();	339 return current_.literal_chars->ascii_literal();

337 }	340 }

338 Vector<const uc16> literal_uc16_string() {	341 Vector<const uc16> literal_utf16_string() {

339 ASSERT_NOT_NULL(current_.literal_chars);	342 ASSERT_NOT_NULL(current_.literal_chars);

340 return current_.literal_chars->uc16_literal();	343 return current_.literal_chars->utf16_literal();

341 }	344 }

342 bool is_literal_ascii() {	345 bool is_literal_ascii() {

343 ASSERT_NOT_NULL(current_.literal_chars);	346 ASSERT_NOT_NULL(current_.literal_chars);

344 return current_.literal_chars->is_ascii();	347 return current_.literal_chars->is_ascii();

345 }	348 }

346 int literal_length() const {	349 int literal_length() const {

347 ASSERT_NOT_NULL(current_.literal_chars);	350 ASSERT_NOT_NULL(current_.literal_chars);

348 return current_.literal_chars->length();	351 return current_.literal_chars->length();

349 }	352 }

350	353

(...skipping 13 matching lines...) Expand all Loading...
364 Token::Value peek() const { return next_.token; }	367 Token::Value peek() const { return next_.token; }

365	368

366 Location peek_location() const { return next_.location; }	369 Location peek_location() const { return next_.location; }

367	370

368 // Returns the literal string for the next token (the token that	371 // Returns the literal string for the next token (the token that

369 // would be returned if Next() were called).	372 // would be returned if Next() were called).

370 Vector<const char> next_literal_ascii_string() {	373 Vector<const char> next_literal_ascii_string() {

371 ASSERT_NOT_NULL(next_.literal_chars);	374 ASSERT_NOT_NULL(next_.literal_chars);

372 return next_.literal_chars->ascii_literal();	375 return next_.literal_chars->ascii_literal();

373 }	376 }

374 Vector<const uc16> next_literal_uc16_string() {	377 Vector<const uc16> next_literal_utf16_string() {

375 ASSERT_NOT_NULL(next_.literal_chars);	378 ASSERT_NOT_NULL(next_.literal_chars);

376 return next_.literal_chars->uc16_literal();	379 return next_.literal_chars->utf16_literal();

377 }	380 }

378 bool is_next_literal_ascii() {	381 bool is_next_literal_ascii() {

379 ASSERT_NOT_NULL(next_.literal_chars);	382 ASSERT_NOT_NULL(next_.literal_chars);

380 return next_.literal_chars->is_ascii();	383 return next_.literal_chars->is_ascii();

381 }	384 }

382 int next_literal_length() const {	385 int next_literal_length() const {

383 ASSERT_NOT_NULL(next_.literal_chars);	386 ASSERT_NOT_NULL(next_.literal_chars);

384 return next_.literal_chars->length();	387 return next_.literal_chars->length();

385 }	388 }

386	389

(...skipping 148 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
535	538

536 UnicodeCache* unicode_cache_;	539 UnicodeCache* unicode_cache_;

537	540

538 // Buffers collecting literal strings, numbers, etc.	541 // Buffers collecting literal strings, numbers, etc.

539 LiteralBuffer literal_buffer1_;	542 LiteralBuffer literal_buffer1_;

540 LiteralBuffer literal_buffer2_;	543 LiteralBuffer literal_buffer2_;

541	544

542 TokenDesc current_; // desc for current token (as returned by Next())	545 TokenDesc current_; // desc for current token (as returned by Next())

543 TokenDesc next_; // desc for next token (one token look-ahead)	546 TokenDesc next_; // desc for next token (one token look-ahead)

544	547

545 // Input stream. Must be initialized to an UC16CharacterStream.	548 // Input stream. Must be initialized to an Utf16CharacterStream.

546 UC16CharacterStream* source_;	549 Utf16CharacterStream* source_;

547	550

548	551

549 // Start position of the octal literal last scanned.	552 // Start position of the octal literal last scanned.

550 Location octal_pos_;	553 Location octal_pos_;

551	554

552 // One Unicode character look-ahead; c0_ < 0 at the end of the input.	555 // One Unicode character look-ahead; c0_ < 0 at the end of the input.

553 uc32 c0_;	556 uc32 c0_;

554	557

555 // Whether there is a line terminator whitespace character after	558 // Whether there is a line terminator whitespace character after

556 // the current token, and before the next. Does not count newlines	559 // the current token, and before the next. Does not count newlines

557 // inside multiline comments.	560 // inside multiline comments.

558 bool has_line_terminator_before_next_;	561 bool has_line_terminator_before_next_;

559 // Whether there is a multi-line comment that contains a	562 // Whether there is a multi-line comment that contains a

560 // line-terminator after the current token, and before the next.	563 // line-terminator after the current token, and before the next.

561 bool has_multiline_comment_before_next_;	564 bool has_multiline_comment_before_next_;

562 // Whether we scan 'let' as a keyword for harmony block-scoped let bindings.	565 // Whether we scan 'let' as a keyword for harmony block-scoped let bindings.

563 bool harmony_scoping_;	566 bool harmony_scoping_;

564 // Whether we scan 'module', 'import', 'export' as keywords.	567 // Whether we scan 'module', 'import', 'export' as keywords.

565 bool harmony_modules_;	568 bool harmony_modules_;

566 };	569 };

567	570

568 } } // namespace v8::internal	571 } } // namespace v8::internal

569	572

570 #endif // V8_SCANNER_H_	573 #endif // V8_SCANNER_H_

OLD	NEW

« src/handles.cc ('K') | « src/preparser-api.cc ('k') | src/scanner.cc » ('j') | src/unicode.h » ('J')