Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(611)

Side by Side Diff: src/scanner.h

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2011 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after
66 inline int HexValue(uc32 c) { 66 inline int HexValue(uc32 c) {
67 c -= '0'; 67 c -= '0';
68 if (static_cast<unsigned>(c) <= 9) return c; 68 if (static_cast<unsigned>(c) <= 9) return c;
69 c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36. 69 c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.
70 if (static_cast<unsigned>(c) <= 5) return c + 10; 70 if (static_cast<unsigned>(c) <= 5) return c + 10;
71 return -1; 71 return -1;
72 } 72 }
73 73
74 74
75 // --------------------------------------------------------------------- 75 // ---------------------------------------------------------------------
76 // Buffered stream of characters, using an internal UC16 buffer. 76 // Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
77 // A code unit is a 16 bit value representing either a 16 bit code point
78 // or one part of a surrogate pair that make a single 21 bit code point.
77 79
78 class UC16CharacterStream { 80 class Utf16CharacterStream {
79 public: 81 public:
80 UC16CharacterStream() : pos_(0) { } 82 Utf16CharacterStream() : pos_(0) { }
81 virtual ~UC16CharacterStream() { } 83 virtual ~Utf16CharacterStream() { }
82 84
83 // Returns and advances past the next UC16 character in the input 85 // Returns and advances past the next UTF-16 code unit in the input
84 // stream. If there are no more characters, it returns a negative 86 // stream. If there are no more code units, it returns a negative
85 // value. 87 // value.
86 inline uc32 Advance() { 88 inline uc32 Advance() {
87 if (buffer_cursor_ < buffer_end_ || ReadBlock()) { 89 if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
88 pos_++; 90 pos_++;
89 return static_cast<uc32>(*(buffer_cursor_++)); 91 return static_cast<uc32>(*(buffer_cursor_++));
90 } 92 }
91 // Note: currently the following increment is necessary to avoid a 93 // Note: currently the following increment is necessary to avoid a
92 // parser problem! The scanner treats the final kEndOfInput as 94 // parser problem! The scanner treats the final kEndOfInput as
93 // a character with a position, and does math relative to that 95 // a code unit with a position, and does math relative to that
94 // position. 96 // position.
95 pos_++; 97 pos_++;
96 98
97 return kEndOfInput; 99 return kEndOfInput;
98 } 100 }
99 101
100 // Return the current position in the character stream. 102 // Return the current position in the code unit stream.
101 // Starts at zero. 103 // Starts at zero.
102 inline unsigned pos() const { return pos_; } 104 inline unsigned pos() const { return pos_; }
103 105
104 // Skips forward past the next character_count UC16 characters 106 // Skips forward past the next code_unit_count UTF-16 code units
105 // in the input, or until the end of input if that comes sooner. 107 // in the input, or until the end of input if that comes sooner.
106 // Returns the number of characters actually skipped. If less 108 // Returns the number of code units actually skipped. If less
107 // than character_count, 109 // than code_unit_count,
108 inline unsigned SeekForward(unsigned character_count) { 110 inline unsigned SeekForward(unsigned code_unit_count) {
109 unsigned buffered_chars = 111 unsigned buffered_chars =
110 static_cast<unsigned>(buffer_end_ - buffer_cursor_); 112 static_cast<unsigned>(buffer_end_ - buffer_cursor_);
111 if (character_count <= buffered_chars) { 113 if (code_unit_count <= buffered_chars) {
112 buffer_cursor_ += character_count; 114 buffer_cursor_ += code_unit_count;
113 pos_ += character_count; 115 pos_ += code_unit_count;
114 return character_count; 116 return code_unit_count;
115 } 117 }
116 return SlowSeekForward(character_count); 118 return SlowSeekForward(code_unit_count);
117 } 119 }
118 120
119 // Pushes back the most recently read UC16 character (or negative 121 // Pushes back the most recently read UTF-16 code unit (or negative
120 // value if at end of input), i.e., the value returned by the most recent 122 // value if at end of input), i.e., the value returned by the most recent
121 // call to Advance. 123 // call to Advance.
122 // Must not be used right after calling SeekForward. 124 // Must not be used right after calling SeekForward.
123 virtual void PushBack(int32_t character) = 0; 125 virtual void PushBack(int32_t code_unit) = 0;
124 126
125 protected: 127 protected:
126 static const uc32 kEndOfInput = -1; 128 static const uc32 kEndOfInput = -1;
127 129
128 // Ensures that the buffer_cursor_ points to the character at 130 // Ensures that the buffer_cursor_ points to the code_unit at
129 // position pos_ of the input, if possible. If the position 131 // position pos_ of the input, if possible. If the position
130 // is at or after the end of the input, return false. If there 132 // is at or after the end of the input, return false. If there
131 // are more characters available, return true. 133 // are more code_units available, return true.
132 virtual bool ReadBlock() = 0; 134 virtual bool ReadBlock() = 0;
133 virtual unsigned SlowSeekForward(unsigned character_count) = 0; 135 virtual unsigned SlowSeekForward(unsigned code_unit_count) = 0;
134 136
135 const uc16* buffer_cursor_; 137 const uc16* buffer_cursor_;
136 const uc16* buffer_end_; 138 const uc16* buffer_end_;
137 unsigned pos_; 139 unsigned pos_;
138 }; 140 };
139 141
140 142
141 class UnicodeCache { 143 class UnicodeCache {
142 // --------------------------------------------------------------------- 144 // ---------------------------------------------------------------------
143 // Caching predicates used by scanners. 145 // Caching predicates used by scanners.
(...skipping 27 matching lines...) Expand all
171 class LiteralBuffer { 173 class LiteralBuffer {
172 public: 174 public:
173 LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { } 175 LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { }
174 176
175 ~LiteralBuffer() { 177 ~LiteralBuffer() {
176 if (backing_store_.length() > 0) { 178 if (backing_store_.length() > 0) {
177 backing_store_.Dispose(); 179 backing_store_.Dispose();
178 } 180 }
179 } 181 }
180 182
181 INLINE(void AddChar(uc16 character)) { 183 INLINE(void AddChar(uint32_t code_unit)) {
182 if (position_ >= backing_store_.length()) ExpandBuffer(); 184 if (position_ >= backing_store_.length()) ExpandBuffer();
183 if (is_ascii_) { 185 if (is_ascii_) {
184 if (character < kMaxAsciiCharCodeU) { 186 if (code_unit < kMaxAsciiCharCodeU) {
185 backing_store_[position_] = static_cast<byte>(character); 187 backing_store_[position_] = static_cast<byte>(code_unit);
186 position_ += kASCIISize; 188 position_ += kASCIISize;
187 return; 189 return;
188 } 190 }
189 ConvertToUC16(); 191 ConvertToUtf16();
190 } 192 }
191 *reinterpret_cast<uc16*>(&backing_store_[position_]) = character; 193 ASSERT(code_unit < 0x10000u);
194 *reinterpret_cast<uc16*>(&backing_store_[position_]) = code_unit;
192 position_ += kUC16Size; 195 position_ += kUC16Size;
193 } 196 }
194 197
195 bool is_ascii() { return is_ascii_; } 198 bool is_ascii() { return is_ascii_; }
196 199
197 Vector<const uc16> uc16_literal() { 200 Vector<const uc16> utf16_literal() {
198 ASSERT(!is_ascii_); 201 ASSERT(!is_ascii_);
199 ASSERT((position_ & 0x1) == 0); 202 ASSERT((position_ & 0x1) == 0);
200 return Vector<const uc16>( 203 return Vector<const uc16>(
201 reinterpret_cast<const uc16*>(backing_store_.start()), 204 reinterpret_cast<const uc16*>(backing_store_.start()),
202 position_ >> 1); 205 position_ >> 1);
203 } 206 }
204 207
205 Vector<const char> ascii_literal() { 208 Vector<const char> ascii_literal() {
206 ASSERT(is_ascii_); 209 ASSERT(is_ascii_);
207 return Vector<const char>( 210 return Vector<const char>(
(...skipping 21 matching lines...) Expand all
229 return new_capacity; 232 return new_capacity;
230 } 233 }
231 234
232 void ExpandBuffer() { 235 void ExpandBuffer() {
233 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity)); 236 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
234 memcpy(new_store.start(), backing_store_.start(), position_); 237 memcpy(new_store.start(), backing_store_.start(), position_);
235 backing_store_.Dispose(); 238 backing_store_.Dispose();
236 backing_store_ = new_store; 239 backing_store_ = new_store;
237 } 240 }
238 241
239 void ConvertToUC16() { 242 void ConvertToUtf16() {
240 ASSERT(is_ascii_); 243 ASSERT(is_ascii_);
241 Vector<byte> new_store; 244 Vector<byte> new_store;
242 int new_content_size = position_ * kUC16Size; 245 int new_content_size = position_ * kUC16Size;
243 if (new_content_size >= backing_store_.length()) { 246 if (new_content_size >= backing_store_.length()) {
244 // Ensure room for all currently read characters as UC16 as well 247 // Ensure room for all currently read code units as UC16 as well
245 // as the character about to be stored. 248 // as the code unit about to be stored.
246 new_store = Vector<byte>::New(NewCapacity(new_content_size)); 249 new_store = Vector<byte>::New(NewCapacity(new_content_size));
247 } else { 250 } else {
248 new_store = backing_store_; 251 new_store = backing_store_;
249 } 252 }
250 char* src = reinterpret_cast<char*>(backing_store_.start()); 253 char* src = reinterpret_cast<char*>(backing_store_.start());
251 uc16* dst = reinterpret_cast<uc16*>(new_store.start()); 254 uc16* dst = reinterpret_cast<uc16*>(new_store.start());
252 for (int i = position_ - 1; i >= 0; i--) { 255 for (int i = position_ - 1; i >= 0; i--) {
253 dst[i] = src[i]; 256 dst[i] = src[i];
254 } 257 }
255 if (new_store.start() != backing_store_.start()) { 258 if (new_store.start() != backing_store_.start()) {
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after
309 int end_pos; 312 int end_pos;
310 }; 313 };
311 314
312 // -1 is outside of the range of any real source code. 315 // -1 is outside of the range of any real source code.
313 static const int kNoOctalLocation = -1; 316 static const int kNoOctalLocation = -1;
314 317
315 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; 318 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
316 319
317 explicit Scanner(UnicodeCache* scanner_contants); 320 explicit Scanner(UnicodeCache* scanner_contants);
318 321
319 void Initialize(UC16CharacterStream* source); 322 void Initialize(Utf16CharacterStream* source);
320 323
321 // Returns the next token and advances input. 324 // Returns the next token and advances input.
322 Token::Value Next(); 325 Token::Value Next();
323 // Returns the current token again. 326 // Returns the current token again.
324 Token::Value current_token() { return current_.token; } 327 Token::Value current_token() { return current_.token; }
325 // Returns the location information for the current token 328 // Returns the location information for the current token
326 // (the token last returned by Next()). 329 // (the token last returned by Next()).
327 Location location() const { return current_.location; } 330 Location location() const { return current_.location; }
328 // Returns the literal string, if any, for the current token (the 331 // Returns the literal string, if any, for the current token (the
329 // token last returned by Next()). The string is 0-terminated. 332 // token last returned by Next()). The string is 0-terminated.
330 // Literal strings are collected for identifiers, strings, and 333 // Literal strings are collected for identifiers, strings, and
331 // numbers. 334 // numbers.
332 // These functions only give the correct result if the literal 335 // These functions only give the correct result if the literal
333 // was scanned between calls to StartLiteral() and TerminateLiteral(). 336 // was scanned between calls to StartLiteral() and TerminateLiteral().
334 Vector<const char> literal_ascii_string() { 337 Vector<const char> literal_ascii_string() {
335 ASSERT_NOT_NULL(current_.literal_chars); 338 ASSERT_NOT_NULL(current_.literal_chars);
336 return current_.literal_chars->ascii_literal(); 339 return current_.literal_chars->ascii_literal();
337 } 340 }
338 Vector<const uc16> literal_uc16_string() { 341 Vector<const uc16> literal_utf16_string() {
339 ASSERT_NOT_NULL(current_.literal_chars); 342 ASSERT_NOT_NULL(current_.literal_chars);
340 return current_.literal_chars->uc16_literal(); 343 return current_.literal_chars->utf16_literal();
341 } 344 }
342 bool is_literal_ascii() { 345 bool is_literal_ascii() {
343 ASSERT_NOT_NULL(current_.literal_chars); 346 ASSERT_NOT_NULL(current_.literal_chars);
344 return current_.literal_chars->is_ascii(); 347 return current_.literal_chars->is_ascii();
345 } 348 }
346 int literal_length() const { 349 int literal_length() const {
347 ASSERT_NOT_NULL(current_.literal_chars); 350 ASSERT_NOT_NULL(current_.literal_chars);
348 return current_.literal_chars->length(); 351 return current_.literal_chars->length();
349 } 352 }
350 353
(...skipping 13 matching lines...) Expand all
364 Token::Value peek() const { return next_.token; } 367 Token::Value peek() const { return next_.token; }
365 368
366 Location peek_location() const { return next_.location; } 369 Location peek_location() const { return next_.location; }
367 370
368 // Returns the literal string for the next token (the token that 371 // Returns the literal string for the next token (the token that
369 // would be returned if Next() were called). 372 // would be returned if Next() were called).
370 Vector<const char> next_literal_ascii_string() { 373 Vector<const char> next_literal_ascii_string() {
371 ASSERT_NOT_NULL(next_.literal_chars); 374 ASSERT_NOT_NULL(next_.literal_chars);
372 return next_.literal_chars->ascii_literal(); 375 return next_.literal_chars->ascii_literal();
373 } 376 }
374 Vector<const uc16> next_literal_uc16_string() { 377 Vector<const uc16> next_literal_utf16_string() {
375 ASSERT_NOT_NULL(next_.literal_chars); 378 ASSERT_NOT_NULL(next_.literal_chars);
376 return next_.literal_chars->uc16_literal(); 379 return next_.literal_chars->utf16_literal();
377 } 380 }
378 bool is_next_literal_ascii() { 381 bool is_next_literal_ascii() {
379 ASSERT_NOT_NULL(next_.literal_chars); 382 ASSERT_NOT_NULL(next_.literal_chars);
380 return next_.literal_chars->is_ascii(); 383 return next_.literal_chars->is_ascii();
381 } 384 }
382 int next_literal_length() const { 385 int next_literal_length() const {
383 ASSERT_NOT_NULL(next_.literal_chars); 386 ASSERT_NOT_NULL(next_.literal_chars);
384 return next_.literal_chars->length(); 387 return next_.literal_chars->length();
385 } 388 }
386 389
(...skipping 148 matching lines...) Expand 10 before | Expand all | Expand 10 after
535 538
536 UnicodeCache* unicode_cache_; 539 UnicodeCache* unicode_cache_;
537 540
538 // Buffers collecting literal strings, numbers, etc. 541 // Buffers collecting literal strings, numbers, etc.
539 LiteralBuffer literal_buffer1_; 542 LiteralBuffer literal_buffer1_;
540 LiteralBuffer literal_buffer2_; 543 LiteralBuffer literal_buffer2_;
541 544
542 TokenDesc current_; // desc for current token (as returned by Next()) 545 TokenDesc current_; // desc for current token (as returned by Next())
543 TokenDesc next_; // desc for next token (one token look-ahead) 546 TokenDesc next_; // desc for next token (one token look-ahead)
544 547
545 // Input stream. Must be initialized to an UC16CharacterStream. 548 // Input stream. Must be initialized to an Utf16CharacterStream.
546 UC16CharacterStream* source_; 549 Utf16CharacterStream* source_;
547 550
548 551
549 // Start position of the octal literal last scanned. 552 // Start position of the octal literal last scanned.
550 Location octal_pos_; 553 Location octal_pos_;
551 554
552 // One Unicode character look-ahead; c0_ < 0 at the end of the input. 555 // One Unicode character look-ahead; c0_ < 0 at the end of the input.
553 uc32 c0_; 556 uc32 c0_;
554 557
555 // Whether there is a line terminator whitespace character after 558 // Whether there is a line terminator whitespace character after
556 // the current token, and before the next. Does not count newlines 559 // the current token, and before the next. Does not count newlines
557 // inside multiline comments. 560 // inside multiline comments.
558 bool has_line_terminator_before_next_; 561 bool has_line_terminator_before_next_;
559 // Whether there is a multi-line comment that contains a 562 // Whether there is a multi-line comment that contains a
560 // line-terminator after the current token, and before the next. 563 // line-terminator after the current token, and before the next.
561 bool has_multiline_comment_before_next_; 564 bool has_multiline_comment_before_next_;
562 // Whether we scan 'let' as a keyword for harmony block-scoped let bindings. 565 // Whether we scan 'let' as a keyword for harmony block-scoped let bindings.
563 bool harmony_scoping_; 566 bool harmony_scoping_;
564 // Whether we scan 'module', 'import', 'export' as keywords. 567 // Whether we scan 'module', 'import', 'export' as keywords.
565 bool harmony_modules_; 568 bool harmony_modules_;
566 }; 569 };
567 570
568 } } // namespace v8::internal 571 } } // namespace v8::internal
569 572
570 #endif // V8_SCANNER_H_ 573 #endif // V8_SCANNER_H_
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698