src/scanner.h - Issue 9600009: Fix input and output to handle UTF16 surrogate pairs.

Unified Diff: src/scanner.h

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/scanner.h

===================================================================

--- src/scanner.h (revision 10944)

+++ src/scanner.h (working copy)

@@ -73,15 +73,17 @@

// ---------------------------------------------------------------------

-// Buffered stream of characters, using an internal UC16 buffer.

+// Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.

+// A code unit is a 16 bit value representing either a 16 bit code point

+// or one part of a surrogate pair that make a single 21 bit code point.

-class UC16CharacterStream {

+class Utf16CharacterStream {

public:

- UC16CharacterStream() : pos_(0) { }

- virtual ~UC16CharacterStream() { }

+ Utf16CharacterStream() : pos_(0) { }

+ virtual ~Utf16CharacterStream() { }

- // Returns and advances past the next UC16 character in the input

- // stream. If there are no more characters, it returns a negative

+ // Returns and advances past the next UTF-16 code unit in the input

+ // stream. If there are no more code units, it returns a negative

// value.

inline uc32 Advance() {

if (buffer_cursor_ < buffer_end_ || ReadBlock()) {

@@ -90,47 +92,47 @@

}

// Note: currently the following increment is necessary to avoid a

// parser problem! The scanner treats the final kEndOfInput as

- // a character with a position, and does math relative to that

+ // a code unit with a position, and does math relative to that

// position.

pos_++;

return kEndOfInput;

}

- // Return the current position in the character stream.

+ // Return the current position in the code unit stream.

// Starts at zero.

inline unsigned pos() const { return pos_; }

- // Skips forward past the next character_count UC16 characters

+ // Skips forward past the next code_unit_count UTF-16 code units

// in the input, or until the end of input if that comes sooner.

- // Returns the number of characters actually skipped. If less

- // than character_count,

- inline unsigned SeekForward(unsigned character_count) {

+ // Returns the number of code units actually skipped. If less

+ // than code_unit_count,

+ inline unsigned SeekForward(unsigned code_unit_count) {

unsigned buffered_chars =

static_cast<unsigned>(buffer_end_ - buffer_cursor_);

- if (character_count <= buffered_chars) {

- buffer_cursor_ += character_count;

- pos_ += character_count;

- return character_count;

+ if (code_unit_count <= buffered_chars) {

+ buffer_cursor_ += code_unit_count;

+ pos_ += code_unit_count;

+ return code_unit_count;

}

- return SlowSeekForward(character_count);

+ return SlowSeekForward(code_unit_count);

}

- // Pushes back the most recently read UC16 character (or negative

+ // Pushes back the most recently read UTF-16 code unit (or negative

// value if at end of input), i.e., the value returned by the most recent

// call to Advance.

// Must not be used right after calling SeekForward.

- virtual void PushBack(int32_t character) = 0;

+ virtual void PushBack(int32_t code_unit) = 0;

protected:

static const uc32 kEndOfInput = -1;

- // Ensures that the buffer_cursor_ points to the character at

+ // Ensures that the buffer_cursor_ points to the code_unit at

// position pos_ of the input, if possible. If the position

// is at or after the end of the input, return false. If there

- // are more characters available, return true.

+ // are more code_units available, return true.

virtual bool ReadBlock() = 0;

- virtual unsigned SlowSeekForward(unsigned character_count) = 0;

+ virtual unsigned SlowSeekForward(unsigned code_unit_count) = 0;

const uc16* buffer_cursor_;

const uc16* buffer_end_;

@@ -178,23 +180,24 @@

}

- INLINE(void AddChar(uc16 character)) {

+ INLINE(void AddChar(uint32_t code_unit)) {

if (position_ >= backing_store_.length()) ExpandBuffer();

if (is_ascii_) {

- if (character < kMaxAsciiCharCodeU) {

- backing_store_[position_] = static_cast<byte>(character);

+ if (code_unit < kMaxAsciiCharCodeU) {

+ backing_store_[position_] = static_cast<byte>(code_unit);

position_ += kASCIISize;

return;

}

- ConvertToUC16();

+ ConvertToUtf16();

}

- *reinterpret_cast<uc16*>(&backing_store_[position_]) = character;

+ ASSERT(code_unit < 0x10000u);

+ *reinterpret_cast<uc16*>(&backing_store_[position_]) = code_unit;

position_ += kUC16Size;

}

bool is_ascii() { return is_ascii_; }

- Vector<const uc16> uc16_literal() {

+ Vector<const uc16> utf16_literal() {

ASSERT(!is_ascii_);

ASSERT((position_ & 0x1) == 0);

return Vector<const uc16>(

@@ -236,13 +239,13 @@

backing_store_ = new_store;

}

- void ConvertToUC16() {

+ void ConvertToUtf16() {

ASSERT(is_ascii_);

Vector<byte> new_store;

int new_content_size = position_ * kUC16Size;

if (new_content_size >= backing_store_.length()) {

- // Ensure room for all currently read characters as UC16 as well

- // as the character about to be stored.

+ // Ensure room for all currently read code units as UC16 as well

+ // as the code unit about to be stored.

new_store = Vector<byte>::New(NewCapacity(new_content_size));

} else {

new_store = backing_store_;

@@ -316,7 +319,7 @@

explicit Scanner(UnicodeCache* scanner_contants);

- void Initialize(UC16CharacterStream* source);

+ void Initialize(Utf16CharacterStream* source);

// Returns the next token and advances input.

Token::Value Next();

@@ -335,9 +338,9 @@

ASSERT_NOT_NULL(current_.literal_chars);

return current_.literal_chars->ascii_literal();

}

- Vector<const uc16> literal_uc16_string() {

+ Vector<const uc16> literal_utf16_string() {

ASSERT_NOT_NULL(current_.literal_chars);

- return current_.literal_chars->uc16_literal();

+ return current_.literal_chars->utf16_literal();

}

bool is_literal_ascii() {

ASSERT_NOT_NULL(current_.literal_chars);

@@ -371,9 +374,9 @@

ASSERT_NOT_NULL(next_.literal_chars);

return next_.literal_chars->ascii_literal();

}

- Vector<const uc16> next_literal_uc16_string() {

+ Vector<const uc16> next_literal_utf16_string() {

ASSERT_NOT_NULL(next_.literal_chars);

- return next_.literal_chars->uc16_literal();

+ return next_.literal_chars->utf16_literal();

}

bool is_next_literal_ascii() {

ASSERT_NOT_NULL(next_.literal_chars);

@@ -542,8 +545,8 @@

TokenDesc current_; // desc for current token (as returned by Next())

TokenDesc next_; // desc for next token (one token look-ahead)

- // Input stream. Must be initialized to an UC16CharacterStream.

- UC16CharacterStream* source_;

+ // Input stream. Must be initialized to an Utf16CharacterStream.

+ Utf16CharacterStream* source_;

// Start position of the octal literal last scanned.

« src/handles.cc ('K') | « src/preparser-api.cc ('k') | src/scanner.cc » ('j') | src/unicode.h » ('J')