Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(168)

Unified Diff: src/scanner.h

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/scanner.h
===================================================================
--- src/scanner.h (revision 10944)
+++ src/scanner.h (working copy)
@@ -73,15 +73,17 @@
// ---------------------------------------------------------------------
-// Buffered stream of characters, using an internal UC16 buffer.
+// Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
+// A code unit is a 16 bit value representing either a 16 bit code point
+// or one part of a surrogate pair that make a single 21 bit code point.
-class UC16CharacterStream {
+class Utf16CharacterStream {
public:
- UC16CharacterStream() : pos_(0) { }
- virtual ~UC16CharacterStream() { }
+ Utf16CharacterStream() : pos_(0) { }
+ virtual ~Utf16CharacterStream() { }
- // Returns and advances past the next UC16 character in the input
- // stream. If there are no more characters, it returns a negative
+ // Returns and advances past the next UTF-16 code unit in the input
+ // stream. If there are no more code units, it returns a negative
// value.
inline uc32 Advance() {
if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
@@ -90,47 +92,47 @@
}
// Note: currently the following increment is necessary to avoid a
// parser problem! The scanner treats the final kEndOfInput as
- // a character with a position, and does math relative to that
+ // a code unit with a position, and does math relative to that
// position.
pos_++;
return kEndOfInput;
}
- // Return the current position in the character stream.
+ // Return the current position in the code unit stream.
// Starts at zero.
inline unsigned pos() const { return pos_; }
- // Skips forward past the next character_count UC16 characters
+ // Skips forward past the next code_unit_count UTF-16 code units
// in the input, or until the end of input if that comes sooner.
- // Returns the number of characters actually skipped. If less
- // than character_count,
- inline unsigned SeekForward(unsigned character_count) {
+ // Returns the number of code units actually skipped. If less
+ // than code_unit_count,
+ inline unsigned SeekForward(unsigned code_unit_count) {
unsigned buffered_chars =
static_cast<unsigned>(buffer_end_ - buffer_cursor_);
- if (character_count <= buffered_chars) {
- buffer_cursor_ += character_count;
- pos_ += character_count;
- return character_count;
+ if (code_unit_count <= buffered_chars) {
+ buffer_cursor_ += code_unit_count;
+ pos_ += code_unit_count;
+ return code_unit_count;
}
- return SlowSeekForward(character_count);
+ return SlowSeekForward(code_unit_count);
}
- // Pushes back the most recently read UC16 character (or negative
+ // Pushes back the most recently read UTF-16 code unit (or negative
// value if at end of input), i.e., the value returned by the most recent
// call to Advance.
// Must not be used right after calling SeekForward.
- virtual void PushBack(int32_t character) = 0;
+ virtual void PushBack(int32_t code_unit) = 0;
protected:
static const uc32 kEndOfInput = -1;
- // Ensures that the buffer_cursor_ points to the character at
+ // Ensures that the buffer_cursor_ points to the code_unit at
// position pos_ of the input, if possible. If the position
// is at or after the end of the input, return false. If there
- // are more characters available, return true.
+ // are more code_units available, return true.
virtual bool ReadBlock() = 0;
- virtual unsigned SlowSeekForward(unsigned character_count) = 0;
+ virtual unsigned SlowSeekForward(unsigned code_unit_count) = 0;
const uc16* buffer_cursor_;
const uc16* buffer_end_;
@@ -178,23 +180,24 @@
}
}
- INLINE(void AddChar(uc16 character)) {
+ INLINE(void AddChar(uint32_t code_unit)) {
if (position_ >= backing_store_.length()) ExpandBuffer();
if (is_ascii_) {
- if (character < kMaxAsciiCharCodeU) {
- backing_store_[position_] = static_cast<byte>(character);
+ if (code_unit < kMaxAsciiCharCodeU) {
+ backing_store_[position_] = static_cast<byte>(code_unit);
position_ += kASCIISize;
return;
}
- ConvertToUC16();
+ ConvertToUtf16();
}
- *reinterpret_cast<uc16*>(&backing_store_[position_]) = character;
+ ASSERT(code_unit < 0x10000u);
+ *reinterpret_cast<uc16*>(&backing_store_[position_]) = code_unit;
position_ += kUC16Size;
}
bool is_ascii() { return is_ascii_; }
- Vector<const uc16> uc16_literal() {
+ Vector<const uc16> utf16_literal() {
ASSERT(!is_ascii_);
ASSERT((position_ & 0x1) == 0);
return Vector<const uc16>(
@@ -236,13 +239,13 @@
backing_store_ = new_store;
}
- void ConvertToUC16() {
+ void ConvertToUtf16() {
ASSERT(is_ascii_);
Vector<byte> new_store;
int new_content_size = position_ * kUC16Size;
if (new_content_size >= backing_store_.length()) {
- // Ensure room for all currently read characters as UC16 as well
- // as the character about to be stored.
+ // Ensure room for all currently read code units as UC16 as well
+ // as the code unit about to be stored.
new_store = Vector<byte>::New(NewCapacity(new_content_size));
} else {
new_store = backing_store_;
@@ -316,7 +319,7 @@
explicit Scanner(UnicodeCache* scanner_contants);
- void Initialize(UC16CharacterStream* source);
+ void Initialize(Utf16CharacterStream* source);
// Returns the next token and advances input.
Token::Value Next();
@@ -335,9 +338,9 @@
ASSERT_NOT_NULL(current_.literal_chars);
return current_.literal_chars->ascii_literal();
}
- Vector<const uc16> literal_uc16_string() {
+ Vector<const uc16> literal_utf16_string() {
ASSERT_NOT_NULL(current_.literal_chars);
- return current_.literal_chars->uc16_literal();
+ return current_.literal_chars->utf16_literal();
}
bool is_literal_ascii() {
ASSERT_NOT_NULL(current_.literal_chars);
@@ -371,9 +374,9 @@
ASSERT_NOT_NULL(next_.literal_chars);
return next_.literal_chars->ascii_literal();
}
- Vector<const uc16> next_literal_uc16_string() {
+ Vector<const uc16> next_literal_utf16_string() {
ASSERT_NOT_NULL(next_.literal_chars);
- return next_.literal_chars->uc16_literal();
+ return next_.literal_chars->utf16_literal();
}
bool is_next_literal_ascii() {
ASSERT_NOT_NULL(next_.literal_chars);
@@ -542,8 +545,8 @@
TokenDesc current_; // desc for current token (as returned by Next())
TokenDesc next_; // desc for next token (one token look-ahead)
- // Input stream. Must be initialized to an UC16CharacterStream.
- UC16CharacterStream* source_;
+ // Input stream. Must be initialized to an Utf16CharacterStream.
+ Utf16CharacterStream* source_;
// Start position of the octal literal last scanned.

Powered by Google App Engine
This is Rietveld 408576698