Index: src/unicode.cc |
diff --git a/src/unicode.cc b/src/unicode.cc |
index 14f380642a7a8dcd8286f93a3dbd176b59a68b99..3897f8f8931957d93ca184931c927215af1b94c9 100644 |
--- a/src/unicode.cc |
+++ b/src/unicode.cc |
@@ -277,58 +277,6 @@ uchar Utf8::CalculateValue(const byte* str, |
} |
-const byte* Utf8::ReadBlock(Buffer<const char*> str, byte* buffer, |
- unsigned capacity, unsigned* chars_read_ptr, unsigned* offset_ptr) { |
- unsigned offset = *offset_ptr; |
- // Bail out early if we've reached the end of the string. |
- if (offset == str.length()) { |
- *chars_read_ptr = 0; |
- return NULL; |
- } |
- const byte* data = reinterpret_cast<const byte*>(str.data()); |
- if (data[offset] <= kMaxOneByteChar) { |
- // The next character is an ASCII char so we scan forward over |
- // the following ASCII characters and return the next pure ASCII |
- // substring |
- const byte* result = data + offset; |
- offset++; |
- while ((offset < str.length()) && (data[offset] <= kMaxOneByteChar)) |
- offset++; |
- *chars_read_ptr = offset - *offset_ptr; |
- *offset_ptr = offset; |
- return result; |
- } else { |
- // The next character is non-ASCII so we just fill the buffer |
- unsigned cursor = 0; |
- unsigned chars_read = 0; |
- while (offset < str.length()) { |
- uchar c = data[offset]; |
- if (c <= kMaxOneByteChar) { |
- // Fast case for ASCII characters |
- if (!CharacterStream::EncodeAsciiCharacter(c, |
- buffer, |
- capacity, |
- cursor)) |
- break; |
- offset += 1; |
- } else { |
- unsigned chars = 0; |
- c = Utf8::ValueOf(data + offset, str.length() - offset, &chars); |
- if (!CharacterStream::EncodeNonAsciiCharacter(c, |
- buffer, |
- capacity, |
- cursor)) |
- break; |
- offset += chars; |
- } |
- chars_read++; |
- } |
- *offset_ptr = offset; |
- *chars_read_ptr = chars_read; |
- return buffer; |
- } |
-} |
- |
unsigned CharacterStream::Length() { |
unsigned result = 0; |
while (has_more()) { |
@@ -356,6 +304,75 @@ void CharacterStream::Seek(unsigned position) { |
} |
} |
+void Utf8DecoderBase::Reset(uint16_t* buffer, |
+ unsigned buffer_length, |
+ const uint8_t* stream, |
+ unsigned stream_length) { |
+ // Assume everything will fit in the buffer and stream won't be needed. |
+ last_byte_of_buffer_unused_ = false; |
+ unbuffered_start_ = NULL; |
+ bool writing_to_buffer = true; |
+ // Loop until stream is read, writing to buffer as long as buffer has space. |
+ unsigned utf16_length = 0; |
+ while (stream_length != 0) { |
+ unsigned cursor = 0; |
+ uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor); |
+ ASSERT(cursor > 0 && cursor <= stream_length); |
+ stream += cursor; |
+ stream_length -= cursor; |
+ bool is_two_byte = character > Utf16::kMaxNonSurrogateCharCode; |
+ utf16_length += is_two_byte ? 2 : 1; |
+ // Don't need to write to the buffer, but still need utf16_length. |
+ if (!writing_to_buffer) continue; |
+ // Write out the characters to the buffer. |
+ // Must check for equality with buffer_length as we've already updated it. |
+ if (utf16_length <= buffer_length) { |
+ if (is_two_byte) { |
Yang
2012/12/20 09:20:27
misnomer?
|
+ *buffer++ = Utf16::LeadSurrogate(character); |
+ *buffer++ = Utf16::TrailSurrogate(character); |
+ } else { |
+ *buffer++ = character; |
+ } |
+ if (utf16_length == buffer_length) { |
+ // Just wrote last character of buffer |
+ writing_to_buffer = false; |
+ unbuffered_start_ = stream; |
+ } |
+ continue; |
+ } |
+ // Have gone over buffer. |
+ // Last char of buffer is unused, set cursor back. |
+ ASSERT(is_two_byte); |
+ writing_to_buffer = false; |
+ last_byte_of_buffer_unused_ = true; |
+ unbuffered_start_ = stream - cursor; |
+ } |
+ utf16_length_ = utf16_length; |
+} |
+ |
+ |
+void Utf8DecoderBase::WriteUtf16Slow(const uint8_t* stream, |
+ uint16_t* data, |
+ unsigned data_length) { |
+ while (data_length != 0) { |
+ unsigned cursor = 0; |
+ uint32_t character = Utf8::ValueOf(stream, Utf8::kMaxEncodedSize, &cursor); |
+ // There's a total lack of bounds checking for stream |
+ // as it was already done in Reset. |
+ stream += cursor; |
+ if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) { |
+ *data++ = Utf16::LeadSurrogate(character); |
+ *data++ = Utf16::TrailSurrogate(character); |
+ ASSERT(data_length > 1); |
+ data_length -= 2; |
+ } else { |
+ *data++ = character; |
+ data_length -= 1; |
+ } |
+ } |
+} |
+ |
+ |
// Uppercase: point.category == 'Lu' |
static const uint16_t kUppercaseTable0Size = 450; |