Chromium Code Reviews| Index: src/unicode.cc |
| diff --git a/src/unicode.cc b/src/unicode.cc |
| index 14f380642a7a8dcd8286f93a3dbd176b59a68b99..3897f8f8931957d93ca184931c927215af1b94c9 100644 |
| --- a/src/unicode.cc |
| +++ b/src/unicode.cc |
| @@ -277,58 +277,6 @@ uchar Utf8::CalculateValue(const byte* str, |
| } |
| -const byte* Utf8::ReadBlock(Buffer<const char*> str, byte* buffer, |
| - unsigned capacity, unsigned* chars_read_ptr, unsigned* offset_ptr) { |
| - unsigned offset = *offset_ptr; |
| - // Bail out early if we've reached the end of the string. |
| - if (offset == str.length()) { |
| - *chars_read_ptr = 0; |
| - return NULL; |
| - } |
| - const byte* data = reinterpret_cast<const byte*>(str.data()); |
| - if (data[offset] <= kMaxOneByteChar) { |
| - // The next character is an ASCII char so we scan forward over |
| - // the following ASCII characters and return the next pure ASCII |
| - // substring |
| - const byte* result = data + offset; |
| - offset++; |
| - while ((offset < str.length()) && (data[offset] <= kMaxOneByteChar)) |
| - offset++; |
| - *chars_read_ptr = offset - *offset_ptr; |
| - *offset_ptr = offset; |
| - return result; |
| - } else { |
| - // The next character is non-ASCII so we just fill the buffer |
| - unsigned cursor = 0; |
| - unsigned chars_read = 0; |
| - while (offset < str.length()) { |
| - uchar c = data[offset]; |
| - if (c <= kMaxOneByteChar) { |
| - // Fast case for ASCII characters |
| - if (!CharacterStream::EncodeAsciiCharacter(c, |
| - buffer, |
| - capacity, |
| - cursor)) |
| - break; |
| - offset += 1; |
| - } else { |
| - unsigned chars = 0; |
| - c = Utf8::ValueOf(data + offset, str.length() - offset, &chars); |
| - if (!CharacterStream::EncodeNonAsciiCharacter(c, |
| - buffer, |
| - capacity, |
| - cursor)) |
| - break; |
| - offset += chars; |
| - } |
| - chars_read++; |
| - } |
| - *offset_ptr = offset; |
| - *chars_read_ptr = chars_read; |
| - return buffer; |
| - } |
| -} |
| - |
| unsigned CharacterStream::Length() { |
| unsigned result = 0; |
| while (has_more()) { |
| @@ -356,6 +304,75 @@ void CharacterStream::Seek(unsigned position) { |
| } |
| } |
| +void Utf8DecoderBase::Reset(uint16_t* buffer, |
| + unsigned buffer_length, |
| + const uint8_t* stream, |
| + unsigned stream_length) { |
| + // Assume everything will fit in the buffer and stream won't be needed. |
| + last_byte_of_buffer_unused_ = false; |
| + unbuffered_start_ = NULL; |
| + bool writing_to_buffer = true; |
| + // Loop until stream is read, writing to buffer as long as buffer has space. |
| + unsigned utf16_length = 0; |
| + while (stream_length != 0) { |
| + unsigned cursor = 0; |
| + uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor); |
| + ASSERT(cursor > 0 && cursor <= stream_length); |
| + stream += cursor; |
| + stream_length -= cursor; |
| + bool is_two_byte = character > Utf16::kMaxNonSurrogateCharCode; |
| + utf16_length += is_two_byte ? 2 : 1; |
| + // Don't need to write to the buffer, but still need utf16_length. |
| + if (!writing_to_buffer) continue; |
| + // Write out the characters to the buffer. |
| + // Must check for equality with buffer_length as we've already updated it. |
| + if (utf16_length <= buffer_length) { |
| + if (is_two_byte) { |
|
Yang
2012/12/20 09:20:27
misnomer?
|
| + *buffer++ = Utf16::LeadSurrogate(character); |
| + *buffer++ = Utf16::TrailSurrogate(character); |
| + } else { |
| + *buffer++ = character; |
| + } |
| + if (utf16_length == buffer_length) { |
| + // Just wrote last character of buffer |
| + writing_to_buffer = false; |
| + unbuffered_start_ = stream; |
| + } |
| + continue; |
| + } |
| + // Have gone over buffer. |
| + // Last char of buffer is unused, set cursor back. |
| + ASSERT(is_two_byte); |
| + writing_to_buffer = false; |
| + last_byte_of_buffer_unused_ = true; |
| + unbuffered_start_ = stream - cursor; |
| + } |
| + utf16_length_ = utf16_length; |
| +} |
| + |
| + |
| +void Utf8DecoderBase::WriteUtf16Slow(const uint8_t* stream, |
| + uint16_t* data, |
| + unsigned data_length) { |
| + while (data_length != 0) { |
| + unsigned cursor = 0; |
| + uint32_t character = Utf8::ValueOf(stream, Utf8::kMaxEncodedSize, &cursor); |
| + // There's a total lack of bounds checking for stream |
| + // as it was already done in Reset. |
| + stream += cursor; |
| + if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) { |
| + *data++ = Utf16::LeadSurrogate(character); |
| + *data++ = Utf16::TrailSurrogate(character); |
| + ASSERT(data_length > 1); |
| + data_length -= 2; |
| + } else { |
| + *data++ = character; |
| + data_length -= 1; |
| + } |
| + } |
| +} |
| + |
| + |
| // Uppercase: point.category == 'Lu' |
| static const uint16_t kUppercaseTable0Size = 450; |