src/unicode.cc - Issue 11649018: Remove Utf8InputBuffer

Unified Diff: src/unicode.cc

Issue 11649018: Remove Utf8InputBuffer (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/unicode.cc

diff --git a/src/unicode.cc b/src/unicode.cc

index 14f380642a7a8dcd8286f93a3dbd176b59a68b99..3897f8f8931957d93ca184931c927215af1b94c9 100644

--- a/src/unicode.cc

+++ b/src/unicode.cc

@@ -277,58 +277,6 @@ uchar Utf8::CalculateValue(const byte* str,

}

-const byte* Utf8::ReadBlock(Buffer<const char*> str, byte* buffer,

- unsigned capacity, unsigned* chars_read_ptr, unsigned* offset_ptr) {

- unsigned offset = *offset_ptr;

- // Bail out early if we've reached the end of the string.

- if (offset == str.length()) {

- *chars_read_ptr = 0;

- return NULL;

- }

- const byte* data = reinterpret_cast<const byte*>(str.data());

- if (data[offset] <= kMaxOneByteChar) {

- // The next character is an ASCII char so we scan forward over

- // the following ASCII characters and return the next pure ASCII

- // substring

- const byte* result = data + offset;

- offset++;

- while ((offset < str.length()) && (data[offset] <= kMaxOneByteChar))

- offset++;

- *chars_read_ptr = offset - *offset_ptr;

- *offset_ptr = offset;

- return result;

- } else {

- // The next character is non-ASCII so we just fill the buffer

- unsigned cursor = 0;

- unsigned chars_read = 0;

- while (offset < str.length()) {

- uchar c = data[offset];

- if (c <= kMaxOneByteChar) {

- // Fast case for ASCII characters

- if (!CharacterStream::EncodeAsciiCharacter(c,

- buffer,

- capacity,

- cursor))

- break;

- offset += 1;

- } else {

- unsigned chars = 0;

- c = Utf8::ValueOf(data + offset, str.length() - offset, &chars);

- if (!CharacterStream::EncodeNonAsciiCharacter(c,

- buffer,

- capacity,

- cursor))

- break;

- offset += chars;

- }

- chars_read++;

- }

- *offset_ptr = offset;

- *chars_read_ptr = chars_read;

- return buffer;

- }

unsigned CharacterStream::Length() {

unsigned result = 0;

while (has_more()) {

@@ -356,6 +304,75 @@ void CharacterStream::Seek(unsigned position) {

}

+void Utf8DecoderBase::Reset(uint16_t* buffer,

+ unsigned buffer_length,

+ const uint8_t* stream,

+ unsigned stream_length) {

+ // Assume everything will fit in the buffer and stream won't be needed.

+ last_byte_of_buffer_unused_ = false;

+ unbuffered_start_ = NULL;

+ bool writing_to_buffer = true;

+ // Loop until stream is read, writing to buffer as long as buffer has space.

+ unsigned utf16_length = 0;

+ while (stream_length != 0) {

+ unsigned cursor = 0;

+ uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor);

+ ASSERT(cursor > 0 && cursor <= stream_length);

+ stream += cursor;

+ stream_length -= cursor;

+ bool is_two_byte = character > Utf16::kMaxNonSurrogateCharCode;

+ utf16_length += is_two_byte ? 2 : 1;

+ // Don't need to write to the buffer, but still need utf16_length.

+ if (!writing_to_buffer) continue;

+ // Write out the characters to the buffer.

+ // Must check for equality with buffer_length as we've already updated it.

+ if (utf16_length <= buffer_length) {

+ if (is_two_byte) {

Yang 2012/12/20 09:20:27 misnomer?

+ *buffer++ = Utf16::LeadSurrogate(character);

+ *buffer++ = Utf16::TrailSurrogate(character);

+ } else {

+ *buffer++ = character;

+ }

+ if (utf16_length == buffer_length) {

+ // Just wrote last character of buffer

+ writing_to_buffer = false;

+ unbuffered_start_ = stream;

+ }

+ continue;

+ }

+ // Have gone over buffer.

+ // Last char of buffer is unused, set cursor back.

+ ASSERT(is_two_byte);

+ writing_to_buffer = false;

+ last_byte_of_buffer_unused_ = true;

+ unbuffered_start_ = stream - cursor;

+ }

+ utf16_length_ = utf16_length;

+void Utf8DecoderBase::WriteUtf16Slow(const uint8_t* stream,

+ uint16_t* data,

+ unsigned data_length) {

+ while (data_length != 0) {

+ unsigned cursor = 0;

+ uint32_t character = Utf8::ValueOf(stream, Utf8::kMaxEncodedSize, &cursor);

+ // There's a total lack of bounds checking for stream

+ // as it was already done in Reset.

+ stream += cursor;

+ if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) {

+ *data++ = Utf16::LeadSurrogate(character);

+ *data++ = Utf16::TrailSurrogate(character);

+ ASSERT(data_length > 1);

+ data_length -= 2;

+ } else {

+ *data++ = character;

+ data_length -= 1;

+ }

// Uppercase: point.category == 'Lu'

static const uint16_t kUppercaseTable0Size = 450;

« src/objects.cc ('K') | « src/unicode.h ('k') | src/unicode-inl.h » ('j') | src/unicode-inl.h » ('J')