src/api.cc - Issue 11725006: Refactor out assumption that one byte strings are ascii in utf8 processing.

Unified Diff: src/api.cc

Issue 11725006: Refactor out assumption that one byte strings are ascii in utf8 processing. (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge

Patch Set: Fix array bounds issue Created 7 years, 12 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/api.cc

diff --git a/src/api.cc b/src/api.cc

index cf3c0835b545190aba087dbe6cc49e60333b3b34..e8c01e6585af64b23c37d47aafab2acde7726f74 100644

--- a/src/api.cc

+++ b/src/api.cc

@@ -3873,109 +3873,229 @@ int String::Length() const {

return str->length();

}

+bool String::MayContainNonAscii() const {

+ i::Handle<i::String> str = Utils::OpenHandle(this);

+ if (IsDeadCheck(str->GetIsolate(), "v8::String::MayContainNonAscii()")) {

+ return false;

+ }

+ return !str->HasOnlyAsciiChars();

+class Utf8LengthVisitor {

+ public:

+ explicit Utf8LengthVisitor()

+ : utf8_length_(0),

+ last_character_(unibrow::Utf16::kNoPreviousCharacter) {}

+ inline int GetLength() {

+ return utf8_length_;

+ }

+ template<typename Char>

+ inline void Visit(const Char* chars, unsigned length) {

+ ASSERT(length > 0);

+ // TODO(dcarney) Add back ascii fast path.

+ int utf8_length = 0;

+ int last_character = last_character_;

+ for (unsigned i = 0; i < length; i++) {

+ uint16_t c = chars[i];

+ utf8_length += unibrow::Utf8::Length(c, last_character);

+ last_character = c;

+ }

+ last_character_ = last_character;

+ utf8_length_ += utf8_length;

+ }

+ inline void VisitOneByteString(const uint8_t* chars, unsigned length) {

+ Visit(chars, length);

+ }

+ inline void VisitTwoByteString(const uint16_t* chars, unsigned length) {

+ Visit(chars, length);

+ }

+ private:

+ int utf8_length_;

+ int last_character_;

+ DISALLOW_COPY_AND_ASSIGN(Utf8LengthVisitor);

+};

+static int Utf8Length(i::String* str, i::Isolate* isolate) {

+ unsigned length = static_cast<unsigned>(str->length());

+ if (length == 0) return 0;

+ int32_t type = str->map()->instance_type();

+ Utf8LengthVisitor visitor;

+ // Non ConsString branch.

+ if ((type & i::kStringRepresentationMask) != i::kConsStringTag) {

+ i::ConsStringNullOp null_op;

+ i::String::Visit(str, 0, visitor, null_op, type, length);

+ return visitor.GetLength();

+ }

+ i::ConsStringIteratorOp* op = isolate->write_iterator();

+ unsigned offset = 0;

+ i::String* leaf = op->Operate(str, &offset, &type, &length);

+ ASSERT(leaf != NULL);

+ while (leaf != NULL) {

+ i::ConsStringNullOp null_op;

+ ASSERT(offset == 0);

+ i::String::Visit(leaf, 0, visitor, null_op, type, length);

+ leaf = op->ContinueOperation(&type, &length);

+ }

+ return visitor.GetLength();

int String::Utf8Length() const {

i::Handle<i::String> str = Utils::OpenHandle(this);

- if (IsDeadCheck(str->GetIsolate(), "v8::String::Utf8Length()")) return 0;

- return i::Utf8Length(str);

-// Will fail with a negative answer if the recursion depth is too high.

-static int RecursivelySerializeToUtf8(i::String* string,

- char* buffer,

- int start,

- int end,

- int recursion_budget,

- int32_t previous_character,

- int32_t* last_character) {

- int utf8_bytes = 0;

- while (true) {

- if (string->IsOneByteRepresentation()) {

- i::String::WriteToFlat(string, buffer, start, end);

- *last_character = unibrow::Utf16::kNoPreviousCharacter;

- return utf8_bytes + end - start;

+ i::Isolate* isolate = str->GetIsolate();

+ if (IsDeadCheck(isolate, "v8::String::Utf8Length()")) return 0;

+ return v8::Utf8Length(*str, isolate);

+class Utf8WriterVisitor {

+ public:

+ Utf8WriterVisitor(char* buffer, int capacity)

+ : early_termination_(false),

+ last_character_(unibrow::Utf16::kNoPreviousCharacter),

+ buffer_(buffer),

+ start_(buffer),

+ capacity_(capacity),

+ utf16_chars_read_(0) {

+ }

+ static int WriteEndCharacter(uint16_t character,

+ int last_character,

+ int remaining,

+ char* const buffer) {

+ using namespace unibrow;

+ ASSERT(remaining > 0);

+ // We can't use a local buffer here because Encode needs to modify

+ // previous characters in the stream. We know, however, that

+ // exactly one character will be advanced.

+ if (Utf16::IsTrailSurrogate(character) &&

+ Utf16::IsLeadSurrogate(last_character)) {

+ int written = Utf8::Encode(buffer, character, last_character);

+ ASSERT(written == 1);

+ return written;

}

- switch (i::StringShape(string).representation_tag()) {

- case i::kExternalStringTag: {

- const uint16_t* data = i::ExternalTwoByteString::cast(string)->

- ExternalTwoByteStringGetData(0);

- char* current = buffer;

- for (int i = start; i < end; i++) {

- uint16_t character = data[i];

- current +=

- unibrow::Utf8::Encode(current, character, previous_character);

- previous_character = character;

- }

- *last_character = previous_character;

- return static_cast<int>(utf8_bytes + current - buffer);

+ // Use a scratch buffer to check the required characters.

+ char temp_buffer[Utf8::kMaxEncodedSize];

+ // Can't encode using last_character as gcc has array bounds issues.

+ int written = Utf8::Encode(temp_buffer,

+ character,

+ unibrow::Utf16::kNoPreviousCharacter);

+ // Won't fit.

+ if (written > remaining) return 0;

+ // Copy over the character from temp_buffer.

+ for (int j = 0; j < written; j++) {

+ buffer[j] = temp_buffer[j];

+ }

+ return written;

+ }

+ template<typename Char>

+ void Visit(const Char* chars, const int length) {

+ using namespace unibrow;

+ // TODO(dcarney): Add back ascii fast path.

+ ASSERT(!early_termination_);

+ ASSERT(length > 0);

+ // Copy state to stack.

+ char* buffer = buffer_;

+ int last_character = last_character_;

+ int i = 0;

+ // Do a fast loop where there is no exit capacity check.

+ while (true) {

+ int fast_length;

+ if (capacity_ == -1) {

+ fast_length = length;

+ } else {

+ int remaing_capacity = capacity_ - (buffer - start_);

+ // Need enough space to write everything but one character.

+ STATIC_ASSERT(Utf16::kMaxExtraUtf8BytesForOneUtf16CodeUnit == 3);

+ int writable_length = (remaing_capacity - 3)/3;

+ // Need to drop into slow loop.

+ if (writable_length <= 0) break;

+ fast_length = i + writable_length;

+ if (fast_length > length) fast_length = length;

}

- case i::kSeqStringTag: {

- const uint16_t* data =

- i::SeqTwoByteString::cast(string)->SeqTwoByteStringGetData(0);

- char* current = buffer;

- for (int i = start; i < end; i++) {

- uint16_t character = data[i];

- current +=

- unibrow::Utf8::Encode(current, character, previous_character);

- previous_character = character;

- }

- *last_character = previous_character;

- return static_cast<int>(utf8_bytes + current - buffer);

+ // Write the characters to the stream.

+ for (; i < fast_length; i++) {

+ uint16_t character = *chars++;

+ buffer += Utf8::Encode(buffer, character, last_character);

+ last_character = character;

+ ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);

}

- case i::kSlicedStringTag: {

- i::SlicedString* slice = i::SlicedString::cast(string);

- unsigned offset = slice->offset();

- string = slice->parent();

- start += offset;

- end += offset;

- continue;

+ // Array is fully written. Exit.

+ if (fast_length == length) {

+ // Write state back out to object.

+ last_character_ = last_character;

+ buffer_ = buffer;

+ utf16_chars_read_ += i;

+ return;

}

- case i::kConsStringTag: {

- i::ConsString* cons_string = i::ConsString::cast(string);

- i::String* first = cons_string->first();

- int boundary = first->length();

- if (start >= boundary) {

- // Only need RHS.

- string = cons_string->second();

- start -= boundary;

- end -= boundary;

- continue;

- } else if (end <= boundary) {

- // Only need LHS.

- string = first;

- } else {

- if (recursion_budget == 0) return -1;

- int extra_utf8_bytes =

- RecursivelySerializeToUtf8(first,

- buffer,

- start,

- boundary,

- recursion_budget - 1,

- previous_character,

- &previous_character);

- if (extra_utf8_bytes < 0) return extra_utf8_bytes;

- buffer += extra_utf8_bytes;

- utf8_bytes += extra_utf8_bytes;

- string = cons_string->second();

- start = 0;

- end -= boundary;

- }

+ }

+ ASSERT(capacity_ != -1);

+ // Slow loop. Must check capacity on each iteration.

+ int remaining_capacity = capacity_ - (buffer - start_);

+ ASSERT(remaining_capacity >= 0);

+ for (; i < length && remaining_capacity > 0; i++) {

+ uint16_t character = *chars++;

+ int written = WriteEndCharacter(character,

+ last_character,

+ remaining_capacity,

+ buffer);

+ if (written == 0) {

+ early_termination_ = true;

+ break;

}

+ buffer += written;

+ remaining_capacity -= written;

+ last_character = character;

}

+ // Write state back out to object.

+ last_character_ = last_character;

+ buffer_ = buffer;

+ utf16_chars_read_ += i;

}

- UNREACHABLE();

- return 0;

+ inline bool IsDone() {

+ return early_termination_;

+ }

-bool String::MayContainNonAscii() const {

- i::Handle<i::String> str = Utils::OpenHandle(this);

- if (IsDeadCheck(str->GetIsolate(), "v8::String::MayContainNonAscii()")) {

- return false;

+ inline void VisitOneByteString(const uint8_t* chars, unsigned length) {

+ Visit(chars, static_cast<int>(length));

}

- return !str->HasOnlyAsciiChars();

+ inline void VisitTwoByteString(const uint16_t* chars, unsigned length) {

+ Visit(chars, static_cast<int>(length));

+ }

+ inline int CompleteWrite(bool write_null, int* utf16_chars_read_out) {

+ // Write out number of utf16 characters written to the stream.

+ if (utf16_chars_read_out != NULL) {

+ *utf16_chars_read_out = utf16_chars_read_;

+ }

+ // Only null terminate if all of the string was written and there's space.

+ if (write_null &&

+ !early_termination_ &&

+ (capacity_ == -1 || (buffer_ - start_) < capacity_)) {

+ *buffer_++ = '\0';

+ }

+ return buffer_ - start_;

+ }

+ private:

+ bool early_termination_;

+ int last_character_;

+ char* buffer_;

+ char* const start_;

+ int capacity_;

+ int utf16_chars_read_;

+ DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);

+};

int String::WriteUtf8(char* buffer,

@@ -3990,122 +4110,23 @@ int String::WriteUtf8(char* buffer,

if (options & HINT_MANY_WRITES_EXPECTED) {

FlattenString(str); // Flatten the string for efficiency.

}

- int string_length = str->length();

- if (str->IsOneByteRepresentation()) {

- int len;

- if (capacity == -1) {

- capacity = str->length() + 1;

- len = string_length;

- } else {

- len = i::Min(capacity, str->length());

- }

- i::String::WriteToFlat(*str, buffer, 0, len);

- if (nchars_ref != NULL) *nchars_ref = len;

- if (!(options & NO_NULL_TERMINATION) && capacity > len) {

- buffer[len] = '\0';

- return len + 1;

- }

- return len;

- }

- if (capacity == -1 || capacity / 3 >= string_length) {

- int32_t previous = unibrow::Utf16::kNoPreviousCharacter;

- const int kMaxRecursion = 100;

- int utf8_bytes =

- RecursivelySerializeToUtf8(*str,

- buffer,

- 0,

- string_length,

- kMaxRecursion,

- previous,

- &previous);

- if (utf8_bytes >= 0) {

- // Success serializing with recursion.

- if ((options & NO_NULL_TERMINATION) == 0 &&

- (capacity > utf8_bytes || capacity == -1)) {

- buffer[utf8_bytes++] = '\0';

- }

- if (nchars_ref != NULL) *nchars_ref = string_length;

- return utf8_bytes;

+ Utf8WriterVisitor writer(buffer, capacity);

+ i::ConsStringIteratorOp* op = isolate->write_iterator();

+ op->Reset();

+ int32_t type = str->map()->instance_type();

+ unsigned str_length = static_cast<unsigned>(str->length());

+ if (str_length != 0) {

+ i::String::Visit(*str, 0, writer, *op, type, str_length);

+ while (!writer.IsDone()) {

+ unsigned length_out;

+ i::String* next = op->ContinueOperation(&type, &length_out);

+ if (next == NULL) break;

+ // TODO(dcarney): need an asserting null op.

+ i::ConsStringNullOp null_op;

+ i::String::Visit(next, 0, writer, null_op, type, length_out);

}

- FlattenString(str);

- // Recurse once. This time around the string is flat and the serializing

- // with recursion will certainly succeed.

- return WriteUtf8(buffer, capacity, nchars_ref, options);

- } else if (capacity >= string_length) {

- // First check that the buffer is large enough. If it is, then recurse

- // once without a capacity limit, which will get into the other branch of

- // this 'if'.

- int utf8_bytes = i::Utf8Length(str);

- if ((options & NO_NULL_TERMINATION) == 0) utf8_bytes++;

- if (utf8_bytes <= capacity) {

- return WriteUtf8(buffer, -1, nchars_ref, options);

- }

- // Slow case.

- i::StringCharacterStream stream(*str, isolate->write_iterator());

- isolate->string_tracker()->RecordWrite(str);

- int len = str->length();

- // Encode the first K - 3 bytes directly into the buffer since we

- // know there's room for them. If no capacity is given we copy all

- // of them here.

- int fast_end = capacity - (unibrow::Utf8::kMaxEncodedSize - 1);

- int i;

- int pos = 0;

- int nchars = 0;

- int previous = unibrow::Utf16::kNoPreviousCharacter;

- for (i = 0; i < len && (capacity == -1 || pos < fast_end); i++) {

- i::uc32 c = stream.GetNext();

- int written = unibrow::Utf8::Encode(buffer + pos, c, previous);

- pos += written;

- nchars++;

- previous = c;

- }

- if (i < len) {

- // For the last characters we need to check the length for each one

- // because they may be longer than the remaining space in the

- // buffer.

- char intermediate[unibrow::Utf8::kMaxEncodedSize];

- for (; i < len && pos < capacity; i++) {

- i::uc32 c = stream.GetNext();

- if (unibrow::Utf16::IsTrailSurrogate(c) &&

- unibrow::Utf16::IsLeadSurrogate(previous)) {

- // We can't use the intermediate buffer here because the encoding

- // of surrogate pairs is done under assumption that you can step

- // back and fix the UTF8 stream. Luckily we only need space for one

- // more byte, so there is always space.

- ASSERT(pos < capacity);

- int written = unibrow::Utf8::Encode(buffer + pos, c, previous);

- ASSERT(written == 1);

- pos += written;

- nchars++;

- } else {

- int written =

- unibrow::Utf8::Encode(intermediate,

- c,

- unibrow::Utf16::kNoPreviousCharacter);

- if (pos + written <= capacity) {

- for (int j = 0; j < written; j++) {

- buffer[pos + j] = intermediate[j];

- }

- pos += written;

- nchars++;

- } else {

- // We've reached the end of the buffer

- break;

- }

- previous = c;

- }

- if (nchars_ref != NULL) *nchars_ref = nchars;

- if (!(options & NO_NULL_TERMINATION) &&

- (i == len && (capacity == -1 || pos < capacity))) {

- buffer[pos++] = '\0';

}

- return pos;

+ return writer.CompleteWrite(!(options & NO_NULL_TERMINATION), nchars_ref);

}

@@ -5637,7 +5658,7 @@ String::Utf8Value::Utf8Value(v8::Handle<v8::Value> obj)

Handle<String> str = obj->ToString();

if (str.IsEmpty()) return;

i::Handle<i::String> i_str = Utils::OpenHandle(*str);

- length_ = i::Utf8Length(i_str);

+ length_ = v8::Utf8Length(*i_str, isolate);

str_ = i::NewArray<char>(length_ + 1);

str->WriteUtf8(str_);

}

« no previous file with comments | « no previous file | src/handles.h » ('j') | no next file with comments »