| Index: src/api.cc
|
| diff --git a/src/api.cc b/src/api.cc
|
| index cf3c0835b545190aba087dbe6cc49e60333b3b34..e8c01e6585af64b23c37d47aafab2acde7726f74 100644
|
| --- a/src/api.cc
|
| +++ b/src/api.cc
|
| @@ -3873,109 +3873,229 @@ int String::Length() const {
|
| return str->length();
|
| }
|
|
|
| +bool String::MayContainNonAscii() const {
|
| + i::Handle<i::String> str = Utils::OpenHandle(this);
|
| + if (IsDeadCheck(str->GetIsolate(), "v8::String::MayContainNonAscii()")) {
|
| + return false;
|
| + }
|
| + return !str->HasOnlyAsciiChars();
|
| +}
|
| +
|
| +
|
| +class Utf8LengthVisitor {
|
| + public:
|
| + explicit Utf8LengthVisitor()
|
| + : utf8_length_(0),
|
| + last_character_(unibrow::Utf16::kNoPreviousCharacter) {}
|
| +
|
| + inline int GetLength() {
|
| + return utf8_length_;
|
| + }
|
| +
|
| + template<typename Char>
|
| + inline void Visit(const Char* chars, unsigned length) {
|
| + ASSERT(length > 0);
|
| + // TODO(dcarney) Add back ascii fast path.
|
| + int utf8_length = 0;
|
| + int last_character = last_character_;
|
| + for (unsigned i = 0; i < length; i++) {
|
| + uint16_t c = chars[i];
|
| + utf8_length += unibrow::Utf8::Length(c, last_character);
|
| + last_character = c;
|
| + }
|
| + last_character_ = last_character;
|
| + utf8_length_ += utf8_length;
|
| + }
|
| +
|
| + inline void VisitOneByteString(const uint8_t* chars, unsigned length) {
|
| + Visit(chars, length);
|
| + }
|
| +
|
| + inline void VisitTwoByteString(const uint16_t* chars, unsigned length) {
|
| + Visit(chars, length);
|
| + }
|
| +
|
| + private:
|
| + int utf8_length_;
|
| + int last_character_;
|
| + DISALLOW_COPY_AND_ASSIGN(Utf8LengthVisitor);
|
| +};
|
| +
|
| +
|
| +static int Utf8Length(i::String* str, i::Isolate* isolate) {
|
| + unsigned length = static_cast<unsigned>(str->length());
|
| + if (length == 0) return 0;
|
| + int32_t type = str->map()->instance_type();
|
| + Utf8LengthVisitor visitor;
|
| + // Non ConsString branch.
|
| + if ((type & i::kStringRepresentationMask) != i::kConsStringTag) {
|
| + i::ConsStringNullOp null_op;
|
| + i::String::Visit(str, 0, visitor, null_op, type, length);
|
| + return visitor.GetLength();
|
| + }
|
| + i::ConsStringIteratorOp* op = isolate->write_iterator();
|
| + unsigned offset = 0;
|
| + i::String* leaf = op->Operate(str, &offset, &type, &length);
|
| + ASSERT(leaf != NULL);
|
| + while (leaf != NULL) {
|
| + i::ConsStringNullOp null_op;
|
| + ASSERT(offset == 0);
|
| + i::String::Visit(leaf, 0, visitor, null_op, type, length);
|
| + leaf = op->ContinueOperation(&type, &length);
|
| + }
|
| + return visitor.GetLength();
|
| +}
|
| +
|
|
|
| int String::Utf8Length() const {
|
| i::Handle<i::String> str = Utils::OpenHandle(this);
|
| - if (IsDeadCheck(str->GetIsolate(), "v8::String::Utf8Length()")) return 0;
|
| - return i::Utf8Length(str);
|
| -}
|
| -
|
| -
|
| -// Will fail with a negative answer if the recursion depth is too high.
|
| -static int RecursivelySerializeToUtf8(i::String* string,
|
| - char* buffer,
|
| - int start,
|
| - int end,
|
| - int recursion_budget,
|
| - int32_t previous_character,
|
| - int32_t* last_character) {
|
| - int utf8_bytes = 0;
|
| - while (true) {
|
| - if (string->IsOneByteRepresentation()) {
|
| - i::String::WriteToFlat(string, buffer, start, end);
|
| - *last_character = unibrow::Utf16::kNoPreviousCharacter;
|
| - return utf8_bytes + end - start;
|
| + i::Isolate* isolate = str->GetIsolate();
|
| + if (IsDeadCheck(isolate, "v8::String::Utf8Length()")) return 0;
|
| + return v8::Utf8Length(*str, isolate);
|
| +}
|
| +
|
| +
|
| +class Utf8WriterVisitor {
|
| + public:
|
| + Utf8WriterVisitor(char* buffer, int capacity)
|
| + : early_termination_(false),
|
| + last_character_(unibrow::Utf16::kNoPreviousCharacter),
|
| + buffer_(buffer),
|
| + start_(buffer),
|
| + capacity_(capacity),
|
| + utf16_chars_read_(0) {
|
| + }
|
| +
|
| + static int WriteEndCharacter(uint16_t character,
|
| + int last_character,
|
| + int remaining,
|
| + char* const buffer) {
|
| + using namespace unibrow;
|
| + ASSERT(remaining > 0);
|
| + // We can't use a local buffer here because Encode needs to modify
|
| + // previous characters in the stream. We know, however, that
|
| + // exactly one character will be advanced.
|
| + if (Utf16::IsTrailSurrogate(character) &&
|
| + Utf16::IsLeadSurrogate(last_character)) {
|
| + int written = Utf8::Encode(buffer, character, last_character);
|
| + ASSERT(written == 1);
|
| + return written;
|
| }
|
| - switch (i::StringShape(string).representation_tag()) {
|
| - case i::kExternalStringTag: {
|
| - const uint16_t* data = i::ExternalTwoByteString::cast(string)->
|
| - ExternalTwoByteStringGetData(0);
|
| - char* current = buffer;
|
| - for (int i = start; i < end; i++) {
|
| - uint16_t character = data[i];
|
| - current +=
|
| - unibrow::Utf8::Encode(current, character, previous_character);
|
| - previous_character = character;
|
| - }
|
| - *last_character = previous_character;
|
| - return static_cast<int>(utf8_bytes + current - buffer);
|
| + // Use a scratch buffer to check the required characters.
|
| + char temp_buffer[Utf8::kMaxEncodedSize];
|
| + // Can't encode using last_character as gcc has array bounds issues.
|
| + int written = Utf8::Encode(temp_buffer,
|
| + character,
|
| + unibrow::Utf16::kNoPreviousCharacter);
|
| + // Won't fit.
|
| + if (written > remaining) return 0;
|
| + // Copy over the character from temp_buffer.
|
| + for (int j = 0; j < written; j++) {
|
| + buffer[j] = temp_buffer[j];
|
| + }
|
| + return written;
|
| + }
|
| +
|
| + template<typename Char>
|
| + void Visit(const Char* chars, const int length) {
|
| + using namespace unibrow;
|
| + // TODO(dcarney): Add back ascii fast path.
|
| + ASSERT(!early_termination_);
|
| + ASSERT(length > 0);
|
| + // Copy state to stack.
|
| + char* buffer = buffer_;
|
| + int last_character = last_character_;
|
| + int i = 0;
|
| + // Do a fast loop where there is no exit capacity check.
|
| + while (true) {
|
| + int fast_length;
|
| + if (capacity_ == -1) {
|
| + fast_length = length;
|
| + } else {
|
| + int remaing_capacity = capacity_ - (buffer - start_);
|
| + // Need enough space to write everything but one character.
|
| + STATIC_ASSERT(Utf16::kMaxExtraUtf8BytesForOneUtf16CodeUnit == 3);
|
| + int writable_length = (remaing_capacity - 3)/3;
|
| + // Need to drop into slow loop.
|
| + if (writable_length <= 0) break;
|
| + fast_length = i + writable_length;
|
| + if (fast_length > length) fast_length = length;
|
| }
|
| - case i::kSeqStringTag: {
|
| - const uint16_t* data =
|
| - i::SeqTwoByteString::cast(string)->SeqTwoByteStringGetData(0);
|
| - char* current = buffer;
|
| - for (int i = start; i < end; i++) {
|
| - uint16_t character = data[i];
|
| - current +=
|
| - unibrow::Utf8::Encode(current, character, previous_character);
|
| - previous_character = character;
|
| - }
|
| - *last_character = previous_character;
|
| - return static_cast<int>(utf8_bytes + current - buffer);
|
| + // Write the characters to the stream.
|
| + for (; i < fast_length; i++) {
|
| + uint16_t character = *chars++;
|
| + buffer += Utf8::Encode(buffer, character, last_character);
|
| + last_character = character;
|
| + ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
|
| }
|
| - case i::kSlicedStringTag: {
|
| - i::SlicedString* slice = i::SlicedString::cast(string);
|
| - unsigned offset = slice->offset();
|
| - string = slice->parent();
|
| - start += offset;
|
| - end += offset;
|
| - continue;
|
| + // Array is fully written. Exit.
|
| + if (fast_length == length) {
|
| + // Write state back out to object.
|
| + last_character_ = last_character;
|
| + buffer_ = buffer;
|
| + utf16_chars_read_ += i;
|
| + return;
|
| }
|
| - case i::kConsStringTag: {
|
| - i::ConsString* cons_string = i::ConsString::cast(string);
|
| - i::String* first = cons_string->first();
|
| - int boundary = first->length();
|
| - if (start >= boundary) {
|
| - // Only need RHS.
|
| - string = cons_string->second();
|
| - start -= boundary;
|
| - end -= boundary;
|
| - continue;
|
| - } else if (end <= boundary) {
|
| - // Only need LHS.
|
| - string = first;
|
| - } else {
|
| - if (recursion_budget == 0) return -1;
|
| - int extra_utf8_bytes =
|
| - RecursivelySerializeToUtf8(first,
|
| - buffer,
|
| - start,
|
| - boundary,
|
| - recursion_budget - 1,
|
| - previous_character,
|
| - &previous_character);
|
| - if (extra_utf8_bytes < 0) return extra_utf8_bytes;
|
| - buffer += extra_utf8_bytes;
|
| - utf8_bytes += extra_utf8_bytes;
|
| - string = cons_string->second();
|
| - start = 0;
|
| - end -= boundary;
|
| - }
|
| + }
|
| + ASSERT(capacity_ != -1);
|
| + // Slow loop. Must check capacity on each iteration.
|
| + int remaining_capacity = capacity_ - (buffer - start_);
|
| + ASSERT(remaining_capacity >= 0);
|
| + for (; i < length && remaining_capacity > 0; i++) {
|
| + uint16_t character = *chars++;
|
| + int written = WriteEndCharacter(character,
|
| + last_character,
|
| + remaining_capacity,
|
| + buffer);
|
| + if (written == 0) {
|
| + early_termination_ = true;
|
| + break;
|
| }
|
| + buffer += written;
|
| + remaining_capacity -= written;
|
| + last_character = character;
|
| }
|
| + // Write state back out to object.
|
| + last_character_ = last_character;
|
| + buffer_ = buffer;
|
| + utf16_chars_read_ += i;
|
| }
|
| - UNREACHABLE();
|
| - return 0;
|
| -}
|
|
|
| + inline bool IsDone() {
|
| + return early_termination_;
|
| + }
|
|
|
| -bool String::MayContainNonAscii() const {
|
| - i::Handle<i::String> str = Utils::OpenHandle(this);
|
| - if (IsDeadCheck(str->GetIsolate(), "v8::String::MayContainNonAscii()")) {
|
| - return false;
|
| + inline void VisitOneByteString(const uint8_t* chars, unsigned length) {
|
| + Visit(chars, static_cast<int>(length));
|
| }
|
| - return !str->HasOnlyAsciiChars();
|
| -}
|
| +
|
| + inline void VisitTwoByteString(const uint16_t* chars, unsigned length) {
|
| + Visit(chars, static_cast<int>(length));
|
| + }
|
| +
|
| + inline int CompleteWrite(bool write_null, int* utf16_chars_read_out) {
|
| + // Write out number of utf16 characters written to the stream.
|
| + if (utf16_chars_read_out != NULL) {
|
| + *utf16_chars_read_out = utf16_chars_read_;
|
| + }
|
| + // Only null terminate if all of the string was written and there's space.
|
| + if (write_null &&
|
| + !early_termination_ &&
|
| + (capacity_ == -1 || (buffer_ - start_) < capacity_)) {
|
| + *buffer_++ = '\0';
|
| + }
|
| + return buffer_ - start_;
|
| + }
|
| +
|
| + private:
|
| + bool early_termination_;
|
| + int last_character_;
|
| + char* buffer_;
|
| + char* const start_;
|
| + int capacity_;
|
| + int utf16_chars_read_;
|
| + DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);
|
| +};
|
|
|
|
|
| int String::WriteUtf8(char* buffer,
|
| @@ -3990,122 +4110,23 @@ int String::WriteUtf8(char* buffer,
|
| if (options & HINT_MANY_WRITES_EXPECTED) {
|
| FlattenString(str); // Flatten the string for efficiency.
|
| }
|
| - int string_length = str->length();
|
| - if (str->IsOneByteRepresentation()) {
|
| - int len;
|
| - if (capacity == -1) {
|
| - capacity = str->length() + 1;
|
| - len = string_length;
|
| - } else {
|
| - len = i::Min(capacity, str->length());
|
| - }
|
| - i::String::WriteToFlat(*str, buffer, 0, len);
|
| - if (nchars_ref != NULL) *nchars_ref = len;
|
| - if (!(options & NO_NULL_TERMINATION) && capacity > len) {
|
| - buffer[len] = '\0';
|
| - return len + 1;
|
| - }
|
| - return len;
|
| - }
|
| -
|
| - if (capacity == -1 || capacity / 3 >= string_length) {
|
| - int32_t previous = unibrow::Utf16::kNoPreviousCharacter;
|
| - const int kMaxRecursion = 100;
|
| - int utf8_bytes =
|
| - RecursivelySerializeToUtf8(*str,
|
| - buffer,
|
| - 0,
|
| - string_length,
|
| - kMaxRecursion,
|
| - previous,
|
| - &previous);
|
| - if (utf8_bytes >= 0) {
|
| - // Success serializing with recursion.
|
| - if ((options & NO_NULL_TERMINATION) == 0 &&
|
| - (capacity > utf8_bytes || capacity == -1)) {
|
| - buffer[utf8_bytes++] = '\0';
|
| - }
|
| - if (nchars_ref != NULL) *nchars_ref = string_length;
|
| - return utf8_bytes;
|
| + Utf8WriterVisitor writer(buffer, capacity);
|
| + i::ConsStringIteratorOp* op = isolate->write_iterator();
|
| + op->Reset();
|
| + int32_t type = str->map()->instance_type();
|
| + unsigned str_length = static_cast<unsigned>(str->length());
|
| + if (str_length != 0) {
|
| + i::String::Visit(*str, 0, writer, *op, type, str_length);
|
| + while (!writer.IsDone()) {
|
| + unsigned length_out;
|
| + i::String* next = op->ContinueOperation(&type, &length_out);
|
| + if (next == NULL) break;
|
| + // TODO(dcarney): need an asserting null op.
|
| + i::ConsStringNullOp null_op;
|
| + i::String::Visit(next, 0, writer, null_op, type, length_out);
|
| }
|
| - FlattenString(str);
|
| - // Recurse once. This time around the string is flat and the serializing
|
| - // with recursion will certainly succeed.
|
| - return WriteUtf8(buffer, capacity, nchars_ref, options);
|
| - } else if (capacity >= string_length) {
|
| - // First check that the buffer is large enough. If it is, then recurse
|
| - // once without a capacity limit, which will get into the other branch of
|
| - // this 'if'.
|
| - int utf8_bytes = i::Utf8Length(str);
|
| - if ((options & NO_NULL_TERMINATION) == 0) utf8_bytes++;
|
| - if (utf8_bytes <= capacity) {
|
| - return WriteUtf8(buffer, -1, nchars_ref, options);
|
| - }
|
| - }
|
| -
|
| - // Slow case.
|
| - i::StringCharacterStream stream(*str, isolate->write_iterator());
|
| - isolate->string_tracker()->RecordWrite(str);
|
| -
|
| - int len = str->length();
|
| - // Encode the first K - 3 bytes directly into the buffer since we
|
| - // know there's room for them. If no capacity is given we copy all
|
| - // of them here.
|
| - int fast_end = capacity - (unibrow::Utf8::kMaxEncodedSize - 1);
|
| - int i;
|
| - int pos = 0;
|
| - int nchars = 0;
|
| - int previous = unibrow::Utf16::kNoPreviousCharacter;
|
| - for (i = 0; i < len && (capacity == -1 || pos < fast_end); i++) {
|
| - i::uc32 c = stream.GetNext();
|
| - int written = unibrow::Utf8::Encode(buffer + pos, c, previous);
|
| - pos += written;
|
| - nchars++;
|
| - previous = c;
|
| - }
|
| - if (i < len) {
|
| - // For the last characters we need to check the length for each one
|
| - // because they may be longer than the remaining space in the
|
| - // buffer.
|
| - char intermediate[unibrow::Utf8::kMaxEncodedSize];
|
| - for (; i < len && pos < capacity; i++) {
|
| - i::uc32 c = stream.GetNext();
|
| - if (unibrow::Utf16::IsTrailSurrogate(c) &&
|
| - unibrow::Utf16::IsLeadSurrogate(previous)) {
|
| - // We can't use the intermediate buffer here because the encoding
|
| - // of surrogate pairs is done under assumption that you can step
|
| - // back and fix the UTF8 stream. Luckily we only need space for one
|
| - // more byte, so there is always space.
|
| - ASSERT(pos < capacity);
|
| - int written = unibrow::Utf8::Encode(buffer + pos, c, previous);
|
| - ASSERT(written == 1);
|
| - pos += written;
|
| - nchars++;
|
| - } else {
|
| - int written =
|
| - unibrow::Utf8::Encode(intermediate,
|
| - c,
|
| - unibrow::Utf16::kNoPreviousCharacter);
|
| - if (pos + written <= capacity) {
|
| - for (int j = 0; j < written; j++) {
|
| - buffer[pos + j] = intermediate[j];
|
| - }
|
| - pos += written;
|
| - nchars++;
|
| - } else {
|
| - // We've reached the end of the buffer
|
| - break;
|
| - }
|
| - }
|
| - previous = c;
|
| - }
|
| - }
|
| - if (nchars_ref != NULL) *nchars_ref = nchars;
|
| - if (!(options & NO_NULL_TERMINATION) &&
|
| - (i == len && (capacity == -1 || pos < capacity))) {
|
| - buffer[pos++] = '\0';
|
| }
|
| - return pos;
|
| + return writer.CompleteWrite(!(options & NO_NULL_TERMINATION), nchars_ref);
|
| }
|
|
|
|
|
| @@ -5637,7 +5658,7 @@ String::Utf8Value::Utf8Value(v8::Handle<v8::Value> obj)
|
| Handle<String> str = obj->ToString();
|
| if (str.IsEmpty()) return;
|
| i::Handle<i::String> i_str = Utils::OpenHandle(*str);
|
| - length_ = i::Utf8Length(i_str);
|
| + length_ = v8::Utf8Length(*i_str, isolate);
|
| str_ = i::NewArray<char>(length_ + 1);
|
| str->WriteUtf8(str_);
|
| }
|
|
|