Index: src/api.cc |
diff --git a/src/api.cc b/src/api.cc |
index cf3c0835b545190aba087dbe6cc49e60333b3b34..e8c01e6585af64b23c37d47aafab2acde7726f74 100644 |
--- a/src/api.cc |
+++ b/src/api.cc |
@@ -3873,109 +3873,229 @@ int String::Length() const { |
return str->length(); |
} |
+bool String::MayContainNonAscii() const { |
+ i::Handle<i::String> str = Utils::OpenHandle(this); |
+ if (IsDeadCheck(str->GetIsolate(), "v8::String::MayContainNonAscii()")) { |
+ return false; |
+ } |
+ return !str->HasOnlyAsciiChars(); |
+} |
+ |
+ |
+class Utf8LengthVisitor { |
+ public: |
+ explicit Utf8LengthVisitor() |
+ : utf8_length_(0), |
+ last_character_(unibrow::Utf16::kNoPreviousCharacter) {} |
+ |
+ inline int GetLength() { |
+ return utf8_length_; |
+ } |
+ |
+ template<typename Char> |
+ inline void Visit(const Char* chars, unsigned length) { |
+ ASSERT(length > 0); |
+ // TODO(dcarney) Add back ascii fast path. |
+ int utf8_length = 0; |
+ int last_character = last_character_; |
+ for (unsigned i = 0; i < length; i++) { |
+ uint16_t c = chars[i]; |
+ utf8_length += unibrow::Utf8::Length(c, last_character); |
+ last_character = c; |
+ } |
+ last_character_ = last_character; |
+ utf8_length_ += utf8_length; |
+ } |
+ |
+ inline void VisitOneByteString(const uint8_t* chars, unsigned length) { |
+ Visit(chars, length); |
+ } |
+ |
+ inline void VisitTwoByteString(const uint16_t* chars, unsigned length) { |
+ Visit(chars, length); |
+ } |
+ |
+ private: |
+ int utf8_length_; |
+ int last_character_; |
+ DISALLOW_COPY_AND_ASSIGN(Utf8LengthVisitor); |
+}; |
+ |
+ |
+static int Utf8Length(i::String* str, i::Isolate* isolate) { |
+ unsigned length = static_cast<unsigned>(str->length()); |
+ if (length == 0) return 0; |
+ int32_t type = str->map()->instance_type(); |
+ Utf8LengthVisitor visitor; |
+ // Non ConsString branch. |
+ if ((type & i::kStringRepresentationMask) != i::kConsStringTag) { |
+ i::ConsStringNullOp null_op; |
+ i::String::Visit(str, 0, visitor, null_op, type, length); |
+ return visitor.GetLength(); |
+ } |
+ i::ConsStringIteratorOp* op = isolate->write_iterator(); |
+ unsigned offset = 0; |
+ i::String* leaf = op->Operate(str, &offset, &type, &length); |
+ ASSERT(leaf != NULL); |
+ while (leaf != NULL) { |
+ i::ConsStringNullOp null_op; |
+ ASSERT(offset == 0); |
+ i::String::Visit(leaf, 0, visitor, null_op, type, length); |
+ leaf = op->ContinueOperation(&type, &length); |
+ } |
+ return visitor.GetLength(); |
+} |
+ |
int String::Utf8Length() const { |
i::Handle<i::String> str = Utils::OpenHandle(this); |
- if (IsDeadCheck(str->GetIsolate(), "v8::String::Utf8Length()")) return 0; |
- return i::Utf8Length(str); |
-} |
- |
- |
-// Will fail with a negative answer if the recursion depth is too high. |
-static int RecursivelySerializeToUtf8(i::String* string, |
- char* buffer, |
- int start, |
- int end, |
- int recursion_budget, |
- int32_t previous_character, |
- int32_t* last_character) { |
- int utf8_bytes = 0; |
- while (true) { |
- if (string->IsOneByteRepresentation()) { |
- i::String::WriteToFlat(string, buffer, start, end); |
- *last_character = unibrow::Utf16::kNoPreviousCharacter; |
- return utf8_bytes + end - start; |
+ i::Isolate* isolate = str->GetIsolate(); |
+ if (IsDeadCheck(isolate, "v8::String::Utf8Length()")) return 0; |
+ return v8::Utf8Length(*str, isolate); |
+} |
+ |
+ |
+class Utf8WriterVisitor { |
+ public: |
+ Utf8WriterVisitor(char* buffer, int capacity) |
+ : early_termination_(false), |
+ last_character_(unibrow::Utf16::kNoPreviousCharacter), |
+ buffer_(buffer), |
+ start_(buffer), |
+ capacity_(capacity), |
+ utf16_chars_read_(0) { |
+ } |
+ |
+ static int WriteEndCharacter(uint16_t character, |
+ int last_character, |
+ int remaining, |
+ char* const buffer) { |
+ using namespace unibrow; |
+ ASSERT(remaining > 0); |
+ // We can't use a local buffer here because Encode needs to modify |
+ // previous characters in the stream. We know, however, that |
+ // exactly one character will be advanced. |
+ if (Utf16::IsTrailSurrogate(character) && |
+ Utf16::IsLeadSurrogate(last_character)) { |
+ int written = Utf8::Encode(buffer, character, last_character); |
+ ASSERT(written == 1); |
+ return written; |
} |
- switch (i::StringShape(string).representation_tag()) { |
- case i::kExternalStringTag: { |
- const uint16_t* data = i::ExternalTwoByteString::cast(string)-> |
- ExternalTwoByteStringGetData(0); |
- char* current = buffer; |
- for (int i = start; i < end; i++) { |
- uint16_t character = data[i]; |
- current += |
- unibrow::Utf8::Encode(current, character, previous_character); |
- previous_character = character; |
- } |
- *last_character = previous_character; |
- return static_cast<int>(utf8_bytes + current - buffer); |
+ // Use a scratch buffer to check the required characters. |
+ char temp_buffer[Utf8::kMaxEncodedSize]; |
+ // Can't encode using last_character as gcc has array bounds issues. |
+ int written = Utf8::Encode(temp_buffer, |
+ character, |
+ unibrow::Utf16::kNoPreviousCharacter); |
+ // Won't fit. |
+ if (written > remaining) return 0; |
+ // Copy over the character from temp_buffer. |
+ for (int j = 0; j < written; j++) { |
+ buffer[j] = temp_buffer[j]; |
+ } |
+ return written; |
+ } |
+ |
+ template<typename Char> |
+ void Visit(const Char* chars, const int length) { |
+ using namespace unibrow; |
+ // TODO(dcarney): Add back ascii fast path. |
+ ASSERT(!early_termination_); |
+ ASSERT(length > 0); |
+ // Copy state to stack. |
+ char* buffer = buffer_; |
+ int last_character = last_character_; |
+ int i = 0; |
+ // Do a fast loop where there is no exit capacity check. |
+ while (true) { |
+ int fast_length; |
+ if (capacity_ == -1) { |
+ fast_length = length; |
+ } else { |
+ int remaing_capacity = capacity_ - (buffer - start_); |
+ // Need enough space to write everything but one character. |
+ STATIC_ASSERT(Utf16::kMaxExtraUtf8BytesForOneUtf16CodeUnit == 3); |
+ int writable_length = (remaing_capacity - 3)/3; |
+ // Need to drop into slow loop. |
+ if (writable_length <= 0) break; |
+ fast_length = i + writable_length; |
+ if (fast_length > length) fast_length = length; |
} |
- case i::kSeqStringTag: { |
- const uint16_t* data = |
- i::SeqTwoByteString::cast(string)->SeqTwoByteStringGetData(0); |
- char* current = buffer; |
- for (int i = start; i < end; i++) { |
- uint16_t character = data[i]; |
- current += |
- unibrow::Utf8::Encode(current, character, previous_character); |
- previous_character = character; |
- } |
- *last_character = previous_character; |
- return static_cast<int>(utf8_bytes + current - buffer); |
+ // Write the characters to the stream. |
+ for (; i < fast_length; i++) { |
+ uint16_t character = *chars++; |
+ buffer += Utf8::Encode(buffer, character, last_character); |
+ last_character = character; |
+ ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_); |
} |
- case i::kSlicedStringTag: { |
- i::SlicedString* slice = i::SlicedString::cast(string); |
- unsigned offset = slice->offset(); |
- string = slice->parent(); |
- start += offset; |
- end += offset; |
- continue; |
+ // Array is fully written. Exit. |
+ if (fast_length == length) { |
+ // Write state back out to object. |
+ last_character_ = last_character; |
+ buffer_ = buffer; |
+ utf16_chars_read_ += i; |
+ return; |
} |
- case i::kConsStringTag: { |
- i::ConsString* cons_string = i::ConsString::cast(string); |
- i::String* first = cons_string->first(); |
- int boundary = first->length(); |
- if (start >= boundary) { |
- // Only need RHS. |
- string = cons_string->second(); |
- start -= boundary; |
- end -= boundary; |
- continue; |
- } else if (end <= boundary) { |
- // Only need LHS. |
- string = first; |
- } else { |
- if (recursion_budget == 0) return -1; |
- int extra_utf8_bytes = |
- RecursivelySerializeToUtf8(first, |
- buffer, |
- start, |
- boundary, |
- recursion_budget - 1, |
- previous_character, |
- &previous_character); |
- if (extra_utf8_bytes < 0) return extra_utf8_bytes; |
- buffer += extra_utf8_bytes; |
- utf8_bytes += extra_utf8_bytes; |
- string = cons_string->second(); |
- start = 0; |
- end -= boundary; |
- } |
+ } |
+ ASSERT(capacity_ != -1); |
+ // Slow loop. Must check capacity on each iteration. |
+ int remaining_capacity = capacity_ - (buffer - start_); |
+ ASSERT(remaining_capacity >= 0); |
+ for (; i < length && remaining_capacity > 0; i++) { |
+ uint16_t character = *chars++; |
+ int written = WriteEndCharacter(character, |
+ last_character, |
+ remaining_capacity, |
+ buffer); |
+ if (written == 0) { |
+ early_termination_ = true; |
+ break; |
} |
+ buffer += written; |
+ remaining_capacity -= written; |
+ last_character = character; |
} |
+ // Write state back out to object. |
+ last_character_ = last_character; |
+ buffer_ = buffer; |
+ utf16_chars_read_ += i; |
} |
- UNREACHABLE(); |
- return 0; |
-} |
+ inline bool IsDone() { |
+ return early_termination_; |
+ } |
-bool String::MayContainNonAscii() const { |
- i::Handle<i::String> str = Utils::OpenHandle(this); |
- if (IsDeadCheck(str->GetIsolate(), "v8::String::MayContainNonAscii()")) { |
- return false; |
+ inline void VisitOneByteString(const uint8_t* chars, unsigned length) { |
+ Visit(chars, static_cast<int>(length)); |
} |
- return !str->HasOnlyAsciiChars(); |
-} |
+ |
+ inline void VisitTwoByteString(const uint16_t* chars, unsigned length) { |
+ Visit(chars, static_cast<int>(length)); |
+ } |
+ |
+ inline int CompleteWrite(bool write_null, int* utf16_chars_read_out) { |
+ // Write out number of utf16 characters written to the stream. |
+ if (utf16_chars_read_out != NULL) { |
+ *utf16_chars_read_out = utf16_chars_read_; |
+ } |
+ // Only null terminate if all of the string was written and there's space. |
+ if (write_null && |
+ !early_termination_ && |
+ (capacity_ == -1 || (buffer_ - start_) < capacity_)) { |
+ *buffer_++ = '\0'; |
+ } |
+ return buffer_ - start_; |
+ } |
+ |
+ private: |
+ bool early_termination_; |
+ int last_character_; |
+ char* buffer_; |
+ char* const start_; |
+ int capacity_; |
+ int utf16_chars_read_; |
+ DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor); |
+}; |
int String::WriteUtf8(char* buffer, |
@@ -3990,122 +4110,23 @@ int String::WriteUtf8(char* buffer, |
if (options & HINT_MANY_WRITES_EXPECTED) { |
FlattenString(str); // Flatten the string for efficiency. |
} |
- int string_length = str->length(); |
- if (str->IsOneByteRepresentation()) { |
- int len; |
- if (capacity == -1) { |
- capacity = str->length() + 1; |
- len = string_length; |
- } else { |
- len = i::Min(capacity, str->length()); |
- } |
- i::String::WriteToFlat(*str, buffer, 0, len); |
- if (nchars_ref != NULL) *nchars_ref = len; |
- if (!(options & NO_NULL_TERMINATION) && capacity > len) { |
- buffer[len] = '\0'; |
- return len + 1; |
- } |
- return len; |
- } |
- |
- if (capacity == -1 || capacity / 3 >= string_length) { |
- int32_t previous = unibrow::Utf16::kNoPreviousCharacter; |
- const int kMaxRecursion = 100; |
- int utf8_bytes = |
- RecursivelySerializeToUtf8(*str, |
- buffer, |
- 0, |
- string_length, |
- kMaxRecursion, |
- previous, |
- &previous); |
- if (utf8_bytes >= 0) { |
- // Success serializing with recursion. |
- if ((options & NO_NULL_TERMINATION) == 0 && |
- (capacity > utf8_bytes || capacity == -1)) { |
- buffer[utf8_bytes++] = '\0'; |
- } |
- if (nchars_ref != NULL) *nchars_ref = string_length; |
- return utf8_bytes; |
+ Utf8WriterVisitor writer(buffer, capacity); |
+ i::ConsStringIteratorOp* op = isolate->write_iterator(); |
+ op->Reset(); |
+ int32_t type = str->map()->instance_type(); |
+ unsigned str_length = static_cast<unsigned>(str->length()); |
+ if (str_length != 0) { |
+ i::String::Visit(*str, 0, writer, *op, type, str_length); |
+ while (!writer.IsDone()) { |
+ unsigned length_out; |
+ i::String* next = op->ContinueOperation(&type, &length_out); |
+ if (next == NULL) break; |
+ // TODO(dcarney): need an asserting null op. |
+ i::ConsStringNullOp null_op; |
+ i::String::Visit(next, 0, writer, null_op, type, length_out); |
} |
- FlattenString(str); |
- // Recurse once. This time around the string is flat and the serializing |
- // with recursion will certainly succeed. |
- return WriteUtf8(buffer, capacity, nchars_ref, options); |
- } else if (capacity >= string_length) { |
- // First check that the buffer is large enough. If it is, then recurse |
- // once without a capacity limit, which will get into the other branch of |
- // this 'if'. |
- int utf8_bytes = i::Utf8Length(str); |
- if ((options & NO_NULL_TERMINATION) == 0) utf8_bytes++; |
- if (utf8_bytes <= capacity) { |
- return WriteUtf8(buffer, -1, nchars_ref, options); |
- } |
- } |
- |
- // Slow case. |
- i::StringCharacterStream stream(*str, isolate->write_iterator()); |
- isolate->string_tracker()->RecordWrite(str); |
- |
- int len = str->length(); |
- // Encode the first K - 3 bytes directly into the buffer since we |
- // know there's room for them. If no capacity is given we copy all |
- // of them here. |
- int fast_end = capacity - (unibrow::Utf8::kMaxEncodedSize - 1); |
- int i; |
- int pos = 0; |
- int nchars = 0; |
- int previous = unibrow::Utf16::kNoPreviousCharacter; |
- for (i = 0; i < len && (capacity == -1 || pos < fast_end); i++) { |
- i::uc32 c = stream.GetNext(); |
- int written = unibrow::Utf8::Encode(buffer + pos, c, previous); |
- pos += written; |
- nchars++; |
- previous = c; |
- } |
- if (i < len) { |
- // For the last characters we need to check the length for each one |
- // because they may be longer than the remaining space in the |
- // buffer. |
- char intermediate[unibrow::Utf8::kMaxEncodedSize]; |
- for (; i < len && pos < capacity; i++) { |
- i::uc32 c = stream.GetNext(); |
- if (unibrow::Utf16::IsTrailSurrogate(c) && |
- unibrow::Utf16::IsLeadSurrogate(previous)) { |
- // We can't use the intermediate buffer here because the encoding |
- // of surrogate pairs is done under assumption that you can step |
- // back and fix the UTF8 stream. Luckily we only need space for one |
- // more byte, so there is always space. |
- ASSERT(pos < capacity); |
- int written = unibrow::Utf8::Encode(buffer + pos, c, previous); |
- ASSERT(written == 1); |
- pos += written; |
- nchars++; |
- } else { |
- int written = |
- unibrow::Utf8::Encode(intermediate, |
- c, |
- unibrow::Utf16::kNoPreviousCharacter); |
- if (pos + written <= capacity) { |
- for (int j = 0; j < written; j++) { |
- buffer[pos + j] = intermediate[j]; |
- } |
- pos += written; |
- nchars++; |
- } else { |
- // We've reached the end of the buffer |
- break; |
- } |
- } |
- previous = c; |
- } |
- } |
- if (nchars_ref != NULL) *nchars_ref = nchars; |
- if (!(options & NO_NULL_TERMINATION) && |
- (i == len && (capacity == -1 || pos < capacity))) { |
- buffer[pos++] = '\0'; |
} |
- return pos; |
+ return writer.CompleteWrite(!(options & NO_NULL_TERMINATION), nchars_ref); |
} |
@@ -5637,7 +5658,7 @@ String::Utf8Value::Utf8Value(v8::Handle<v8::Value> obj) |
Handle<String> str = obj->ToString(); |
if (str.IsEmpty()) return; |
i::Handle<i::String> i_str = Utils::OpenHandle(*str); |
- length_ = i::Utf8Length(i_str); |
+ length_ = v8::Utf8Length(*i_str, isolate); |
str_ = i::NewArray<char>(length_ + 1); |
str->WriteUtf8(str_); |
} |