Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(339)

Unified Diff: src/api.cc

Issue 11725006: Refactor out assumption that one byte strings are ascii in utf8 processing. (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge
Patch Set: Fix array bounds issue Created 7 years, 12 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | src/handles.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/api.cc
diff --git a/src/api.cc b/src/api.cc
index cf3c0835b545190aba087dbe6cc49e60333b3b34..e8c01e6585af64b23c37d47aafab2acde7726f74 100644
--- a/src/api.cc
+++ b/src/api.cc
@@ -3873,109 +3873,229 @@ int String::Length() const {
return str->length();
}
+bool String::MayContainNonAscii() const {
+ i::Handle<i::String> str = Utils::OpenHandle(this);
+ if (IsDeadCheck(str->GetIsolate(), "v8::String::MayContainNonAscii()")) {
+ return false;
+ }
+ return !str->HasOnlyAsciiChars();
+}
+
+
+class Utf8LengthVisitor {
+ public:
+ explicit Utf8LengthVisitor()
+ : utf8_length_(0),
+ last_character_(unibrow::Utf16::kNoPreviousCharacter) {}
+
+ inline int GetLength() {
+ return utf8_length_;
+ }
+
+ template<typename Char>
+ inline void Visit(const Char* chars, unsigned length) {
+ ASSERT(length > 0);
+ // TODO(dcarney) Add back ascii fast path.
+ int utf8_length = 0;
+ int last_character = last_character_;
+ for (unsigned i = 0; i < length; i++) {
+ uint16_t c = chars[i];
+ utf8_length += unibrow::Utf8::Length(c, last_character);
+ last_character = c;
+ }
+ last_character_ = last_character;
+ utf8_length_ += utf8_length;
+ }
+
+ inline void VisitOneByteString(const uint8_t* chars, unsigned length) {
+ Visit(chars, length);
+ }
+
+ inline void VisitTwoByteString(const uint16_t* chars, unsigned length) {
+ Visit(chars, length);
+ }
+
+ private:
+ int utf8_length_;
+ int last_character_;
+ DISALLOW_COPY_AND_ASSIGN(Utf8LengthVisitor);
+};
+
+
+static int Utf8Length(i::String* str, i::Isolate* isolate) {
+ unsigned length = static_cast<unsigned>(str->length());
+ if (length == 0) return 0;
+ int32_t type = str->map()->instance_type();
+ Utf8LengthVisitor visitor;
+ // Non ConsString branch.
+ if ((type & i::kStringRepresentationMask) != i::kConsStringTag) {
+ i::ConsStringNullOp null_op;
+ i::String::Visit(str, 0, visitor, null_op, type, length);
+ return visitor.GetLength();
+ }
+ i::ConsStringIteratorOp* op = isolate->write_iterator();
+ unsigned offset = 0;
+ i::String* leaf = op->Operate(str, &offset, &type, &length);
+ ASSERT(leaf != NULL);
+ while (leaf != NULL) {
+ i::ConsStringNullOp null_op;
+ ASSERT(offset == 0);
+ i::String::Visit(leaf, 0, visitor, null_op, type, length);
+ leaf = op->ContinueOperation(&type, &length);
+ }
+ return visitor.GetLength();
+}
+
int String::Utf8Length() const {
i::Handle<i::String> str = Utils::OpenHandle(this);
- if (IsDeadCheck(str->GetIsolate(), "v8::String::Utf8Length()")) return 0;
- return i::Utf8Length(str);
-}
-
-
-// Will fail with a negative answer if the recursion depth is too high.
-static int RecursivelySerializeToUtf8(i::String* string,
- char* buffer,
- int start,
- int end,
- int recursion_budget,
- int32_t previous_character,
- int32_t* last_character) {
- int utf8_bytes = 0;
- while (true) {
- if (string->IsOneByteRepresentation()) {
- i::String::WriteToFlat(string, buffer, start, end);
- *last_character = unibrow::Utf16::kNoPreviousCharacter;
- return utf8_bytes + end - start;
+ i::Isolate* isolate = str->GetIsolate();
+ if (IsDeadCheck(isolate, "v8::String::Utf8Length()")) return 0;
+ return v8::Utf8Length(*str, isolate);
+}
+
+
+class Utf8WriterVisitor {
+ public:
+ Utf8WriterVisitor(char* buffer, int capacity)
+ : early_termination_(false),
+ last_character_(unibrow::Utf16::kNoPreviousCharacter),
+ buffer_(buffer),
+ start_(buffer),
+ capacity_(capacity),
+ utf16_chars_read_(0) {
+ }
+
+ static int WriteEndCharacter(uint16_t character,
+ int last_character,
+ int remaining,
+ char* const buffer) {
+ using namespace unibrow;
+ ASSERT(remaining > 0);
+ // We can't use a local buffer here because Encode needs to modify
+ // previous characters in the stream. We know, however, that
+ // exactly one character will be advanced.
+ if (Utf16::IsTrailSurrogate(character) &&
+ Utf16::IsLeadSurrogate(last_character)) {
+ int written = Utf8::Encode(buffer, character, last_character);
+ ASSERT(written == 1);
+ return written;
}
- switch (i::StringShape(string).representation_tag()) {
- case i::kExternalStringTag: {
- const uint16_t* data = i::ExternalTwoByteString::cast(string)->
- ExternalTwoByteStringGetData(0);
- char* current = buffer;
- for (int i = start; i < end; i++) {
- uint16_t character = data[i];
- current +=
- unibrow::Utf8::Encode(current, character, previous_character);
- previous_character = character;
- }
- *last_character = previous_character;
- return static_cast<int>(utf8_bytes + current - buffer);
+ // Use a scratch buffer to check the required characters.
+ char temp_buffer[Utf8::kMaxEncodedSize];
+ // Can't encode using last_character as gcc has array bounds issues.
+ int written = Utf8::Encode(temp_buffer,
+ character,
+ unibrow::Utf16::kNoPreviousCharacter);
+ // Won't fit.
+ if (written > remaining) return 0;
+ // Copy over the character from temp_buffer.
+ for (int j = 0; j < written; j++) {
+ buffer[j] = temp_buffer[j];
+ }
+ return written;
+ }
+
+ template<typename Char>
+ void Visit(const Char* chars, const int length) {
+ using namespace unibrow;
+ // TODO(dcarney): Add back ascii fast path.
+ ASSERT(!early_termination_);
+ ASSERT(length > 0);
+ // Copy state to stack.
+ char* buffer = buffer_;
+ int last_character = last_character_;
+ int i = 0;
+ // Do a fast loop where there is no exit capacity check.
+ while (true) {
+ int fast_length;
+ if (capacity_ == -1) {
+ fast_length = length;
+ } else {
+ int remaing_capacity = capacity_ - (buffer - start_);
+ // Need enough space to write everything but one character.
+ STATIC_ASSERT(Utf16::kMaxExtraUtf8BytesForOneUtf16CodeUnit == 3);
+ int writable_length = (remaing_capacity - 3)/3;
+ // Need to drop into slow loop.
+ if (writable_length <= 0) break;
+ fast_length = i + writable_length;
+ if (fast_length > length) fast_length = length;
}
- case i::kSeqStringTag: {
- const uint16_t* data =
- i::SeqTwoByteString::cast(string)->SeqTwoByteStringGetData(0);
- char* current = buffer;
- for (int i = start; i < end; i++) {
- uint16_t character = data[i];
- current +=
- unibrow::Utf8::Encode(current, character, previous_character);
- previous_character = character;
- }
- *last_character = previous_character;
- return static_cast<int>(utf8_bytes + current - buffer);
+ // Write the characters to the stream.
+ for (; i < fast_length; i++) {
+ uint16_t character = *chars++;
+ buffer += Utf8::Encode(buffer, character, last_character);
+ last_character = character;
+ ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
}
- case i::kSlicedStringTag: {
- i::SlicedString* slice = i::SlicedString::cast(string);
- unsigned offset = slice->offset();
- string = slice->parent();
- start += offset;
- end += offset;
- continue;
+ // Array is fully written. Exit.
+ if (fast_length == length) {
+ // Write state back out to object.
+ last_character_ = last_character;
+ buffer_ = buffer;
+ utf16_chars_read_ += i;
+ return;
}
- case i::kConsStringTag: {
- i::ConsString* cons_string = i::ConsString::cast(string);
- i::String* first = cons_string->first();
- int boundary = first->length();
- if (start >= boundary) {
- // Only need RHS.
- string = cons_string->second();
- start -= boundary;
- end -= boundary;
- continue;
- } else if (end <= boundary) {
- // Only need LHS.
- string = first;
- } else {
- if (recursion_budget == 0) return -1;
- int extra_utf8_bytes =
- RecursivelySerializeToUtf8(first,
- buffer,
- start,
- boundary,
- recursion_budget - 1,
- previous_character,
- &previous_character);
- if (extra_utf8_bytes < 0) return extra_utf8_bytes;
- buffer += extra_utf8_bytes;
- utf8_bytes += extra_utf8_bytes;
- string = cons_string->second();
- start = 0;
- end -= boundary;
- }
+ }
+ ASSERT(capacity_ != -1);
+ // Slow loop. Must check capacity on each iteration.
+ int remaining_capacity = capacity_ - (buffer - start_);
+ ASSERT(remaining_capacity >= 0);
+ for (; i < length && remaining_capacity > 0; i++) {
+ uint16_t character = *chars++;
+ int written = WriteEndCharacter(character,
+ last_character,
+ remaining_capacity,
+ buffer);
+ if (written == 0) {
+ early_termination_ = true;
+ break;
}
+ buffer += written;
+ remaining_capacity -= written;
+ last_character = character;
}
+ // Write state back out to object.
+ last_character_ = last_character;
+ buffer_ = buffer;
+ utf16_chars_read_ += i;
}
- UNREACHABLE();
- return 0;
-}
+ inline bool IsDone() {
+ return early_termination_;
+ }
-bool String::MayContainNonAscii() const {
- i::Handle<i::String> str = Utils::OpenHandle(this);
- if (IsDeadCheck(str->GetIsolate(), "v8::String::MayContainNonAscii()")) {
- return false;
+ inline void VisitOneByteString(const uint8_t* chars, unsigned length) {
+ Visit(chars, static_cast<int>(length));
}
- return !str->HasOnlyAsciiChars();
-}
+
+ inline void VisitTwoByteString(const uint16_t* chars, unsigned length) {
+ Visit(chars, static_cast<int>(length));
+ }
+
+ inline int CompleteWrite(bool write_null, int* utf16_chars_read_out) {
+ // Write out number of utf16 characters written to the stream.
+ if (utf16_chars_read_out != NULL) {
+ *utf16_chars_read_out = utf16_chars_read_;
+ }
+ // Only null terminate if all of the string was written and there's space.
+ if (write_null &&
+ !early_termination_ &&
+ (capacity_ == -1 || (buffer_ - start_) < capacity_)) {
+ *buffer_++ = '\0';
+ }
+ return buffer_ - start_;
+ }
+
+ private:
+ bool early_termination_;
+ int last_character_;
+ char* buffer_;
+ char* const start_;
+ int capacity_;
+ int utf16_chars_read_;
+ DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);
+};
int String::WriteUtf8(char* buffer,
@@ -3990,122 +4110,23 @@ int String::WriteUtf8(char* buffer,
if (options & HINT_MANY_WRITES_EXPECTED) {
FlattenString(str); // Flatten the string for efficiency.
}
- int string_length = str->length();
- if (str->IsOneByteRepresentation()) {
- int len;
- if (capacity == -1) {
- capacity = str->length() + 1;
- len = string_length;
- } else {
- len = i::Min(capacity, str->length());
- }
- i::String::WriteToFlat(*str, buffer, 0, len);
- if (nchars_ref != NULL) *nchars_ref = len;
- if (!(options & NO_NULL_TERMINATION) && capacity > len) {
- buffer[len] = '\0';
- return len + 1;
- }
- return len;
- }
-
- if (capacity == -1 || capacity / 3 >= string_length) {
- int32_t previous = unibrow::Utf16::kNoPreviousCharacter;
- const int kMaxRecursion = 100;
- int utf8_bytes =
- RecursivelySerializeToUtf8(*str,
- buffer,
- 0,
- string_length,
- kMaxRecursion,
- previous,
- &previous);
- if (utf8_bytes >= 0) {
- // Success serializing with recursion.
- if ((options & NO_NULL_TERMINATION) == 0 &&
- (capacity > utf8_bytes || capacity == -1)) {
- buffer[utf8_bytes++] = '\0';
- }
- if (nchars_ref != NULL) *nchars_ref = string_length;
- return utf8_bytes;
+ Utf8WriterVisitor writer(buffer, capacity);
+ i::ConsStringIteratorOp* op = isolate->write_iterator();
+ op->Reset();
+ int32_t type = str->map()->instance_type();
+ unsigned str_length = static_cast<unsigned>(str->length());
+ if (str_length != 0) {
+ i::String::Visit(*str, 0, writer, *op, type, str_length);
+ while (!writer.IsDone()) {
+ unsigned length_out;
+ i::String* next = op->ContinueOperation(&type, &length_out);
+ if (next == NULL) break;
+ // TODO(dcarney): need an asserting null op.
+ i::ConsStringNullOp null_op;
+ i::String::Visit(next, 0, writer, null_op, type, length_out);
}
- FlattenString(str);
- // Recurse once. This time around the string is flat and the serializing
- // with recursion will certainly succeed.
- return WriteUtf8(buffer, capacity, nchars_ref, options);
- } else if (capacity >= string_length) {
- // First check that the buffer is large enough. If it is, then recurse
- // once without a capacity limit, which will get into the other branch of
- // this 'if'.
- int utf8_bytes = i::Utf8Length(str);
- if ((options & NO_NULL_TERMINATION) == 0) utf8_bytes++;
- if (utf8_bytes <= capacity) {
- return WriteUtf8(buffer, -1, nchars_ref, options);
- }
- }
-
- // Slow case.
- i::StringCharacterStream stream(*str, isolate->write_iterator());
- isolate->string_tracker()->RecordWrite(str);
-
- int len = str->length();
- // Encode the first K - 3 bytes directly into the buffer since we
- // know there's room for them. If no capacity is given we copy all
- // of them here.
- int fast_end = capacity - (unibrow::Utf8::kMaxEncodedSize - 1);
- int i;
- int pos = 0;
- int nchars = 0;
- int previous = unibrow::Utf16::kNoPreviousCharacter;
- for (i = 0; i < len && (capacity == -1 || pos < fast_end); i++) {
- i::uc32 c = stream.GetNext();
- int written = unibrow::Utf8::Encode(buffer + pos, c, previous);
- pos += written;
- nchars++;
- previous = c;
- }
- if (i < len) {
- // For the last characters we need to check the length for each one
- // because they may be longer than the remaining space in the
- // buffer.
- char intermediate[unibrow::Utf8::kMaxEncodedSize];
- for (; i < len && pos < capacity; i++) {
- i::uc32 c = stream.GetNext();
- if (unibrow::Utf16::IsTrailSurrogate(c) &&
- unibrow::Utf16::IsLeadSurrogate(previous)) {
- // We can't use the intermediate buffer here because the encoding
- // of surrogate pairs is done under assumption that you can step
- // back and fix the UTF8 stream. Luckily we only need space for one
- // more byte, so there is always space.
- ASSERT(pos < capacity);
- int written = unibrow::Utf8::Encode(buffer + pos, c, previous);
- ASSERT(written == 1);
- pos += written;
- nchars++;
- } else {
- int written =
- unibrow::Utf8::Encode(intermediate,
- c,
- unibrow::Utf16::kNoPreviousCharacter);
- if (pos + written <= capacity) {
- for (int j = 0; j < written; j++) {
- buffer[pos + j] = intermediate[j];
- }
- pos += written;
- nchars++;
- } else {
- // We've reached the end of the buffer
- break;
- }
- }
- previous = c;
- }
- }
- if (nchars_ref != NULL) *nchars_ref = nchars;
- if (!(options & NO_NULL_TERMINATION) &&
- (i == len && (capacity == -1 || pos < capacity))) {
- buffer[pos++] = '\0';
}
- return pos;
+ return writer.CompleteWrite(!(options & NO_NULL_TERMINATION), nchars_ref);
}
@@ -5637,7 +5658,7 @@ String::Utf8Value::Utf8Value(v8::Handle<v8::Value> obj)
Handle<String> str = obj->ToString();
if (str.IsEmpty()) return;
i::Handle<i::String> i_str = Utils::OpenHandle(*str);
- length_ = i::Utf8Length(i_str);
+ length_ = v8::Utf8Length(*i_str, isolate);
str_ = i::NewArray<char>(length_ + 1);
str->WriteUtf8(str_);
}
« no previous file with comments | « no previous file | src/handles.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698