Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1543)

Side by Side Diff: src/api.cc

Issue 11725006: Refactor out assumption that one byte strings are ascii in utf8 processing. (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge
Patch Set: Fix array bounds issue Created 7 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | src/handles.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 3855 matching lines...) Expand 10 before | Expand all | Expand 10 after
3866 i::Handle<i::Script> script(i::Script::cast(func->shared()->script())); 3866 i::Handle<i::Script> script(i::Script::cast(func->shared()->script()));
3867 return Utils::ToLocal(i::Handle<i::Object>(script->id())); 3867 return Utils::ToLocal(i::Handle<i::Object>(script->id()));
3868 } 3868 }
3869 3869
3870 int String::Length() const { 3870 int String::Length() const {
3871 i::Handle<i::String> str = Utils::OpenHandle(this); 3871 i::Handle<i::String> str = Utils::OpenHandle(this);
3872 if (IsDeadCheck(str->GetIsolate(), "v8::String::Length()")) return 0; 3872 if (IsDeadCheck(str->GetIsolate(), "v8::String::Length()")) return 0;
3873 return str->length(); 3873 return str->length();
3874 } 3874 }
3875 3875
3876
3877 int String::Utf8Length() const {
3878 i::Handle<i::String> str = Utils::OpenHandle(this);
3879 if (IsDeadCheck(str->GetIsolate(), "v8::String::Utf8Length()")) return 0;
3880 return i::Utf8Length(str);
3881 }
3882
3883
3884 // Will fail with a negative answer if the recursion depth is too high.
3885 static int RecursivelySerializeToUtf8(i::String* string,
3886 char* buffer,
3887 int start,
3888 int end,
3889 int recursion_budget,
3890 int32_t previous_character,
3891 int32_t* last_character) {
3892 int utf8_bytes = 0;
3893 while (true) {
3894 if (string->IsOneByteRepresentation()) {
3895 i::String::WriteToFlat(string, buffer, start, end);
3896 *last_character = unibrow::Utf16::kNoPreviousCharacter;
3897 return utf8_bytes + end - start;
3898 }
3899 switch (i::StringShape(string).representation_tag()) {
3900 case i::kExternalStringTag: {
3901 const uint16_t* data = i::ExternalTwoByteString::cast(string)->
3902 ExternalTwoByteStringGetData(0);
3903 char* current = buffer;
3904 for (int i = start; i < end; i++) {
3905 uint16_t character = data[i];
3906 current +=
3907 unibrow::Utf8::Encode(current, character, previous_character);
3908 previous_character = character;
3909 }
3910 *last_character = previous_character;
3911 return static_cast<int>(utf8_bytes + current - buffer);
3912 }
3913 case i::kSeqStringTag: {
3914 const uint16_t* data =
3915 i::SeqTwoByteString::cast(string)->SeqTwoByteStringGetData(0);
3916 char* current = buffer;
3917 for (int i = start; i < end; i++) {
3918 uint16_t character = data[i];
3919 current +=
3920 unibrow::Utf8::Encode(current, character, previous_character);
3921 previous_character = character;
3922 }
3923 *last_character = previous_character;
3924 return static_cast<int>(utf8_bytes + current - buffer);
3925 }
3926 case i::kSlicedStringTag: {
3927 i::SlicedString* slice = i::SlicedString::cast(string);
3928 unsigned offset = slice->offset();
3929 string = slice->parent();
3930 start += offset;
3931 end += offset;
3932 continue;
3933 }
3934 case i::kConsStringTag: {
3935 i::ConsString* cons_string = i::ConsString::cast(string);
3936 i::String* first = cons_string->first();
3937 int boundary = first->length();
3938 if (start >= boundary) {
3939 // Only need RHS.
3940 string = cons_string->second();
3941 start -= boundary;
3942 end -= boundary;
3943 continue;
3944 } else if (end <= boundary) {
3945 // Only need LHS.
3946 string = first;
3947 } else {
3948 if (recursion_budget == 0) return -1;
3949 int extra_utf8_bytes =
3950 RecursivelySerializeToUtf8(first,
3951 buffer,
3952 start,
3953 boundary,
3954 recursion_budget - 1,
3955 previous_character,
3956 &previous_character);
3957 if (extra_utf8_bytes < 0) return extra_utf8_bytes;
3958 buffer += extra_utf8_bytes;
3959 utf8_bytes += extra_utf8_bytes;
3960 string = cons_string->second();
3961 start = 0;
3962 end -= boundary;
3963 }
3964 }
3965 }
3966 }
3967 UNREACHABLE();
3968 return 0;
3969 }
3970
3971
3972 bool String::MayContainNonAscii() const { 3876 bool String::MayContainNonAscii() const {
3973 i::Handle<i::String> str = Utils::OpenHandle(this); 3877 i::Handle<i::String> str = Utils::OpenHandle(this);
3974 if (IsDeadCheck(str->GetIsolate(), "v8::String::MayContainNonAscii()")) { 3878 if (IsDeadCheck(str->GetIsolate(), "v8::String::MayContainNonAscii()")) {
3975 return false; 3879 return false;
3976 } 3880 }
3977 return !str->HasOnlyAsciiChars(); 3881 return !str->HasOnlyAsciiChars();
3978 } 3882 }
3979 3883
3980 3884
3885 class Utf8LengthVisitor {
3886 public:
3887 explicit Utf8LengthVisitor()
3888 : utf8_length_(0),
3889 last_character_(unibrow::Utf16::kNoPreviousCharacter) {}
3890
3891 inline int GetLength() {
3892 return utf8_length_;
3893 }
3894
3895 template<typename Char>
3896 inline void Visit(const Char* chars, unsigned length) {
3897 ASSERT(length > 0);
3898 // TODO(dcarney) Add back ascii fast path.
3899 int utf8_length = 0;
3900 int last_character = last_character_;
3901 for (unsigned i = 0; i < length; i++) {
3902 uint16_t c = chars[i];
3903 utf8_length += unibrow::Utf8::Length(c, last_character);
3904 last_character = c;
3905 }
3906 last_character_ = last_character;
3907 utf8_length_ += utf8_length;
3908 }
3909
3910 inline void VisitOneByteString(const uint8_t* chars, unsigned length) {
3911 Visit(chars, length);
3912 }
3913
3914 inline void VisitTwoByteString(const uint16_t* chars, unsigned length) {
3915 Visit(chars, length);
3916 }
3917
3918 private:
3919 int utf8_length_;
3920 int last_character_;
3921 DISALLOW_COPY_AND_ASSIGN(Utf8LengthVisitor);
3922 };
3923
3924
3925 static int Utf8Length(i::String* str, i::Isolate* isolate) {
3926 unsigned length = static_cast<unsigned>(str->length());
3927 if (length == 0) return 0;
3928 int32_t type = str->map()->instance_type();
3929 Utf8LengthVisitor visitor;
3930 // Non ConsString branch.
3931 if ((type & i::kStringRepresentationMask) != i::kConsStringTag) {
3932 i::ConsStringNullOp null_op;
3933 i::String::Visit(str, 0, visitor, null_op, type, length);
3934 return visitor.GetLength();
3935 }
3936 i::ConsStringIteratorOp* op = isolate->write_iterator();
3937 unsigned offset = 0;
3938 i::String* leaf = op->Operate(str, &offset, &type, &length);
3939 ASSERT(leaf != NULL);
3940 while (leaf != NULL) {
3941 i::ConsStringNullOp null_op;
3942 ASSERT(offset == 0);
3943 i::String::Visit(leaf, 0, visitor, null_op, type, length);
3944 leaf = op->ContinueOperation(&type, &length);
3945 }
3946 return visitor.GetLength();
3947 }
3948
3949
3950 int String::Utf8Length() const {
3951 i::Handle<i::String> str = Utils::OpenHandle(this);
3952 i::Isolate* isolate = str->GetIsolate();
3953 if (IsDeadCheck(isolate, "v8::String::Utf8Length()")) return 0;
3954 return v8::Utf8Length(*str, isolate);
3955 }
3956
3957
3958 class Utf8WriterVisitor {
3959 public:
3960 Utf8WriterVisitor(char* buffer, int capacity)
3961 : early_termination_(false),
3962 last_character_(unibrow::Utf16::kNoPreviousCharacter),
3963 buffer_(buffer),
3964 start_(buffer),
3965 capacity_(capacity),
3966 utf16_chars_read_(0) {
3967 }
3968
3969 static int WriteEndCharacter(uint16_t character,
3970 int last_character,
3971 int remaining,
3972 char* const buffer) {
3973 using namespace unibrow;
3974 ASSERT(remaining > 0);
3975 // We can't use a local buffer here because Encode needs to modify
3976 // previous characters in the stream. We know, however, that
3977 // exactly one character will be advanced.
3978 if (Utf16::IsTrailSurrogate(character) &&
3979 Utf16::IsLeadSurrogate(last_character)) {
3980 int written = Utf8::Encode(buffer, character, last_character);
3981 ASSERT(written == 1);
3982 return written;
3983 }
3984 // Use a scratch buffer to check the required characters.
3985 char temp_buffer[Utf8::kMaxEncodedSize];
3986 // Can't encode using last_character as gcc has array bounds issues.
3987 int written = Utf8::Encode(temp_buffer,
3988 character,
3989 unibrow::Utf16::kNoPreviousCharacter);
3990 // Won't fit.
3991 if (written > remaining) return 0;
3992 // Copy over the character from temp_buffer.
3993 for (int j = 0; j < written; j++) {
3994 buffer[j] = temp_buffer[j];
3995 }
3996 return written;
3997 }
3998
3999 template<typename Char>
4000 void Visit(const Char* chars, const int length) {
4001 using namespace unibrow;
4002 // TODO(dcarney): Add back ascii fast path.
4003 ASSERT(!early_termination_);
4004 ASSERT(length > 0);
4005 // Copy state to stack.
4006 char* buffer = buffer_;
4007 int last_character = last_character_;
4008 int i = 0;
4009 // Do a fast loop where there is no exit capacity check.
4010 while (true) {
4011 int fast_length;
4012 if (capacity_ == -1) {
4013 fast_length = length;
4014 } else {
4015 int remaing_capacity = capacity_ - (buffer - start_);
4016 // Need enough space to write everything but one character.
4017 STATIC_ASSERT(Utf16::kMaxExtraUtf8BytesForOneUtf16CodeUnit == 3);
4018 int writable_length = (remaing_capacity - 3)/3;
4019 // Need to drop into slow loop.
4020 if (writable_length <= 0) break;
4021 fast_length = i + writable_length;
4022 if (fast_length > length) fast_length = length;
4023 }
4024 // Write the characters to the stream.
4025 for (; i < fast_length; i++) {
4026 uint16_t character = *chars++;
4027 buffer += Utf8::Encode(buffer, character, last_character);
4028 last_character = character;
4029 ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
4030 }
4031 // Array is fully written. Exit.
4032 if (fast_length == length) {
4033 // Write state back out to object.
4034 last_character_ = last_character;
4035 buffer_ = buffer;
4036 utf16_chars_read_ += i;
4037 return;
4038 }
4039 }
4040 ASSERT(capacity_ != -1);
4041 // Slow loop. Must check capacity on each iteration.
4042 int remaining_capacity = capacity_ - (buffer - start_);
4043 ASSERT(remaining_capacity >= 0);
4044 for (; i < length && remaining_capacity > 0; i++) {
4045 uint16_t character = *chars++;
4046 int written = WriteEndCharacter(character,
4047 last_character,
4048 remaining_capacity,
4049 buffer);
4050 if (written == 0) {
4051 early_termination_ = true;
4052 break;
4053 }
4054 buffer += written;
4055 remaining_capacity -= written;
4056 last_character = character;
4057 }
4058 // Write state back out to object.
4059 last_character_ = last_character;
4060 buffer_ = buffer;
4061 utf16_chars_read_ += i;
4062 }
4063
4064 inline bool IsDone() {
4065 return early_termination_;
4066 }
4067
4068 inline void VisitOneByteString(const uint8_t* chars, unsigned length) {
4069 Visit(chars, static_cast<int>(length));
4070 }
4071
4072 inline void VisitTwoByteString(const uint16_t* chars, unsigned length) {
4073 Visit(chars, static_cast<int>(length));
4074 }
4075
4076 inline int CompleteWrite(bool write_null, int* utf16_chars_read_out) {
4077 // Write out number of utf16 characters written to the stream.
4078 if (utf16_chars_read_out != NULL) {
4079 *utf16_chars_read_out = utf16_chars_read_;
4080 }
4081 // Only null terminate if all of the string was written and there's space.
4082 if (write_null &&
4083 !early_termination_ &&
4084 (capacity_ == -1 || (buffer_ - start_) < capacity_)) {
4085 *buffer_++ = '\0';
4086 }
4087 return buffer_ - start_;
4088 }
4089
4090 private:
4091 bool early_termination_;
4092 int last_character_;
4093 char* buffer_;
4094 char* const start_;
4095 int capacity_;
4096 int utf16_chars_read_;
4097 DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);
4098 };
4099
4100
3981 int String::WriteUtf8(char* buffer, 4101 int String::WriteUtf8(char* buffer,
3982 int capacity, 4102 int capacity,
3983 int* nchars_ref, 4103 int* nchars_ref,
3984 int options) const { 4104 int options) const {
3985 i::Isolate* isolate = Utils::OpenHandle(this)->GetIsolate(); 4105 i::Isolate* isolate = Utils::OpenHandle(this)->GetIsolate();
3986 if (IsDeadCheck(isolate, "v8::String::WriteUtf8()")) return 0; 4106 if (IsDeadCheck(isolate, "v8::String::WriteUtf8()")) return 0;
3987 LOG_API(isolate, "String::WriteUtf8"); 4107 LOG_API(isolate, "String::WriteUtf8");
3988 ENTER_V8(isolate); 4108 ENTER_V8(isolate);
3989 i::Handle<i::String> str = Utils::OpenHandle(this); 4109 i::Handle<i::String> str = Utils::OpenHandle(this);
3990 if (options & HINT_MANY_WRITES_EXPECTED) { 4110 if (options & HINT_MANY_WRITES_EXPECTED) {
3991 FlattenString(str); // Flatten the string for efficiency. 4111 FlattenString(str); // Flatten the string for efficiency.
3992 } 4112 }
3993 int string_length = str->length(); 4113 Utf8WriterVisitor writer(buffer, capacity);
3994 if (str->IsOneByteRepresentation()) { 4114 i::ConsStringIteratorOp* op = isolate->write_iterator();
3995 int len; 4115 op->Reset();
3996 if (capacity == -1) { 4116 int32_t type = str->map()->instance_type();
3997 capacity = str->length() + 1; 4117 unsigned str_length = static_cast<unsigned>(str->length());
3998 len = string_length; 4118 if (str_length != 0) {
3999 } else { 4119 i::String::Visit(*str, 0, writer, *op, type, str_length);
4000 len = i::Min(capacity, str->length()); 4120 while (!writer.IsDone()) {
4001 } 4121 unsigned length_out;
4002 i::String::WriteToFlat(*str, buffer, 0, len); 4122 i::String* next = op->ContinueOperation(&type, &length_out);
4003 if (nchars_ref != NULL) *nchars_ref = len; 4123 if (next == NULL) break;
4004 if (!(options & NO_NULL_TERMINATION) && capacity > len) { 4124 // TODO(dcarney): need an asserting null op.
4005 buffer[len] = '\0'; 4125 i::ConsStringNullOp null_op;
4006 return len + 1; 4126 i::String::Visit(next, 0, writer, null_op, type, length_out);
4007 }
4008 return len;
4009 }
4010
4011 if (capacity == -1 || capacity / 3 >= string_length) {
4012 int32_t previous = unibrow::Utf16::kNoPreviousCharacter;
4013 const int kMaxRecursion = 100;
4014 int utf8_bytes =
4015 RecursivelySerializeToUtf8(*str,
4016 buffer,
4017 0,
4018 string_length,
4019 kMaxRecursion,
4020 previous,
4021 &previous);
4022 if (utf8_bytes >= 0) {
4023 // Success serializing with recursion.
4024 if ((options & NO_NULL_TERMINATION) == 0 &&
4025 (capacity > utf8_bytes || capacity == -1)) {
4026 buffer[utf8_bytes++] = '\0';
4027 }
4028 if (nchars_ref != NULL) *nchars_ref = string_length;
4029 return utf8_bytes;
4030 }
4031 FlattenString(str);
4032 // Recurse once. This time around the string is flat and the serializing
4033 // with recursion will certainly succeed.
4034 return WriteUtf8(buffer, capacity, nchars_ref, options);
4035 } else if (capacity >= string_length) {
4036 // First check that the buffer is large enough. If it is, then recurse
4037 // once without a capacity limit, which will get into the other branch of
4038 // this 'if'.
4039 int utf8_bytes = i::Utf8Length(str);
4040 if ((options & NO_NULL_TERMINATION) == 0) utf8_bytes++;
4041 if (utf8_bytes <= capacity) {
4042 return WriteUtf8(buffer, -1, nchars_ref, options);
4043 } 4127 }
4044 } 4128 }
4045 4129 return writer.CompleteWrite(!(options & NO_NULL_TERMINATION), nchars_ref);
4046 // Slow case.
4047 i::StringCharacterStream stream(*str, isolate->write_iterator());
4048 isolate->string_tracker()->RecordWrite(str);
4049
4050 int len = str->length();
4051 // Encode the first K - 3 bytes directly into the buffer since we
4052 // know there's room for them. If no capacity is given we copy all
4053 // of them here.
4054 int fast_end = capacity - (unibrow::Utf8::kMaxEncodedSize - 1);
4055 int i;
4056 int pos = 0;
4057 int nchars = 0;
4058 int previous = unibrow::Utf16::kNoPreviousCharacter;
4059 for (i = 0; i < len && (capacity == -1 || pos < fast_end); i++) {
4060 i::uc32 c = stream.GetNext();
4061 int written = unibrow::Utf8::Encode(buffer + pos, c, previous);
4062 pos += written;
4063 nchars++;
4064 previous = c;
4065 }
4066 if (i < len) {
4067 // For the last characters we need to check the length for each one
4068 // because they may be longer than the remaining space in the
4069 // buffer.
4070 char intermediate[unibrow::Utf8::kMaxEncodedSize];
4071 for (; i < len && pos < capacity; i++) {
4072 i::uc32 c = stream.GetNext();
4073 if (unibrow::Utf16::IsTrailSurrogate(c) &&
4074 unibrow::Utf16::IsLeadSurrogate(previous)) {
4075 // We can't use the intermediate buffer here because the encoding
4076 // of surrogate pairs is done under assumption that you can step
4077 // back and fix the UTF8 stream. Luckily we only need space for one
4078 // more byte, so there is always space.
4079 ASSERT(pos < capacity);
4080 int written = unibrow::Utf8::Encode(buffer + pos, c, previous);
4081 ASSERT(written == 1);
4082 pos += written;
4083 nchars++;
4084 } else {
4085 int written =
4086 unibrow::Utf8::Encode(intermediate,
4087 c,
4088 unibrow::Utf16::kNoPreviousCharacter);
4089 if (pos + written <= capacity) {
4090 for (int j = 0; j < written; j++) {
4091 buffer[pos + j] = intermediate[j];
4092 }
4093 pos += written;
4094 nchars++;
4095 } else {
4096 // We've reached the end of the buffer
4097 break;
4098 }
4099 }
4100 previous = c;
4101 }
4102 }
4103 if (nchars_ref != NULL) *nchars_ref = nchars;
4104 if (!(options & NO_NULL_TERMINATION) &&
4105 (i == len && (capacity == -1 || pos < capacity))) {
4106 buffer[pos++] = '\0';
4107 }
4108 return pos;
4109 } 4130 }
4110 4131
4111 4132
4112 int String::WriteAscii(char* buffer, 4133 int String::WriteAscii(char* buffer,
4113 int start, 4134 int start,
4114 int length, 4135 int length,
4115 int options) const { 4136 int options) const {
4116 i::Isolate* isolate = Utils::OpenHandle(this)->GetIsolate(); 4137 i::Isolate* isolate = Utils::OpenHandle(this)->GetIsolate();
4117 if (IsDeadCheck(isolate, "v8::String::WriteAscii()")) return 0; 4138 if (IsDeadCheck(isolate, "v8::String::WriteAscii()")) return 0;
4118 LOG_API(isolate, "String::WriteAscii"); 4139 LOG_API(isolate, "String::WriteAscii");
(...skipping 1511 matching lines...) Expand 10 before | Expand all | Expand 10 after
5630 : str_(NULL), length_(0) { 5651 : str_(NULL), length_(0) {
5631 i::Isolate* isolate = i::Isolate::Current(); 5652 i::Isolate* isolate = i::Isolate::Current();
5632 if (IsDeadCheck(isolate, "v8::String::Utf8Value::Utf8Value()")) return; 5653 if (IsDeadCheck(isolate, "v8::String::Utf8Value::Utf8Value()")) return;
5633 if (obj.IsEmpty()) return; 5654 if (obj.IsEmpty()) return;
5634 ENTER_V8(isolate); 5655 ENTER_V8(isolate);
5635 i::HandleScope scope(isolate); 5656 i::HandleScope scope(isolate);
5636 TryCatch try_catch; 5657 TryCatch try_catch;
5637 Handle<String> str = obj->ToString(); 5658 Handle<String> str = obj->ToString();
5638 if (str.IsEmpty()) return; 5659 if (str.IsEmpty()) return;
5639 i::Handle<i::String> i_str = Utils::OpenHandle(*str); 5660 i::Handle<i::String> i_str = Utils::OpenHandle(*str);
5640 length_ = i::Utf8Length(i_str); 5661 length_ = v8::Utf8Length(*i_str, isolate);
5641 str_ = i::NewArray<char>(length_ + 1); 5662 str_ = i::NewArray<char>(length_ + 1);
5642 str->WriteUtf8(str_); 5663 str->WriteUtf8(str_);
5643 } 5664 }
5644 5665
5645 5666
5646 String::Utf8Value::~Utf8Value() { 5667 String::Utf8Value::~Utf8Value() {
5647 i::DeleteArray(str_); 5668 i::DeleteArray(str_);
5648 } 5669 }
5649 5670
5650 5671
(...skipping 1061 matching lines...) Expand 10 before | Expand all | Expand 10 after
6712 6733
6713 v->VisitPointers(blocks_.first(), first_block_limit_); 6734 v->VisitPointers(blocks_.first(), first_block_limit_);
6714 6735
6715 for (int i = 1; i < blocks_.length(); i++) { 6736 for (int i = 1; i < blocks_.length(); i++) {
6716 v->VisitPointers(blocks_[i], &blocks_[i][kHandleBlockSize]); 6737 v->VisitPointers(blocks_[i], &blocks_[i][kHandleBlockSize]);
6717 } 6738 }
6718 } 6739 }
6719 6740
6720 6741
6721 } } // namespace v8::internal 6742 } } // namespace v8::internal
OLDNEW
« no previous file with comments | « no previous file | src/handles.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698