Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(402)

Unified Diff: src/objects.cc

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/objects.cc
===================================================================
--- src/objects.cc (revision 10944)
+++ src/objects.cc (working copy)
@@ -6043,9 +6043,11 @@
buffer->Reset(offset, this);
int character_position = offset;
int utf8_bytes = 0;
+ int last = unibrow::Utf16::kNoPreviousCharacter;
while (buffer->has_more() && character_position++ < offset + length) {
uint16_t character = buffer->GetNext();
- utf8_bytes += unibrow::Utf8::Length(character);
+ utf8_bytes += unibrow::Utf8::Length(character, last);
+ last = character;
}
if (length_return) {
@@ -6059,13 +6061,15 @@
buffer->Seek(offset);
character_position = offset;
int utf8_byte_position = 0;
+ last = unibrow::Utf16::kNoPreviousCharacter;
while (buffer->has_more() && character_position++ < offset + length) {
uint16_t character = buffer->GetNext();
if (allow_nulls == DISALLOW_NULLS && character == 0) {
character = ' ';
}
utf8_byte_position +=
- unibrow::Utf8::Encode(result + utf8_byte_position, character);
+ unibrow::Utf8::Encode(result + utf8_byte_position, character, last);
+ last = character;
}
result[utf8_byte_position] = 0;
return SmartArrayPointer<char>(result);
@@ -6379,73 +6383,6 @@
}
-// This method determines the type of string involved and then gets the UTF8
-// length of the string. It doesn't flatten the string and has log(n) recursion
-// for a string of length n.
-int String::Utf8Length(String* input, int from, int to) {
- if (from == to) return 0;
- int total = 0;
- while (true) {
- if (input->IsAsciiRepresentation()) return total + to - from;
- switch (StringShape(input).representation_tag()) {
- case kConsStringTag: {
- ConsString* str = ConsString::cast(input);
- String* first = str->first();
- String* second = str->second();
- int first_length = first->length();
- if (first_length - from < to - first_length) {
- if (first_length > from) {
- // Left hand side is shorter.
- total += Utf8Length(first, from, first_length);
- input = second;
- from = 0;
- to -= first_length;
- } else {
- // We only need the right hand side.
- input = second;
- from -= first_length;
- to -= first_length;
- }
- } else {
- if (first_length <= to) {
- // Right hand side is shorter.
- total += Utf8Length(second, 0, to - first_length);
- input = first;
- to = first_length;
- } else {
- // We only need the left hand side.
- input = first;
- }
- }
- continue;
- }
- case kExternalStringTag:
- case kSeqStringTag: {
- Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();
- const uc16* p = vector.start();
- for (int i = from; i < to; i++) {
- total += unibrow::Utf8::Length(p[i]);
- }
- return total;
- }
- case kSlicedStringTag: {
- SlicedString* str = SlicedString::cast(input);
- int offset = str->offset();
- input = str->parent();
- from += offset;
- to += offset;
- continue;
- }
- default:
- break;
- }
- UNREACHABLE();
- return 0;
- }
- return 0;
-}
-
-
void Relocatable::PostGarbageCollectionProcessing() {
Isolate* isolate = Isolate::Current();
Relocatable* current = isolate->relocatable_top();
@@ -6839,8 +6776,10 @@
// General slow case check. We know that the ia and ib iterators
// have the same length.
while (ia->has_more()) {
- uc32 ca = ia->GetNext();
- uc32 cb = ib->GetNext();
+ uint32_t ca = ia->GetNext();
+ uint32_t cb = ib->GetNext();
+ ASSERT(ca <= unibrow::Utf16::kMaxNonSurrogateCharCode);
+ ASSERT(cb <= unibrow::Utf16::kMaxNonSurrogateCharCode);
if (ca != cb)
return false;
}
@@ -7023,8 +6962,14 @@
decoder->Reset(str.start(), str.length());
int i;
for (i = 0; i < slen && decoder->has_more(); i++) {
- uc32 r = decoder->GetNext();
- if (Get(i) != r) return false;
+ uint32_t r = decoder->GetNext();
+ if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {
+ if (i > slen - 1) return false;
+ if (Get(i++) != unibrow::Utf16::LeadSurrogate(r)) return false;
+ if (Get(i) != unibrow::Utf16::TrailSurrogate(r)) return false;
+ } else {
+ if (Get(i) != r) return false;
+ }
}
return i == slen && !decoder->has_more();
}
@@ -7154,6 +7099,22 @@
}
+void StringHasher::AddSurrogatePair(uc32 c) {
+ uint16_t lead = unibrow::Utf16::LeadSurrogate(c);
+ AddCharacter(lead);
+ uint16_t trail = unibrow::Utf16::TrailSurrogate(c);
+ AddCharacter(trail);
+}
+
+
+void StringHasher::AddSurrogatePairNoIndex(uc32 c) {
+ uint16_t lead = unibrow::Utf16::LeadSurrogate(c);
+ AddCharacterNoIndex(lead);
+ uint16_t trail = unibrow::Utf16::TrailSurrogate(c);
+ AddCharacterNoIndex(trail);
+}
+
+
uint32_t StringHasher::GetHashField() {
ASSERT(is_valid());
if (length_ <= String::kMaxHashCalcLength) {
@@ -10746,7 +10707,7 @@
if (hash_field_ != 0) return hash_field_ >> String::kHashShift;
unibrow::Utf8InputBuffer<> buffer(string_.start(),
static_cast<unsigned>(string_.length()));
- chars_ = buffer.Length();
+ chars_ = buffer.Utf16Length();
hash_field_ = String::ComputeHashField(&buffer, chars_, seed_);
uint32_t result = hash_field_ >> String::kHashShift;
ASSERT(result != 0); // Ensure that the hash value of 0 is never computed.

Powered by Google App Engine
This is Rietveld 408576698