Index: src/objects.cc |
=================================================================== |
--- src/objects.cc (revision 10944) |
+++ src/objects.cc (working copy) |
@@ -6043,9 +6043,11 @@ |
buffer->Reset(offset, this); |
int character_position = offset; |
int utf8_bytes = 0; |
+ int last = unibrow::Utf8::kNoPreviousCharacter; |
while (buffer->has_more() && character_position++ < offset + length) { |
uint16_t character = buffer->GetNext(); |
- utf8_bytes += unibrow::Utf8::Length(character); |
+ utf8_bytes += unibrow::Utf8::Length(character, last); |
+ last = character; |
} |
if (length_return) { |
@@ -6059,13 +6061,15 @@ |
buffer->Seek(offset); |
character_position = offset; |
int utf8_byte_position = 0; |
+ last = unibrow::Utf8::kNoPreviousCharacter; |
while (buffer->has_more() && character_position++ < offset + length) { |
uint16_t character = buffer->GetNext(); |
if (allow_nulls == DISALLOW_NULLS && character == 0) { |
character = ' '; |
} |
utf8_byte_position += |
- unibrow::Utf8::Encode(result + utf8_byte_position, character); |
+ unibrow::Utf8::Encode(result + utf8_byte_position, character, last); |
+ last = character; |
} |
result[utf8_byte_position] = 0; |
return SmartArrayPointer<char>(result); |
@@ -6381,41 +6385,88 @@ |
// This method determines the type of string involved and then gets the UTF8 |
// length of the string. It doesn't flatten the string and has log(n) recursion |
-// for a string of length n. |
-int String::Utf8Length(String* input, int from, int to) { |
+// for a string of length n. If the failure flag gets set, then we have to |
+// flatten the string and retry. Failures are caused by surrogate pairs in deep |
rossberg
2012/03/07 13:32:47
Not sure I understand the failure mode. Why does t
Erik Corry
2012/03/11 19:29:22
The old version of this function was constructed a
|
+// cons strings. |
+int String::Utf8Length(String* input, |
+ int from, |
+ int to, |
+ bool followed_by_surrogate, |
+ int max_recursion, |
+ bool* failure, |
+ bool* starts_with_surrogate) { |
if (from == to) return 0; |
int total = 0; |
+ bool dummy; |
while (true) { |
- if (input->IsAsciiRepresentation()) return total + to - from; |
+ if (input->IsAsciiRepresentation()) { |
+ *starts_with_surrogate = false; |
+ return total + to - from; |
+ } |
switch (StringShape(input).representation_tag()) { |
case kConsStringTag: { |
ConsString* str = ConsString::cast(input); |
String* first = str->first(); |
String* second = str->second(); |
int first_length = first->length(); |
- if (first_length - from < to - first_length) { |
+ if (first_length - from > to - first_length) { |
+ if (first_length < to) { |
+ // Right hand side is shorter. |
+ bool right_starts_with_surrogate = false; |
+ total += Utf8Length(second, |
+ 0, |
+ to - first_length, |
+ followed_by_surrogate, |
+ max_recursion - 1, |
+ failure, |
+ &right_starts_with_surrogate); |
+ if (*failure) return 0; |
+ followed_by_surrogate = right_starts_with_surrogate; |
+ input = first; |
+ to = first_length; |
+ } else { |
+ // We only need the left hand side. |
+ input = first; |
+ } |
+ } else { |
if (first_length > from) { |
// Left hand side is shorter. |
- total += Utf8Length(first, from, first_length); |
- input = second; |
- from = 0; |
- to -= first_length; |
+ if (first->IsAsciiRepresentation()) { |
+ total += first_length - from; |
+ *starts_with_surrogate = false; |
+ starts_with_surrogate = &dummy; |
+ input = second; |
+ from = 0; |
+ to -= first_length; |
+ } else if (second->IsAsciiRepresentation()) { |
+ followed_by_surrogate = false; |
+ total += to - first_length; |
+ input = first; |
+ to = first_length; |
+ } else if (max_recursion > 0) { |
rossberg
2012/03/07 13:32:47
Why is this the only recursive path actually check
Erik Corry
2012/03/11 19:29:22
See above.
|
+ bool right_starts_with_surrogate = false; |
+ // Recursing on the long one. This may fail. |
+ total += Utf8Length(second, |
+ 0, |
+ to - first_length, |
+ followed_by_surrogate, |
+ max_recursion - 1, |
+ failure, |
+ &right_starts_with_surrogate); |
+ if (*failure) return 0; |
+ input = first; |
+ to = first_length; |
+ followed_by_surrogate = right_starts_with_surrogate; |
+ } else { |
+ *failure = true; |
+ return 0; |
+ } |
} else { |
// We only need the right hand side. |
input = second; |
- from -= first_length; |
+ from = 0; |
to -= first_length; |
} |
- } else { |
- if (first_length <= to) { |
- // Right hand side is shorter. |
- total += Utf8Length(second, 0, to - first_length); |
- input = first; |
- to = first_length; |
- } else { |
- // We only need the left hand side. |
- input = first; |
- } |
} |
continue; |
} |
@@ -6423,9 +6474,21 @@ |
case kSeqStringTag: { |
Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector(); |
const uc16* p = vector.start(); |
+ int previous = unibrow::Utf8::kNoPreviousCharacter; |
for (int i = from; i < to; i++) { |
- total += unibrow::Utf8::Length(p[i]); |
+ uc16 c = p[i]; |
+ total += unibrow::Utf8::Length(c, previous); |
+ previous = c; |
} |
+ if (to - from > 0) { |
+ if (unibrow::Utf16::IsLeadSurrogate(previous) && |
+ followed_by_surrogate) { |
+ total -= 2; |
rossberg
2012/03/07 13:32:47
I wouldn't mind a comment here, why -2? Is that kS
Erik Corry
2012/03/11 19:29:22
No, it's kBytesSavedByCombiningSurrogates. Fixed.
|
+ } |
+ if (unibrow::Utf16::IsTrailSurrogate(p[from])) { |
+ *starts_with_surrogate = true; |
+ } |
+ } |
return total; |
} |
case kSlicedStringTag: { |
@@ -6839,8 +6902,10 @@ |
// General slow case check. We know that the ia and ib iterators |
// have the same length. |
while (ia->has_more()) { |
- uc32 ca = ia->GetNext(); |
- uc32 cb = ib->GetNext(); |
+ uint32_t ca = ia->GetNext(); |
+ uint32_t cb = ib->GetNext(); |
+ ASSERT(ca <= unibrow::Utf16::kMaxNonSurrogateCharCode); |
+ ASSERT(cb <= unibrow::Utf16::kMaxNonSurrogateCharCode); |
if (ca != cb) |
return false; |
} |
@@ -7023,8 +7088,14 @@ |
decoder->Reset(str.start(), str.length()); |
int i; |
for (i = 0; i < slen && decoder->has_more(); i++) { |
- uc32 r = decoder->GetNext(); |
- if (Get(i) != r) return false; |
+ uint32_t r = decoder->GetNext(); |
+ if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) { |
+ if (i > slen - 1) return false; |
+ if (Get(i++) != unibrow::Utf16::LeadSurrogate(r)) return false; |
+ if (Get(i) != unibrow::Utf16::TrailSurrogate(r)) return false; |
+ } else { |
+ if (Get(i) != r) return false; |
+ } |
} |
return i == slen && !decoder->has_more(); |
} |
@@ -7154,6 +7225,22 @@ |
} |
+void StringHasher::AddSurrogatePair(uc32 c) { |
+ uint16_t lead = unibrow::Utf16::LeadSurrogate(c); |
+ AddCharacter(lead); |
+ uint16_t trail = unibrow::Utf16::TrailSurrogate(c); |
+ AddCharacter(trail); |
+} |
+ |
+ |
+void StringHasher::AddSurrogatePairNoIndex(uc32 c) { |
+ uint16_t lead = unibrow::Utf16::LeadSurrogate(c); |
+ AddCharacterNoIndex(lead); |
+ uint16_t trail = unibrow::Utf16::TrailSurrogate(c); |
+ AddCharacterNoIndex(trail); |
+} |
+ |
+ |
uint32_t StringHasher::GetHashField() { |
ASSERT(is_valid()); |
if (length_ <= String::kMaxHashCalcLength) { |
@@ -10746,7 +10833,7 @@ |
if (hash_field_ != 0) return hash_field_ >> String::kHashShift; |
unibrow::Utf8InputBuffer<> buffer(string_.start(), |
static_cast<unsigned>(string_.length())); |
- chars_ = buffer.Length(); |
+ chars_ = buffer.Utf16Length(); |
hash_field_ = String::ComputeHashField(&buffer, chars_, seed_); |
uint32_t result = hash_field_ >> String::kHashShift; |
ASSERT(result != 0); // Ensure that the hash value of 0 is never computed. |