Index: src/handles.cc |
=================================================================== |
--- src/handles.cc (revision 10944) |
+++ src/handles.cc (working copy) |
@@ -800,4 +800,162 @@ |
} |
+// This method determines the type of string involved and then gets the UTF8 |
+// length of the string. It doesn't flatten the string and has log(n) recursion |
+// for a string of length n. If the failure flag gets set, then we have to |
+// flatten the string and retry. Failures are caused by surrogate pairs in deep |
+// cons strings. |
+ |
+// Single surrogate characters that are encountered in the UTF-16 character |
+// sequence of the input string get counted as 3 UTF-8 bytes, because that |
+// is the way that WriteUtf8 will encode them. Surrogate pairs are counted and |
+// encoded as one 4-byte UTF-8 sequence. |
+ |
+// This function conceptually uses recursion on the two halves of cons strings. |
+// However, in order to avoid the recursion going too deep it recurses on the |
+// second string of the cons, but iterates on the first substring (by manually |
+// eliminating it as a tail recursion). This means it counts the UTF-8 length |
+// from the end to the start, which makes no difference to the total. |
+ |
+// Surrogate pairs are recognized even if they are split across two sides of a |
+// cons, which complicates the implementation somewhat. Therefore, too deep |
+// recursion cannot always be avoided. This case is detected, and the failure |
+// flag is set, a signal to the caller that the string should be flattened and |
+// the operation retried. |
+int Utf8LengthHelper(String* input, |
+ int from, |
+ int to, |
+ bool followed_by_surrogate, |
+ int max_recursion, |
+ bool* failure, |
+ bool* starts_with_surrogate) { |
+ if (from == to) return 0; |
+ int total = 0; |
+ bool dummy; |
+ while (true) { |
+ if (input->IsAsciiRepresentation()) { |
+ *starts_with_surrogate = false; |
+ return total + to - from; |
+ } |
+ switch (StringShape(input).representation_tag()) { |
+ case kConsStringTag: { |
+ ConsString* str = ConsString::cast(input); |
+ String* first = str->first(); |
+ String* second = str->second(); |
+ int first_length = first->length(); |
+ if (first_length - from > to - first_length) { |
+ if (first_length < to) { |
+ // Right hand side is shorter. No need to check the recursion depth |
+ // since this can only happen log(n) times. |
+ bool right_starts_with_surrogate = false; |
+ total += Utf8LengthHelper(second, |
+ 0, |
+ to - first_length, |
+ followed_by_surrogate, |
+ max_recursion - 1, |
+ failure, |
+ &right_starts_with_surrogate); |
+ if (*failure) return 0; |
+ followed_by_surrogate = right_starts_with_surrogate; |
+ input = first; |
+ to = first_length; |
+ } else { |
+ // We only need the left hand side. |
+ input = first; |
+ } |
+ } else { |
+ if (first_length > from) { |
+ // Left hand side is shorter. |
+ if (first->IsAsciiRepresentation()) { |
+ total += first_length - from; |
+ *starts_with_surrogate = false; |
+ starts_with_surrogate = &dummy; |
+ input = second; |
+ from = 0; |
+ to -= first_length; |
+ } else if (second->IsAsciiRepresentation()) { |
+ followed_by_surrogate = false; |
+ total += to - first_length; |
+ input = first; |
+ to = first_length; |
+ } else if (max_recursion > 0) { |
+ bool right_starts_with_surrogate = false; |
+ // Recursing on the long one. This may fail. |
+ total += Utf8LengthHelper(second, |
+ 0, |
+ to - first_length, |
+ followed_by_surrogate, |
+ max_recursion - 1, |
+ failure, |
+ &right_starts_with_surrogate); |
+ if (*failure) return 0; |
+ input = first; |
+ to = first_length; |
+ followed_by_surrogate = right_starts_with_surrogate; |
+ } else { |
+ *failure = true; |
+ return 0; |
+ } |
+ } else { |
+ // We only need the right hand side. |
+ input = second; |
+ from = 0; |
+ to -= first_length; |
+ } |
+ } |
+ continue; |
+ } |
+ case kExternalStringTag: |
+ case kSeqStringTag: { |
+ Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector(); |
+ const uc16* p = vector.start(); |
+ int previous = unibrow::Utf16::kNoPreviousCharacter; |
+ for (int i = from; i < to; i++) { |
+ uc16 c = p[i]; |
+ total += unibrow::Utf8::Length(c, previous); |
+ previous = c; |
+ } |
+ if (to - from > 0) { |
+ if (unibrow::Utf16::IsLeadSurrogate(previous) && |
+ followed_by_surrogate) { |
+ total -= unibrow::Utf8::kBytesSavedByCombiningSurrogates; |
+ } |
+ if (unibrow::Utf16::IsTrailSurrogate(p[from])) { |
+ *starts_with_surrogate = true; |
+ } |
+ } |
+ return total; |
+ } |
+ case kSlicedStringTag: { |
+ SlicedString* str = SlicedString::cast(input); |
+ int offset = str->offset(); |
+ input = str->parent(); |
+ from += offset; |
+ to += offset; |
+ continue; |
+ } |
+ default: |
+ break; |
+ } |
+ UNREACHABLE(); |
+ return 0; |
+ } |
+ return 0; |
+} |
+ |
+ |
+int Utf8Length(Handle<String> str) { |
+ bool dummy; |
+ bool failure = true; |
+ int len; |
+ const int kRecursionBudget = 100; |
+ while (failure) { |
rossberg
2012/03/12 10:55:05
Nit: this seems more natural as a do-while loop.
Erik Corry
2012/03/12 12:34:10
I made it into a do-while, but it's no better (sam
Sven Panne
2012/03/12 13:00:44
Just an unsolicited comment from my side: For stuf
|
+ failure = false; |
+ len = Utf8LengthHelper( |
+ *str, 0, str->length(), false, kRecursionBudget, &failure, &dummy); |
+ if (failure) FlattenString(str); |
+ } |
+ return len; |
+} |
+ |
} } // namespace v8::internal |