Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(641)

Unified Diff: src/handles.cc

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/handles.cc
===================================================================
--- src/handles.cc (revision 10944)
+++ src/handles.cc (working copy)
@@ -800,4 +800,162 @@
}
+// This method determines the type of string involved and then gets the UTF8
+// length of the string. It doesn't flatten the string and has log(n) recursion
+// for a string of length n. If the failure flag gets set, then we have to
+// flatten the string and retry. Failures are caused by surrogate pairs in deep
+// cons strings.
+
+// Single surrogate characters that are encountered in the UTF-16 character
+// sequence of the input string get counted as 3 UTF-8 bytes, because that
+// is the way that WriteUtf8 will encode them. Surrogate pairs are counted and
+// encoded as one 4-byte UTF-8 sequence.
+
+// This function conceptually uses recursion on the two halves of cons strings.
+// However, in order to avoid the recursion going too deep it recurses on the
+// second string of the cons, but iterates on the first substring (by manually
+// eliminating it as a tail recursion). This means it counts the UTF-8 length
+// from the end to the start, which makes no difference to the total.
+
+// Surrogate pairs are recognized even if they are split across two sides of a
+// cons, which complicates the implementation somewhat. Therefore, too deep
+// recursion cannot always be avoided. This case is detected, and the failure
+// flag is set, a signal to the caller that the string should be flattened and
+// the operation retried.
+int Utf8LengthHelper(String* input,
+ int from,
+ int to,
+ bool followed_by_surrogate,
+ int max_recursion,
+ bool* failure,
+ bool* starts_with_surrogate) {
+ if (from == to) return 0;
+ int total = 0;
+ bool dummy;
+ while (true) {
+ if (input->IsAsciiRepresentation()) {
+ *starts_with_surrogate = false;
+ return total + to - from;
+ }
+ switch (StringShape(input).representation_tag()) {
+ case kConsStringTag: {
+ ConsString* str = ConsString::cast(input);
+ String* first = str->first();
+ String* second = str->second();
+ int first_length = first->length();
+ if (first_length - from > to - first_length) {
+ if (first_length < to) {
+ // Right hand side is shorter. No need to check the recursion depth
+ // since this can only happen log(n) times.
+ bool right_starts_with_surrogate = false;
+ total += Utf8LengthHelper(second,
+ 0,
+ to - first_length,
+ followed_by_surrogate,
+ max_recursion - 1,
+ failure,
+ &right_starts_with_surrogate);
+ if (*failure) return 0;
+ followed_by_surrogate = right_starts_with_surrogate;
+ input = first;
+ to = first_length;
+ } else {
+ // We only need the left hand side.
+ input = first;
+ }
+ } else {
+ if (first_length > from) {
+ // Left hand side is shorter.
+ if (first->IsAsciiRepresentation()) {
+ total += first_length - from;
+ *starts_with_surrogate = false;
+ starts_with_surrogate = &dummy;
+ input = second;
+ from = 0;
+ to -= first_length;
+ } else if (second->IsAsciiRepresentation()) {
+ followed_by_surrogate = false;
+ total += to - first_length;
+ input = first;
+ to = first_length;
+ } else if (max_recursion > 0) {
+ bool right_starts_with_surrogate = false;
+ // Recursing on the long one. This may fail.
+ total += Utf8LengthHelper(second,
+ 0,
+ to - first_length,
+ followed_by_surrogate,
+ max_recursion - 1,
+ failure,
+ &right_starts_with_surrogate);
+ if (*failure) return 0;
+ input = first;
+ to = first_length;
+ followed_by_surrogate = right_starts_with_surrogate;
+ } else {
+ *failure = true;
+ return 0;
+ }
+ } else {
+ // We only need the right hand side.
+ input = second;
+ from = 0;
+ to -= first_length;
+ }
+ }
+ continue;
+ }
+ case kExternalStringTag:
+ case kSeqStringTag: {
+ Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();
+ const uc16* p = vector.start();
+ int previous = unibrow::Utf16::kNoPreviousCharacter;
+ for (int i = from; i < to; i++) {
+ uc16 c = p[i];
+ total += unibrow::Utf8::Length(c, previous);
+ previous = c;
+ }
+ if (to - from > 0) {
+ if (unibrow::Utf16::IsLeadSurrogate(previous) &&
+ followed_by_surrogate) {
+ total -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
+ }
+ if (unibrow::Utf16::IsTrailSurrogate(p[from])) {
+ *starts_with_surrogate = true;
+ }
+ }
+ return total;
+ }
+ case kSlicedStringTag: {
+ SlicedString* str = SlicedString::cast(input);
+ int offset = str->offset();
+ input = str->parent();
+ from += offset;
+ to += offset;
+ continue;
+ }
+ default:
+ break;
+ }
+ UNREACHABLE();
+ return 0;
+ }
+ return 0;
+}
+
+
+int Utf8Length(Handle<String> str) {
+ bool dummy;
+ bool failure = true;
+ int len;
+ const int kRecursionBudget = 100;
+ while (failure) {
rossberg 2012/03/12 10:55:05 Nit: this seems more natural as a do-while loop.
Erik Corry 2012/03/12 12:34:10 I made it into a do-while, but it's no better (sam
Sven Panne 2012/03/12 13:00:44 Just an unsolicited comment from my side: For stuf
+ failure = false;
+ len = Utf8LengthHelper(
+ *str, 0, str->length(), false, kRecursionBudget, &failure, &dummy);
+ if (failure) FlattenString(str);
+ }
+ return len;
+}
+
} } // namespace v8::internal

Powered by Google App Engine
This is Rietveld 408576698