src/handles.cc - Issue 9600009: Fix input and output to handle UTF16 surrogate pairs.

Unified Diff: src/handles.cc

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/handles.cc

===================================================================

--- src/handles.cc (revision 10944)

+++ src/handles.cc (working copy)

@@ -800,4 +800,162 @@

}

+// This method determines the type of string involved and then gets the UTF8

+// length of the string. It doesn't flatten the string and has log(n) recursion

+// for a string of length n. If the failure flag gets set, then we have to

+// flatten the string and retry. Failures are caused by surrogate pairs in deep

+// cons strings.

+// Single surrogate characters that are encountered in the UTF-16 character

+// sequence of the input string get counted as 3 UTF-8 bytes, because that

+// is the way that WriteUtf8 will encode them. Surrogate pairs are counted and

+// encoded as one 4-byte UTF-8 sequence.

+// This function conceptually uses recursion on the two halves of cons strings.

+// However, in order to avoid the recursion going too deep it recurses on the

+// second string of the cons, but iterates on the first substring (by manually

+// eliminating it as a tail recursion). This means it counts the UTF-8 length

+// from the end to the start, which makes no difference to the total.

+// Surrogate pairs are recognized even if they are split across two sides of a

+// cons, which complicates the implementation somewhat. Therefore, too deep

+// recursion cannot always be avoided. This case is detected, and the failure

+// flag is set, a signal to the caller that the string should be flattened and

+// the operation retried.

+int Utf8LengthHelper(String* input,

+ int from,

+ int to,

+ bool followed_by_surrogate,

+ int max_recursion,

+ bool* failure,

+ bool* starts_with_surrogate) {

+ if (from == to) return 0;

+ int total = 0;

+ bool dummy;

+ while (true) {

+ if (input->IsAsciiRepresentation()) {

+ *starts_with_surrogate = false;

+ return total + to - from;

+ }

+ switch (StringShape(input).representation_tag()) {

+ case kConsStringTag: {

+ ConsString* str = ConsString::cast(input);

+ String* first = str->first();

+ String* second = str->second();

+ int first_length = first->length();

+ if (first_length - from > to - first_length) {

+ if (first_length < to) {

+ // Right hand side is shorter. No need to check the recursion depth

+ // since this can only happen log(n) times.

+ bool right_starts_with_surrogate = false;

+ total += Utf8LengthHelper(second,

+ 0,

+ to - first_length,

+ followed_by_surrogate,

+ max_recursion - 1,

+ failure,

+ &right_starts_with_surrogate);

+ if (*failure) return 0;

+ followed_by_surrogate = right_starts_with_surrogate;

+ input = first;

+ to = first_length;

+ } else {

+ // We only need the left hand side.

+ input = first;

+ }

+ } else {

+ if (first_length > from) {

+ // Left hand side is shorter.

+ if (first->IsAsciiRepresentation()) {

+ total += first_length - from;

+ *starts_with_surrogate = false;

+ starts_with_surrogate = &dummy;

+ input = second;

+ from = 0;

+ to -= first_length;

+ } else if (second->IsAsciiRepresentation()) {

+ followed_by_surrogate = false;

+ total += to - first_length;

+ input = first;

+ to = first_length;

+ } else if (max_recursion > 0) {

+ bool right_starts_with_surrogate = false;

+ // Recursing on the long one. This may fail.

+ total += Utf8LengthHelper(second,

+ 0,

+ to - first_length,

+ followed_by_surrogate,

+ max_recursion - 1,

+ failure,

+ &right_starts_with_surrogate);

+ if (*failure) return 0;

+ input = first;

+ to = first_length;

+ followed_by_surrogate = right_starts_with_surrogate;

+ } else {

+ *failure = true;

+ return 0;

+ }

+ } else {

+ // We only need the right hand side.

+ input = second;

+ from = 0;

+ to -= first_length;

+ }

+ continue;

+ }

+ case kExternalStringTag:

+ case kSeqStringTag: {

+ Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();

+ const uc16* p = vector.start();

+ int previous = unibrow::Utf16::kNoPreviousCharacter;

+ for (int i = from; i < to; i++) {

+ uc16 c = p[i];

+ total += unibrow::Utf8::Length(c, previous);

+ previous = c;

+ }

+ if (to - from > 0) {

+ if (unibrow::Utf16::IsLeadSurrogate(previous) &&

+ followed_by_surrogate) {

+ total -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;

+ }

+ if (unibrow::Utf16::IsTrailSurrogate(p[from])) {

+ *starts_with_surrogate = true;

+ }

+ return total;

+ }

+ case kSlicedStringTag: {

+ SlicedString* str = SlicedString::cast(input);

+ int offset = str->offset();

+ input = str->parent();

+ from += offset;

+ to += offset;

+ continue;

+ }

+ default:

+ break;

+ }

+ UNREACHABLE();

+ return 0;

+ }

+ return 0;

+int Utf8Length(Handle<String> str) {

+ bool dummy;

+ bool failure = true;

+ int len;

+ const int kRecursionBudget = 100;

+ while (failure) {

rossberg 2012/03/12 10:55:05 Nit: this seems more natural as a do-while loop.

Erik Corry 2012/03/12 12:34:10 I made it into a do-while, but it's no better (sam

Sven Panne 2012/03/12 13:00:44 Just an unsolicited comment from my side: For stuf

+ failure = false;

+ len = Utf8LengthHelper(

+ *str, 0, str->length(), false, kRecursionBudget, &failure, &dummy);

+ if (failure) FlattenString(str);

+ }

+ return len;

} } // namespace v8::internal

« src/handles.h ('K') | « src/handles.h ('k') | src/heap.cc » ('j') | src/unicode.h » ('J')