src/handles.cc - Issue 9600009: Fix input and output to handle UTF16 surrogate pairs.

Side by Side Diff: src/handles.cc

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2011 the V8 project authors. All rights reserved.	1 // Copyright 2011 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 782 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
793	793

794 Handle<ObjectHashTable> PutIntoObjectHashTable(Handle<ObjectHashTable> table,	794 Handle<ObjectHashTable> PutIntoObjectHashTable(Handle<ObjectHashTable> table,

795 Handle<Object> key,	795 Handle<Object> key,

796 Handle<Object> value) {	796 Handle<Object> value) {

797 CALL_HEAP_FUNCTION(table->GetIsolate(),	797 CALL_HEAP_FUNCTION(table->GetIsolate(),

798 table->Put(key, value),	798 table->Put(key, value),

799 ObjectHashTable);	799 ObjectHashTable);

800 }	800 }

801	801

802	802

	803 // This method determines the type of string involved and then gets the UTF8

	804 // length of the string. It doesn't flatten the string and has log(n) recursion

	805 // for a string of length n. If the failure flag gets set, then we have to

	806 // flatten the string and retry. Failures are caused by surrogate pairs in deep

	807 // cons strings.

	808

	809 // Single surrogate characters that are encountered in the UTF-16 character

	810 // sequence of the input string get counted as 3 UTF-8 bytes, because that

	811 // is the way that WriteUtf8 will encode them. Surrogate pairs are counted and

	812 // encoded as one 4-byte UTF-8 sequence.

	813

	814 // This function conceptually uses recursion on the two halves of cons strings.

	815 // However, in order to avoid the recursion going too deep it recurses on the

	816 // second string of the cons, but iterates on the first substring (by manually

	817 // eliminating it as a tail recursion). This means it counts the UTF-8 length

	818 // from the end to the start, which makes no difference to the total.

	819

	820 // Surrogate pairs are recognized even if they are split across two sides of a

	821 // cons, which complicates the implementation somewhat. Therefore, too deep

	822 // recursion cannot always be avoided. This case is detected, and the failure

	823 // flag is set, a signal to the caller that the string should be flattened and

	824 // the operation retried.

	825 int Utf8LengthHelper(String* input,

	826 int from,

	827 int to,

	828 bool followed_by_surrogate,

	829 int max_recursion,

	830 bool* failure,

	831 bool* starts_with_surrogate) {

	832 if (from == to) return 0;

	833 int total = 0;

	834 bool dummy;

	835 while (true) {

	836 if (input->IsAsciiRepresentation()) {

	837 *starts_with_surrogate = false;

	838 return total + to - from;

	839 }

	840 switch (StringShape(input).representation_tag()) {

	841 case kConsStringTag: {

	842 ConsString* str = ConsString::cast(input);

	843 String* first = str->first();

	844 String* second = str->second();

	845 int first_length = first->length();

	846 if (first_length - from > to - first_length) {

	847 if (first_length < to) {

	848 // Right hand side is shorter. No need to check the recursion depth

	849 // since this can only happen log(n) times.

	850 bool right_starts_with_surrogate = false;

	851 total += Utf8LengthHelper(second,

	852 0,

	853 to - first_length,

	854 followed_by_surrogate,

	855 max_recursion - 1,

	856 failure,

	857 &right_starts_with_surrogate);

	858 if (*failure) return 0;

	859 followed_by_surrogate = right_starts_with_surrogate;

	860 input = first;

	861 to = first_length;

	862 } else {

	863 // We only need the left hand side.

	864 input = first;

	865 }

	866 } else {

	867 if (first_length > from) {

	868 // Left hand side is shorter.

	869 if (first->IsAsciiRepresentation()) {

	870 total += first_length - from;

	871 *starts_with_surrogate = false;

	872 starts_with_surrogate = &dummy;

	873 input = second;

	874 from = 0;

	875 to -= first_length;

	876 } else if (second->IsAsciiRepresentation()) {

	877 followed_by_surrogate = false;

	878 total += to - first_length;

	879 input = first;

	880 to = first_length;

	881 } else if (max_recursion > 0) {

	882 bool right_starts_with_surrogate = false;

	883 // Recursing on the long one. This may fail.

	884 total += Utf8LengthHelper(second,

	885 0,

	886 to - first_length,

	887 followed_by_surrogate,

	888 max_recursion - 1,

	889 failure,

	890 &right_starts_with_surrogate);

	891 if (*failure) return 0;

	892 input = first;

	893 to = first_length;

	894 followed_by_surrogate = right_starts_with_surrogate;

	895 } else {

	896 *failure = true;

	897 return 0;

	898 }

	899 } else {

	900 // We only need the right hand side.

	901 input = second;

	902 from = 0;

	903 to -= first_length;

	904 }

	905 }

	906 continue;

	907 }

	908 case kExternalStringTag:

	909 case kSeqStringTag: {

	910 Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();

	911 const uc16* p = vector.start();

	912 int previous = unibrow::Utf16::kNoPreviousCharacter;

	913 for (int i = from; i < to; i++) {

	914 uc16 c = p[i];

	915 total += unibrow::Utf8::Length(c, previous);

	916 previous = c;

	917 }

	918 if (to - from > 0) {

	919 if (unibrow::Utf16::IsLeadSurrogate(previous) &&

	920 followed_by_surrogate) {

	921 total -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;

	922 }

	923 if (unibrow::Utf16::IsTrailSurrogate(p[from])) {

	924 *starts_with_surrogate = true;

	925 }

	926 }

	927 return total;

	928 }

	929 case kSlicedStringTag: {

	930 SlicedString* str = SlicedString::cast(input);

	931 int offset = str->offset();

	932 input = str->parent();

	933 from += offset;

	934 to += offset;

	935 continue;

	936 }

	937 default:

	938 break;

	939 }

	940 UNREACHABLE();

	941 return 0;

	942 }

	943 return 0;

	944 }

	945

	946

	947 int Utf8Length(Handle<String> str) {

	948 bool dummy;

	949 bool failure = true;

	950 int len;

	951 const int kRecursionBudget = 100;

	952 while (failure) {
	rossberg 2012/03/12 10:55:05 Nit: this seems more natural as a do-while loop. Nit: this seems more natural as a do-while loop. Erik Corry 2012/03/12 12:34:10 I made it into a do-while, but it's no better (sam Show quoted text On 2012/03/12 10:55:05, rossberg wrote: > Nit: this seems more natural as a do-while loop. I made it into a do-while, but it's no better (same line count, flag must still be declared outside the loop, still have to test the flag twice). Sven Panne 2012/03/12 13:00:44 Just an unsolicited comment from my side: For stuf Show quoted text On 2012/03/12 12:34:10, Erik Corry wrote: > On 2012/03/12 10:55:05, rossberg wrote: > > Nit: this seems more natural as a do-while loop. > > I made it into a do-while, but it's no better (same line count, flag must still > be declared outside the loop, still have to test the flag twice). Just an unsolicited comment from my side: For stuff like that I usually prefer a C/C++ emulation of Dahl's loop/while/repeat. Alas, compilers are sometimes a bit too dumb, so an additional line has to be appended: int Utf8Length(Handle<String> str) { const int kRecursionBudget = 100; while (true) { bool failure = false, dummy; int len = Utf8LengthHelper( *str, 0, str->length(), false, kRecursionBudget, &failure, &dummy); if (!failure) return len; FlattenString(str); } return 0; // Keep the compiler happy. } Using a "Remove Double Negative" refactoring might make this even more clear (failure => ok).
	953 failure = false;

	954 len = Utf8LengthHelper(

	955 *str, 0, str->length(), false, kRecursionBudget, &failure, &dummy);

	956 if (failure) FlattenString(str);

	957 }

	958 return len;

	959 }

	960

803 } } // namespace v8::internal	961 } } // namespace v8::internal

OLD	NEW

« src/handles.h ('K') | « src/handles.h ('k') | src/heap.cc » ('j') | src/unicode.h » ('J')