Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(422)

Side by Side Diff: src/handles.cc

Issue 9600009: Fix input and output to handle UTF16 surrogate pairs. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2011 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 782 matching lines...) Expand 10 before | Expand all | Expand 10 after
793 793
794 Handle<ObjectHashTable> PutIntoObjectHashTable(Handle<ObjectHashTable> table, 794 Handle<ObjectHashTable> PutIntoObjectHashTable(Handle<ObjectHashTable> table,
795 Handle<Object> key, 795 Handle<Object> key,
796 Handle<Object> value) { 796 Handle<Object> value) {
797 CALL_HEAP_FUNCTION(table->GetIsolate(), 797 CALL_HEAP_FUNCTION(table->GetIsolate(),
798 table->Put(*key, *value), 798 table->Put(*key, *value),
799 ObjectHashTable); 799 ObjectHashTable);
800 } 800 }
801 801
802 802
803 // This method determines the type of string involved and then gets the UTF8
804 // length of the string. It doesn't flatten the string and has log(n) recursion
805 // for a string of length n. If the failure flag gets set, then we have to
806 // flatten the string and retry. Failures are caused by surrogate pairs in deep
807 // cons strings.
808
809 // Single surrogate characters that are encountered in the UTF-16 character
810 // sequence of the input string get counted as 3 UTF-8 bytes, because that
811 // is the way that WriteUtf8 will encode them. Surrogate pairs are counted and
812 // encoded as one 4-byte UTF-8 sequence.
813
814 // This function conceptually uses recursion on the two halves of cons strings.
815 // However, in order to avoid the recursion going too deep it recurses on the
816 // second string of the cons, but iterates on the first substring (by manually
817 // eliminating it as a tail recursion). This means it counts the UTF-8 length
818 // from the end to the start, which makes no difference to the total.
819
820 // Surrogate pairs are recognized even if they are split across two sides of a
821 // cons, which complicates the implementation somewhat. Therefore, too deep
822 // recursion cannot always be avoided. This case is detected, and the failure
823 // flag is set, a signal to the caller that the string should be flattened and
824 // the operation retried.
825 int Utf8LengthHelper(String* input,
826 int from,
827 int to,
828 bool followed_by_surrogate,
829 int max_recursion,
830 bool* failure,
831 bool* starts_with_surrogate) {
832 if (from == to) return 0;
833 int total = 0;
834 bool dummy;
835 while (true) {
836 if (input->IsAsciiRepresentation()) {
837 *starts_with_surrogate = false;
838 return total + to - from;
839 }
840 switch (StringShape(input).representation_tag()) {
841 case kConsStringTag: {
842 ConsString* str = ConsString::cast(input);
843 String* first = str->first();
844 String* second = str->second();
845 int first_length = first->length();
846 if (first_length - from > to - first_length) {
847 if (first_length < to) {
848 // Right hand side is shorter. No need to check the recursion depth
849 // since this can only happen log(n) times.
850 bool right_starts_with_surrogate = false;
851 total += Utf8LengthHelper(second,
852 0,
853 to - first_length,
854 followed_by_surrogate,
855 max_recursion - 1,
856 failure,
857 &right_starts_with_surrogate);
858 if (*failure) return 0;
859 followed_by_surrogate = right_starts_with_surrogate;
860 input = first;
861 to = first_length;
862 } else {
863 // We only need the left hand side.
864 input = first;
865 }
866 } else {
867 if (first_length > from) {
868 // Left hand side is shorter.
869 if (first->IsAsciiRepresentation()) {
870 total += first_length - from;
871 *starts_with_surrogate = false;
872 starts_with_surrogate = &dummy;
873 input = second;
874 from = 0;
875 to -= first_length;
876 } else if (second->IsAsciiRepresentation()) {
877 followed_by_surrogate = false;
878 total += to - first_length;
879 input = first;
880 to = first_length;
881 } else if (max_recursion > 0) {
882 bool right_starts_with_surrogate = false;
883 // Recursing on the long one. This may fail.
884 total += Utf8LengthHelper(second,
885 0,
886 to - first_length,
887 followed_by_surrogate,
888 max_recursion - 1,
889 failure,
890 &right_starts_with_surrogate);
891 if (*failure) return 0;
892 input = first;
893 to = first_length;
894 followed_by_surrogate = right_starts_with_surrogate;
895 } else {
896 *failure = true;
897 return 0;
898 }
899 } else {
900 // We only need the right hand side.
901 input = second;
902 from = 0;
903 to -= first_length;
904 }
905 }
906 continue;
907 }
908 case kExternalStringTag:
909 case kSeqStringTag: {
910 Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();
911 const uc16* p = vector.start();
912 int previous = unibrow::Utf16::kNoPreviousCharacter;
913 for (int i = from; i < to; i++) {
914 uc16 c = p[i];
915 total += unibrow::Utf8::Length(c, previous);
916 previous = c;
917 }
918 if (to - from > 0) {
919 if (unibrow::Utf16::IsLeadSurrogate(previous) &&
920 followed_by_surrogate) {
921 total -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
922 }
923 if (unibrow::Utf16::IsTrailSurrogate(p[from])) {
924 *starts_with_surrogate = true;
925 }
926 }
927 return total;
928 }
929 case kSlicedStringTag: {
930 SlicedString* str = SlicedString::cast(input);
931 int offset = str->offset();
932 input = str->parent();
933 from += offset;
934 to += offset;
935 continue;
936 }
937 default:
938 break;
939 }
940 UNREACHABLE();
941 return 0;
942 }
943 return 0;
944 }
945
946
947 int Utf8Length(Handle<String> str) {
948 bool dummy;
949 bool failure = true;
950 int len;
951 const int kRecursionBudget = 100;
952 while (failure) {
rossberg 2012/03/12 10:55:05 Nit: this seems more natural as a do-while loop.
Erik Corry 2012/03/12 12:34:10 I made it into a do-while, but it's no better (sam
Sven Panne 2012/03/12 13:00:44 Just an unsolicited comment from my side: For stuf
953 failure = false;
954 len = Utf8LengthHelper(
955 *str, 0, str->length(), false, kRecursionBudget, &failure, &dummy);
956 if (failure) FlattenString(str);
957 }
958 return len;
959 }
960
803 } } // namespace v8::internal 961 } } // namespace v8::internal
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698