OLD | NEW |
---|---|
1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 782 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
793 | 793 |
794 Handle<ObjectHashTable> PutIntoObjectHashTable(Handle<ObjectHashTable> table, | 794 Handle<ObjectHashTable> PutIntoObjectHashTable(Handle<ObjectHashTable> table, |
795 Handle<Object> key, | 795 Handle<Object> key, |
796 Handle<Object> value) { | 796 Handle<Object> value) { |
797 CALL_HEAP_FUNCTION(table->GetIsolate(), | 797 CALL_HEAP_FUNCTION(table->GetIsolate(), |
798 table->Put(*key, *value), | 798 table->Put(*key, *value), |
799 ObjectHashTable); | 799 ObjectHashTable); |
800 } | 800 } |
801 | 801 |
802 | 802 |
803 // This method determines the type of string involved and then gets the UTF8 | |
804 // length of the string. It doesn't flatten the string and has log(n) recursion | |
805 // for a string of length n. If the failure flag gets set, then we have to | |
806 // flatten the string and retry. Failures are caused by surrogate pairs in deep | |
807 // cons strings. | |
808 | |
809 // Single surrogate characters that are encountered in the UTF-16 character | |
810 // sequence of the input string get counted as 3 UTF-8 bytes, because that | |
811 // is the way that WriteUtf8 will encode them. Surrogate pairs are counted and | |
812 // encoded as one 4-byte UTF-8 sequence. | |
813 | |
814 // This function conceptually uses recursion on the two halves of cons strings. | |
815 // However, in order to avoid the recursion going too deep it recurses on the | |
816 // second string of the cons, but iterates on the first substring (by manually | |
817 // eliminating it as a tail recursion). This means it counts the UTF-8 length | |
818 // from the end to the start, which makes no difference to the total. | |
819 | |
820 // Surrogate pairs are recognized even if they are split across two sides of a | |
821 // cons, which complicates the implementation somewhat. Therefore, too deep | |
822 // recursion cannot always be avoided. This case is detected, and the failure | |
823 // flag is set, a signal to the caller that the string should be flattened and | |
824 // the operation retried. | |
825 int Utf8LengthHelper(String* input, | |
826 int from, | |
827 int to, | |
828 bool followed_by_surrogate, | |
829 int max_recursion, | |
830 bool* failure, | |
831 bool* starts_with_surrogate) { | |
832 if (from == to) return 0; | |
833 int total = 0; | |
834 bool dummy; | |
835 while (true) { | |
836 if (input->IsAsciiRepresentation()) { | |
837 *starts_with_surrogate = false; | |
838 return total + to - from; | |
839 } | |
840 switch (StringShape(input).representation_tag()) { | |
841 case kConsStringTag: { | |
842 ConsString* str = ConsString::cast(input); | |
843 String* first = str->first(); | |
844 String* second = str->second(); | |
845 int first_length = first->length(); | |
846 if (first_length - from > to - first_length) { | |
847 if (first_length < to) { | |
848 // Right hand side is shorter. No need to check the recursion depth | |
849 // since this can only happen log(n) times. | |
850 bool right_starts_with_surrogate = false; | |
851 total += Utf8LengthHelper(second, | |
852 0, | |
853 to - first_length, | |
854 followed_by_surrogate, | |
855 max_recursion - 1, | |
856 failure, | |
857 &right_starts_with_surrogate); | |
858 if (*failure) return 0; | |
859 followed_by_surrogate = right_starts_with_surrogate; | |
860 input = first; | |
861 to = first_length; | |
862 } else { | |
863 // We only need the left hand side. | |
864 input = first; | |
865 } | |
866 } else { | |
867 if (first_length > from) { | |
868 // Left hand side is shorter. | |
869 if (first->IsAsciiRepresentation()) { | |
870 total += first_length - from; | |
871 *starts_with_surrogate = false; | |
872 starts_with_surrogate = &dummy; | |
873 input = second; | |
874 from = 0; | |
875 to -= first_length; | |
876 } else if (second->IsAsciiRepresentation()) { | |
877 followed_by_surrogate = false; | |
878 total += to - first_length; | |
879 input = first; | |
880 to = first_length; | |
881 } else if (max_recursion > 0) { | |
882 bool right_starts_with_surrogate = false; | |
883 // Recursing on the long one. This may fail. | |
884 total += Utf8LengthHelper(second, | |
885 0, | |
886 to - first_length, | |
887 followed_by_surrogate, | |
888 max_recursion - 1, | |
889 failure, | |
890 &right_starts_with_surrogate); | |
891 if (*failure) return 0; | |
892 input = first; | |
893 to = first_length; | |
894 followed_by_surrogate = right_starts_with_surrogate; | |
895 } else { | |
896 *failure = true; | |
897 return 0; | |
898 } | |
899 } else { | |
900 // We only need the right hand side. | |
901 input = second; | |
902 from = 0; | |
903 to -= first_length; | |
904 } | |
905 } | |
906 continue; | |
907 } | |
908 case kExternalStringTag: | |
909 case kSeqStringTag: { | |
910 Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector(); | |
911 const uc16* p = vector.start(); | |
912 int previous = unibrow::Utf16::kNoPreviousCharacter; | |
913 for (int i = from; i < to; i++) { | |
914 uc16 c = p[i]; | |
915 total += unibrow::Utf8::Length(c, previous); | |
916 previous = c; | |
917 } | |
918 if (to - from > 0) { | |
919 if (unibrow::Utf16::IsLeadSurrogate(previous) && | |
920 followed_by_surrogate) { | |
921 total -= unibrow::Utf8::kBytesSavedByCombiningSurrogates; | |
922 } | |
923 if (unibrow::Utf16::IsTrailSurrogate(p[from])) { | |
924 *starts_with_surrogate = true; | |
925 } | |
926 } | |
927 return total; | |
928 } | |
929 case kSlicedStringTag: { | |
930 SlicedString* str = SlicedString::cast(input); | |
931 int offset = str->offset(); | |
932 input = str->parent(); | |
933 from += offset; | |
934 to += offset; | |
935 continue; | |
936 } | |
937 default: | |
938 break; | |
939 } | |
940 UNREACHABLE(); | |
941 return 0; | |
942 } | |
943 return 0; | |
944 } | |
945 | |
946 | |
947 int Utf8Length(Handle<String> str) { | |
948 bool dummy; | |
949 bool failure = true; | |
950 int len; | |
951 const int kRecursionBudget = 100; | |
952 while (failure) { | |
rossberg
2012/03/12 10:55:05
Nit: this seems more natural as a do-while loop.
Erik Corry
2012/03/12 12:34:10
I made it into a do-while, but it's no better (sam
Sven Panne
2012/03/12 13:00:44
Just an unsolicited comment from my side: For stuf
| |
953 failure = false; | |
954 len = Utf8LengthHelper( | |
955 *str, 0, str->length(), false, kRecursionBudget, &failure, &dummy); | |
956 if (failure) FlattenString(str); | |
957 } | |
958 return len; | |
959 } | |
960 | |
803 } } // namespace v8::internal | 961 } } // namespace v8::internal |
OLD | NEW |