Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
| 4 // met: | 4 // met: |
| 5 // | 5 // |
| 6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
| 7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
| 8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
| 9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
| 10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
| (...skipping 782 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 793 | 793 |
| 794 Handle<ObjectHashTable> PutIntoObjectHashTable(Handle<ObjectHashTable> table, | 794 Handle<ObjectHashTable> PutIntoObjectHashTable(Handle<ObjectHashTable> table, |
| 795 Handle<Object> key, | 795 Handle<Object> key, |
| 796 Handle<Object> value) { | 796 Handle<Object> value) { |
| 797 CALL_HEAP_FUNCTION(table->GetIsolate(), | 797 CALL_HEAP_FUNCTION(table->GetIsolate(), |
| 798 table->Put(*key, *value), | 798 table->Put(*key, *value), |
| 799 ObjectHashTable); | 799 ObjectHashTable); |
| 800 } | 800 } |
| 801 | 801 |
| 802 | 802 |
| 803 // This method determines the type of string involved and then gets the UTF8 | |
| 804 // length of the string. It doesn't flatten the string and has log(n) recursion | |
| 805 // for a string of length n. If the failure flag gets set, then we have to | |
| 806 // flatten the string and retry. Failures are caused by surrogate pairs in deep | |
| 807 // cons strings. | |
| 808 | |
| 809 // Single surrogate characters that are encountered in the UTF-16 character | |
| 810 // sequence of the input string get counted as 3 UTF-8 bytes, because that | |
| 811 // is the way that WriteUtf8 will encode them. Surrogate pairs are counted and | |
| 812 // encoded as one 4-byte UTF-8 sequence. | |
| 813 | |
| 814 // This function conceptually uses recursion on the two halves of cons strings. | |
| 815 // However, in order to avoid the recursion going too deep it recurses on the | |
| 816 // second string of the cons, but iterates on the first substring (by manually | |
| 817 // eliminating it as a tail recursion). This means it counts the UTF-8 length | |
| 818 // from the end to the start, which makes no difference to the total. | |
| 819 | |
| 820 // Surrogate pairs are recognized even if they are split across two sides of a | |
| 821 // cons, which complicates the implementation somewhat. Therefore, too deep | |
| 822 // recursion cannot always be avoided. This case is detected, and the failure | |
| 823 // flag is set, a signal to the caller that the string should be flattened and | |
| 824 // the operation retried. | |
| 825 int Utf8LengthHelper(String* input, | |
| 826 int from, | |
| 827 int to, | |
| 828 bool followed_by_surrogate, | |
| 829 int max_recursion, | |
| 830 bool* failure, | |
| 831 bool* starts_with_surrogate) { | |
| 832 if (from == to) return 0; | |
| 833 int total = 0; | |
| 834 bool dummy; | |
| 835 while (true) { | |
| 836 if (input->IsAsciiRepresentation()) { | |
| 837 *starts_with_surrogate = false; | |
| 838 return total + to - from; | |
| 839 } | |
| 840 switch (StringShape(input).representation_tag()) { | |
| 841 case kConsStringTag: { | |
| 842 ConsString* str = ConsString::cast(input); | |
| 843 String* first = str->first(); | |
| 844 String* second = str->second(); | |
| 845 int first_length = first->length(); | |
| 846 if (first_length - from > to - first_length) { | |
| 847 if (first_length < to) { | |
| 848 // Right hand side is shorter. No need to check the recursion depth | |
| 849 // since this can only happen log(n) times. | |
| 850 bool right_starts_with_surrogate = false; | |
| 851 total += Utf8LengthHelper(second, | |
| 852 0, | |
| 853 to - first_length, | |
| 854 followed_by_surrogate, | |
| 855 max_recursion - 1, | |
| 856 failure, | |
| 857 &right_starts_with_surrogate); | |
| 858 if (*failure) return 0; | |
| 859 followed_by_surrogate = right_starts_with_surrogate; | |
| 860 input = first; | |
| 861 to = first_length; | |
| 862 } else { | |
| 863 // We only need the left hand side. | |
| 864 input = first; | |
| 865 } | |
| 866 } else { | |
| 867 if (first_length > from) { | |
| 868 // Left hand side is shorter. | |
| 869 if (first->IsAsciiRepresentation()) { | |
| 870 total += first_length - from; | |
| 871 *starts_with_surrogate = false; | |
| 872 starts_with_surrogate = &dummy; | |
| 873 input = second; | |
| 874 from = 0; | |
| 875 to -= first_length; | |
| 876 } else if (second->IsAsciiRepresentation()) { | |
| 877 followed_by_surrogate = false; | |
| 878 total += to - first_length; | |
| 879 input = first; | |
| 880 to = first_length; | |
| 881 } else if (max_recursion > 0) { | |
| 882 bool right_starts_with_surrogate = false; | |
| 883 // Recursing on the long one. This may fail. | |
| 884 total += Utf8LengthHelper(second, | |
| 885 0, | |
| 886 to - first_length, | |
| 887 followed_by_surrogate, | |
| 888 max_recursion - 1, | |
| 889 failure, | |
| 890 &right_starts_with_surrogate); | |
| 891 if (*failure) return 0; | |
| 892 input = first; | |
| 893 to = first_length; | |
| 894 followed_by_surrogate = right_starts_with_surrogate; | |
| 895 } else { | |
| 896 *failure = true; | |
| 897 return 0; | |
| 898 } | |
| 899 } else { | |
| 900 // We only need the right hand side. | |
| 901 input = second; | |
| 902 from = 0; | |
| 903 to -= first_length; | |
| 904 } | |
| 905 } | |
| 906 continue; | |
| 907 } | |
| 908 case kExternalStringTag: | |
| 909 case kSeqStringTag: { | |
| 910 Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector(); | |
| 911 const uc16* p = vector.start(); | |
| 912 int previous = unibrow::Utf16::kNoPreviousCharacter; | |
| 913 for (int i = from; i < to; i++) { | |
| 914 uc16 c = p[i]; | |
| 915 total += unibrow::Utf8::Length(c, previous); | |
| 916 previous = c; | |
| 917 } | |
| 918 if (to - from > 0) { | |
| 919 if (unibrow::Utf16::IsLeadSurrogate(previous) && | |
| 920 followed_by_surrogate) { | |
| 921 total -= unibrow::Utf8::kBytesSavedByCombiningSurrogates; | |
| 922 } | |
| 923 if (unibrow::Utf16::IsTrailSurrogate(p[from])) { | |
| 924 *starts_with_surrogate = true; | |
| 925 } | |
| 926 } | |
| 927 return total; | |
| 928 } | |
| 929 case kSlicedStringTag: { | |
| 930 SlicedString* str = SlicedString::cast(input); | |
| 931 int offset = str->offset(); | |
| 932 input = str->parent(); | |
| 933 from += offset; | |
| 934 to += offset; | |
| 935 continue; | |
| 936 } | |
| 937 default: | |
| 938 break; | |
| 939 } | |
| 940 UNREACHABLE(); | |
| 941 return 0; | |
| 942 } | |
| 943 return 0; | |
| 944 } | |
| 945 | |
| 946 | |
| 947 int Utf8Length(Handle<String> str) { | |
| 948 bool dummy; | |
| 949 bool failure = true; | |
| 950 int len; | |
| 951 const int kRecursionBudget = 100; | |
| 952 while (failure) { | |
|
rossberg
2012/03/12 10:55:05
Nit: this seems more natural as a do-while loop.
Erik Corry
2012/03/12 12:34:10
I made it into a do-while, but it's no better (sam
Sven Panne
2012/03/12 13:00:44
Just an unsolicited comment from my side: For stuf
| |
| 953 failure = false; | |
| 954 len = Utf8LengthHelper( | |
| 955 *str, 0, str->length(), false, kRecursionBudget, &failure, &dummy); | |
| 956 if (failure) FlattenString(str); | |
| 957 } | |
| 958 return len; | |
| 959 } | |
| 960 | |
| 803 } } // namespace v8::internal | 961 } } // namespace v8::internal |
| OLD | NEW |