Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1002)

Unified Diff: src/handles.cc

Issue 11725006: Refactor out assumption that one byte strings are ascii in utf8 processing. (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge
Patch Set: Fix array bounds issue Created 7 years, 12 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/handles.h ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/handles.cc
diff --git a/src/handles.cc b/src/handles.cc
index 3bc1f4bea03d831cf802c453eb69c15e6a13f49f..16fe0c795c0271d6be8711261925fc96db017445 100644
--- a/src/handles.cc
+++ b/src/handles.cc
@@ -883,165 +883,6 @@ Handle<ObjectHashTable> PutIntoObjectHashTable(Handle<ObjectHashTable> table,
}
-// This method determines the type of string involved and then gets the UTF8
-// length of the string. It doesn't flatten the string and has log(n) recursion
-// for a string of length n. If the failure flag gets set, then we have to
-// flatten the string and retry. Failures are caused by surrogate pairs in deep
-// cons strings.
-
-// Single surrogate characters that are encountered in the UTF-16 character
-// sequence of the input string get counted as 3 UTF-8 bytes, because that
-// is the way that WriteUtf8 will encode them. Surrogate pairs are counted and
-// encoded as one 4-byte UTF-8 sequence.
-
-// This function conceptually uses recursion on the two halves of cons strings.
-// However, in order to avoid the recursion going too deep it recurses on the
-// second string of the cons, but iterates on the first substring (by manually
-// eliminating it as a tail recursion). This means it counts the UTF-8 length
-// from the end to the start, which makes no difference to the total.
-
-// Surrogate pairs are recognized even if they are split across two sides of a
-// cons, which complicates the implementation somewhat. Therefore, too deep
-// recursion cannot always be avoided. This case is detected, and the failure
-// flag is set, a signal to the caller that the string should be flattened and
-// the operation retried.
-int Utf8LengthHelper(String* input,
- int from,
- int to,
- bool followed_by_surrogate,
- int max_recursion,
- bool* failure,
- bool* starts_with_surrogate) {
- if (from == to) return 0;
- int total = 0;
- bool dummy;
- while (true) {
- if (input->IsOneByteRepresentation()) {
- *starts_with_surrogate = false;
- return total + to - from;
- }
- switch (StringShape(input).representation_tag()) {
- case kConsStringTag: {
- ConsString* str = ConsString::cast(input);
- String* first = str->first();
- String* second = str->second();
- int first_length = first->length();
- if (first_length - from > to - first_length) {
- if (first_length < to) {
- // Right hand side is shorter. No need to check the recursion depth
- // since this can only happen log(n) times.
- bool right_starts_with_surrogate = false;
- total += Utf8LengthHelper(second,
- 0,
- to - first_length,
- followed_by_surrogate,
- max_recursion - 1,
- failure,
- &right_starts_with_surrogate);
- if (*failure) return 0;
- followed_by_surrogate = right_starts_with_surrogate;
- input = first;
- to = first_length;
- } else {
- // We only need the left hand side.
- input = first;
- }
- } else {
- if (first_length > from) {
- // Left hand side is shorter.
- if (first->IsOneByteRepresentation()) {
- total += first_length - from;
- *starts_with_surrogate = false;
- starts_with_surrogate = &dummy;
- input = second;
- from = 0;
- to -= first_length;
- } else if (second->IsOneByteRepresentation()) {
- followed_by_surrogate = false;
- total += to - first_length;
- input = first;
- to = first_length;
- } else if (max_recursion > 0) {
- bool right_starts_with_surrogate = false;
- // Recursing on the long one. This may fail.
- total += Utf8LengthHelper(second,
- 0,
- to - first_length,
- followed_by_surrogate,
- max_recursion - 1,
- failure,
- &right_starts_with_surrogate);
- if (*failure) return 0;
- input = first;
- to = first_length;
- followed_by_surrogate = right_starts_with_surrogate;
- } else {
- *failure = true;
- return 0;
- }
- } else {
- // We only need the right hand side.
- input = second;
- from = 0;
- to -= first_length;
- }
- }
- continue;
- }
- case kExternalStringTag:
- case kSeqStringTag: {
- Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();
- const uc16* p = vector.start();
- int previous = unibrow::Utf16::kNoPreviousCharacter;
- for (int i = from; i < to; i++) {
- uc16 c = p[i];
- total += unibrow::Utf8::Length(c, previous);
- previous = c;
- }
- if (to - from > 0) {
- if (unibrow::Utf16::IsLeadSurrogate(previous) &&
- followed_by_surrogate) {
- total -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
- }
- if (unibrow::Utf16::IsTrailSurrogate(p[from])) {
- *starts_with_surrogate = true;
- }
- }
- return total;
- }
- case kSlicedStringTag: {
- SlicedString* str = SlicedString::cast(input);
- int offset = str->offset();
- input = str->parent();
- from += offset;
- to += offset;
- continue;
- }
- default:
- break;
- }
- UNREACHABLE();
- return 0;
- }
- return 0;
-}
-
-
-int Utf8Length(Handle<String> str) {
- bool dummy;
- bool failure;
- int len;
- const int kRecursionBudget = 100;
- do {
- failure = false;
- len = Utf8LengthHelper(
- *str, 0, str->length(), false, kRecursionBudget, &failure, &dummy);
- if (failure) FlattenString(str);
- } while (failure);
- return len;
-}
-
-
DeferredHandleScope::DeferredHandleScope(Isolate* isolate)
: impl_(isolate->handle_scope_implementer()) {
ASSERT(impl_->isolate() == Isolate::Current());
« no previous file with comments | « src/handles.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698