src/handles.cc - Issue 11725006: Refactor out assumption that one byte strings are ascii in utf8 processing.

Unified Diff: src/handles.cc

Issue 11725006: Refactor out assumption that one byte strings are ascii in utf8 processing. (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge

Patch Set: Fix array bounds issue Created 7 years, 12 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/handles.cc

diff --git a/src/handles.cc b/src/handles.cc

index 3bc1f4bea03d831cf802c453eb69c15e6a13f49f..16fe0c795c0271d6be8711261925fc96db017445 100644

--- a/src/handles.cc

+++ b/src/handles.cc

@@ -883,165 +883,6 @@ Handle<ObjectHashTable> PutIntoObjectHashTable(Handle<ObjectHashTable> table,

}

-// This method determines the type of string involved and then gets the UTF8

-// length of the string. It doesn't flatten the string and has log(n) recursion

-// for a string of length n. If the failure flag gets set, then we have to

-// flatten the string and retry. Failures are caused by surrogate pairs in deep

-// cons strings.

-// Single surrogate characters that are encountered in the UTF-16 character

-// sequence of the input string get counted as 3 UTF-8 bytes, because that

-// is the way that WriteUtf8 will encode them. Surrogate pairs are counted and

-// encoded as one 4-byte UTF-8 sequence.

-// This function conceptually uses recursion on the two halves of cons strings.

-// However, in order to avoid the recursion going too deep it recurses on the

-// second string of the cons, but iterates on the first substring (by manually

-// eliminating it as a tail recursion). This means it counts the UTF-8 length

-// from the end to the start, which makes no difference to the total.

-// Surrogate pairs are recognized even if they are split across two sides of a

-// cons, which complicates the implementation somewhat. Therefore, too deep

-// recursion cannot always be avoided. This case is detected, and the failure

-// flag is set, a signal to the caller that the string should be flattened and

-// the operation retried.

-int Utf8LengthHelper(String* input,

- int from,

- int to,

- bool followed_by_surrogate,

- int max_recursion,

- bool* failure,

- bool* starts_with_surrogate) {

- if (from == to) return 0;

- int total = 0;

- bool dummy;

- while (true) {

- if (input->IsOneByteRepresentation()) {

- *starts_with_surrogate = false;

- return total + to - from;

- }

- switch (StringShape(input).representation_tag()) {

- case kConsStringTag: {

- ConsString* str = ConsString::cast(input);

- String* first = str->first();

- String* second = str->second();

- int first_length = first->length();

- if (first_length - from > to - first_length) {

- if (first_length < to) {

- // Right hand side is shorter. No need to check the recursion depth

- // since this can only happen log(n) times.

- bool right_starts_with_surrogate = false;

- total += Utf8LengthHelper(second,

- 0,

- to - first_length,

- followed_by_surrogate,

- max_recursion - 1,

- failure,

- &right_starts_with_surrogate);

- if (*failure) return 0;

- followed_by_surrogate = right_starts_with_surrogate;

- input = first;

- to = first_length;

- } else {

- // We only need the left hand side.

- input = first;

- }

- } else {

- if (first_length > from) {

- // Left hand side is shorter.

- if (first->IsOneByteRepresentation()) {

- total += first_length - from;

- *starts_with_surrogate = false;

- starts_with_surrogate = &dummy;

- input = second;

- from = 0;

- to -= first_length;

- } else if (second->IsOneByteRepresentation()) {

- followed_by_surrogate = false;

- total += to - first_length;

- input = first;

- to = first_length;

- } else if (max_recursion > 0) {

- bool right_starts_with_surrogate = false;

- // Recursing on the long one. This may fail.

- total += Utf8LengthHelper(second,

- 0,

- to - first_length,

- followed_by_surrogate,

- max_recursion - 1,

- failure,

- &right_starts_with_surrogate);

- if (*failure) return 0;

- input = first;

- to = first_length;

- followed_by_surrogate = right_starts_with_surrogate;

- } else {

- *failure = true;

- return 0;

- }

- } else {

- // We only need the right hand side.

- input = second;

- from = 0;

- to -= first_length;

- }

- continue;

- }

- case kExternalStringTag:

- case kSeqStringTag: {

- Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();

- const uc16* p = vector.start();

- int previous = unibrow::Utf16::kNoPreviousCharacter;

- for (int i = from; i < to; i++) {

- uc16 c = p[i];

- total += unibrow::Utf8::Length(c, previous);

- previous = c;

- }

- if (to - from > 0) {

- if (unibrow::Utf16::IsLeadSurrogate(previous) &&

- followed_by_surrogate) {

- total -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;

- }

- if (unibrow::Utf16::IsTrailSurrogate(p[from])) {

- *starts_with_surrogate = true;

- }

- return total;

- }

- case kSlicedStringTag: {

- SlicedString* str = SlicedString::cast(input);

- int offset = str->offset();

- input = str->parent();

- from += offset;

- to += offset;

- continue;

- }

- default:

- break;

- }

- UNREACHABLE();

- return 0;

- }

- return 0;

-int Utf8Length(Handle<String> str) {

- bool dummy;

- bool failure;

- int len;

- const int kRecursionBudget = 100;

- do {

- failure = false;

- len = Utf8LengthHelper(

- *str, 0, str->length(), false, kRecursionBudget, &failure, &dummy);

- if (failure) FlattenString(str);

- } while (failure);

- return len;

DeferredHandleScope::DeferredHandleScope(Isolate* isolate)

: impl_(isolate->handle_scope_implementer()) {

ASSERT(impl_->isolate() == Isolate::Current());

« no previous file with comments | « src/handles.h ('k') | no next file » | no next file with comments »