Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1103)

Side by Side Diff: src/unicode.cc

Issue 11649018: Remove Utf8InputBuffer (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge
Patch Set: Created 8 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 259 matching lines...) Expand 10 before | Expand all | Expand 10 after
270 return kBadChar; 270 return kBadChar;
271 } 271 }
272 *cursor += 4; 272 *cursor += 4;
273 return code_point; 273 return code_point;
274 } 274 }
275 *cursor += 1; 275 *cursor += 1;
276 return kBadChar; 276 return kBadChar;
277 } 277 }
278 278
279 279
280 const byte* Utf8::ReadBlock(Buffer<const char*> str, byte* buffer,
281 unsigned capacity, unsigned* chars_read_ptr, unsigned* offset_ptr) {
282 unsigned offset = *offset_ptr;
283 // Bail out early if we've reached the end of the string.
284 if (offset == str.length()) {
285 *chars_read_ptr = 0;
286 return NULL;
287 }
288 const byte* data = reinterpret_cast<const byte*>(str.data());
289 if (data[offset] <= kMaxOneByteChar) {
290 // The next character is an ASCII char so we scan forward over
291 // the following ASCII characters and return the next pure ASCII
292 // substring
293 const byte* result = data + offset;
294 offset++;
295 while ((offset < str.length()) && (data[offset] <= kMaxOneByteChar))
296 offset++;
297 *chars_read_ptr = offset - *offset_ptr;
298 *offset_ptr = offset;
299 return result;
300 } else {
301 // The next character is non-ASCII so we just fill the buffer
302 unsigned cursor = 0;
303 unsigned chars_read = 0;
304 while (offset < str.length()) {
305 uchar c = data[offset];
306 if (c <= kMaxOneByteChar) {
307 // Fast case for ASCII characters
308 if (!CharacterStream::EncodeAsciiCharacter(c,
309 buffer,
310 capacity,
311 cursor))
312 break;
313 offset += 1;
314 } else {
315 unsigned chars = 0;
316 c = Utf8::ValueOf(data + offset, str.length() - offset, &chars);
317 if (!CharacterStream::EncodeNonAsciiCharacter(c,
318 buffer,
319 capacity,
320 cursor))
321 break;
322 offset += chars;
323 }
324 chars_read++;
325 }
326 *offset_ptr = offset;
327 *chars_read_ptr = chars_read;
328 return buffer;
329 }
330 }
331
332 unsigned CharacterStream::Length() { 280 unsigned CharacterStream::Length() {
333 unsigned result = 0; 281 unsigned result = 0;
334 while (has_more()) { 282 while (has_more()) {
335 result++; 283 result++;
336 GetNext(); 284 GetNext();
337 } 285 }
338 Rewind(); 286 Rewind();
339 return result; 287 return result;
340 } 288 }
341 289
342 unsigned CharacterStream::Utf16Length() { 290 unsigned CharacterStream::Utf16Length() {
343 unsigned result = 0; 291 unsigned result = 0;
344 while (has_more()) { 292 while (has_more()) {
345 uchar c = GetNext(); 293 uchar c = GetNext();
346 result += c > Utf16::kMaxNonSurrogateCharCode ? 2 : 1; 294 result += c > Utf16::kMaxNonSurrogateCharCode ? 2 : 1;
347 } 295 }
348 Rewind(); 296 Rewind();
349 return result; 297 return result;
350 } 298 }
351 299
352 void CharacterStream::Seek(unsigned position) { 300 void CharacterStream::Seek(unsigned position) {
353 Rewind(); 301 Rewind();
354 for (unsigned i = 0; i < position; i++) { 302 for (unsigned i = 0; i < position; i++) {
355 GetNext(); 303 GetNext();
356 } 304 }
357 } 305 }
358 306
307 void Utf8DecoderBase::Reset(uint16_t* buffer,
308 unsigned buffer_length,
309 const uint8_t* stream,
310 unsigned stream_length) {
311 // Assume everything will fit in the buffer and stream won't be needed.
312 last_byte_of_buffer_unused_ = false;
313 unbuffered_start_ = NULL;
314 bool writing_to_buffer = true;
315 // Loop until stream is read, writing to buffer as long as buffer has space.
316 unsigned utf16_length = 0;
317 while (stream_length != 0) {
318 unsigned cursor = 0;
319 uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor);
320 ASSERT(cursor > 0 && cursor <= stream_length);
321 stream += cursor;
322 stream_length -= cursor;
323 bool is_two_byte = character > Utf16::kMaxNonSurrogateCharCode;
324 utf16_length += is_two_byte ? 2 : 1;
325 // Don't need to write to the buffer, but still need utf16_length.
326 if (!writing_to_buffer) continue;
327 // Write out the characters to the buffer.
328 // Must check for equality with buffer_length as we've already updated it.
329 if (utf16_length <= buffer_length) {
330 if (is_two_byte) {
Yang 2012/12/20 09:20:27 misnomer?
331 *buffer++ = Utf16::LeadSurrogate(character);
332 *buffer++ = Utf16::TrailSurrogate(character);
333 } else {
334 *buffer++ = character;
335 }
336 if (utf16_length == buffer_length) {
337 // Just wrote last character of buffer
338 writing_to_buffer = false;
339 unbuffered_start_ = stream;
340 }
341 continue;
342 }
343 // Have gone over buffer.
344 // Last char of buffer is unused, set cursor back.
345 ASSERT(is_two_byte);
346 writing_to_buffer = false;
347 last_byte_of_buffer_unused_ = true;
348 unbuffered_start_ = stream - cursor;
349 }
350 utf16_length_ = utf16_length;
351 }
352
353
354 void Utf8DecoderBase::WriteUtf16Slow(const uint8_t* stream,
355 uint16_t* data,
356 unsigned data_length) {
357 while (data_length != 0) {
358 unsigned cursor = 0;
359 uint32_t character = Utf8::ValueOf(stream, Utf8::kMaxEncodedSize, &cursor);
360 // There's a total lack of bounds checking for stream
361 // as it was already done in Reset.
362 stream += cursor;
363 if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) {
364 *data++ = Utf16::LeadSurrogate(character);
365 *data++ = Utf16::TrailSurrogate(character);
366 ASSERT(data_length > 1);
367 data_length -= 2;
368 } else {
369 *data++ = character;
370 data_length -= 1;
371 }
372 }
373 }
374
375
359 // Uppercase: point.category == 'Lu' 376 // Uppercase: point.category == 'Lu'
360 377
361 static const uint16_t kUppercaseTable0Size = 450; 378 static const uint16_t kUppercaseTable0Size = 450;
362 static const int32_t kUppercaseTable0[450] = { 379 static const int32_t kUppercaseTable0[450] = {
363 1073741889, 90, 1073742016, 214, 1073742040, 222, 256, 258, // NOLINT 380 1073741889, 90, 1073742016, 214, 1073742040, 222, 256, 258, // NOLINT
364 260, 262, 264, 266, 268, 270, 272, 274, // NOLINT 381 260, 262, 264, 266, 268, 270, 272, 274, // NOLINT
365 276, 278, 280, 282, 284, 286, 288, 290, // NOLINT 382 276, 278, 280, 282, 284, 286, 288, 290, // NOLINT
366 292, 294, 296, 298, 300, 302, 304, 306, // NOLINT 383 292, 294, 296, 298, 300, 302, 304, 306, // NOLINT
367 308, 310, 313, 315, 317, 319, 321, 323, // NOLINT 384 308, 310, 313, 315, 317, 319, 321, 323, // NOLINT
368 325, 327, 330, 332, 334, 336, 338, 340, // NOLINT 385 325, 327, 330, 332, 334, 336, 338, 340, // NOLINT
(...skipping 1493 matching lines...) Expand 10 before | Expand all | Expand 10 after
1862 + kEcma262UnCanonicalizeMultiStrings0Size * sizeof(MultiCharacterSpecialCa se<4>) // NOLINT 1879 + kEcma262UnCanonicalizeMultiStrings0Size * sizeof(MultiCharacterSpecialCa se<4>) // NOLINT
1863 + kEcma262UnCanonicalizeMultiStrings1Size * sizeof(MultiCharacterSpecialCa se<2>) // NOLINT 1880 + kEcma262UnCanonicalizeMultiStrings1Size * sizeof(MultiCharacterSpecialCa se<2>) // NOLINT
1864 + kEcma262UnCanonicalizeMultiStrings5Size * sizeof(MultiCharacterSpecialCa se<2>) // NOLINT 1881 + kEcma262UnCanonicalizeMultiStrings5Size * sizeof(MultiCharacterSpecialCa se<2>) // NOLINT
1865 + kEcma262UnCanonicalizeMultiStrings7Size * sizeof(MultiCharacterSpecialCa se<2>) // NOLINT 1882 + kEcma262UnCanonicalizeMultiStrings7Size * sizeof(MultiCharacterSpecialCa se<2>) // NOLINT
1866 + kCanonicalizationRangeMultiStrings0Size * sizeof(MultiCharacterSpecialCa se<1>) // NOLINT 1883 + kCanonicalizationRangeMultiStrings0Size * sizeof(MultiCharacterSpecialCa se<1>) // NOLINT
1867 + kCanonicalizationRangeMultiStrings1Size * sizeof(MultiCharacterSpecialCa se<1>) // NOLINT 1884 + kCanonicalizationRangeMultiStrings1Size * sizeof(MultiCharacterSpecialCa se<1>) // NOLINT
1868 + kCanonicalizationRangeMultiStrings7Size * sizeof(MultiCharacterSpecialCa se<1>); // NOLINT 1885 + kCanonicalizationRangeMultiStrings7Size * sizeof(MultiCharacterSpecialCa se<1>); // NOLINT
1869 } 1886 }
1870 1887
1871 } // namespace unicode 1888 } // namespace unicode
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698