src/unicode.cc - Issue 11649018: Remove Utf8InputBuffer

Side by Side Diff: src/unicode.cc

Issue 11649018: Remove Utf8InputBuffer (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 259 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
270 return kBadChar;	270 return kBadChar;

271 }	271 }

272 *cursor += 4;	272 *cursor += 4;

273 return code_point;	273 return code_point;

274 }	274 }

275 *cursor += 1;	275 *cursor += 1;

276 return kBadChar;	276 return kBadChar;

277 }	277 }

278	278

279	279

280 const byte* Utf8::ReadBlock(Buffer<const char> str, byte buffer,

281 unsigned capacity, unsigned* chars_read_ptr, unsigned* offset_ptr) {

282 unsigned offset = *offset_ptr;

283 // Bail out early if we've reached the end of the string.

284 if (offset == str.length()) {

285 *chars_read_ptr = 0;

286 return NULL;

287 }

288 const byte* data = reinterpret_cast<const byte*>(str.data());

289 if (data[offset] <= kMaxOneByteChar) {

290 // The next character is an ASCII char so we scan forward over

291 // the following ASCII characters and return the next pure ASCII

292 // substring

293 const byte* result = data + offset;

294 offset++;

295 while ((offset < str.length()) && (data[offset] <= kMaxOneByteChar))

296 offset++;

297 chars_read_ptr = offset - offset_ptr;

298 *offset_ptr = offset;

299 return result;

300 } else {

301 // The next character is non-ASCII so we just fill the buffer

302 unsigned cursor = 0;

303 unsigned chars_read = 0;

304 while (offset < str.length()) {

305 uchar c = data[offset];

306 if (c <= kMaxOneByteChar) {

307 // Fast case for ASCII characters

308 if (!CharacterStream::EncodeAsciiCharacter(c,

309 buffer,

310 capacity,

311 cursor))

312 break;

313 offset += 1;

314 } else {

315 unsigned chars = 0;

316 c = Utf8::ValueOf(data + offset, str.length() - offset, &chars);

317 if (!CharacterStream::EncodeNonAsciiCharacter(c,

318 buffer,

319 capacity,

320 cursor))

321 break;

322 offset += chars;

323 }

324 chars_read++;

325 }

326 *offset_ptr = offset;

327 *chars_read_ptr = chars_read;

328 return buffer;

329 }

330 }

331

332 unsigned CharacterStream::Length() {	280 unsigned CharacterStream::Length() {

333 unsigned result = 0;	281 unsigned result = 0;

334 while (has_more()) {	282 while (has_more()) {

335 result++;	283 result++;

336 GetNext();	284 GetNext();

337 }	285 }

338 Rewind();	286 Rewind();

339 return result;	287 return result;

340 }	288 }

341	289

342 unsigned CharacterStream::Utf16Length() {	290 unsigned CharacterStream::Utf16Length() {

343 unsigned result = 0;	291 unsigned result = 0;

344 while (has_more()) {	292 while (has_more()) {

345 uchar c = GetNext();	293 uchar c = GetNext();

346 result += c > Utf16::kMaxNonSurrogateCharCode ? 2 : 1;	294 result += c > Utf16::kMaxNonSurrogateCharCode ? 2 : 1;

347 }	295 }

348 Rewind();	296 Rewind();

349 return result;	297 return result;

350 }	298 }

351	299

352 void CharacterStream::Seek(unsigned position) {	300 void CharacterStream::Seek(unsigned position) {

353 Rewind();	301 Rewind();

354 for (unsigned i = 0; i < position; i++) {	302 for (unsigned i = 0; i < position; i++) {

355 GetNext();	303 GetNext();

356 }	304 }

357 }	305 }

358	306

	307 void Utf8DecoderBase::Reset(uint16_t* buffer,

	308 unsigned buffer_length,

	309 const uint8_t* stream,

	310 unsigned stream_length) {

	311 // Assume everything will fit in the buffer and stream won't be needed.

	312 last_byte_of_buffer_unused_ = false;

	313 unbuffered_start_ = NULL;

	314 bool writing_to_buffer = true;

	315 // Loop until stream is read, writing to buffer as long as buffer has space.

	316 unsigned utf16_length = 0;

	317 while (stream_length != 0) {

	318 unsigned cursor = 0;

	319 uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor);

	320 ASSERT(cursor > 0 && cursor <= stream_length);

	321 stream += cursor;

	322 stream_length -= cursor;

	323 bool is_two_byte = character > Utf16::kMaxNonSurrogateCharCode;

	324 utf16_length += is_two_byte ? 2 : 1;

	325 // Don't need to write to the buffer, but still need utf16_length.

	326 if (!writing_to_buffer) continue;

	327 // Write out the characters to the buffer.

	328 // Must check for equality with buffer_length as we've already updated it.

	329 if (utf16_length <= buffer_length) {

	330 if (is_two_byte) {
	Yang 2012/12/20 09:20:27 misnomer? misnomer?
	331 *buffer++ = Utf16::LeadSurrogate(character);

	332 *buffer++ = Utf16::TrailSurrogate(character);

	333 } else {

	334 *buffer++ = character;

	335 }

	336 if (utf16_length == buffer_length) {

	337 // Just wrote last character of buffer

	338 writing_to_buffer = false;

	339 unbuffered_start_ = stream;

	340 }

	341 continue;

	342 }

	343 // Have gone over buffer.

	344 // Last char of buffer is unused, set cursor back.

	345 ASSERT(is_two_byte);

	346 writing_to_buffer = false;

	347 last_byte_of_buffer_unused_ = true;

	348 unbuffered_start_ = stream - cursor;

	349 }

	350 utf16_length_ = utf16_length;

	351 }

	352

	353

	354 void Utf8DecoderBase::WriteUtf16Slow(const uint8_t* stream,

	355 uint16_t* data,

	356 unsigned data_length) {

	357 while (data_length != 0) {

	358 unsigned cursor = 0;

	359 uint32_t character = Utf8::ValueOf(stream, Utf8::kMaxEncodedSize, &cursor);

	360 // There's a total lack of bounds checking for stream

	361 // as it was already done in Reset.

	362 stream += cursor;

	363 if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) {

	364 *data++ = Utf16::LeadSurrogate(character);

	365 *data++ = Utf16::TrailSurrogate(character);

	366 ASSERT(data_length > 1);

	367 data_length -= 2;

	368 } else {

	369 *data++ = character;

	370 data_length -= 1;

	371 }

	372 }

	373 }

	374

	375

359 // Uppercase: point.category == 'Lu'	376 // Uppercase: point.category == 'Lu'

360	377

361 static const uint16_t kUppercaseTable0Size = 450;	378 static const uint16_t kUppercaseTable0Size = 450;

362 static const int32_t kUppercaseTable0[450] = {	379 static const int32_t kUppercaseTable0[450] = {

363 1073741889, 90, 1073742016, 214, 1073742040, 222, 256, 258, // NOLINT	380 1073741889, 90, 1073742016, 214, 1073742040, 222, 256, 258, // NOLINT

364 260, 262, 264, 266, 268, 270, 272, 274, // NOLINT	381 260, 262, 264, 266, 268, 270, 272, 274, // NOLINT

365 276, 278, 280, 282, 284, 286, 288, 290, // NOLINT	382 276, 278, 280, 282, 284, 286, 288, 290, // NOLINT

366 292, 294, 296, 298, 300, 302, 304, 306, // NOLINT	383 292, 294, 296, 298, 300, 302, 304, 306, // NOLINT

367 308, 310, 313, 315, 317, 319, 321, 323, // NOLINT	384 308, 310, 313, 315, 317, 319, 321, 323, // NOLINT

368 325, 327, 330, 332, 334, 336, 338, 340, // NOLINT	385 325, 327, 330, 332, 334, 336, 338, 340, // NOLINT

(...skipping 1493 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1862 + kEcma262UnCanonicalizeMultiStrings0Size * sizeof(MultiCharacterSpecialCa se<4>) // NOLINT	1879 + kEcma262UnCanonicalizeMultiStrings0Size * sizeof(MultiCharacterSpecialCa se<4>) // NOLINT

1863 + kEcma262UnCanonicalizeMultiStrings1Size * sizeof(MultiCharacterSpecialCa se<2>) // NOLINT	1880 + kEcma262UnCanonicalizeMultiStrings1Size * sizeof(MultiCharacterSpecialCa se<2>) // NOLINT

1864 + kEcma262UnCanonicalizeMultiStrings5Size * sizeof(MultiCharacterSpecialCa se<2>) // NOLINT	1881 + kEcma262UnCanonicalizeMultiStrings5Size * sizeof(MultiCharacterSpecialCa se<2>) // NOLINT

1865 + kEcma262UnCanonicalizeMultiStrings7Size * sizeof(MultiCharacterSpecialCa se<2>) // NOLINT	1882 + kEcma262UnCanonicalizeMultiStrings7Size * sizeof(MultiCharacterSpecialCa se<2>) // NOLINT

1866 + kCanonicalizationRangeMultiStrings0Size * sizeof(MultiCharacterSpecialCa se<1>) // NOLINT	1883 + kCanonicalizationRangeMultiStrings0Size * sizeof(MultiCharacterSpecialCa se<1>) // NOLINT

1867 + kCanonicalizationRangeMultiStrings1Size * sizeof(MultiCharacterSpecialCa se<1>) // NOLINT	1884 + kCanonicalizationRangeMultiStrings1Size * sizeof(MultiCharacterSpecialCa se<1>) // NOLINT

1868 + kCanonicalizationRangeMultiStrings7Size * sizeof(MultiCharacterSpecialCa se<1>); // NOLINT	1885 + kCanonicalizationRangeMultiStrings7Size * sizeof(MultiCharacterSpecialCa se<1>); // NOLINT

1869 }	1886 }

1870	1887

1871 } // namespace unicode	1888 } // namespace unicode

OLD	NEW

« src/objects.cc ('K') | « src/unicode.h ('k') | src/unicode-inl.h » ('j') | src/unicode-inl.h » ('J')