OLD | NEW |
---|---|
1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 259 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
270 return kBadChar; | 270 return kBadChar; |
271 } | 271 } |
272 *cursor += 4; | 272 *cursor += 4; |
273 return code_point; | 273 return code_point; |
274 } | 274 } |
275 *cursor += 1; | 275 *cursor += 1; |
276 return kBadChar; | 276 return kBadChar; |
277 } | 277 } |
278 | 278 |
279 | 279 |
280 const byte* Utf8::ReadBlock(Buffer<const char*> str, byte* buffer, | |
281 unsigned capacity, unsigned* chars_read_ptr, unsigned* offset_ptr) { | |
282 unsigned offset = *offset_ptr; | |
283 // Bail out early if we've reached the end of the string. | |
284 if (offset == str.length()) { | |
285 *chars_read_ptr = 0; | |
286 return NULL; | |
287 } | |
288 const byte* data = reinterpret_cast<const byte*>(str.data()); | |
289 if (data[offset] <= kMaxOneByteChar) { | |
290 // The next character is an ASCII char so we scan forward over | |
291 // the following ASCII characters and return the next pure ASCII | |
292 // substring | |
293 const byte* result = data + offset; | |
294 offset++; | |
295 while ((offset < str.length()) && (data[offset] <= kMaxOneByteChar)) | |
296 offset++; | |
297 *chars_read_ptr = offset - *offset_ptr; | |
298 *offset_ptr = offset; | |
299 return result; | |
300 } else { | |
301 // The next character is non-ASCII so we just fill the buffer | |
302 unsigned cursor = 0; | |
303 unsigned chars_read = 0; | |
304 while (offset < str.length()) { | |
305 uchar c = data[offset]; | |
306 if (c <= kMaxOneByteChar) { | |
307 // Fast case for ASCII characters | |
308 if (!CharacterStream::EncodeAsciiCharacter(c, | |
309 buffer, | |
310 capacity, | |
311 cursor)) | |
312 break; | |
313 offset += 1; | |
314 } else { | |
315 unsigned chars = 0; | |
316 c = Utf8::ValueOf(data + offset, str.length() - offset, &chars); | |
317 if (!CharacterStream::EncodeNonAsciiCharacter(c, | |
318 buffer, | |
319 capacity, | |
320 cursor)) | |
321 break; | |
322 offset += chars; | |
323 } | |
324 chars_read++; | |
325 } | |
326 *offset_ptr = offset; | |
327 *chars_read_ptr = chars_read; | |
328 return buffer; | |
329 } | |
330 } | |
331 | |
332 unsigned CharacterStream::Length() { | 280 unsigned CharacterStream::Length() { |
333 unsigned result = 0; | 281 unsigned result = 0; |
334 while (has_more()) { | 282 while (has_more()) { |
335 result++; | 283 result++; |
336 GetNext(); | 284 GetNext(); |
337 } | 285 } |
338 Rewind(); | 286 Rewind(); |
339 return result; | 287 return result; |
340 } | 288 } |
341 | 289 |
342 unsigned CharacterStream::Utf16Length() { | 290 unsigned CharacterStream::Utf16Length() { |
343 unsigned result = 0; | 291 unsigned result = 0; |
344 while (has_more()) { | 292 while (has_more()) { |
345 uchar c = GetNext(); | 293 uchar c = GetNext(); |
346 result += c > Utf16::kMaxNonSurrogateCharCode ? 2 : 1; | 294 result += c > Utf16::kMaxNonSurrogateCharCode ? 2 : 1; |
347 } | 295 } |
348 Rewind(); | 296 Rewind(); |
349 return result; | 297 return result; |
350 } | 298 } |
351 | 299 |
352 void CharacterStream::Seek(unsigned position) { | 300 void CharacterStream::Seek(unsigned position) { |
353 Rewind(); | 301 Rewind(); |
354 for (unsigned i = 0; i < position; i++) { | 302 for (unsigned i = 0; i < position; i++) { |
355 GetNext(); | 303 GetNext(); |
356 } | 304 } |
357 } | 305 } |
358 | 306 |
307 void Utf8DecoderBase::Reset(uint16_t* buffer, | |
308 unsigned buffer_length, | |
309 const uint8_t* stream, | |
310 unsigned stream_length) { | |
311 // Assume everything will fit in the buffer and stream won't be needed. | |
312 last_byte_of_buffer_unused_ = false; | |
313 unbuffered_start_ = NULL; | |
314 bool writing_to_buffer = true; | |
315 // Loop until stream is read, writing to buffer as long as buffer has space. | |
316 unsigned utf16_length = 0; | |
317 while (stream_length != 0) { | |
318 unsigned cursor = 0; | |
319 uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor); | |
320 ASSERT(cursor > 0 && cursor <= stream_length); | |
321 stream += cursor; | |
322 stream_length -= cursor; | |
323 bool is_two_byte = character > Utf16::kMaxNonSurrogateCharCode; | |
324 utf16_length += is_two_byte ? 2 : 1; | |
325 // Don't need to write to the buffer, but still need utf16_length. | |
326 if (!writing_to_buffer) continue; | |
327 // Write out the characters to the buffer. | |
328 // Must check for equality with buffer_length as we've already updated it. | |
329 if (utf16_length <= buffer_length) { | |
330 if (is_two_byte) { | |
Yang
2012/12/20 09:20:27
misnomer?
| |
331 *buffer++ = Utf16::LeadSurrogate(character); | |
332 *buffer++ = Utf16::TrailSurrogate(character); | |
333 } else { | |
334 *buffer++ = character; | |
335 } | |
336 if (utf16_length == buffer_length) { | |
337 // Just wrote last character of buffer | |
338 writing_to_buffer = false; | |
339 unbuffered_start_ = stream; | |
340 } | |
341 continue; | |
342 } | |
343 // Have gone over buffer. | |
344 // Last char of buffer is unused, set cursor back. | |
345 ASSERT(is_two_byte); | |
346 writing_to_buffer = false; | |
347 last_byte_of_buffer_unused_ = true; | |
348 unbuffered_start_ = stream - cursor; | |
349 } | |
350 utf16_length_ = utf16_length; | |
351 } | |
352 | |
353 | |
354 void Utf8DecoderBase::WriteUtf16Slow(const uint8_t* stream, | |
355 uint16_t* data, | |
356 unsigned data_length) { | |
357 while (data_length != 0) { | |
358 unsigned cursor = 0; | |
359 uint32_t character = Utf8::ValueOf(stream, Utf8::kMaxEncodedSize, &cursor); | |
360 // There's a total lack of bounds checking for stream | |
361 // as it was already done in Reset. | |
362 stream += cursor; | |
363 if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) { | |
364 *data++ = Utf16::LeadSurrogate(character); | |
365 *data++ = Utf16::TrailSurrogate(character); | |
366 ASSERT(data_length > 1); | |
367 data_length -= 2; | |
368 } else { | |
369 *data++ = character; | |
370 data_length -= 1; | |
371 } | |
372 } | |
373 } | |
374 | |
375 | |
359 // Uppercase: point.category == 'Lu' | 376 // Uppercase: point.category == 'Lu' |
360 | 377 |
361 static const uint16_t kUppercaseTable0Size = 450; | 378 static const uint16_t kUppercaseTable0Size = 450; |
362 static const int32_t kUppercaseTable0[450] = { | 379 static const int32_t kUppercaseTable0[450] = { |
363 1073741889, 90, 1073742016, 214, 1073742040, 222, 256, 258, // NOLINT | 380 1073741889, 90, 1073742016, 214, 1073742040, 222, 256, 258, // NOLINT |
364 260, 262, 264, 266, 268, 270, 272, 274, // NOLINT | 381 260, 262, 264, 266, 268, 270, 272, 274, // NOLINT |
365 276, 278, 280, 282, 284, 286, 288, 290, // NOLINT | 382 276, 278, 280, 282, 284, 286, 288, 290, // NOLINT |
366 292, 294, 296, 298, 300, 302, 304, 306, // NOLINT | 383 292, 294, 296, 298, 300, 302, 304, 306, // NOLINT |
367 308, 310, 313, 315, 317, 319, 321, 323, // NOLINT | 384 308, 310, 313, 315, 317, 319, 321, 323, // NOLINT |
368 325, 327, 330, 332, 334, 336, 338, 340, // NOLINT | 385 325, 327, 330, 332, 334, 336, 338, 340, // NOLINT |
(...skipping 1493 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1862 + kEcma262UnCanonicalizeMultiStrings0Size * sizeof(MultiCharacterSpecialCa se<4>) // NOLINT | 1879 + kEcma262UnCanonicalizeMultiStrings0Size * sizeof(MultiCharacterSpecialCa se<4>) // NOLINT |
1863 + kEcma262UnCanonicalizeMultiStrings1Size * sizeof(MultiCharacterSpecialCa se<2>) // NOLINT | 1880 + kEcma262UnCanonicalizeMultiStrings1Size * sizeof(MultiCharacterSpecialCa se<2>) // NOLINT |
1864 + kEcma262UnCanonicalizeMultiStrings5Size * sizeof(MultiCharacterSpecialCa se<2>) // NOLINT | 1881 + kEcma262UnCanonicalizeMultiStrings5Size * sizeof(MultiCharacterSpecialCa se<2>) // NOLINT |
1865 + kEcma262UnCanonicalizeMultiStrings7Size * sizeof(MultiCharacterSpecialCa se<2>) // NOLINT | 1882 + kEcma262UnCanonicalizeMultiStrings7Size * sizeof(MultiCharacterSpecialCa se<2>) // NOLINT |
1866 + kCanonicalizationRangeMultiStrings0Size * sizeof(MultiCharacterSpecialCa se<1>) // NOLINT | 1883 + kCanonicalizationRangeMultiStrings0Size * sizeof(MultiCharacterSpecialCa se<1>) // NOLINT |
1867 + kCanonicalizationRangeMultiStrings1Size * sizeof(MultiCharacterSpecialCa se<1>) // NOLINT | 1884 + kCanonicalizationRangeMultiStrings1Size * sizeof(MultiCharacterSpecialCa se<1>) // NOLINT |
1868 + kCanonicalizationRangeMultiStrings7Size * sizeof(MultiCharacterSpecialCa se<1>); // NOLINT | 1885 + kCanonicalizationRangeMultiStrings7Size * sizeof(MultiCharacterSpecialCa se<1>); // NOLINT |
1869 } | 1886 } |
1870 | 1887 |
1871 } // namespace unicode | 1888 } // namespace unicode |
OLD | NEW |