OLD | NEW |
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 part of dart.convert; | 5 part of dart.convert; |
6 | 6 |
7 /** The Unicode Replacement character `U+FFFD` (�). */ | 7 /** The Unicode Replacement character `U+FFFD` (�). */ |
8 const UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD; | 8 const UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD; |
9 | 9 |
| 10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */ |
| 11 const UNICODE_BOM_CHARACTER_RUNE = 0xFEFF; |
| 12 |
10 /** | 13 /** |
11 * An instance of the default implementation of the [Utf8Codec]. | 14 * An instance of the default implementation of the [Utf8Codec]. |
12 * | 15 * |
13 * This instance provides a convenient access to the most common UTF-8 | 16 * This instance provides a convenient access to the most common UTF-8 |
14 * use cases. | 17 * use cases. |
15 * | 18 * |
16 * Examples: | 19 * Examples: |
17 * | 20 * |
18 * var encoded = UTF8.encode("Îñţérñåţîöñåļîžåţîờñ"); | 21 * var encoded = UTF8.encode("Îñţérñåţîöñåļîžåţîờñ"); |
19 * var decoded = UTF8.decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6, | 22 * var decoded = UTF8.decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6, |
(...skipping 21 matching lines...) Expand all Loading... |
41 */ | 44 */ |
42 const Utf8Codec({ bool allowMalformed: false }) | 45 const Utf8Codec({ bool allowMalformed: false }) |
43 : _allowMalformed = allowMalformed; | 46 : _allowMalformed = allowMalformed; |
44 | 47 |
45 String get name => "utf-8"; | 48 String get name => "utf-8"; |
46 | 49 |
47 /** | 50 /** |
48 * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the | 51 * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the |
49 * corresponding string. | 52 * corresponding string. |
50 * | 53 * |
| 54 * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this |
| 55 * character is discarded. |
| 56 * |
51 * If [allowMalformed] is `true` the decoder replaces invalid (or | 57 * If [allowMalformed] is `true` the decoder replaces invalid (or |
52 * unterminated) character sequences with the Unicode Replacement character | 58 * unterminated) character sequences with the Unicode Replacement character |
53 * `U+FFFD` (�). Otherwise it throws a [FormatException]. | 59 * `U+FFFD` (�). Otherwise it throws a [FormatException]. |
54 * | 60 * |
55 * If [allowMalformed] is not given, it defaults to the `allowMalformed` that | 61 * If [allowMalformed] is not given, it defaults to the `allowMalformed` that |
56 * was used to instantiate `this`. | 62 * was used to instantiate `this`. |
57 */ | 63 */ |
58 String decode(List<int> codeUnits, { bool allowMalformed }) { | 64 String decode(List<int> codeUnits, { bool allowMalformed }) { |
59 if (allowMalformed == null) allowMalformed = _allowMalformed; | 65 if (allowMalformed == null) allowMalformed = _allowMalformed; |
60 return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits); | 66 return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits); |
(...skipping 235 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
296 * If it is `true` [convert] replaces invalid (or unterminated) character | 302 * If it is `true` [convert] replaces invalid (or unterminated) character |
297 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise | 303 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise |
298 * it throws a [FormatException]. | 304 * it throws a [FormatException]. |
299 */ | 305 */ |
300 const Utf8Decoder({ bool allowMalformed: false }) | 306 const Utf8Decoder({ bool allowMalformed: false }) |
301 : this._allowMalformed = allowMalformed; | 307 : this._allowMalformed = allowMalformed; |
302 | 308 |
303 /** | 309 /** |
304 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the | 310 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the |
305 * corresponding string. | 311 * corresponding string. |
| 312 * |
| 313 * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this |
| 314 * character is discarded. |
306 */ | 315 */ |
307 String convert(List<int> codeUnits) { | 316 String convert(List<int> codeUnits) { |
308 StringBuffer buffer = new StringBuffer(); | 317 StringBuffer buffer = new StringBuffer(); |
309 _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed); | 318 _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed); |
310 decoder.convert(codeUnits, 0, codeUnits.length); | 319 decoder.convert(codeUnits, 0, codeUnits.length); |
311 decoder.close(); | 320 decoder.close(); |
312 return buffer.toString(); | 321 return buffer.toString(); |
313 } | 322 } |
314 | 323 |
315 /** | 324 /** |
(...skipping 23 matching lines...) Expand all Loading... |
339 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits | 348 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits |
340 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max. | 349 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max. |
341 | 350 |
342 // UTF-16 constants. | 351 // UTF-16 constants. |
343 const int _SURROGATE_MASK = 0xF800; | 352 const int _SURROGATE_MASK = 0xF800; |
344 const int _SURROGATE_TAG_MASK = 0xFC00; | 353 const int _SURROGATE_TAG_MASK = 0xFC00; |
345 const int _SURROGATE_VALUE_MASK = 0x3FF; | 354 const int _SURROGATE_VALUE_MASK = 0x3FF; |
346 const int _LEAD_SURROGATE_MIN = 0xD800; | 355 const int _LEAD_SURROGATE_MIN = 0xD800; |
347 const int _TAIL_SURROGATE_MIN = 0xDC00; | 356 const int _TAIL_SURROGATE_MIN = 0xDC00; |
348 | 357 |
349 const int _REPLACEMENT_CHARACTER = 0xFFFD; | |
350 const int _BOM_CHARACTER = 0xFEFF; | |
351 | |
352 bool _isSurrogate(int codeUnit) => | 358 bool _isSurrogate(int codeUnit) => |
353 (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN; | 359 (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN; |
354 bool _isLeadSurrogate(int codeUnit) => | 360 bool _isLeadSurrogate(int codeUnit) => |
355 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN; | 361 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN; |
356 bool _isTailSurrogate(int codeUnit) => | 362 bool _isTailSurrogate(int codeUnit) => |
357 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN; | 363 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN; |
358 int _combineSurrogatePair(int lead, int tail) => | 364 int _combineSurrogatePair(int lead, int tail) => |
359 0x10000 | ((lead & _SURROGATE_VALUE_MASK) << 10) | 365 0x10000 + ((lead & _SURROGATE_VALUE_MASK) << 10) |
360 | (tail & _SURROGATE_VALUE_MASK); | 366 | (tail & _SURROGATE_VALUE_MASK); |
361 | 367 |
362 | 368 |
363 /** | 369 /** |
364 * Decodes UTF-8. | 370 * Decodes UTF-8. |
365 * | 371 * |
366 * The decoder handles chunked input. | 372 * The decoder handles chunked input. |
367 */ | 373 */ |
368 // TODO(floitsch): make this class public. | 374 // TODO(floitsch): make this class public. |
369 class _Utf8Decoder { | 375 class _Utf8Decoder { |
(...skipping 23 matching lines...) Expand all Loading... |
393 * Flushes this decoder as if closed. | 399 * Flushes this decoder as if closed. |
394 * | 400 * |
395 * This method throws if the input was partial and the decoder was | 401 * This method throws if the input was partial and the decoder was |
396 * constructed with `allowMalformed` set to `false`. | 402 * constructed with `allowMalformed` set to `false`. |
397 */ | 403 */ |
398 void flush() { | 404 void flush() { |
399 if (hasPartialInput) { | 405 if (hasPartialInput) { |
400 if (!_allowMalformed) { | 406 if (!_allowMalformed) { |
401 throw new FormatException("Unfinished UTF-8 octet sequence"); | 407 throw new FormatException("Unfinished UTF-8 octet sequence"); |
402 } | 408 } |
403 _stringSink.writeCharCode(_REPLACEMENT_CHARACTER); | 409 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); |
404 _value = 0; | 410 _value = 0; |
405 _expectedUnits = 0; | 411 _expectedUnits = 0; |
406 _extraUnits = 0; | 412 _extraUnits = 0; |
407 } | 413 } |
408 } | 414 } |
409 | 415 |
410 void convert(List<int> codeUnits, int startIndex, int endIndex) { | 416 void convert(List<int> codeUnits, int startIndex, int endIndex) { |
411 int value = _value; | 417 int value = _value; |
412 int expectedUnits = _expectedUnits; | 418 int expectedUnits = _expectedUnits; |
413 int extraUnits = _extraUnits; | 419 int extraUnits = _extraUnits; |
414 _value = 0; | 420 _value = 0; |
415 _expectedUnits = 0; | 421 _expectedUnits = 0; |
416 _extraUnits = 0; | 422 _extraUnits = 0; |
417 | 423 |
418 int i = startIndex; | 424 int i = startIndex; |
419 loop: while (true) { | 425 loop: while (true) { |
420 multibyte: if (expectedUnits > 0) { | 426 multibyte: if (expectedUnits > 0) { |
421 do { | 427 do { |
422 if (i == endIndex) { | 428 if (i == endIndex) { |
423 break loop; | 429 break loop; |
424 } | 430 } |
425 int unit = codeUnits[i]; | 431 int unit = codeUnits[i]; |
426 if ((unit & 0xC0) != 0x80) { | 432 if ((unit & 0xC0) != 0x80) { |
427 expectedUnits = 0; | 433 expectedUnits = 0; |
428 if (!_allowMalformed) { | 434 if (!_allowMalformed) { |
429 throw new FormatException( | 435 throw new FormatException( |
430 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); | 436 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); |
431 } | 437 } |
432 _isFirstCharacter = false; | 438 _isFirstCharacter = false; |
433 _stringSink.writeCharCode(_REPLACEMENT_CHARACTER); | 439 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); |
434 break multibyte; | 440 break multibyte; |
435 } else { | 441 } else { |
436 value = (value << 6) | (unit & 0x3f); | 442 value = (value << 6) | (unit & 0x3f); |
437 expectedUnits--; | 443 expectedUnits--; |
438 i++; | 444 i++; |
439 } | 445 } |
440 } while (expectedUnits > 0); | 446 } while (expectedUnits > 0); |
441 if (value <= _LIMITS[extraUnits - 1]) { | 447 if (value <= _LIMITS[extraUnits - 1]) { |
442 // Overly long encoding. The value could be encoded with a shorter | 448 // Overly long encoding. The value could be encoded with a shorter |
443 // encoding. | 449 // encoding. |
444 if (!_allowMalformed) { | 450 if (!_allowMalformed) { |
445 throw new FormatException( | 451 throw new FormatException( |
446 "Overlong encoding of 0x${value.toRadixString(16)}"); | 452 "Overlong encoding of 0x${value.toRadixString(16)}"); |
447 } | 453 } |
448 expectedUnits = extraUnits = 0; | 454 expectedUnits = extraUnits = 0; |
449 value = _REPLACEMENT_CHARACTER; | 455 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; |
450 } | 456 } |
451 if (value > _FOUR_BYTE_LIMIT) { | 457 if (value > _FOUR_BYTE_LIMIT) { |
452 if (!_allowMalformed) { | 458 if (!_allowMalformed) { |
453 throw new FormatException("Character outside valid Unicode range: " | 459 throw new FormatException("Character outside valid Unicode range: " |
454 "0x${value.toRadixString(16)}"); | 460 "0x${value.toRadixString(16)}"); |
455 } | 461 } |
456 value = _REPLACEMENT_CHARACTER; | 462 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; |
457 } | 463 } |
458 if (!_isFirstCharacter || value != _BOM_CHARACTER) { | 464 if (!_isFirstCharacter || value != UNICODE_BOM_CHARACTER_RUNE) { |
459 _stringSink.writeCharCode(value); | 465 _stringSink.writeCharCode(value); |
460 } | 466 } |
461 _isFirstCharacter = false; | 467 _isFirstCharacter = false; |
462 } | 468 } |
463 | 469 |
464 while (i < endIndex) { | 470 while (i < endIndex) { |
465 int unit = codeUnits[i++]; | 471 int unit = codeUnits[i++]; |
466 // TODO(floitsch): the way we test we could potentially allow | 472 // TODO(floitsch): the way we test we could potentially allow |
467 // units that are too large, if they happen to have the | 473 // units that are too large, if they happen to have the |
468 // right bit-pattern. (Same is true for the multibyte loop above). | 474 // right bit-pattern. (Same is true for the multibyte loop above). |
469 // TODO(floitsch): optimize this loop. See: | 475 // TODO(floitsch): optimize this loop. See: |
470 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d
art?column_width=80 | 476 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d
art?column_width=80 |
471 if (unit < 0) { | 477 if (unit < 0) { |
472 // TODO(floitsch): should this be unit <= 0 ? | 478 // TODO(floitsch): should this be unit <= 0 ? |
473 if (!_allowMalformed) { | 479 if (!_allowMalformed) { |
474 throw new FormatException( | 480 throw new FormatException( |
475 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}"); | 481 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}"); |
476 } | 482 } |
477 _stringSink.writeCharCode(_REPLACEMENT_CHARACTER); | 483 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); |
478 } else if (unit <= _ONE_BYTE_LIMIT) { | 484 } else if (unit <= _ONE_BYTE_LIMIT) { |
479 _isFirstCharacter = false; | 485 _isFirstCharacter = false; |
480 _stringSink.writeCharCode(unit); | 486 _stringSink.writeCharCode(unit); |
481 } else { | 487 } else { |
482 if ((unit & 0xE0) == 0xC0) { | 488 if ((unit & 0xE0) == 0xC0) { |
483 value = unit & 0x1F; | 489 value = unit & 0x1F; |
484 expectedUnits = extraUnits = 1; | 490 expectedUnits = extraUnits = 1; |
485 continue loop; | 491 continue loop; |
486 } | 492 } |
487 if ((unit & 0xF0) == 0xE0) { | 493 if ((unit & 0xF0) == 0xE0) { |
488 value = unit & 0x0F; | 494 value = unit & 0x0F; |
489 expectedUnits = extraUnits = 2; | 495 expectedUnits = extraUnits = 2; |
490 continue loop; | 496 continue loop; |
491 } | 497 } |
492 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences. | 498 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences. |
493 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) { | 499 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) { |
494 value = unit & 0x07; | 500 value = unit & 0x07; |
495 expectedUnits = extraUnits = 3; | 501 expectedUnits = extraUnits = 3; |
496 continue loop; | 502 continue loop; |
497 } | 503 } |
498 if (!_allowMalformed) { | 504 if (!_allowMalformed) { |
499 throw new FormatException( | 505 throw new FormatException( |
500 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); | 506 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); |
501 } | 507 } |
502 value = _REPLACEMENT_CHARACTER; | 508 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; |
503 expectedUnits = extraUnits = 0; | 509 expectedUnits = extraUnits = 0; |
504 _isFirstCharacter = false; | 510 _isFirstCharacter = false; |
505 _stringSink.writeCharCode(value); | 511 _stringSink.writeCharCode(value); |
506 } | 512 } |
507 } | 513 } |
508 break loop; | 514 break loop; |
509 } | 515 } |
510 if (expectedUnits > 0) { | 516 if (expectedUnits > 0) { |
511 _value = value; | 517 _value = value; |
512 _expectedUnits = expectedUnits; | 518 _expectedUnits = expectedUnits; |
513 _extraUnits = extraUnits; | 519 _extraUnits = extraUnits; |
514 } | 520 } |
515 } | 521 } |
516 } | 522 } |
OLD | NEW |