| OLD | NEW |
| 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 part of dart.convert; | 5 part of dart.convert; |
| 6 | 6 |
| 7 /** The Unicode Replacement character `U+FFFD` (�). */ | 7 /** The Unicode Replacement character `U+FFFD` (�). */ |
| 8 const UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD; | 8 const UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD; |
| 9 | 9 |
| 10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */ |
| 11 const UNICODE_BOM_CHARACTER_RUNE = 0xFEFF; |
| 12 |
| 10 /** | 13 /** |
| 11 * An instance of the default implementation of the [Utf8Codec]. | 14 * An instance of the default implementation of the [Utf8Codec]. |
| 12 * | 15 * |
| 13 * This instance provides a convenient access to the most common UTF-8 | 16 * This instance provides a convenient access to the most common UTF-8 |
| 14 * use cases. | 17 * use cases. |
| 15 * | 18 * |
| 16 * Examples: | 19 * Examples: |
| 17 * | 20 * |
| 18 * var encoded = UTF8.encode("Îñţérñåţîöñåļîžåţîờñ"); | 21 * var encoded = UTF8.encode("Îñţérñåţîöñåļîžåţîờñ"); |
| 19 * var decoded = UTF8.decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6, | 22 * var decoded = UTF8.decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6, |
| (...skipping 21 matching lines...) Expand all Loading... |
| 41 */ | 44 */ |
| 42 const Utf8Codec({ bool allowMalformed: false }) | 45 const Utf8Codec({ bool allowMalformed: false }) |
| 43 : _allowMalformed = allowMalformed; | 46 : _allowMalformed = allowMalformed; |
| 44 | 47 |
| 45 String get name => "utf-8"; | 48 String get name => "utf-8"; |
| 46 | 49 |
| 47 /** | 50 /** |
| 48 * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the | 51 * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the |
| 49 * corresponding string. | 52 * corresponding string. |
| 50 * | 53 * |
| 54 * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this |
| 55 * character is discarded. |
| 56 * |
| 51 * If [allowMalformed] is `true` the decoder replaces invalid (or | 57 * If [allowMalformed] is `true` the decoder replaces invalid (or |
| 52 * unterminated) character sequences with the Unicode Replacement character | 58 * unterminated) character sequences with the Unicode Replacement character |
| 53 * `U+FFFD` (�). Otherwise it throws a [FormatException]. | 59 * `U+FFFD` (�). Otherwise it throws a [FormatException]. |
| 54 * | 60 * |
| 55 * If [allowMalformed] is not given, it defaults to the `allowMalformed` that | 61 * If [allowMalformed] is not given, it defaults to the `allowMalformed` that |
| 56 * was used to instantiate `this`. | 62 * was used to instantiate `this`. |
| 57 */ | 63 */ |
| 58 String decode(List<int> codeUnits, { bool allowMalformed }) { | 64 String decode(List<int> codeUnits, { bool allowMalformed }) { |
| 59 if (allowMalformed == null) allowMalformed = _allowMalformed; | 65 if (allowMalformed == null) allowMalformed = _allowMalformed; |
| 60 return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits); | 66 return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits); |
| (...skipping 235 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 296 * If it is `true` [convert] replaces invalid (or unterminated) character | 302 * If it is `true` [convert] replaces invalid (or unterminated) character |
| 297 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise | 303 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise |
| 298 * it throws a [FormatException]. | 304 * it throws a [FormatException]. |
| 299 */ | 305 */ |
| 300 const Utf8Decoder({ bool allowMalformed: false }) | 306 const Utf8Decoder({ bool allowMalformed: false }) |
| 301 : this._allowMalformed = allowMalformed; | 307 : this._allowMalformed = allowMalformed; |
| 302 | 308 |
| 303 /** | 309 /** |
| 304 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the | 310 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the |
| 305 * corresponding string. | 311 * corresponding string. |
| 312 * |
| 313 * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this |
| 314 * character is discarded. |
| 306 */ | 315 */ |
| 307 String convert(List<int> codeUnits) { | 316 String convert(List<int> codeUnits) { |
| 308 StringBuffer buffer = new StringBuffer(); | 317 StringBuffer buffer = new StringBuffer(); |
| 309 _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed); | 318 _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed); |
| 310 decoder.convert(codeUnits, 0, codeUnits.length); | 319 decoder.convert(codeUnits, 0, codeUnits.length); |
| 311 decoder.close(); | 320 decoder.close(); |
| 312 return buffer.toString(); | 321 return buffer.toString(); |
| 313 } | 322 } |
| 314 | 323 |
| 315 /** | 324 /** |
| (...skipping 23 matching lines...) Expand all Loading... |
| 339 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits | 348 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits |
| 340 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max. | 349 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max. |
| 341 | 350 |
| 342 // UTF-16 constants. | 351 // UTF-16 constants. |
| 343 const int _SURROGATE_MASK = 0xF800; | 352 const int _SURROGATE_MASK = 0xF800; |
| 344 const int _SURROGATE_TAG_MASK = 0xFC00; | 353 const int _SURROGATE_TAG_MASK = 0xFC00; |
| 345 const int _SURROGATE_VALUE_MASK = 0x3FF; | 354 const int _SURROGATE_VALUE_MASK = 0x3FF; |
| 346 const int _LEAD_SURROGATE_MIN = 0xD800; | 355 const int _LEAD_SURROGATE_MIN = 0xD800; |
| 347 const int _TAIL_SURROGATE_MIN = 0xDC00; | 356 const int _TAIL_SURROGATE_MIN = 0xDC00; |
| 348 | 357 |
| 349 const int _REPLACEMENT_CHARACTER = 0xFFFD; | |
| 350 const int _BOM_CHARACTER = 0xFEFF; | |
| 351 | |
| 352 bool _isSurrogate(int codeUnit) => | 358 bool _isSurrogate(int codeUnit) => |
| 353 (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN; | 359 (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN; |
| 354 bool _isLeadSurrogate(int codeUnit) => | 360 bool _isLeadSurrogate(int codeUnit) => |
| 355 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN; | 361 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN; |
| 356 bool _isTailSurrogate(int codeUnit) => | 362 bool _isTailSurrogate(int codeUnit) => |
| 357 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN; | 363 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN; |
| 358 int _combineSurrogatePair(int lead, int tail) => | 364 int _combineSurrogatePair(int lead, int tail) => |
| 359 0x10000 | ((lead & _SURROGATE_VALUE_MASK) << 10) | 365 0x10000 + ((lead & _SURROGATE_VALUE_MASK) << 10) |
| 360 | (tail & _SURROGATE_VALUE_MASK); | 366 | (tail & _SURROGATE_VALUE_MASK); |
| 361 | 367 |
| 362 | 368 |
| 363 /** | 369 /** |
| 364 * Decodes UTF-8. | 370 * Decodes UTF-8. |
| 365 * | 371 * |
| 366 * The decoder handles chunked input. | 372 * The decoder handles chunked input. |
| 367 */ | 373 */ |
| 368 // TODO(floitsch): make this class public. | 374 // TODO(floitsch): make this class public. |
| 369 class _Utf8Decoder { | 375 class _Utf8Decoder { |
| (...skipping 23 matching lines...) Expand all Loading... |
| 393 * Flushes this decoder as if closed. | 399 * Flushes this decoder as if closed. |
| 394 * | 400 * |
| 395 * This method throws if the input was partial and the decoder was | 401 * This method throws if the input was partial and the decoder was |
| 396 * constructed with `allowMalformed` set to `false`. | 402 * constructed with `allowMalformed` set to `false`. |
| 397 */ | 403 */ |
| 398 void flush() { | 404 void flush() { |
| 399 if (hasPartialInput) { | 405 if (hasPartialInput) { |
| 400 if (!_allowMalformed) { | 406 if (!_allowMalformed) { |
| 401 throw new FormatException("Unfinished UTF-8 octet sequence"); | 407 throw new FormatException("Unfinished UTF-8 octet sequence"); |
| 402 } | 408 } |
| 403 _stringSink.writeCharCode(_REPLACEMENT_CHARACTER); | 409 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); |
| 404 _value = 0; | 410 _value = 0; |
| 405 _expectedUnits = 0; | 411 _expectedUnits = 0; |
| 406 _extraUnits = 0; | 412 _extraUnits = 0; |
| 407 } | 413 } |
| 408 } | 414 } |
| 409 | 415 |
| 410 void convert(List<int> codeUnits, int startIndex, int endIndex) { | 416 void convert(List<int> codeUnits, int startIndex, int endIndex) { |
| 411 int value = _value; | 417 int value = _value; |
| 412 int expectedUnits = _expectedUnits; | 418 int expectedUnits = _expectedUnits; |
| 413 int extraUnits = _extraUnits; | 419 int extraUnits = _extraUnits; |
| 414 _value = 0; | 420 _value = 0; |
| 415 _expectedUnits = 0; | 421 _expectedUnits = 0; |
| 416 _extraUnits = 0; | 422 _extraUnits = 0; |
| 417 | 423 |
| 418 int i = startIndex; | 424 int i = startIndex; |
| 419 loop: while (true) { | 425 loop: while (true) { |
| 420 multibyte: if (expectedUnits > 0) { | 426 multibyte: if (expectedUnits > 0) { |
| 421 do { | 427 do { |
| 422 if (i == endIndex) { | 428 if (i == endIndex) { |
| 423 break loop; | 429 break loop; |
| 424 } | 430 } |
| 425 int unit = codeUnits[i]; | 431 int unit = codeUnits[i]; |
| 426 if ((unit & 0xC0) != 0x80) { | 432 if ((unit & 0xC0) != 0x80) { |
| 427 expectedUnits = 0; | 433 expectedUnits = 0; |
| 428 if (!_allowMalformed) { | 434 if (!_allowMalformed) { |
| 429 throw new FormatException( | 435 throw new FormatException( |
| 430 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); | 436 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); |
| 431 } | 437 } |
| 432 _isFirstCharacter = false; | 438 _isFirstCharacter = false; |
| 433 _stringSink.writeCharCode(_REPLACEMENT_CHARACTER); | 439 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); |
| 434 break multibyte; | 440 break multibyte; |
| 435 } else { | 441 } else { |
| 436 value = (value << 6) | (unit & 0x3f); | 442 value = (value << 6) | (unit & 0x3f); |
| 437 expectedUnits--; | 443 expectedUnits--; |
| 438 i++; | 444 i++; |
| 439 } | 445 } |
| 440 } while (expectedUnits > 0); | 446 } while (expectedUnits > 0); |
| 441 if (value <= _LIMITS[extraUnits - 1]) { | 447 if (value <= _LIMITS[extraUnits - 1]) { |
| 442 // Overly long encoding. The value could be encoded with a shorter | 448 // Overly long encoding. The value could be encoded with a shorter |
| 443 // encoding. | 449 // encoding. |
| 444 if (!_allowMalformed) { | 450 if (!_allowMalformed) { |
| 445 throw new FormatException( | 451 throw new FormatException( |
| 446 "Overlong encoding of 0x${value.toRadixString(16)}"); | 452 "Overlong encoding of 0x${value.toRadixString(16)}"); |
| 447 } | 453 } |
| 448 expectedUnits = extraUnits = 0; | 454 expectedUnits = extraUnits = 0; |
| 449 value = _REPLACEMENT_CHARACTER; | 455 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; |
| 450 } | 456 } |
| 451 if (value > _FOUR_BYTE_LIMIT) { | 457 if (value > _FOUR_BYTE_LIMIT) { |
| 452 if (!_allowMalformed) { | 458 if (!_allowMalformed) { |
| 453 throw new FormatException("Character outside valid Unicode range: " | 459 throw new FormatException("Character outside valid Unicode range: " |
| 454 "0x${value.toRadixString(16)}"); | 460 "0x${value.toRadixString(16)}"); |
| 455 } | 461 } |
| 456 value = _REPLACEMENT_CHARACTER; | 462 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; |
| 457 } | 463 } |
| 458 if (!_isFirstCharacter || value != _BOM_CHARACTER) { | 464 if (!_isFirstCharacter || value != UNICODE_BOM_CHARACTER_RUNE) { |
| 459 _stringSink.writeCharCode(value); | 465 _stringSink.writeCharCode(value); |
| 460 } | 466 } |
| 461 _isFirstCharacter = false; | 467 _isFirstCharacter = false; |
| 462 } | 468 } |
| 463 | 469 |
| 464 while (i < endIndex) { | 470 while (i < endIndex) { |
| 465 int unit = codeUnits[i++]; | 471 int unit = codeUnits[i++]; |
| 466 // TODO(floitsch): the way we test we could potentially allow | 472 // TODO(floitsch): the way we test we could potentially allow |
| 467 // units that are too large, if they happen to have the | 473 // units that are too large, if they happen to have the |
| 468 // right bit-pattern. (Same is true for the multibyte loop above). | 474 // right bit-pattern. (Same is true for the multibyte loop above). |
| 469 // TODO(floitsch): optimize this loop. See: | 475 // TODO(floitsch): optimize this loop. See: |
| 470 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d
art?column_width=80 | 476 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d
art?column_width=80 |
| 471 if (unit < 0) { | 477 if (unit < 0) { |
| 472 // TODO(floitsch): should this be unit <= 0 ? | 478 // TODO(floitsch): should this be unit <= 0 ? |
| 473 if (!_allowMalformed) { | 479 if (!_allowMalformed) { |
| 474 throw new FormatException( | 480 throw new FormatException( |
| 475 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}"); | 481 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}"); |
| 476 } | 482 } |
| 477 _stringSink.writeCharCode(_REPLACEMENT_CHARACTER); | 483 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); |
| 478 } else if (unit <= _ONE_BYTE_LIMIT) { | 484 } else if (unit <= _ONE_BYTE_LIMIT) { |
| 479 _isFirstCharacter = false; | 485 _isFirstCharacter = false; |
| 480 _stringSink.writeCharCode(unit); | 486 _stringSink.writeCharCode(unit); |
| 481 } else { | 487 } else { |
| 482 if ((unit & 0xE0) == 0xC0) { | 488 if ((unit & 0xE0) == 0xC0) { |
| 483 value = unit & 0x1F; | 489 value = unit & 0x1F; |
| 484 expectedUnits = extraUnits = 1; | 490 expectedUnits = extraUnits = 1; |
| 485 continue loop; | 491 continue loop; |
| 486 } | 492 } |
| 487 if ((unit & 0xF0) == 0xE0) { | 493 if ((unit & 0xF0) == 0xE0) { |
| 488 value = unit & 0x0F; | 494 value = unit & 0x0F; |
| 489 expectedUnits = extraUnits = 2; | 495 expectedUnits = extraUnits = 2; |
| 490 continue loop; | 496 continue loop; |
| 491 } | 497 } |
| 492 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences. | 498 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences. |
| 493 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) { | 499 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) { |
| 494 value = unit & 0x07; | 500 value = unit & 0x07; |
| 495 expectedUnits = extraUnits = 3; | 501 expectedUnits = extraUnits = 3; |
| 496 continue loop; | 502 continue loop; |
| 497 } | 503 } |
| 498 if (!_allowMalformed) { | 504 if (!_allowMalformed) { |
| 499 throw new FormatException( | 505 throw new FormatException( |
| 500 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); | 506 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); |
| 501 } | 507 } |
| 502 value = _REPLACEMENT_CHARACTER; | 508 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; |
| 503 expectedUnits = extraUnits = 0; | 509 expectedUnits = extraUnits = 0; |
| 504 _isFirstCharacter = false; | 510 _isFirstCharacter = false; |
| 505 _stringSink.writeCharCode(value); | 511 _stringSink.writeCharCode(value); |
| 506 } | 512 } |
| 507 } | 513 } |
| 508 break loop; | 514 break loop; |
| 509 } | 515 } |
| 510 if (expectedUnits > 0) { | 516 if (expectedUnits > 0) { |
| 511 _value = value; | 517 _value = value; |
| 512 _expectedUnits = expectedUnits; | 518 _expectedUnits = expectedUnits; |
| 513 _extraUnits = extraUnits; | 519 _extraUnits = extraUnits; |
| 514 } | 520 } |
| 515 } | 521 } |
| 516 } | 522 } |
| OLD | NEW |