Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(522)

Side by Side Diff: sdk/lib/convert/utf.dart

Issue 25463003: Fix UTF8 encoder for Unicode runes > 0xFFFF. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Reupload due to error. Created 7 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | tests/lib/convert/unicode_tests.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 part of dart.convert; 5 part of dart.convert;
6 6
7 /** The Unicode Replacement character `U+FFFD` (�). */ 7 /** The Unicode Replacement character `U+FFFD` (�). */
8 const UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD; 8 const UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD;
9 9
10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */
11 const UNICODE_BOM_CHARACTER_RUNE = 0xFEFF;
12
10 /** 13 /**
11 * An instance of the default implementation of the [Utf8Codec]. 14 * An instance of the default implementation of the [Utf8Codec].
12 * 15 *
13 * This instance provides a convenient access to the most common UTF-8 16 * This instance provides a convenient access to the most common UTF-8
14 * use cases. 17 * use cases.
15 * 18 *
16 * Examples: 19 * Examples:
17 * 20 *
18 * var encoded = UTF8.encode("Îñţérñåţîöñåļîžåţîờñ"); 21 * var encoded = UTF8.encode("Îñţérñåţîöñåļîžåţîờñ");
19 * var decoded = UTF8.decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6, 22 * var decoded = UTF8.decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6,
(...skipping 21 matching lines...) Expand all
41 */ 44 */
42 const Utf8Codec({ bool allowMalformed: false }) 45 const Utf8Codec({ bool allowMalformed: false })
43 : _allowMalformed = allowMalformed; 46 : _allowMalformed = allowMalformed;
44 47
45 String get name => "utf-8"; 48 String get name => "utf-8";
46 49
47 /** 50 /**
48 * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the 51 * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the
49 * corresponding string. 52 * corresponding string.
50 * 53 *
54 * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this
55 * character is discarded.
56 *
51 * If [allowMalformed] is `true` the decoder replaces invalid (or 57 * If [allowMalformed] is `true` the decoder replaces invalid (or
52 * unterminated) character sequences with the Unicode Replacement character 58 * unterminated) character sequences with the Unicode Replacement character
53 * `U+FFFD` (�). Otherwise it throws a [FormatException]. 59 * `U+FFFD` (�). Otherwise it throws a [FormatException].
54 * 60 *
55 * If [allowMalformed] is not given, it defaults to the `allowMalformed` that 61 * If [allowMalformed] is not given, it defaults to the `allowMalformed` that
56 * was used to instantiate `this`. 62 * was used to instantiate `this`.
57 */ 63 */
58 String decode(List<int> codeUnits, { bool allowMalformed }) { 64 String decode(List<int> codeUnits, { bool allowMalformed }) {
59 if (allowMalformed == null) allowMalformed = _allowMalformed; 65 if (allowMalformed == null) allowMalformed = _allowMalformed;
60 return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits); 66 return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits);
(...skipping 235 matching lines...) Expand 10 before | Expand all | Expand 10 after
296 * If it is `true` [convert] replaces invalid (or unterminated) character 302 * If it is `true` [convert] replaces invalid (or unterminated) character
297 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise 303 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise
298 * it throws a [FormatException]. 304 * it throws a [FormatException].
299 */ 305 */
300 const Utf8Decoder({ bool allowMalformed: false }) 306 const Utf8Decoder({ bool allowMalformed: false })
301 : this._allowMalformed = allowMalformed; 307 : this._allowMalformed = allowMalformed;
302 308
303 /** 309 /**
304 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the 310 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the
305 * corresponding string. 311 * corresponding string.
312 *
313 * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this
314 * character is discarded.
306 */ 315 */
307 String convert(List<int> codeUnits) { 316 String convert(List<int> codeUnits) {
308 StringBuffer buffer = new StringBuffer(); 317 StringBuffer buffer = new StringBuffer();
309 _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed); 318 _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed);
310 decoder.convert(codeUnits, 0, codeUnits.length); 319 decoder.convert(codeUnits, 0, codeUnits.length);
311 decoder.close(); 320 decoder.close();
312 return buffer.toString(); 321 return buffer.toString();
313 } 322 }
314 323
315 /** 324 /**
(...skipping 23 matching lines...) Expand all
339 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits 348 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits
340 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max. 349 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max.
341 350
342 // UTF-16 constants. 351 // UTF-16 constants.
343 const int _SURROGATE_MASK = 0xF800; 352 const int _SURROGATE_MASK = 0xF800;
344 const int _SURROGATE_TAG_MASK = 0xFC00; 353 const int _SURROGATE_TAG_MASK = 0xFC00;
345 const int _SURROGATE_VALUE_MASK = 0x3FF; 354 const int _SURROGATE_VALUE_MASK = 0x3FF;
346 const int _LEAD_SURROGATE_MIN = 0xD800; 355 const int _LEAD_SURROGATE_MIN = 0xD800;
347 const int _TAIL_SURROGATE_MIN = 0xDC00; 356 const int _TAIL_SURROGATE_MIN = 0xDC00;
348 357
349 const int _REPLACEMENT_CHARACTER = 0xFFFD;
350 const int _BOM_CHARACTER = 0xFEFF;
351
352 bool _isSurrogate(int codeUnit) => 358 bool _isSurrogate(int codeUnit) =>
353 (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN; 359 (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN;
354 bool _isLeadSurrogate(int codeUnit) => 360 bool _isLeadSurrogate(int codeUnit) =>
355 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN; 361 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN;
356 bool _isTailSurrogate(int codeUnit) => 362 bool _isTailSurrogate(int codeUnit) =>
357 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN; 363 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN;
358 int _combineSurrogatePair(int lead, int tail) => 364 int _combineSurrogatePair(int lead, int tail) =>
359 0x10000 | ((lead & _SURROGATE_VALUE_MASK) << 10) 365 0x10000 + ((lead & _SURROGATE_VALUE_MASK) << 10)
360 | (tail & _SURROGATE_VALUE_MASK); 366 | (tail & _SURROGATE_VALUE_MASK);
361 367
362 368
363 /** 369 /**
364 * Decodes UTF-8. 370 * Decodes UTF-8.
365 * 371 *
366 * The decoder handles chunked input. 372 * The decoder handles chunked input.
367 */ 373 */
368 // TODO(floitsch): make this class public. 374 // TODO(floitsch): make this class public.
369 class _Utf8Decoder { 375 class _Utf8Decoder {
(...skipping 23 matching lines...) Expand all
393 * Flushes this decoder as if closed. 399 * Flushes this decoder as if closed.
394 * 400 *
395 * This method throws if the input was partial and the decoder was 401 * This method throws if the input was partial and the decoder was
396 * constructed with `allowMalformed` set to `false`. 402 * constructed with `allowMalformed` set to `false`.
397 */ 403 */
398 void flush() { 404 void flush() {
399 if (hasPartialInput) { 405 if (hasPartialInput) {
400 if (!_allowMalformed) { 406 if (!_allowMalformed) {
401 throw new FormatException("Unfinished UTF-8 octet sequence"); 407 throw new FormatException("Unfinished UTF-8 octet sequence");
402 } 408 }
403 _stringSink.writeCharCode(_REPLACEMENT_CHARACTER); 409 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE);
404 _value = 0; 410 _value = 0;
405 _expectedUnits = 0; 411 _expectedUnits = 0;
406 _extraUnits = 0; 412 _extraUnits = 0;
407 } 413 }
408 } 414 }
409 415
410 void convert(List<int> codeUnits, int startIndex, int endIndex) { 416 void convert(List<int> codeUnits, int startIndex, int endIndex) {
411 int value = _value; 417 int value = _value;
412 int expectedUnits = _expectedUnits; 418 int expectedUnits = _expectedUnits;
413 int extraUnits = _extraUnits; 419 int extraUnits = _extraUnits;
414 _value = 0; 420 _value = 0;
415 _expectedUnits = 0; 421 _expectedUnits = 0;
416 _extraUnits = 0; 422 _extraUnits = 0;
417 423
418 int i = startIndex; 424 int i = startIndex;
419 loop: while (true) { 425 loop: while (true) {
420 multibyte: if (expectedUnits > 0) { 426 multibyte: if (expectedUnits > 0) {
421 do { 427 do {
422 if (i == endIndex) { 428 if (i == endIndex) {
423 break loop; 429 break loop;
424 } 430 }
425 int unit = codeUnits[i]; 431 int unit = codeUnits[i];
426 if ((unit & 0xC0) != 0x80) { 432 if ((unit & 0xC0) != 0x80) {
427 expectedUnits = 0; 433 expectedUnits = 0;
428 if (!_allowMalformed) { 434 if (!_allowMalformed) {
429 throw new FormatException( 435 throw new FormatException(
430 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); 436 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
431 } 437 }
432 _isFirstCharacter = false; 438 _isFirstCharacter = false;
433 _stringSink.writeCharCode(_REPLACEMENT_CHARACTER); 439 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE);
434 break multibyte; 440 break multibyte;
435 } else { 441 } else {
436 value = (value << 6) | (unit & 0x3f); 442 value = (value << 6) | (unit & 0x3f);
437 expectedUnits--; 443 expectedUnits--;
438 i++; 444 i++;
439 } 445 }
440 } while (expectedUnits > 0); 446 } while (expectedUnits > 0);
441 if (value <= _LIMITS[extraUnits - 1]) { 447 if (value <= _LIMITS[extraUnits - 1]) {
442 // Overly long encoding. The value could be encoded with a shorter 448 // Overly long encoding. The value could be encoded with a shorter
443 // encoding. 449 // encoding.
444 if (!_allowMalformed) { 450 if (!_allowMalformed) {
445 throw new FormatException( 451 throw new FormatException(
446 "Overlong encoding of 0x${value.toRadixString(16)}"); 452 "Overlong encoding of 0x${value.toRadixString(16)}");
447 } 453 }
448 expectedUnits = extraUnits = 0; 454 expectedUnits = extraUnits = 0;
449 value = _REPLACEMENT_CHARACTER; 455 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;
450 } 456 }
451 if (value > _FOUR_BYTE_LIMIT) { 457 if (value > _FOUR_BYTE_LIMIT) {
452 if (!_allowMalformed) { 458 if (!_allowMalformed) {
453 throw new FormatException("Character outside valid Unicode range: " 459 throw new FormatException("Character outside valid Unicode range: "
454 "0x${value.toRadixString(16)}"); 460 "0x${value.toRadixString(16)}");
455 } 461 }
456 value = _REPLACEMENT_CHARACTER; 462 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;
457 } 463 }
458 if (!_isFirstCharacter || value != _BOM_CHARACTER) { 464 if (!_isFirstCharacter || value != UNICODE_BOM_CHARACTER_RUNE) {
459 _stringSink.writeCharCode(value); 465 _stringSink.writeCharCode(value);
460 } 466 }
461 _isFirstCharacter = false; 467 _isFirstCharacter = false;
462 } 468 }
463 469
464 while (i < endIndex) { 470 while (i < endIndex) {
465 int unit = codeUnits[i++]; 471 int unit = codeUnits[i++];
466 // TODO(floitsch): the way we test we could potentially allow 472 // TODO(floitsch): the way we test we could potentially allow
467 // units that are too large, if they happen to have the 473 // units that are too large, if they happen to have the
468 // right bit-pattern. (Same is true for the multibyte loop above). 474 // right bit-pattern. (Same is true for the multibyte loop above).
469 // TODO(floitsch): optimize this loop. See: 475 // TODO(floitsch): optimize this loop. See:
470 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d art?column_width=80 476 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d art?column_width=80
471 if (unit < 0) { 477 if (unit < 0) {
472 // TODO(floitsch): should this be unit <= 0 ? 478 // TODO(floitsch): should this be unit <= 0 ?
473 if (!_allowMalformed) { 479 if (!_allowMalformed) {
474 throw new FormatException( 480 throw new FormatException(
475 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}"); 481 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}");
476 } 482 }
477 _stringSink.writeCharCode(_REPLACEMENT_CHARACTER); 483 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE);
478 } else if (unit <= _ONE_BYTE_LIMIT) { 484 } else if (unit <= _ONE_BYTE_LIMIT) {
479 _isFirstCharacter = false; 485 _isFirstCharacter = false;
480 _stringSink.writeCharCode(unit); 486 _stringSink.writeCharCode(unit);
481 } else { 487 } else {
482 if ((unit & 0xE0) == 0xC0) { 488 if ((unit & 0xE0) == 0xC0) {
483 value = unit & 0x1F; 489 value = unit & 0x1F;
484 expectedUnits = extraUnits = 1; 490 expectedUnits = extraUnits = 1;
485 continue loop; 491 continue loop;
486 } 492 }
487 if ((unit & 0xF0) == 0xE0) { 493 if ((unit & 0xF0) == 0xE0) {
488 value = unit & 0x0F; 494 value = unit & 0x0F;
489 expectedUnits = extraUnits = 2; 495 expectedUnits = extraUnits = 2;
490 continue loop; 496 continue loop;
491 } 497 }
492 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences. 498 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences.
493 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) { 499 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) {
494 value = unit & 0x07; 500 value = unit & 0x07;
495 expectedUnits = extraUnits = 3; 501 expectedUnits = extraUnits = 3;
496 continue loop; 502 continue loop;
497 } 503 }
498 if (!_allowMalformed) { 504 if (!_allowMalformed) {
499 throw new FormatException( 505 throw new FormatException(
500 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); 506 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
501 } 507 }
502 value = _REPLACEMENT_CHARACTER; 508 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;
503 expectedUnits = extraUnits = 0; 509 expectedUnits = extraUnits = 0;
504 _isFirstCharacter = false; 510 _isFirstCharacter = false;
505 _stringSink.writeCharCode(value); 511 _stringSink.writeCharCode(value);
506 } 512 }
507 } 513 }
508 break loop; 514 break loop;
509 } 515 }
510 if (expectedUnits > 0) { 516 if (expectedUnits > 0) {
511 _value = value; 517 _value = value;
512 _expectedUnits = expectedUnits; 518 _expectedUnits = expectedUnits;
513 _extraUnits = extraUnits; 519 _extraUnits = extraUnits;
514 } 520 }
515 } 521 }
516 } 522 }
OLDNEW
« no previous file with comments | « no previous file | tests/lib/convert/unicode_tests.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698