sdk/lib/convert/utf.dart - Issue 25463003: Fix UTF8 encoder for Unicode runes > 0xFFFF.

Side by Side Diff: sdk/lib/convert/utf.dart

Issue 25463003: Fix UTF8 encoder for Unicode runes > 0xFFFF. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Reupload due to error. Created 7 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 part of dart.convert;	5 part of dart.convert;

6	6

7 /** The Unicode Replacement character `U+FFFD` (�). */	7 /** The Unicode Replacement character `U+FFFD` (�). */

8 const UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD;	8 const UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD;

9	9

	10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */

	11 const UNICODE_BOM_CHARACTER_RUNE = 0xFEFF;

	12

10 /**	13 /**

11 * An instance of the default implementation of the [Utf8Codec].	14 * An instance of the default implementation of the [Utf8Codec].

12 *	15 *

13 * This instance provides a convenient access to the most common UTF-8	16 * This instance provides a convenient access to the most common UTF-8

14 * use cases.	17 * use cases.

15 *	18 *

16 * Examples:	19 * Examples:

17 *	20 *

18 * var encoded = UTF8.encode("Îñţérñåţîöñåļîžåţîờñ");	21 * var encoded = UTF8.encode("Îñţérñåţîöñåļîžåţîờñ");

19 * var decoded = UTF8.decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6,	22 * var decoded = UTF8.decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6,

(...skipping 21 matching lines...) Expand all Loading...
41 */	44 */

42 const Utf8Codec({ bool allowMalformed: false })	45 const Utf8Codec({ bool allowMalformed: false })

43 : _allowMalformed = allowMalformed;	46 : _allowMalformed = allowMalformed;

44	47

45 String get name => "utf-8";	48 String get name => "utf-8";

46	49

47 /**	50 /**

48 * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the	51 * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the

49 * corresponding string.	52 * corresponding string.

50 *	53 *

	54 * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this

	55 * character is discarded.

	56 *

51 * If [allowMalformed] is `true` the decoder replaces invalid (or	57 * If [allowMalformed] is `true` the decoder replaces invalid (or

52 * unterminated) character sequences with the Unicode Replacement character	58 * unterminated) character sequences with the Unicode Replacement character

53 * `U+FFFD` (�). Otherwise it throws a [FormatException].	59 * `U+FFFD` (�). Otherwise it throws a [FormatException].

54 *	60 *

55 * If [allowMalformed] is not given, it defaults to the `allowMalformed` that	61 * If [allowMalformed] is not given, it defaults to the `allowMalformed` that

56 * was used to instantiate `this`.	62 * was used to instantiate `this`.

57 */	63 */

58 String decode(List<int> codeUnits, { bool allowMalformed }) {	64 String decode(List<int> codeUnits, { bool allowMalformed }) {

59 if (allowMalformed == null) allowMalformed = _allowMalformed;	65 if (allowMalformed == null) allowMalformed = _allowMalformed;

60 return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits);	66 return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits);

(...skipping 235 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
296 * If it is `true` [convert] replaces invalid (or unterminated) character	302 * If it is `true` [convert] replaces invalid (or unterminated) character

297 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise	303 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise

298 * it throws a [FormatException].	304 * it throws a [FormatException].

299 */	305 */

300 const Utf8Decoder({ bool allowMalformed: false })	306 const Utf8Decoder({ bool allowMalformed: false })

301 : this._allowMalformed = allowMalformed;	307 : this._allowMalformed = allowMalformed;

302	308

303 /**	309 /**

304 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the	310 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the

305 * corresponding string.	311 * corresponding string.

	312 *

	313 * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this

	314 * character is discarded.

306 */	315 */

307 String convert(List<int> codeUnits) {	316 String convert(List<int> codeUnits) {

308 StringBuffer buffer = new StringBuffer();	317 StringBuffer buffer = new StringBuffer();

309 _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed);	318 _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed);

310 decoder.convert(codeUnits, 0, codeUnits.length);	319 decoder.convert(codeUnits, 0, codeUnits.length);

311 decoder.close();	320 decoder.close();

312 return buffer.toString();	321 return buffer.toString();

313 }	322 }

314	323

315 /**	324 /**

(...skipping 23 matching lines...) Expand all Loading...
339 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits	348 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits

340 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max.	349 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max.

341	350

342 // UTF-16 constants.	351 // UTF-16 constants.

343 const int _SURROGATE_MASK = 0xF800;	352 const int _SURROGATE_MASK = 0xF800;

344 const int _SURROGATE_TAG_MASK = 0xFC00;	353 const int _SURROGATE_TAG_MASK = 0xFC00;

345 const int _SURROGATE_VALUE_MASK = 0x3FF;	354 const int _SURROGATE_VALUE_MASK = 0x3FF;

346 const int _LEAD_SURROGATE_MIN = 0xD800;	355 const int _LEAD_SURROGATE_MIN = 0xD800;

347 const int _TAIL_SURROGATE_MIN = 0xDC00;	356 const int _TAIL_SURROGATE_MIN = 0xDC00;

348	357

349 const int _REPLACEMENT_CHARACTER = 0xFFFD;

350 const int _BOM_CHARACTER = 0xFEFF;

351

352 bool _isSurrogate(int codeUnit) =>	358 bool _isSurrogate(int codeUnit) =>

353 (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN;	359 (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN;

354 bool _isLeadSurrogate(int codeUnit) =>	360 bool _isLeadSurrogate(int codeUnit) =>

355 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN;	361 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN;

356 bool _isTailSurrogate(int codeUnit) =>	362 bool _isTailSurrogate(int codeUnit) =>

357 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN;	363 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN;

358 int _combineSurrogatePair(int lead, int tail) =>	364 int _combineSurrogatePair(int lead, int tail) =>

359 0x10000 \| ((lead & _SURROGATE_VALUE_MASK) << 10)	365 0x10000 + ((lead & _SURROGATE_VALUE_MASK) << 10)

360 \| (tail & _SURROGATE_VALUE_MASK);	366 \| (tail & _SURROGATE_VALUE_MASK);

361	367

362	368

363 /**	369 /**

364 * Decodes UTF-8.	370 * Decodes UTF-8.

365 *	371 *

366 * The decoder handles chunked input.	372 * The decoder handles chunked input.

367 */	373 */

368 // TODO(floitsch): make this class public.	374 // TODO(floitsch): make this class public.

369 class _Utf8Decoder {	375 class _Utf8Decoder {

(...skipping 23 matching lines...) Expand all Loading...
393 * Flushes this decoder as if closed.	399 * Flushes this decoder as if closed.

394 *	400 *

395 * This method throws if the input was partial and the decoder was	401 * This method throws if the input was partial and the decoder was

396 * constructed with `allowMalformed` set to `false`.	402 * constructed with `allowMalformed` set to `false`.

397 */	403 */

398 void flush() {	404 void flush() {

399 if (hasPartialInput) {	405 if (hasPartialInput) {

400 if (!_allowMalformed) {	406 if (!_allowMalformed) {

401 throw new FormatException("Unfinished UTF-8 octet sequence");	407 throw new FormatException("Unfinished UTF-8 octet sequence");

402 }	408 }

403 _stringSink.writeCharCode(_REPLACEMENT_CHARACTER);	409 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE);

404 _value = 0;	410 _value = 0;

405 _expectedUnits = 0;	411 _expectedUnits = 0;

406 _extraUnits = 0;	412 _extraUnits = 0;

407 }	413 }

408 }	414 }

409	415

410 void convert(List<int> codeUnits, int startIndex, int endIndex) {	416 void convert(List<int> codeUnits, int startIndex, int endIndex) {

411 int value = _value;	417 int value = _value;

412 int expectedUnits = _expectedUnits;	418 int expectedUnits = _expectedUnits;

413 int extraUnits = _extraUnits;	419 int extraUnits = _extraUnits;

414 _value = 0;	420 _value = 0;

415 _expectedUnits = 0;	421 _expectedUnits = 0;

416 _extraUnits = 0;	422 _extraUnits = 0;

417	423

418 int i = startIndex;	424 int i = startIndex;

419 loop: while (true) {	425 loop: while (true) {

420 multibyte: if (expectedUnits > 0) {	426 multibyte: if (expectedUnits > 0) {

421 do {	427 do {

422 if (i == endIndex) {	428 if (i == endIndex) {

423 break loop;	429 break loop;

424 }	430 }

425 int unit = codeUnits[i];	431 int unit = codeUnits[i];

426 if ((unit & 0xC0) != 0x80) {	432 if ((unit & 0xC0) != 0x80) {

427 expectedUnits = 0;	433 expectedUnits = 0;

428 if (!_allowMalformed) {	434 if (!_allowMalformed) {

429 throw new FormatException(	435 throw new FormatException(

430 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");	436 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");

431 }	437 }

432 _isFirstCharacter = false;	438 _isFirstCharacter = false;

433 _stringSink.writeCharCode(_REPLACEMENT_CHARACTER);	439 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE);

434 break multibyte;	440 break multibyte;

435 } else {	441 } else {

436 value = (value << 6) \| (unit & 0x3f);	442 value = (value << 6) \| (unit & 0x3f);

437 expectedUnits--;	443 expectedUnits--;

438 i++;	444 i++;

439 }	445 }

440 } while (expectedUnits > 0);	446 } while (expectedUnits > 0);

441 if (value <= _LIMITS[extraUnits - 1]) {	447 if (value <= _LIMITS[extraUnits - 1]) {

442 // Overly long encoding. The value could be encoded with a shorter	448 // Overly long encoding. The value could be encoded with a shorter

443 // encoding.	449 // encoding.

444 if (!_allowMalformed) {	450 if (!_allowMalformed) {

445 throw new FormatException(	451 throw new FormatException(

446 "Overlong encoding of 0x${value.toRadixString(16)}");	452 "Overlong encoding of 0x${value.toRadixString(16)}");

447 }	453 }

448 expectedUnits = extraUnits = 0;	454 expectedUnits = extraUnits = 0;

449 value = _REPLACEMENT_CHARACTER;	455 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;

450 }	456 }

451 if (value > _FOUR_BYTE_LIMIT) {	457 if (value > _FOUR_BYTE_LIMIT) {

452 if (!_allowMalformed) {	458 if (!_allowMalformed) {

453 throw new FormatException("Character outside valid Unicode range: "	459 throw new FormatException("Character outside valid Unicode range: "

454 "0x${value.toRadixString(16)}");	460 "0x${value.toRadixString(16)}");

455 }	461 }

456 value = _REPLACEMENT_CHARACTER;	462 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;

457 }	463 }

458 if (!_isFirstCharacter \|\| value != _BOM_CHARACTER) {	464 if (!_isFirstCharacter \|\| value != UNICODE_BOM_CHARACTER_RUNE) {

459 _stringSink.writeCharCode(value);	465 _stringSink.writeCharCode(value);

460 }	466 }

461 _isFirstCharacter = false;	467 _isFirstCharacter = false;

462 }	468 }

463	469

464 while (i < endIndex) {	470 while (i < endIndex) {

465 int unit = codeUnits[i++];	471 int unit = codeUnits[i++];

466 // TODO(floitsch): the way we test we could potentially allow	472 // TODO(floitsch): the way we test we could potentially allow

467 // units that are too large, if they happen to have the	473 // units that are too large, if they happen to have the

468 // right bit-pattern. (Same is true for the multibyte loop above).	474 // right bit-pattern. (Same is true for the multibyte loop above).

469 // TODO(floitsch): optimize this loop. See:	475 // TODO(floitsch): optimize this loop. See:

470 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d art?column_width=80	476 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d art?column_width=80

471 if (unit < 0) {	477 if (unit < 0) {

472 // TODO(floitsch): should this be unit <= 0 ?	478 // TODO(floitsch): should this be unit <= 0 ?

473 if (!_allowMalformed) {	479 if (!_allowMalformed) {

474 throw new FormatException(	480 throw new FormatException(

475 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}");	481 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}");

476 }	482 }

477 _stringSink.writeCharCode(_REPLACEMENT_CHARACTER);	483 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE);

478 } else if (unit <= _ONE_BYTE_LIMIT) {	484 } else if (unit <= _ONE_BYTE_LIMIT) {

479 _isFirstCharacter = false;	485 _isFirstCharacter = false;

480 _stringSink.writeCharCode(unit);	486 _stringSink.writeCharCode(unit);

481 } else {	487 } else {

482 if ((unit & 0xE0) == 0xC0) {	488 if ((unit & 0xE0) == 0xC0) {

483 value = unit & 0x1F;	489 value = unit & 0x1F;

484 expectedUnits = extraUnits = 1;	490 expectedUnits = extraUnits = 1;

485 continue loop;	491 continue loop;

486 }	492 }

487 if ((unit & 0xF0) == 0xE0) {	493 if ((unit & 0xF0) == 0xE0) {

488 value = unit & 0x0F;	494 value = unit & 0x0F;

489 expectedUnits = extraUnits = 2;	495 expectedUnits = extraUnits = 2;

490 continue loop;	496 continue loop;

491 }	497 }

492 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences.	498 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences.

493 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) {	499 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) {

494 value = unit & 0x07;	500 value = unit & 0x07;

495 expectedUnits = extraUnits = 3;	501 expectedUnits = extraUnits = 3;

496 continue loop;	502 continue loop;

497 }	503 }

498 if (!_allowMalformed) {	504 if (!_allowMalformed) {

499 throw new FormatException(	505 throw new FormatException(

500 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");	506 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");

501 }	507 }

502 value = _REPLACEMENT_CHARACTER;	508 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;

503 expectedUnits = extraUnits = 0;	509 expectedUnits = extraUnits = 0;

504 _isFirstCharacter = false;	510 _isFirstCharacter = false;

505 _stringSink.writeCharCode(value);	511 _stringSink.writeCharCode(value);

506 }	512 }

507 }	513 }

508 break loop;	514 break loop;

509 }	515 }

510 if (expectedUnits > 0) {	516 if (expectedUnits > 0) {

511 _value = value;	517 _value = value;

512 _expectedUnits = expectedUnits;	518 _expectedUnits = expectedUnits;

513 _extraUnits = extraUnits;	519 _extraUnits = extraUnits;

514 }	520 }

515 }	521 }

516 }	522 }

OLD	NEW

« no previous file with comments | « no previous file | tests/lib/convert/unicode_tests.dart » ('j') | no next file with comments »