lib/inputstream.dart - Issue 10916294: switch html5lib to new pkg layout

Unified Diff: lib/inputstream.dart

Issue 10916294: switch html5lib to new pkg layout (Closed) Base URL: https://github.com/dart-lang/html5lib.git@master

Patch Set: Created 8 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: lib/inputstream.dart

diff --git a/lib/inputstream.dart b/lib/inputstream.dart

deleted file mode 100644

index 2cf76175038430fdb234da9cd37c77f75057d23b..0000000000000000000000000000000000000000

--- a/lib/inputstream.dart

+++ /dev/null

@@ -1,433 +0,0 @@

-#library('inputstream');

-#import('dart:io');

-#import('dart:utf');

-#import('char_encodings.dart');

-#import('constants.dart');

-#import('utils.dart');

-#import('encoding_parser.dart');

-#import('../treebuilders/simpletree.dart', prefix: 'tree'); // for Span

-/**

- * Provides a unicode stream of characters to the HTMLTokenizer.

- *

- * This class takes care of character encoding and removing or replacing

- * incorrect byte-sequences and also provides column and line tracking.

- */

-class HTMLInputStream {

- const int _defaultChunkSize = 10240;

- /** List of where new lines occur. */

- List newLines;

- /**

- * Number of bytes to use when looking for a meta element with

- * encoding information.

- */

- const int numBytesMeta = 512;

- /** Encoding to use if no other information can be found. */

- const String defaultEncoding = "windows-1252";

- /** The name of the character encoding. */

- String charEncodingName;

- /** True if we are certain about [charEncodingName], false for tenative. */

- bool charEncodingCertain = true;

- List<int> rawBytes;

- Iterator<int> dataStream;

- /** Cache for charsUntil() */

- Map<Pair, RegExp> charsUntilRegEx;

- List<String> errors;

- String chunk;

- int chunkOffset;

- /** number of (complete) lines in previous chunks */

- int prevNumLines;

- /** number of columns in the last line of the previous chunk */

- int prevNumCols;

- /** Deals with CR LF and surrogates split over chunk boundaries */

- String _bufferedCharacter;

- /**

- * Initialises the HTMLInputStream.

- *

- * HTMLInputStream(source, [encoding]) -> Normalized stream from source

- * for use by html5lib.

- *

- * [source] can be either a [RandomAccessFile], a [String], or a [List<int>]

- * containing the raw bytes.

- *

- * The optional encoding parameter must be a string that indicates

- * the encoding. If specified, that encoding will be used,

- * regardless of any BOM or later declaration (such as in a meta

- * element)

- *

- * [parseMeta] - Look for a <meta> element containing encoding information

- */

- HTMLInputStream(source, [String encoding, bool parseMeta = true])

- : newLines = [0],

- charEncodingName = codecName(encoding),

- charsUntilRegEx = new Map() {

- if (source is String) {

- // TODO(jmesserly): if the data is already a string, we should just use

- // the source.charCodes() instead of wasting time encoding/decoding.

- rawBytes = encodeUtf8(source);

- charEncodingName = 'utf-8';

- charEncodingCertain = true;

- } else if (source is RandomAccessFile) {

- // TODO(jmesserly): it's unfortunate we need to read all bytes in advance,

- // but it's necessary because of how the UTF decoders work.

- rawBytes = readAllBytesFromFile(source);

- } else if (source is List<int>) {

- rawBytes = source;

- } else {

- // TODO(jmesserly): we should accept some kind of stream API too.

- // Unfortunately dart:io InputStream is async only, which won't work.

- throw new IllegalArgumentException(

- 'source must be a String, RandomAccessFile, or List<int>');

- }

- // Detect encoding iff no explicit "transport level" encoding is supplied

- if (charEncodingName == null) {

- detectEncoding(parseMeta);

- }

- reset();

- }

- void reset() {

- dataStream = null;

- chunk = "";

- chunkOffset = 0;

- errors = [];

- prevNumLines = 0;

- prevNumCols = 0;

- _bufferedCharacter = null;

- }

- void detectEncoding([bool parseMeta = true]) {

- // First look for a BOM

- // This will also read past the BOM if present

- charEncodingName = detectBOM();

- charEncodingCertain = true;

- // If there is no BOM need to look for meta elements with encoding

- // information

- if (charEncodingName === null && parseMeta) {

- charEncodingName = detectEncodingMeta();

- charEncodingCertain = false;

- }

- // If all else fails use the default encoding

- if (charEncodingName === null) {

- charEncodingCertain = false;

- charEncodingName = defaultEncoding;

- }

- // Substitute for equivalent encodings:

- if (charEncodingName.toLowerCase() == "iso-8859-1") {

- charEncodingName = "windows-1252";

- }

- void changeEncoding(String newEncoding) {

- newEncoding = codecName(newEncoding);

- if (const ["utf-16", "utf-16-be", "utf-16-le"].indexOf(newEncoding) >= 0) {

- newEncoding = "utf-8";

- }

- if (newEncoding === null) {

- return;

- } else if (newEncoding == charEncodingName) {

- charEncodingCertain = true;

- } else {

- reset();

- charEncodingName = newEncoding;

- charEncodingCertain = true;

- throw new ReparseException(

- "Encoding changed from $charEncodingName to $newEncoding");

- }

- /**

- * Attempts to detect at BOM at the start of the stream. If

- * an encoding can be determined from the BOM return the name of the

- * encoding otherwise return null.

- */

- String detectBOM() {

- // Try detecting the BOM using bytes from the string

- if (hasUtf8Bom(rawBytes)) {

- return 'utf-8';

- }

- // Note: we don't need to remember whether it was big or little endian

- // because the decoder will do that later. It will also eat the BOM for us.

- if (hasUtf16Bom(rawBytes)) {

- return 'utf-16';

- }

- if (hasUtf32Bom(rawBytes)) {

- return 'utf-32';

- }

- return null;

- }

- /** Report the encoding declared by the meta element. */

- String detectEncodingMeta() {

- var parser = new EncodingParser(slice(rawBytes, 0, numBytesMeta));

- var encoding = parser.getEncoding();

- if (const ["utf-16", "utf-16-be", "utf-16-le"].indexOf(encoding) >= 0) {

- encoding = "utf-8";

- }

- return encoding;

- }

- tree.Span _position(int offset) {

- var nLines = 1;

- var lastLinePos = -1;

- for (int i = 0; i < offset; i++) {

- if (chunk.charCodeAt(i) == NEWLINE) {

- lastLinePos = i;

- nLines++;

- }

- var positionLine = prevNumLines + nLines;

- var positionColumn;

- if (lastLinePos == -1) {

- positionColumn = prevNumCols + offset;

- } else {

- positionColumn = offset - (lastLinePos + 1);

- }

- return new tree.Span(positionLine, positionColumn);

- }

- /** Returns (line, col) of the current position in the stream. */

- tree.Span position() => _position(chunkOffset);

- /**

- * Read one character from the stream or queue if available. Return

- * EOF when EOF is reached.

- */

- String char() {

- // Read a new chunk from the input stream if necessary

- if (chunkOffset >= chunk.length) {

- if (!readChunk()) {

- return EOF;

- }

- return chunk[chunkOffset++];

- }

- // TODO(jmesserly): fix the performance of this method. Lots of things would

- // be better dealt with in the tokenizer. At the very least we should try to

- // avoid so many allocations...

- bool readChunk([int readSize]) {

- if (readSize === null) {

- readSize = _defaultChunkSize;

- }

- var pos = _position(chunk.length);

- prevNumLines = pos.line - 1; // make it 0-based

- prevNumCols = pos.column;

- chunk = "";

- chunkOffset = 0;

- if (dataStream == null) {

- // perform the initial decode

- dataStream = decodeBytes(charEncodingName, rawBytes).iterator();

- }

- var charCodes = [];

- for (int i = 0; i < readSize && dataStream.hasNext(); i++) {

- charCodes.add(dataStream.next());

- }

- var data = codepointsToString(charCodes);

- // Deal with CR LF and surrogates broken across chunks

- if (_bufferedCharacter != null) {

- data = '${_bufferedCharacter}${data}';

- _bufferedCharacter = null;

- } else if (data.length == 0) {

- // We have no more data, bye-bye stream

- return false;

- }

- if (data.length > 1) {

- var lastv = data.charCodeAt(data.length - 1);

- if (lastv == 0x0D || 0xD800 <= lastv && lastv <= 0xDBFF) {

- _bufferedCharacter = data[data.length - 1];

- data = data.substring(0, data.length - 1);

- }

- // Replace invalid characters

- // Note U+0000 is dealt with in the tokenizer

- chunk = replaceCharacters(data);

- return true;

- }

- /**

- * Returns a string of characters from the stream up to but not

- * including any character in 'characters' or EOF.

- */

- String charsUntil(String characters, [bool opposite = false]) {

- // Use a cache of regexps to find the required characters

- var regexpKey = new Pair(characters, opposite ? 'opposite' : '');

- var chars = charsUntilRegEx[regexpKey];

- if (chars == null) {

- escapeChar(c) {

- assert(c < 128);

- var hex = c.toRadixString(16);

- hex = (hex.length == 1) ? "0$hex" : hex;

- return "\\u00$hex";

- }

- var regex = joinStr(characters.charCodes().map(escapeChar));

- if (!opposite) {

- regex = "^${regex}";

- }

- chars = charsUntilRegEx[regexpKey] = new RegExp("^[${regex}]+");

- }

- var rv = [];

- while (true) {

- // Find the longest matching prefix

- // TODO(jmesserly): RegExp does not seem to offer a start offset?

- var searchChunk = chunk.substring(chunkOffset);

- var m = chars.firstMatch(searchChunk);

- if (m === null) {

- // If nothing matched, and it wasn't because we ran out of chunk,

- // then stop

- if (chunkOffset != chunk.length) {

- break;

- }

- } else {

- assert(m.start() == 0);

- var end = m.end();

- // If not the whole chunk matched, return everything

- // up to the part that didn't match

- if (end != chunk.length - chunkOffset) {

- rv.add(searchChunk.substring(0, end));

- chunkOffset += end;

- break;

- }

- // If the whole remainder of the chunk matched,

- // use it all and read the next chunk

- rv.add(searchChunk);

- if (!readChunk()) {

- // Reached EOF

- break;

- }

- return joinStr(rv);

- }

- void unget(String ch) {

- // Only one character is allowed to be ungotten at once - it must

- // be consumed again before any further call to unget

- if (ch != null) {

- if (chunkOffset == 0) {

- // unget is called quite rarely, so it's a good idea to do

- // more work here if it saves a bit of work in the frequently

- // called char and charsUntil.

- // So, just prepend the ungotten character onto the current

- // chunk:

- chunk = '${ch}${chunk}';

- } else {

- chunkOffset -= 1;

- assert(chunk[chunkOffset] == ch);

- }

- String replaceCharacters(String str) {

- // TODO(jmesserly): it'd be nice not to create the array until we know we

- // are replacing something. Also it'd be nice to set the initial capacity.

- var result = <int>[];

- for (int i = 0; i < str.length; i++) {

- var c = str.charCodeAt(i);

- if (invalidUnicode(c)) errors.add("invalid-codepoint");

- if (0xD800 <= c && c <= 0xDFFF) {

- c = 0xFFFD;

- } else if (c == RETURN) {

- int j = i + 1;

- if (j < str.length && str.charCodeAt(j) == NEWLINE) {

- i = j; // \r\n becomes \n

- }

- c = NEWLINE;

- }

- result.add(c);

- }

- return codepointsToString(result);

- }

-// TODO(jmesserly): the Python code used a regex to check for this. But

-// Dart doesn't let you create a regexp with invalid characters.

-bool invalidUnicode(int c) {

- if (0x0001 <= c && c <= 0x0008) return true;

- if (0x000E <= c && c <= 0x001F) return true;

- if (0x007F <= c && c <= 0x009F) return true;

- if (0xD800 <= c && c <= 0xDFFF) return true;

- if (0xFDD0 <= c && c <= 0xFDEF) return true;

- switch (c) {

- case 0x000B: case 0xFFFE: case 0xFFFF: case 0x01FFFE: case 0x01FFFF:

- case 0x02FFFE: case 0x02FFFF: case 0x03FFFE: case 0x03FFFF:

- case 0x04FFFE: case 0x04FFFF: case 0x05FFFE: case 0x05FFFF:

- case 0x06FFFE: case 0x06FFFF: case 0x07FFFE: case 0x07FFFF:

- case 0x08FFFE: case 0x08FFFF: case 0x09FFFE: case 0x09FFFF:

- case 0x0AFFFE: case 0x0AFFFF: case 0x0BFFFE: case 0x0BFFFF:

- case 0x0CFFFE: case 0x0CFFFF: case 0x0DFFFE: case 0x0DFFFF:

- case 0x0EFFFE: case 0x0EFFFF: case 0x0FFFFE: case 0x0FFFFF:

- case 0x10FFFE: case 0x10FFFF:

- return true;

- }

- return false;

-List<int> readAllBytesFromFile(RandomAccessFile file) {

- int length = file.lengthSync();

- var bytes = new List<int>(length);

- int bytesRead = 0;

- while (bytesRead < length) {

- int read = file.readListSync(bytes, bytesRead, length - bytesRead);

- if (read <= 0) {

- // This could happen if, for example, the file was resized while

- // we're reading. Just shrink the bytes array and move on.

- bytes = bytes.getRange(0, bytesRead);

- break;

- }

- bytesRead += read;

- }

- return bytes;

-/**

- * Return the python codec name corresponding to an encoding or null if the

- * string doesn't correspond to a valid encoding.

- */

-String codecName(String encoding) {

- final asciiPunctuation = const RegExp(

- "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]");

- if (encoding == null) return null;

- var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase();

- return encodings[canonicalName];

« no previous file with comments | « lib/html5parser.dart ('k') | lib/list_proxy.dart » ('j') | no next file with comments »