Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(439)

Unified Diff: lib/inputstream.dart

Issue 10916294: switch html5lib to new pkg layout (Closed) Base URL: https://github.com/dart-lang/html5lib.git@master
Patch Set: Created 8 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « lib/html5parser.dart ('k') | lib/list_proxy.dart » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: lib/inputstream.dart
diff --git a/lib/inputstream.dart b/lib/inputstream.dart
deleted file mode 100644
index 2cf76175038430fdb234da9cd37c77f75057d23b..0000000000000000000000000000000000000000
--- a/lib/inputstream.dart
+++ /dev/null
@@ -1,433 +0,0 @@
-#library('inputstream');
-
-#import('dart:io');
-#import('dart:utf');
-#import('char_encodings.dart');
-#import('constants.dart');
-#import('utils.dart');
-#import('encoding_parser.dart');
-#import('../treebuilders/simpletree.dart', prefix: 'tree'); // for Span
-
-/**
- * Provides a unicode stream of characters to the HTMLTokenizer.
- *
- * This class takes care of character encoding and removing or replacing
- * incorrect byte-sequences and also provides column and line tracking.
- */
-class HTMLInputStream {
-
- const int _defaultChunkSize = 10240;
-
- /** List of where new lines occur. */
- List newLines;
-
- /**
- * Number of bytes to use when looking for a meta element with
- * encoding information.
- */
- const int numBytesMeta = 512;
-
- /** Encoding to use if no other information can be found. */
- const String defaultEncoding = "windows-1252";
-
- /** The name of the character encoding. */
- String charEncodingName;
-
- /** True if we are certain about [charEncodingName], false for tenative. */
- bool charEncodingCertain = true;
-
- List<int> rawBytes;
-
- Iterator<int> dataStream;
-
- /** Cache for charsUntil() */
- Map<Pair, RegExp> charsUntilRegEx;
-
- List<String> errors;
-
- String chunk;
-
- int chunkOffset;
-
- /** number of (complete) lines in previous chunks */
- int prevNumLines;
-
- /** number of columns in the last line of the previous chunk */
- int prevNumCols;
-
- /** Deals with CR LF and surrogates split over chunk boundaries */
- String _bufferedCharacter;
-
- /**
- * Initialises the HTMLInputStream.
- *
- * HTMLInputStream(source, [encoding]) -> Normalized stream from source
- * for use by html5lib.
- *
- * [source] can be either a [RandomAccessFile], a [String], or a [List<int>]
- * containing the raw bytes.
- *
- * The optional encoding parameter must be a string that indicates
- * the encoding. If specified, that encoding will be used,
- * regardless of any BOM or later declaration (such as in a meta
- * element)
- *
- * [parseMeta] - Look for a <meta> element containing encoding information
- */
- HTMLInputStream(source, [String encoding, bool parseMeta = true])
- : newLines = [0],
- charEncodingName = codecName(encoding),
- charsUntilRegEx = new Map() {
-
- if (source is String) {
- // TODO(jmesserly): if the data is already a string, we should just use
- // the source.charCodes() instead of wasting time encoding/decoding.
- rawBytes = encodeUtf8(source);
- charEncodingName = 'utf-8';
- charEncodingCertain = true;
- } else if (source is RandomAccessFile) {
- // TODO(jmesserly): it's unfortunate we need to read all bytes in advance,
- // but it's necessary because of how the UTF decoders work.
- rawBytes = readAllBytesFromFile(source);
- } else if (source is List<int>) {
- rawBytes = source;
- } else {
- // TODO(jmesserly): we should accept some kind of stream API too.
- // Unfortunately dart:io InputStream is async only, which won't work.
- throw new IllegalArgumentException(
- 'source must be a String, RandomAccessFile, or List<int>');
- }
-
- // Detect encoding iff no explicit "transport level" encoding is supplied
- if (charEncodingName == null) {
- detectEncoding(parseMeta);
- }
-
- reset();
- }
-
- void reset() {
- dataStream = null;
- chunk = "";
- chunkOffset = 0;
- errors = [];
- prevNumLines = 0;
- prevNumCols = 0;
- _bufferedCharacter = null;
- }
-
-
- void detectEncoding([bool parseMeta = true]) {
- // First look for a BOM
- // This will also read past the BOM if present
- charEncodingName = detectBOM();
- charEncodingCertain = true;
-
- // If there is no BOM need to look for meta elements with encoding
- // information
- if (charEncodingName === null && parseMeta) {
- charEncodingName = detectEncodingMeta();
- charEncodingCertain = false;
- }
- // If all else fails use the default encoding
- if (charEncodingName === null) {
- charEncodingCertain = false;
- charEncodingName = defaultEncoding;
- }
-
- // Substitute for equivalent encodings:
- if (charEncodingName.toLowerCase() == "iso-8859-1") {
- charEncodingName = "windows-1252";
- }
- }
-
- void changeEncoding(String newEncoding) {
- newEncoding = codecName(newEncoding);
- if (const ["utf-16", "utf-16-be", "utf-16-le"].indexOf(newEncoding) >= 0) {
- newEncoding = "utf-8";
- }
- if (newEncoding === null) {
- return;
- } else if (newEncoding == charEncodingName) {
- charEncodingCertain = true;
- } else {
- reset();
- charEncodingName = newEncoding;
- charEncodingCertain = true;
- throw new ReparseException(
- "Encoding changed from $charEncodingName to $newEncoding");
- }
- }
-
- /**
- * Attempts to detect at BOM at the start of the stream. If
- * an encoding can be determined from the BOM return the name of the
- * encoding otherwise return null.
- */
- String detectBOM() {
- // Try detecting the BOM using bytes from the string
- if (hasUtf8Bom(rawBytes)) {
- return 'utf-8';
- }
- // Note: we don't need to remember whether it was big or little endian
- // because the decoder will do that later. It will also eat the BOM for us.
- if (hasUtf16Bom(rawBytes)) {
- return 'utf-16';
- }
- if (hasUtf32Bom(rawBytes)) {
- return 'utf-32';
- }
- return null;
- }
-
- /** Report the encoding declared by the meta element. */
- String detectEncodingMeta() {
- var parser = new EncodingParser(slice(rawBytes, 0, numBytesMeta));
- var encoding = parser.getEncoding();
-
- if (const ["utf-16", "utf-16-be", "utf-16-le"].indexOf(encoding) >= 0) {
- encoding = "utf-8";
- }
-
- return encoding;
- }
-
- tree.Span _position(int offset) {
- var nLines = 1;
- var lastLinePos = -1;
- for (int i = 0; i < offset; i++) {
- if (chunk.charCodeAt(i) == NEWLINE) {
- lastLinePos = i;
- nLines++;
- }
- }
- var positionLine = prevNumLines + nLines;
- var positionColumn;
- if (lastLinePos == -1) {
- positionColumn = prevNumCols + offset;
- } else {
- positionColumn = offset - (lastLinePos + 1);
- }
- return new tree.Span(positionLine, positionColumn);
- }
-
- /** Returns (line, col) of the current position in the stream. */
- tree.Span position() => _position(chunkOffset);
-
- /**
- * Read one character from the stream or queue if available. Return
- * EOF when EOF is reached.
- */
- String char() {
- // Read a new chunk from the input stream if necessary
- if (chunkOffset >= chunk.length) {
- if (!readChunk()) {
- return EOF;
- }
- }
-
- return chunk[chunkOffset++];
- }
-
-
- // TODO(jmesserly): fix the performance of this method. Lots of things would
- // be better dealt with in the tokenizer. At the very least we should try to
- // avoid so many allocations...
- bool readChunk([int readSize]) {
- if (readSize === null) {
- readSize = _defaultChunkSize;
- }
-
- var pos = _position(chunk.length);
- prevNumLines = pos.line - 1; // make it 0-based
- prevNumCols = pos.column;
-
- chunk = "";
- chunkOffset = 0;
-
- if (dataStream == null) {
- // perform the initial decode
- dataStream = decodeBytes(charEncodingName, rawBytes).iterator();
- }
- var charCodes = [];
- for (int i = 0; i < readSize && dataStream.hasNext(); i++) {
- charCodes.add(dataStream.next());
- }
- var data = codepointsToString(charCodes);
-
- // Deal with CR LF and surrogates broken across chunks
- if (_bufferedCharacter != null) {
- data = '${_bufferedCharacter}${data}';
- _bufferedCharacter = null;
- } else if (data.length == 0) {
- // We have no more data, bye-bye stream
- return false;
- }
-
- if (data.length > 1) {
- var lastv = data.charCodeAt(data.length - 1);
- if (lastv == 0x0D || 0xD800 <= lastv && lastv <= 0xDBFF) {
- _bufferedCharacter = data[data.length - 1];
- data = data.substring(0, data.length - 1);
- }
- }
-
- // Replace invalid characters
- // Note U+0000 is dealt with in the tokenizer
- chunk = replaceCharacters(data);
-
- return true;
- }
-
- /**
- * Returns a string of characters from the stream up to but not
- * including any character in 'characters' or EOF.
- */
- String charsUntil(String characters, [bool opposite = false]) {
- // Use a cache of regexps to find the required characters
- var regexpKey = new Pair(characters, opposite ? 'opposite' : '');
- var chars = charsUntilRegEx[regexpKey];
-
- if (chars == null) {
- escapeChar(c) {
- assert(c < 128);
- var hex = c.toRadixString(16);
- hex = (hex.length == 1) ? "0$hex" : hex;
- return "\\u00$hex";
- }
- var regex = joinStr(characters.charCodes().map(escapeChar));
- if (!opposite) {
- regex = "^${regex}";
- }
- chars = charsUntilRegEx[regexpKey] = new RegExp("^[${regex}]+");
- }
-
- var rv = [];
- while (true) {
- // Find the longest matching prefix
- // TODO(jmesserly): RegExp does not seem to offer a start offset?
- var searchChunk = chunk.substring(chunkOffset);
- var m = chars.firstMatch(searchChunk);
- if (m === null) {
- // If nothing matched, and it wasn't because we ran out of chunk,
- // then stop
- if (chunkOffset != chunk.length) {
- break;
- }
- } else {
- assert(m.start() == 0);
- var end = m.end();
- // If not the whole chunk matched, return everything
- // up to the part that didn't match
- if (end != chunk.length - chunkOffset) {
- rv.add(searchChunk.substring(0, end));
- chunkOffset += end;
- break;
- }
- }
- // If the whole remainder of the chunk matched,
- // use it all and read the next chunk
- rv.add(searchChunk);
- if (!readChunk()) {
- // Reached EOF
- break;
- }
- }
- return joinStr(rv);
- }
-
- void unget(String ch) {
- // Only one character is allowed to be ungotten at once - it must
- // be consumed again before any further call to unget
- if (ch != null) {
- if (chunkOffset == 0) {
- // unget is called quite rarely, so it's a good idea to do
- // more work here if it saves a bit of work in the frequently
- // called char and charsUntil.
- // So, just prepend the ungotten character onto the current
- // chunk:
- chunk = '${ch}${chunk}';
- } else {
- chunkOffset -= 1;
- assert(chunk[chunkOffset] == ch);
- }
- }
- }
-
- String replaceCharacters(String str) {
- // TODO(jmesserly): it'd be nice not to create the array until we know we
- // are replacing something. Also it'd be nice to set the initial capacity.
- var result = <int>[];
- for (int i = 0; i < str.length; i++) {
- var c = str.charCodeAt(i);
- if (invalidUnicode(c)) errors.add("invalid-codepoint");
-
- if (0xD800 <= c && c <= 0xDFFF) {
- c = 0xFFFD;
- } else if (c == RETURN) {
- int j = i + 1;
- if (j < str.length && str.charCodeAt(j) == NEWLINE) {
- i = j; // \r\n becomes \n
- }
- c = NEWLINE;
- }
- result.add(c);
- }
- return codepointsToString(result);
- }
-}
-
-
-// TODO(jmesserly): the Python code used a regex to check for this. But
-// Dart doesn't let you create a regexp with invalid characters.
-bool invalidUnicode(int c) {
- if (0x0001 <= c && c <= 0x0008) return true;
- if (0x000E <= c && c <= 0x001F) return true;
- if (0x007F <= c && c <= 0x009F) return true;
- if (0xD800 <= c && c <= 0xDFFF) return true;
- if (0xFDD0 <= c && c <= 0xFDEF) return true;
- switch (c) {
- case 0x000B: case 0xFFFE: case 0xFFFF: case 0x01FFFE: case 0x01FFFF:
- case 0x02FFFE: case 0x02FFFF: case 0x03FFFE: case 0x03FFFF:
- case 0x04FFFE: case 0x04FFFF: case 0x05FFFE: case 0x05FFFF:
- case 0x06FFFE: case 0x06FFFF: case 0x07FFFE: case 0x07FFFF:
- case 0x08FFFE: case 0x08FFFF: case 0x09FFFE: case 0x09FFFF:
- case 0x0AFFFE: case 0x0AFFFF: case 0x0BFFFE: case 0x0BFFFF:
- case 0x0CFFFE: case 0x0CFFFF: case 0x0DFFFE: case 0x0DFFFF:
- case 0x0EFFFE: case 0x0EFFFF: case 0x0FFFFE: case 0x0FFFFF:
- case 0x10FFFE: case 0x10FFFF:
- return true;
- }
- return false;
-}
-
-List<int> readAllBytesFromFile(RandomAccessFile file) {
- int length = file.lengthSync();
- var bytes = new List<int>(length);
-
- int bytesRead = 0;
- while (bytesRead < length) {
- int read = file.readListSync(bytes, bytesRead, length - bytesRead);
- if (read <= 0) {
- // This could happen if, for example, the file was resized while
- // we're reading. Just shrink the bytes array and move on.
- bytes = bytes.getRange(0, bytesRead);
- break;
- }
- bytesRead += read;
- }
- return bytes;
-}
-
-/**
- * Return the python codec name corresponding to an encoding or null if the
- * string doesn't correspond to a valid encoding.
- */
-String codecName(String encoding) {
- final asciiPunctuation = const RegExp(
- "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]");
-
- if (encoding == null) return null;
- var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase();
- return encodings[canonicalName];
-}
« no previous file with comments | « lib/html5parser.dart ('k') | lib/list_proxy.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698