OLD | NEW |
1 library inputstream; | 1 library inputstream; |
2 | 2 |
3 import 'dart:utf'; | 3 import 'dart:utf'; |
4 import 'package:html5lib/dom_parsing.dart' show SourceFileInfo; | 4 import 'package:html5lib/dom_parsing.dart' show SourceFileInfo; |
5 import 'char_encodings.dart'; | 5 import 'char_encodings.dart'; |
6 import 'constants.dart'; | 6 import 'constants.dart'; |
7 import 'utils.dart'; | 7 import 'utils.dart'; |
8 import 'encoding_parser.dart'; | 8 import 'encoding_parser.dart'; |
9 | 9 |
10 /** Hooks to call into dart:io without directly referencing it. */ | 10 /** Hooks to call into dart:io without directly referencing it. */ |
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
86 * element) | 86 * element) |
87 * | 87 * |
88 * [parseMeta] - Look for a <meta> element containing encoding information | 88 * [parseMeta] - Look for a <meta> element containing encoding information |
89 */ | 89 */ |
90 HtmlInputStream(source, [String encoding, bool parseMeta = true, | 90 HtmlInputStream(source, [String encoding, bool parseMeta = true, |
91 this.generateSpans = false]) | 91 this.generateSpans = false]) |
92 : charEncodingName = codecName(encoding) { | 92 : charEncodingName = codecName(encoding) { |
93 | 93 |
94 if (source is String) { | 94 if (source is String) { |
95 // TODO(jmesserly): if the data is already a string, we should just use | 95 // TODO(jmesserly): if the data is already a string, we should just use |
96 // the source.charCodes() instead of wasting time encoding/decoding. | 96 // the source.charCodes instead of wasting time encoding/decoding. |
97 rawBytes = encodeUtf8(source); | 97 rawBytes = encodeUtf8(source); |
98 charEncodingName = 'utf-8'; | 98 charEncodingName = 'utf-8'; |
99 charEncodingCertain = true; | 99 charEncodingCertain = true; |
100 } else if (source is List<int>) { | 100 } else if (source is List<int>) { |
101 rawBytes = source; | 101 rawBytes = source; |
102 } else { | 102 } else { |
103 // TODO(jmesserly): it's unfortunate we need to read all bytes in advance, | 103 // TODO(jmesserly): it's unfortunate we need to read all bytes in advance, |
104 // but it's necessary because of how the UTF decoders work. | 104 // but it's necessary because of how the UTF decoders work. |
105 rawBytes = consoleSupport.bytesFromFile(source); | 105 rawBytes = consoleSupport.bytesFromFile(source); |
106 | 106 |
(...skipping 192 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
299 var regexpKey = new Pair(characters, opposite ? 'opposite' : ''); | 299 var regexpKey = new Pair(characters, opposite ? 'opposite' : ''); |
300 var chars = charsUntilRegEx[regexpKey]; | 300 var chars = charsUntilRegEx[regexpKey]; |
301 | 301 |
302 if (chars == null) { | 302 if (chars == null) { |
303 escapeChar(c) { | 303 escapeChar(c) { |
304 assert(c < 128); | 304 assert(c < 128); |
305 var hex = c.toRadixString(16); | 305 var hex = c.toRadixString(16); |
306 hex = (hex.length == 1) ? "0$hex" : hex; | 306 hex = (hex.length == 1) ? "0$hex" : hex; |
307 return "\\u00$hex"; | 307 return "\\u00$hex"; |
308 } | 308 } |
309 var regex = joinStr(characters.charCodes().map(escapeChar)); | 309 var regex = joinStr(characters.charCodes.map(escapeChar)); |
310 if (!opposite) { | 310 if (!opposite) { |
311 regex = "^${regex}"; | 311 regex = "^${regex}"; |
312 } | 312 } |
313 chars = charsUntilRegEx[regexpKey] = new RegExp("^[${regex}]+"); | 313 chars = charsUntilRegEx[regexpKey] = new RegExp("^[${regex}]+"); |
314 } | 314 } |
315 | 315 |
316 var rv = []; | 316 var rv = []; |
317 while (true) { | 317 while (true) { |
318 // Find the longest matching prefix | 318 // Find the longest matching prefix |
319 // TODO(jmesserly): RegExp does not seem to offer a start offset? | 319 // TODO(jmesserly): RegExp does not seem to offer a start offset? |
320 var searchChunk = chunk.substring(chunkOffset); | 320 var searchChunk = chunk.substring(chunkOffset); |
321 var m = chars.firstMatch(searchChunk); | 321 var m = chars.firstMatch(searchChunk); |
322 if (m === null) { | 322 if (m === null) { |
323 // If nothing matched, and it wasn't because we ran out of chunk, | 323 // If nothing matched, and it wasn't because we ran out of chunk, |
324 // then stop | 324 // then stop |
325 if (chunkOffset != chunk.length) { | 325 if (chunkOffset != chunk.length) { |
326 break; | 326 break; |
327 } | 327 } |
328 } else { | 328 } else { |
329 assert(m.start() == 0); | 329 assert(m.start == 0); |
330 var end = m.end(); | 330 var end = m.end; |
331 // If not the whole chunk matched, return everything | 331 // If not the whole chunk matched, return everything |
332 // up to the part that didn't match | 332 // up to the part that didn't match |
333 if (end != chunk.length - chunkOffset) { | 333 if (end != chunk.length - chunkOffset) { |
334 rv.add(searchChunk.substring(0, end)); | 334 rv.add(searchChunk.substring(0, end)); |
335 chunkOffset += end; | 335 chunkOffset += end; |
336 break; | 336 break; |
337 } | 337 } |
338 } | 338 } |
339 // If the whole remainder of the chunk matched, | 339 // If the whole remainder of the chunk matched, |
340 // use it all and read the next chunk | 340 // use it all and read the next chunk |
(...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
417 * string doesn't correspond to a valid encoding. | 417 * string doesn't correspond to a valid encoding. |
418 */ | 418 */ |
419 String codecName(String encoding) { | 419 String codecName(String encoding) { |
420 final asciiPunctuation = const RegExp( | 420 final asciiPunctuation = const RegExp( |
421 "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]"); | 421 "[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]"); |
422 | 422 |
423 if (encoding == null) return null; | 423 if (encoding == null) return null; |
424 var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase(); | 424 var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase(); |
425 return encodings[canonicalName]; | 425 return encodings[canonicalName]; |
426 } | 426 } |
OLD | NEW |