Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(614)

Side by Side Diff: lib/src/tokenizer.dart

Issue 11260039: Advance html5lib to newest breaking changes in core: getKeys -> keys, etc (Closed) Base URL: git@github.com:dart-lang/html5lib.git@master
Patch Set: Created 8 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « lib/src/list_proxy.dart ('k') | lib/src/treebuilder.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 library tokenizer; 1 library tokenizer;
2 2
3 import 'dart:math'; 3 import 'dart:math';
4 import 'package:html5lib/dom_parsing.dart' show SourceSpan; 4 import 'package:html5lib/dom_parsing.dart' show SourceSpan;
5 import 'package:html5lib/parser.dart' show HtmlParser; 5 import 'package:html5lib/parser.dart' show HtmlParser;
6 import 'constants.dart'; 6 import 'constants.dart';
7 import 'inputstream.dart'; 7 import 'inputstream.dart';
8 import 'token.dart'; 8 import 'token.dart';
9 import 'utils.dart'; 9 import 'utils.dart';
10 10
11 // Group entities by their first character, for faster lookups 11 // Group entities by their first character, for faster lookups
12 12
13 // TODO(jmesserly): we could use a better data structure here like a trie, if 13 // TODO(jmesserly): we could use a better data structure here like a trie, if
14 // we had it implemented in Dart. 14 // we had it implemented in Dart.
15 Map<String, List<String>> entitiesByFirstChar = (() { 15 Map<String, List<String>> entitiesByFirstChar = (() {
16 var result = {}; 16 var result = {};
17 for (var k in entities.getKeys()) { 17 for (var k in entities.keys) {
18 result.putIfAbsent(k[0], () => []).add(k); 18 result.putIfAbsent(k[0], () => []).add(k);
19 } 19 }
20 return result; 20 return result;
21 })(); 21 })();
22 22
23 // TODO(jmesserly): lots of ways to make this faster: 23 // TODO(jmesserly): lots of ways to make this faster:
24 // - use char codes everywhere instead of 1-char strings 24 // - use char codes everywhere instead of 1-char strings
25 // - use switch instead of contains, indexOf 25 // - use switch instead of contains, indexOf
26 // - use switch instead of the sequential if tests 26 // - use switch instead of the sequential if tests
27 // - avoid string concat 27 // - avoid string concat
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
64 HtmlTokenizer(doc, 64 HtmlTokenizer(doc,
65 [String encoding, bool parseMeta = true, 65 [String encoding, bool parseMeta = true,
66 this.lowercaseElementName = true, this.lowercaseAttrName = true, 66 this.lowercaseElementName = true, this.lowercaseAttrName = true,
67 bool generateSpans = false]) 67 bool generateSpans = false])
68 : stream = new HtmlInputStream(doc, encoding, parseMeta, generateSpans), 68 : stream = new HtmlInputStream(doc, encoding, parseMeta, generateSpans),
69 tokenQueue = new Queue(), 69 tokenQueue = new Queue(),
70 generateSpans = generateSpans { 70 generateSpans = generateSpans {
71 reset(); 71 reset();
72 } 72 }
73 73
74 get lastData => currentToken.data.last(); 74 get lastData => currentToken.data.last;
75 75
76 TagToken get currentTagToken => currentToken; 76 TagToken get currentTagToken => currentToken;
77 DoctypeToken get currentDoctypeToken => currentToken; 77 DoctypeToken get currentDoctypeToken => currentToken;
78 78
79 bool get hasNext { 79 bool get hasNext {
80 if (stream.errors.length > 0) return true; 80 if (stream.errors.length > 0) return true;
81 if (tokenQueue.length > 0) return true; 81 if (tokenQueue.length > 0) return true;
82 // Start processing. When EOF is reached state will return false; 82 // Start processing. When EOF is reached state will return false;
83 // instead of true and the loop will terminate. 83 // instead of true and the loop will terminate.
84 do { 84 do {
(...skipping 119 matching lines...) Expand 10 before | Expand all | Expand 10 after
204 var output = "&"; 204 var output = "&";
205 205
206 var charStack = [stream.char()]; 206 var charStack = [stream.char()];
207 if (isWhitespace(charStack[0]) || charStack[0] == '<' || charStack[0] == '&' 207 if (isWhitespace(charStack[0]) || charStack[0] == '<' || charStack[0] == '&'
208 || charStack[0] == EOF || allowedChar == charStack[0]) { 208 || charStack[0] == EOF || allowedChar == charStack[0]) {
209 stream.unget(charStack[0]); 209 stream.unget(charStack[0]);
210 } else if (charStack[0] == "#") { 210 } else if (charStack[0] == "#") {
211 // Read the next character to see if it's hex or decimal 211 // Read the next character to see if it's hex or decimal
212 bool hex = false; 212 bool hex = false;
213 charStack.add(stream.char()); 213 charStack.add(stream.char());
214 if (charStack.last() == 'x' || charStack.last() == 'X') { 214 if (charStack.last == 'x' || charStack.last == 'X') {
215 hex = true; 215 hex = true;
216 charStack.add(stream.char()); 216 charStack.add(stream.char());
217 } 217 }
218 218
219 // charStack.last() should be the first digit 219 // charStack.last should be the first digit
220 if (hex && isHexDigit(charStack.last()) || 220 if (hex && isHexDigit(charStack.last) ||
221 (!hex && isDigit(charStack.last()))) { 221 (!hex && isDigit(charStack.last))) {
222 // At least one digit found, so consume the whole number 222 // At least one digit found, so consume the whole number
223 stream.unget(charStack.last()); 223 stream.unget(charStack.last);
224 output = consumeNumberEntity(hex); 224 output = consumeNumberEntity(hex);
225 } else { 225 } else {
226 // No digits found 226 // No digits found
227 _addToken(new ParseErrorToken("expected-numeric-entity")); 227 _addToken(new ParseErrorToken("expected-numeric-entity"));
228 stream.unget(charStack.removeLast()); 228 stream.unget(charStack.removeLast());
229 output = "&${joinStr(charStack)}"; 229 output = "&${joinStr(charStack)}";
230 } 230 }
231 } else { 231 } else {
232 // At this point in the process might have named entity. Entities 232 // At this point in the process might have named entity. Entities
233 // are stored in the global variable "entities". 233 // are stored in the global variable "entities".
234 // 234 //
235 // Consume characters and compare to these to a substring of the 235 // Consume characters and compare to these to a substring of the
236 // entity names in the list until the substring no longer matches. 236 // entity names in the list until the substring no longer matches.
237 var filteredEntityList = entitiesByFirstChar[charStack[0]]; 237 var filteredEntityList = entitiesByFirstChar[charStack[0]];
238 if (filteredEntityList == null) filteredEntityList = const []; 238 if (filteredEntityList == null) filteredEntityList = const [];
239 239
240 while (charStack.last() !== EOF) { 240 while (charStack.last !== EOF) {
241 var name = joinStr(charStack); 241 var name = joinStr(charStack);
242 filteredEntityList = filteredEntityList.filter( 242 filteredEntityList = filteredEntityList.filter(
243 (e) => e.startsWith(name)); 243 (e) => e.startsWith(name));
244 244
245 if (filteredEntityList.length == 0) { 245 if (filteredEntityList.length == 0) {
246 break; 246 break;
247 } 247 }
248 charStack.add(stream.char()); 248 charStack.add(stream.char());
249 } 249 }
250 250
(...skipping 953 matching lines...) Expand 10 before | Expand all | Expand 10 after
1204 1204
1205 // Eat the character directly after the bogus comment which is either a 1205 // Eat the character directly after the bogus comment which is either a
1206 // ">" or an EOF. 1206 // ">" or an EOF.
1207 stream.char(); 1207 stream.char();
1208 state = dataState; 1208 state = dataState;
1209 return true; 1209 return true;
1210 } 1210 }
1211 1211
1212 bool markupDeclarationOpenState() { 1212 bool markupDeclarationOpenState() {
1213 var charStack = [stream.char()]; 1213 var charStack = [stream.char()];
1214 if (charStack.last() == "-") { 1214 if (charStack.last == "-") {
1215 charStack.add(stream.char()); 1215 charStack.add(stream.char());
1216 if (charStack.last() == "-") { 1216 if (charStack.last == "-") {
1217 currentToken = new CommentToken(""); 1217 currentToken = new CommentToken("");
1218 state = commentStartState; 1218 state = commentStartState;
1219 return true; 1219 return true;
1220 } 1220 }
1221 } else if (charStack.last() == 'd' || charStack.last() == 'D') { 1221 } else if (charStack.last == 'd' || charStack.last == 'D') {
1222 var matched = true; 1222 var matched = true;
1223 for (var expected in const ['oO', 'cC', 'tT', 'yY', 'pP', 'eE']) { 1223 for (var expected in const ['oO', 'cC', 'tT', 'yY', 'pP', 'eE']) {
1224 var char = stream.char(); 1224 var char = stream.char();
1225 charStack.add(char); 1225 charStack.add(char);
1226 if (char == EOF || !expected.contains(char)) { 1226 if (char == EOF || !expected.contains(char)) {
1227 matched = false; 1227 matched = false;
1228 break; 1228 break;
1229 } 1229 }
1230 } 1230 }
1231 if (matched) { 1231 if (matched) {
1232 currentToken = new DoctypeToken(correct: true); 1232 currentToken = new DoctypeToken(correct: true);
1233 state = doctypeState; 1233 state = doctypeState;
1234 return true; 1234 return true;
1235 } 1235 }
1236 } else if (charStack.last() == "[" && 1236 } else if (charStack.last == "[" &&
1237 parser !== null && parser.tree.openElements.length > 0 && 1237 parser !== null && parser.tree.openElements.length > 0 &&
1238 parser.tree.openElements.last().namespace 1238 parser.tree.openElements.last.namespace
1239 != parser.tree.defaultNamespace) { 1239 != parser.tree.defaultNamespace) {
1240 var matched = true; 1240 var matched = true;
1241 for (var expected in const ["C", "D", "A", "T", "A", "["]) { 1241 for (var expected in const ["C", "D", "A", "T", "A", "["]) {
1242 charStack.add(stream.char()); 1242 charStack.add(stream.char());
1243 if (charStack.last() != expected) { 1243 if (charStack.last != expected) {
1244 matched = false; 1244 matched = false;
1245 break; 1245 break;
1246 } 1246 }
1247 } 1247 }
1248 if (matched) { 1248 if (matched) {
1249 state = cdataSectionState; 1249 state = cdataSectionState;
1250 return true; 1250 return true;
1251 } 1251 }
1252 } 1252 }
1253 1253
(...skipping 572 matching lines...) Expand 10 before | Expand all | Expand 10 after
1826 } 1826 }
1827 1827
1828 if (data.length > 0) { 1828 if (data.length > 0) {
1829 _addToken(new CharactersToken(joinStr(data))); 1829 _addToken(new CharactersToken(joinStr(data)));
1830 } 1830 }
1831 state = dataState; 1831 state = dataState;
1832 return true; 1832 return true;
1833 } 1833 }
1834 } 1834 }
1835 1835
OLDNEW
« no previous file with comments | « lib/src/list_proxy.dart ('k') | lib/src/treebuilder.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698