OLD | NEW |
1 library tokenizer; | 1 library tokenizer; |
2 | 2 |
3 import 'dart:math'; | 3 import 'dart:math'; |
4 import 'package:html5lib/dom_parsing.dart' show SourceSpan; | 4 import 'package:html5lib/dom_parsing.dart' show SourceSpan; |
5 import 'package:html5lib/parser.dart' show HtmlParser; | 5 import 'package:html5lib/parser.dart' show HtmlParser; |
6 import 'constants.dart'; | 6 import 'constants.dart'; |
7 import 'inputstream.dart'; | 7 import 'inputstream.dart'; |
8 import 'token.dart'; | 8 import 'token.dart'; |
9 import 'utils.dart'; | 9 import 'utils.dart'; |
10 | 10 |
11 // Group entities by their first character, for faster lookups | 11 // Group entities by their first character, for faster lookups |
12 | 12 |
13 // TODO(jmesserly): we could use a better data structure here like a trie, if | 13 // TODO(jmesserly): we could use a better data structure here like a trie, if |
14 // we had it implemented in Dart. | 14 // we had it implemented in Dart. |
15 Map<String, List<String>> entitiesByFirstChar = (() { | 15 Map<String, List<String>> entitiesByFirstChar = (() { |
16 var result = {}; | 16 var result = {}; |
17 for (var k in entities.getKeys()) { | 17 for (var k in entities.keys) { |
18 result.putIfAbsent(k[0], () => []).add(k); | 18 result.putIfAbsent(k[0], () => []).add(k); |
19 } | 19 } |
20 return result; | 20 return result; |
21 })(); | 21 })(); |
22 | 22 |
23 // TODO(jmesserly): lots of ways to make this faster: | 23 // TODO(jmesserly): lots of ways to make this faster: |
24 // - use char codes everywhere instead of 1-char strings | 24 // - use char codes everywhere instead of 1-char strings |
25 // - use switch instead of contains, indexOf | 25 // - use switch instead of contains, indexOf |
26 // - use switch instead of the sequential if tests | 26 // - use switch instead of the sequential if tests |
27 // - avoid string concat | 27 // - avoid string concat |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
64 HtmlTokenizer(doc, | 64 HtmlTokenizer(doc, |
65 [String encoding, bool parseMeta = true, | 65 [String encoding, bool parseMeta = true, |
66 this.lowercaseElementName = true, this.lowercaseAttrName = true, | 66 this.lowercaseElementName = true, this.lowercaseAttrName = true, |
67 bool generateSpans = false]) | 67 bool generateSpans = false]) |
68 : stream = new HtmlInputStream(doc, encoding, parseMeta, generateSpans), | 68 : stream = new HtmlInputStream(doc, encoding, parseMeta, generateSpans), |
69 tokenQueue = new Queue(), | 69 tokenQueue = new Queue(), |
70 generateSpans = generateSpans { | 70 generateSpans = generateSpans { |
71 reset(); | 71 reset(); |
72 } | 72 } |
73 | 73 |
74 get lastData => currentToken.data.last(); | 74 get lastData => currentToken.data.last; |
75 | 75 |
76 TagToken get currentTagToken => currentToken; | 76 TagToken get currentTagToken => currentToken; |
77 DoctypeToken get currentDoctypeToken => currentToken; | 77 DoctypeToken get currentDoctypeToken => currentToken; |
78 | 78 |
79 bool get hasNext { | 79 bool get hasNext { |
80 if (stream.errors.length > 0) return true; | 80 if (stream.errors.length > 0) return true; |
81 if (tokenQueue.length > 0) return true; | 81 if (tokenQueue.length > 0) return true; |
82 // Start processing. When EOF is reached state will return false; | 82 // Start processing. When EOF is reached state will return false; |
83 // instead of true and the loop will terminate. | 83 // instead of true and the loop will terminate. |
84 do { | 84 do { |
(...skipping 119 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
204 var output = "&"; | 204 var output = "&"; |
205 | 205 |
206 var charStack = [stream.char()]; | 206 var charStack = [stream.char()]; |
207 if (isWhitespace(charStack[0]) || charStack[0] == '<' || charStack[0] == '&' | 207 if (isWhitespace(charStack[0]) || charStack[0] == '<' || charStack[0] == '&' |
208 || charStack[0] == EOF || allowedChar == charStack[0]) { | 208 || charStack[0] == EOF || allowedChar == charStack[0]) { |
209 stream.unget(charStack[0]); | 209 stream.unget(charStack[0]); |
210 } else if (charStack[0] == "#") { | 210 } else if (charStack[0] == "#") { |
211 // Read the next character to see if it's hex or decimal | 211 // Read the next character to see if it's hex or decimal |
212 bool hex = false; | 212 bool hex = false; |
213 charStack.add(stream.char()); | 213 charStack.add(stream.char()); |
214 if (charStack.last() == 'x' || charStack.last() == 'X') { | 214 if (charStack.last == 'x' || charStack.last == 'X') { |
215 hex = true; | 215 hex = true; |
216 charStack.add(stream.char()); | 216 charStack.add(stream.char()); |
217 } | 217 } |
218 | 218 |
219 // charStack.last() should be the first digit | 219 // charStack.last should be the first digit |
220 if (hex && isHexDigit(charStack.last()) || | 220 if (hex && isHexDigit(charStack.last) || |
221 (!hex && isDigit(charStack.last()))) { | 221 (!hex && isDigit(charStack.last))) { |
222 // At least one digit found, so consume the whole number | 222 // At least one digit found, so consume the whole number |
223 stream.unget(charStack.last()); | 223 stream.unget(charStack.last); |
224 output = consumeNumberEntity(hex); | 224 output = consumeNumberEntity(hex); |
225 } else { | 225 } else { |
226 // No digits found | 226 // No digits found |
227 _addToken(new ParseErrorToken("expected-numeric-entity")); | 227 _addToken(new ParseErrorToken("expected-numeric-entity")); |
228 stream.unget(charStack.removeLast()); | 228 stream.unget(charStack.removeLast()); |
229 output = "&${joinStr(charStack)}"; | 229 output = "&${joinStr(charStack)}"; |
230 } | 230 } |
231 } else { | 231 } else { |
232 // At this point in the process might have named entity. Entities | 232 // At this point in the process might have named entity. Entities |
233 // are stored in the global variable "entities". | 233 // are stored in the global variable "entities". |
234 // | 234 // |
235 // Consume characters and compare to these to a substring of the | 235 // Consume characters and compare to these to a substring of the |
236 // entity names in the list until the substring no longer matches. | 236 // entity names in the list until the substring no longer matches. |
237 var filteredEntityList = entitiesByFirstChar[charStack[0]]; | 237 var filteredEntityList = entitiesByFirstChar[charStack[0]]; |
238 if (filteredEntityList == null) filteredEntityList = const []; | 238 if (filteredEntityList == null) filteredEntityList = const []; |
239 | 239 |
240 while (charStack.last() !== EOF) { | 240 while (charStack.last !== EOF) { |
241 var name = joinStr(charStack); | 241 var name = joinStr(charStack); |
242 filteredEntityList = filteredEntityList.filter( | 242 filteredEntityList = filteredEntityList.filter( |
243 (e) => e.startsWith(name)); | 243 (e) => e.startsWith(name)); |
244 | 244 |
245 if (filteredEntityList.length == 0) { | 245 if (filteredEntityList.length == 0) { |
246 break; | 246 break; |
247 } | 247 } |
248 charStack.add(stream.char()); | 248 charStack.add(stream.char()); |
249 } | 249 } |
250 | 250 |
(...skipping 953 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1204 | 1204 |
1205 // Eat the character directly after the bogus comment which is either a | 1205 // Eat the character directly after the bogus comment which is either a |
1206 // ">" or an EOF. | 1206 // ">" or an EOF. |
1207 stream.char(); | 1207 stream.char(); |
1208 state = dataState; | 1208 state = dataState; |
1209 return true; | 1209 return true; |
1210 } | 1210 } |
1211 | 1211 |
1212 bool markupDeclarationOpenState() { | 1212 bool markupDeclarationOpenState() { |
1213 var charStack = [stream.char()]; | 1213 var charStack = [stream.char()]; |
1214 if (charStack.last() == "-") { | 1214 if (charStack.last == "-") { |
1215 charStack.add(stream.char()); | 1215 charStack.add(stream.char()); |
1216 if (charStack.last() == "-") { | 1216 if (charStack.last == "-") { |
1217 currentToken = new CommentToken(""); | 1217 currentToken = new CommentToken(""); |
1218 state = commentStartState; | 1218 state = commentStartState; |
1219 return true; | 1219 return true; |
1220 } | 1220 } |
1221 } else if (charStack.last() == 'd' || charStack.last() == 'D') { | 1221 } else if (charStack.last == 'd' || charStack.last == 'D') { |
1222 var matched = true; | 1222 var matched = true; |
1223 for (var expected in const ['oO', 'cC', 'tT', 'yY', 'pP', 'eE']) { | 1223 for (var expected in const ['oO', 'cC', 'tT', 'yY', 'pP', 'eE']) { |
1224 var char = stream.char(); | 1224 var char = stream.char(); |
1225 charStack.add(char); | 1225 charStack.add(char); |
1226 if (char == EOF || !expected.contains(char)) { | 1226 if (char == EOF || !expected.contains(char)) { |
1227 matched = false; | 1227 matched = false; |
1228 break; | 1228 break; |
1229 } | 1229 } |
1230 } | 1230 } |
1231 if (matched) { | 1231 if (matched) { |
1232 currentToken = new DoctypeToken(correct: true); | 1232 currentToken = new DoctypeToken(correct: true); |
1233 state = doctypeState; | 1233 state = doctypeState; |
1234 return true; | 1234 return true; |
1235 } | 1235 } |
1236 } else if (charStack.last() == "[" && | 1236 } else if (charStack.last == "[" && |
1237 parser !== null && parser.tree.openElements.length > 0 && | 1237 parser !== null && parser.tree.openElements.length > 0 && |
1238 parser.tree.openElements.last().namespace | 1238 parser.tree.openElements.last.namespace |
1239 != parser.tree.defaultNamespace) { | 1239 != parser.tree.defaultNamespace) { |
1240 var matched = true; | 1240 var matched = true; |
1241 for (var expected in const ["C", "D", "A", "T", "A", "["]) { | 1241 for (var expected in const ["C", "D", "A", "T", "A", "["]) { |
1242 charStack.add(stream.char()); | 1242 charStack.add(stream.char()); |
1243 if (charStack.last() != expected) { | 1243 if (charStack.last != expected) { |
1244 matched = false; | 1244 matched = false; |
1245 break; | 1245 break; |
1246 } | 1246 } |
1247 } | 1247 } |
1248 if (matched) { | 1248 if (matched) { |
1249 state = cdataSectionState; | 1249 state = cdataSectionState; |
1250 return true; | 1250 return true; |
1251 } | 1251 } |
1252 } | 1252 } |
1253 | 1253 |
(...skipping 572 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1826 } | 1826 } |
1827 | 1827 |
1828 if (data.length > 0) { | 1828 if (data.length > 0) { |
1829 _addToken(new CharactersToken(joinStr(data))); | 1829 _addToken(new CharactersToken(joinStr(data))); |
1830 } | 1830 } |
1831 state = dataState; | 1831 state = dataState; |
1832 return true; | 1832 return true; |
1833 } | 1833 } |
1834 } | 1834 } |
1835 | 1835 |
OLD | NEW |