Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(631)

Side by Side Diff: lib/encoding_parser.dart

Issue 10916294: switch html5lib to new pkg layout (Closed) Base URL: https://github.com/dart-lang/html5lib.git@master
Patch Set: Created 8 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « lib/dom.dart ('k') | lib/html5parser.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #library('encoding_parser');
2
3 #import('constants.dart');
4 #import('inputstream.dart');
5 #import('utils.dart');
6
7 // TODO(jmesserly): I converted StopIteration to NoMoreElementsException. Seems
8 // strange to throw this from outside of an iterator though.
9 /**
10 * String-like object with an associated position and various extra methods
11 * If the position is ever greater than the string length then an exception is
12 * raised.
13 */
14 class EncodingBytes implements Iterable<String> {
15 final String _bytes;
16 int _position;
17
18 EncodingBytes(String bytes) : _bytes = bytes, _position = -1;
19
20 Iterator<String> iterator() => _bytes.splitChars().iterator();
21 int get length => _bytes.length;
22
23 String next() {
24 var p = _position = _position + 1;
25 if (p >= length) {
26 throw const NoMoreElementsException();
27 } else if (p < 0) {
28 throw new IndexOutOfRangeException(p);
29 }
30 return _bytes[p];
31 }
32
33 String previous() {
34 var p = _position;
35 if (p >= length) {
36 throw const NoMoreElementsException();
37 } else if (p < 0) {
38 throw new IndexOutOfRangeException(p);
39 }
40 _position = p = p - 1;
41 return _bytes[p];
42 }
43
44 set position(int value) {
45 if (_position >= length) {
46 throw const NoMoreElementsException();
47 }
48 _position = value;
49 }
50
51 int get position {
52 if (_position >= length) {
53 throw const NoMoreElementsException();
54 }
55 if (_position >= 0) {
56 return _position;
57 } else {
58 return 0;
59 }
60 }
61
62 String get currentByte => _bytes[position];
63
64 /** Skip past a list of characters. Defaults to skipping [isWhitespace]. */
65 String skip([CharPreciate skipChars]) {
66 if (skipChars == null) skipChars = isWhitespace;
67 var p = position; // use property for the error-checking
68 while (p < length) {
69 var c = _bytes[p];
70 if (!skipChars(c)) {
71 _position = p;
72 return c;
73 }
74 p += 1;
75 }
76 _position = p;
77 return null;
78 }
79
80 String skipUntil(CharPreciate untilChars) {
81 var p = position;
82 while (p < length) {
83 var c = _bytes[p];
84 if (untilChars(c)) {
85 _position = p;
86 return c;
87 }
88 p += 1;
89 }
90 return null;
91 }
92
93 /**
94 * Look for a sequence of bytes at the start of a string. If the bytes
95 * are found return true and advance the position to the byte after the
96 * match. Otherwise return false and leave the position alone.
97 */
98 bool matchBytes(String bytes) {
99 var p = position;
100 if (_bytes.length < p + bytes.length) {
101 return false;
102 }
103 var data = _bytes.substring(p, p + bytes.length);
104 if (data == bytes) {
105 position += bytes.length;
106 return true;
107 }
108 return false;
109 }
110
111 /**
112 * Look for the next sequence of bytes matching a given sequence. If
113 * a match is found advance the position to the last byte of the match
114 */
115 bool jumpTo(String bytes) {
116 var newPosition = _bytes.indexOf(bytes, position);
117 if (newPosition >= 0) {
118 _position = newPosition + bytes.length - 1;
119 return true;
120 } else {
121 throw const NoMoreElementsException();
122 }
123 }
124
125 String slice(int start, [int end]) {
126 if (end == null) end = length;
127 if (end < 0) end += length;
128 return _bytes.substring(start, end - start);
129 }
130 }
131
132 /** Mini parser for detecting character encoding from meta elements. */
133 class EncodingParser {
134 final EncodingBytes data;
135 String encoding;
136
137 /** [bytes] - the data to work on for encoding detection. */
138 EncodingParser(List<int> bytes)
139 // Note: this is intentionally interpreting bytes as codepoints.
140 : data = new EncodingBytes(new String.fromCharCodes(bytes).toLowerCase());
141
142 String getEncoding() {
143 final methodDispatch = [
144 ["<!--", handleComment],
145 ["<meta", handleMeta],
146 ["</", handlePossibleEndTag],
147 ["<!", handleOther],
148 ["<?", handleOther],
149 ["<", handlePossibleStartTag]];
150
151 try {
152 for (var byte in data) {
153 var keepParsing = true;
154 for (var dispatch in methodDispatch) {
155 if (data.matchBytes(dispatch[0])) {
156 try {
157 keepParsing = dispatch[1]();
158 break;
159 } on NoMoreElementsException catch (e) {
160 keepParsing = false;
161 break;
162 }
163 }
164 }
165 if (!keepParsing) {
166 break;
167 }
168 }
169 } on NoMoreElementsException catch (e) {
170 // Catch this here to match behavior of Python's StopIteration
171 }
172 return encoding;
173 }
174
175 /** Skip over comments. */
176 bool handleComment() => data.jumpTo("-->");
177
178 bool handleMeta() {
179 if (!isWhitespace(data.currentByte)) {
180 // if we have <meta not followed by a space so just keep going
181 return true;
182 }
183 // We have a valid meta element we want to search for attributes
184 while (true) {
185 // Try to find the next attribute after the current position
186 var attr = getAttribute();
187 if (attr === null) return true;
188
189 if (attr[0] == "charset") {
190 var tentativeEncoding = attr[1];
191 var codec = codecName(tentativeEncoding);
192 if (codec != null) {
193 encoding = codec;
194 return false;
195 }
196 } else if (attr[0] == "content") {
197 var contentParser = new ContentAttrParser(new EncodingBytes(attr[1]));
198 var tentativeEncoding = contentParser.parse();
199 var codec = codecName(tentativeEncoding);
200 if (codec != null) {
201 encoding = codec;
202 return false;
203 }
204 }
205 }
206 }
207
208 bool handlePossibleStartTag() => handlePossibleTag(false);
209
210 bool handlePossibleEndTag() {
211 data.next();
212 return handlePossibleTag(true);
213 }
214
215 bool handlePossibleTag(bool endTag) {
216 if (!isLetter(data.currentByte)) {
217 //If the next byte is not an ascii letter either ignore this
218 //fragment (possible start tag case) or treat it according to
219 //handleOther
220 if (endTag) {
221 data.previous();
222 handleOther();
223 }
224 return true;
225 }
226
227 var c = data.skipUntil(isSpaceOrAngleBracket);
228 if (c == "<") {
229 // return to the first step in the overall "two step" algorithm
230 // reprocessing the < byte
231 data.previous();
232 } else {
233 //Read all attributes
234 var attr = getAttribute();
235 while (attr != null) {
236 attr = getAttribute();
237 }
238 }
239 return true;
240 }
241
242 bool handleOther() => data.jumpTo(">");
243
244 /**
245 * Return a name,value pair for the next attribute in the stream,
246 * if one is found, or null
247 */
248 List<String> getAttribute() {
249 // Step 1 (skip chars)
250 var c = data.skip((x) => x == "/" || isWhitespace(x));
251 // Step 2
252 if (c == ">" || c == null) {
253 return null;
254 }
255 // Step 3
256 var attrName = [];
257 var attrValue = [];
258 // Step 4 attribute name
259 while (true) {
260 if (c == null) {
261 return null;
262 } else if (c == "=" && attrName.length > 0) {
263 break;
264 } else if (isWhitespace(c)) {
265 // Step 6!
266 c = data.skip();
267 c = data.next();
268 break;
269 } else if (c == "/" || c == ">") {
270 return [joinStr(attrName), ""];
271 } else if (isLetter(c)) {
272 attrName.add(c.toLowerCase());
273 } else {
274 attrName.add(c);
275 }
276 // Step 5
277 c = data.next();
278 }
279 // Step 7
280 if (c != "=") {
281 data.previous();
282 return [joinStr(attrName), ""];
283 }
284 // Step 8
285 data.next();
286 // Step 9
287 c = data.skip();
288 // Step 10
289 if (c == "'" || c == '"') {
290 // 10.1
291 var quoteChar = c;
292 while (true) {
293 // 10.2
294 c = data.next();
295 if (c == quoteChar) {
296 // 10.3
297 data.next();
298 return [joinStr(attrName), joinStr(attrValue)];
299 } else if (isLetter(c)) {
300 // 10.4
301 attrValue.add(c.toLowerCase());
302 } else {
303 // 10.5
304 attrValue.add(c);
305 }
306 }
307 } else if (c == ">") {
308 return [joinStr(attrName), ""];
309 } else if (c === null) {
310 return null;
311 } else if (isLetter(c)) {
312 attrValue.add(c.toLowerCase());
313 } else {
314 attrValue.add(c);
315 }
316 // Step 11
317 while (true) {
318 c = data.next();
319 if (isSpaceOrAngleBracket(c)) {
320 return [joinStr(attrName), joinStr(attrValue)];
321 } else if (c === null) {
322 return null;
323 } else if (isLetter(c)) {
324 attrValue.add(c.toLowerCase());
325 } else {
326 attrValue.add(c);
327 }
328 }
329 }
330 }
331
332
333 class ContentAttrParser {
334 final EncodingBytes data;
335
336 ContentAttrParser(this.data);
337
338 String parse() {
339 try {
340 // Check if the attr name is charset
341 // otherwise return
342 data.jumpTo("charset");
343 data.position += 1;
344 data.skip();
345 if (data.currentByte != "=") {
346 // If there is no = sign keep looking for attrs
347 return null;
348 }
349 data.position += 1;
350 data.skip();
351 // Look for an encoding between matching quote marks
352 if (data.currentByte == '"' || data.currentByte == "'") {
353 var quoteMark = data.currentByte;
354 data.position += 1;
355 var oldPosition = data.position;
356 if (data.jumpTo(quoteMark)) {
357 return data.slice(oldPosition, data.position);
358 } else {
359 return null;
360 }
361 } else {
362 // Unquoted value
363 var oldPosition = data.position;
364 try {
365 data.skipUntil(isWhitespace);
366 return data.slice(oldPosition, data.position);
367 } on NoMoreElementsException catch (e) {
368 //Return the whole remaining value
369 return data.slice(oldPosition);
370 }
371 }
372 } on NoMoreElementsException catch (e) {
373 return null;
374 }
375 }
376 }
377
378
379 bool isSpaceOrAngleBracket(String char) {
380 return char == ">" || char == "<" || isWhitespace(char);
381 }
382
383 typedef bool CharPreciate(String char);
OLDNEW
« no previous file with comments | « lib/dom.dart ('k') | lib/html5parser.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698