| OLD | NEW |
| (Empty) |
| 1 #library('encoding_parser'); | |
| 2 | |
| 3 #import('constants.dart'); | |
| 4 #import('inputstream.dart'); | |
| 5 #import('utils.dart'); | |
| 6 | |
| 7 // TODO(jmesserly): I converted StopIteration to NoMoreElementsException. Seems | |
| 8 // strange to throw this from outside of an iterator though. | |
| 9 /** | |
| 10 * String-like object with an associated position and various extra methods | |
| 11 * If the position is ever greater than the string length then an exception is | |
| 12 * raised. | |
| 13 */ | |
| 14 class EncodingBytes implements Iterable<String> { | |
| 15 final String _bytes; | |
| 16 int _position; | |
| 17 | |
| 18 EncodingBytes(String bytes) : _bytes = bytes, _position = -1; | |
| 19 | |
| 20 Iterator<String> iterator() => _bytes.splitChars().iterator(); | |
| 21 int get length => _bytes.length; | |
| 22 | |
| 23 String next() { | |
| 24 var p = _position = _position + 1; | |
| 25 if (p >= length) { | |
| 26 throw const NoMoreElementsException(); | |
| 27 } else if (p < 0) { | |
| 28 throw new IndexOutOfRangeException(p); | |
| 29 } | |
| 30 return _bytes[p]; | |
| 31 } | |
| 32 | |
| 33 String previous() { | |
| 34 var p = _position; | |
| 35 if (p >= length) { | |
| 36 throw const NoMoreElementsException(); | |
| 37 } else if (p < 0) { | |
| 38 throw new IndexOutOfRangeException(p); | |
| 39 } | |
| 40 _position = p = p - 1; | |
| 41 return _bytes[p]; | |
| 42 } | |
| 43 | |
| 44 set position(int value) { | |
| 45 if (_position >= length) { | |
| 46 throw const NoMoreElementsException(); | |
| 47 } | |
| 48 _position = value; | |
| 49 } | |
| 50 | |
| 51 int get position { | |
| 52 if (_position >= length) { | |
| 53 throw const NoMoreElementsException(); | |
| 54 } | |
| 55 if (_position >= 0) { | |
| 56 return _position; | |
| 57 } else { | |
| 58 return 0; | |
| 59 } | |
| 60 } | |
| 61 | |
| 62 String get currentByte => _bytes[position]; | |
| 63 | |
| 64 /** Skip past a list of characters. Defaults to skipping [isWhitespace]. */ | |
| 65 String skip([CharPreciate skipChars]) { | |
| 66 if (skipChars == null) skipChars = isWhitespace; | |
| 67 var p = position; // use property for the error-checking | |
| 68 while (p < length) { | |
| 69 var c = _bytes[p]; | |
| 70 if (!skipChars(c)) { | |
| 71 _position = p; | |
| 72 return c; | |
| 73 } | |
| 74 p += 1; | |
| 75 } | |
| 76 _position = p; | |
| 77 return null; | |
| 78 } | |
| 79 | |
| 80 String skipUntil(CharPreciate untilChars) { | |
| 81 var p = position; | |
| 82 while (p < length) { | |
| 83 var c = _bytes[p]; | |
| 84 if (untilChars(c)) { | |
| 85 _position = p; | |
| 86 return c; | |
| 87 } | |
| 88 p += 1; | |
| 89 } | |
| 90 return null; | |
| 91 } | |
| 92 | |
| 93 /** | |
| 94 * Look for a sequence of bytes at the start of a string. If the bytes | |
| 95 * are found return true and advance the position to the byte after the | |
| 96 * match. Otherwise return false and leave the position alone. | |
| 97 */ | |
| 98 bool matchBytes(String bytes) { | |
| 99 var p = position; | |
| 100 if (_bytes.length < p + bytes.length) { | |
| 101 return false; | |
| 102 } | |
| 103 var data = _bytes.substring(p, p + bytes.length); | |
| 104 if (data == bytes) { | |
| 105 position += bytes.length; | |
| 106 return true; | |
| 107 } | |
| 108 return false; | |
| 109 } | |
| 110 | |
| 111 /** | |
| 112 * Look for the next sequence of bytes matching a given sequence. If | |
| 113 * a match is found advance the position to the last byte of the match | |
| 114 */ | |
| 115 bool jumpTo(String bytes) { | |
| 116 var newPosition = _bytes.indexOf(bytes, position); | |
| 117 if (newPosition >= 0) { | |
| 118 _position = newPosition + bytes.length - 1; | |
| 119 return true; | |
| 120 } else { | |
| 121 throw const NoMoreElementsException(); | |
| 122 } | |
| 123 } | |
| 124 | |
| 125 String slice(int start, [int end]) { | |
| 126 if (end == null) end = length; | |
| 127 if (end < 0) end += length; | |
| 128 return _bytes.substring(start, end - start); | |
| 129 } | |
| 130 } | |
| 131 | |
| 132 /** Mini parser for detecting character encoding from meta elements. */ | |
| 133 class EncodingParser { | |
| 134 final EncodingBytes data; | |
| 135 String encoding; | |
| 136 | |
| 137 /** [bytes] - the data to work on for encoding detection. */ | |
| 138 EncodingParser(List<int> bytes) | |
| 139 // Note: this is intentionally interpreting bytes as codepoints. | |
| 140 : data = new EncodingBytes(new String.fromCharCodes(bytes).toLowerCase()); | |
| 141 | |
| 142 String getEncoding() { | |
| 143 final methodDispatch = [ | |
| 144 ["<!--", handleComment], | |
| 145 ["<meta", handleMeta], | |
| 146 ["</", handlePossibleEndTag], | |
| 147 ["<!", handleOther], | |
| 148 ["<?", handleOther], | |
| 149 ["<", handlePossibleStartTag]]; | |
| 150 | |
| 151 try { | |
| 152 for (var byte in data) { | |
| 153 var keepParsing = true; | |
| 154 for (var dispatch in methodDispatch) { | |
| 155 if (data.matchBytes(dispatch[0])) { | |
| 156 try { | |
| 157 keepParsing = dispatch[1](); | |
| 158 break; | |
| 159 } on NoMoreElementsException catch (e) { | |
| 160 keepParsing = false; | |
| 161 break; | |
| 162 } | |
| 163 } | |
| 164 } | |
| 165 if (!keepParsing) { | |
| 166 break; | |
| 167 } | |
| 168 } | |
| 169 } on NoMoreElementsException catch (e) { | |
| 170 // Catch this here to match behavior of Python's StopIteration | |
| 171 } | |
| 172 return encoding; | |
| 173 } | |
| 174 | |
| 175 /** Skip over comments. */ | |
| 176 bool handleComment() => data.jumpTo("-->"); | |
| 177 | |
| 178 bool handleMeta() { | |
| 179 if (!isWhitespace(data.currentByte)) { | |
| 180 // if we have <meta not followed by a space so just keep going | |
| 181 return true; | |
| 182 } | |
| 183 // We have a valid meta element we want to search for attributes | |
| 184 while (true) { | |
| 185 // Try to find the next attribute after the current position | |
| 186 var attr = getAttribute(); | |
| 187 if (attr === null) return true; | |
| 188 | |
| 189 if (attr[0] == "charset") { | |
| 190 var tentativeEncoding = attr[1]; | |
| 191 var codec = codecName(tentativeEncoding); | |
| 192 if (codec != null) { | |
| 193 encoding = codec; | |
| 194 return false; | |
| 195 } | |
| 196 } else if (attr[0] == "content") { | |
| 197 var contentParser = new ContentAttrParser(new EncodingBytes(attr[1])); | |
| 198 var tentativeEncoding = contentParser.parse(); | |
| 199 var codec = codecName(tentativeEncoding); | |
| 200 if (codec != null) { | |
| 201 encoding = codec; | |
| 202 return false; | |
| 203 } | |
| 204 } | |
| 205 } | |
| 206 } | |
| 207 | |
| 208 bool handlePossibleStartTag() => handlePossibleTag(false); | |
| 209 | |
| 210 bool handlePossibleEndTag() { | |
| 211 data.next(); | |
| 212 return handlePossibleTag(true); | |
| 213 } | |
| 214 | |
| 215 bool handlePossibleTag(bool endTag) { | |
| 216 if (!isLetter(data.currentByte)) { | |
| 217 //If the next byte is not an ascii letter either ignore this | |
| 218 //fragment (possible start tag case) or treat it according to | |
| 219 //handleOther | |
| 220 if (endTag) { | |
| 221 data.previous(); | |
| 222 handleOther(); | |
| 223 } | |
| 224 return true; | |
| 225 } | |
| 226 | |
| 227 var c = data.skipUntil(isSpaceOrAngleBracket); | |
| 228 if (c == "<") { | |
| 229 // return to the first step in the overall "two step" algorithm | |
| 230 // reprocessing the < byte | |
| 231 data.previous(); | |
| 232 } else { | |
| 233 //Read all attributes | |
| 234 var attr = getAttribute(); | |
| 235 while (attr != null) { | |
| 236 attr = getAttribute(); | |
| 237 } | |
| 238 } | |
| 239 return true; | |
| 240 } | |
| 241 | |
| 242 bool handleOther() => data.jumpTo(">"); | |
| 243 | |
| 244 /** | |
| 245 * Return a name,value pair for the next attribute in the stream, | |
| 246 * if one is found, or null | |
| 247 */ | |
| 248 List<String> getAttribute() { | |
| 249 // Step 1 (skip chars) | |
| 250 var c = data.skip((x) => x == "/" || isWhitespace(x)); | |
| 251 // Step 2 | |
| 252 if (c == ">" || c == null) { | |
| 253 return null; | |
| 254 } | |
| 255 // Step 3 | |
| 256 var attrName = []; | |
| 257 var attrValue = []; | |
| 258 // Step 4 attribute name | |
| 259 while (true) { | |
| 260 if (c == null) { | |
| 261 return null; | |
| 262 } else if (c == "=" && attrName.length > 0) { | |
| 263 break; | |
| 264 } else if (isWhitespace(c)) { | |
| 265 // Step 6! | |
| 266 c = data.skip(); | |
| 267 c = data.next(); | |
| 268 break; | |
| 269 } else if (c == "/" || c == ">") { | |
| 270 return [joinStr(attrName), ""]; | |
| 271 } else if (isLetter(c)) { | |
| 272 attrName.add(c.toLowerCase()); | |
| 273 } else { | |
| 274 attrName.add(c); | |
| 275 } | |
| 276 // Step 5 | |
| 277 c = data.next(); | |
| 278 } | |
| 279 // Step 7 | |
| 280 if (c != "=") { | |
| 281 data.previous(); | |
| 282 return [joinStr(attrName), ""]; | |
| 283 } | |
| 284 // Step 8 | |
| 285 data.next(); | |
| 286 // Step 9 | |
| 287 c = data.skip(); | |
| 288 // Step 10 | |
| 289 if (c == "'" || c == '"') { | |
| 290 // 10.1 | |
| 291 var quoteChar = c; | |
| 292 while (true) { | |
| 293 // 10.2 | |
| 294 c = data.next(); | |
| 295 if (c == quoteChar) { | |
| 296 // 10.3 | |
| 297 data.next(); | |
| 298 return [joinStr(attrName), joinStr(attrValue)]; | |
| 299 } else if (isLetter(c)) { | |
| 300 // 10.4 | |
| 301 attrValue.add(c.toLowerCase()); | |
| 302 } else { | |
| 303 // 10.5 | |
| 304 attrValue.add(c); | |
| 305 } | |
| 306 } | |
| 307 } else if (c == ">") { | |
| 308 return [joinStr(attrName), ""]; | |
| 309 } else if (c === null) { | |
| 310 return null; | |
| 311 } else if (isLetter(c)) { | |
| 312 attrValue.add(c.toLowerCase()); | |
| 313 } else { | |
| 314 attrValue.add(c); | |
| 315 } | |
| 316 // Step 11 | |
| 317 while (true) { | |
| 318 c = data.next(); | |
| 319 if (isSpaceOrAngleBracket(c)) { | |
| 320 return [joinStr(attrName), joinStr(attrValue)]; | |
| 321 } else if (c === null) { | |
| 322 return null; | |
| 323 } else if (isLetter(c)) { | |
| 324 attrValue.add(c.toLowerCase()); | |
| 325 } else { | |
| 326 attrValue.add(c); | |
| 327 } | |
| 328 } | |
| 329 } | |
| 330 } | |
| 331 | |
| 332 | |
| 333 class ContentAttrParser { | |
| 334 final EncodingBytes data; | |
| 335 | |
| 336 ContentAttrParser(this.data); | |
| 337 | |
| 338 String parse() { | |
| 339 try { | |
| 340 // Check if the attr name is charset | |
| 341 // otherwise return | |
| 342 data.jumpTo("charset"); | |
| 343 data.position += 1; | |
| 344 data.skip(); | |
| 345 if (data.currentByte != "=") { | |
| 346 // If there is no = sign keep looking for attrs | |
| 347 return null; | |
| 348 } | |
| 349 data.position += 1; | |
| 350 data.skip(); | |
| 351 // Look for an encoding between matching quote marks | |
| 352 if (data.currentByte == '"' || data.currentByte == "'") { | |
| 353 var quoteMark = data.currentByte; | |
| 354 data.position += 1; | |
| 355 var oldPosition = data.position; | |
| 356 if (data.jumpTo(quoteMark)) { | |
| 357 return data.slice(oldPosition, data.position); | |
| 358 } else { | |
| 359 return null; | |
| 360 } | |
| 361 } else { | |
| 362 // Unquoted value | |
| 363 var oldPosition = data.position; | |
| 364 try { | |
| 365 data.skipUntil(isWhitespace); | |
| 366 return data.slice(oldPosition, data.position); | |
| 367 } on NoMoreElementsException catch (e) { | |
| 368 //Return the whole remaining value | |
| 369 return data.slice(oldPosition); | |
| 370 } | |
| 371 } | |
| 372 } on NoMoreElementsException catch (e) { | |
| 373 return null; | |
| 374 } | |
| 375 } | |
| 376 } | |
| 377 | |
| 378 | |
| 379 bool isSpaceOrAngleBracket(String char) { | |
| 380 return char == ">" || char == "<" || isWhitespace(char); | |
| 381 } | |
| 382 | |
| 383 typedef bool CharPreciate(String char); | |
| OLD | NEW |