lib/encoding_parser.dart - Issue 10916294: switch html5lib to new pkg layout

Side by Side Diff: lib/encoding_parser.dart

Issue 10916294: switch html5lib to new pkg layout (Closed) Base URL: https://github.com/dart-lang/html5lib.git@master

Patch Set: Created 8 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 #library('encoding_parser');

2

3 #import('constants.dart');

4 #import('inputstream.dart');

5 #import('utils.dart');

6

7 // TODO(jmesserly): I converted StopIteration to NoMoreElementsException. Seems

8 // strange to throw this from outside of an iterator though.

9 /**

10 * String-like object with an associated position and various extra methods

11 * If the position is ever greater than the string length then an exception is

12 * raised.

13 */

14 class EncodingBytes implements Iterable<String> {

15 final String _bytes;

16 int _position;

17

18 EncodingBytes(String bytes) : _bytes = bytes, _position = -1;

19

20 Iterator<String> iterator() => _bytes.splitChars().iterator();

21 int get length => _bytes.length;

22

23 String next() {

24 var p = _position = _position + 1;

25 if (p >= length) {

26 throw const NoMoreElementsException();

27 } else if (p < 0) {

28 throw new IndexOutOfRangeException(p);

29 }

30 return _bytes[p];

31 }

32

33 String previous() {

34 var p = _position;

35 if (p >= length) {

36 throw const NoMoreElementsException();

37 } else if (p < 0) {

38 throw new IndexOutOfRangeException(p);

39 }

40 _position = p = p - 1;

41 return _bytes[p];

42 }

43

44 set position(int value) {

45 if (_position >= length) {

46 throw const NoMoreElementsException();

47 }

48 _position = value;

49 }

50

51 int get position {

52 if (_position >= length) {

53 throw const NoMoreElementsException();

54 }

55 if (_position >= 0) {

56 return _position;

57 } else {

58 return 0;

59 }

60 }

61

62 String get currentByte => _bytes[position];

63

64 /** Skip past a list of characters. Defaults to skipping [isWhitespace]. */

65 String skip([CharPreciate skipChars]) {

66 if (skipChars == null) skipChars = isWhitespace;

67 var p = position; // use property for the error-checking

68 while (p < length) {

69 var c = _bytes[p];

70 if (!skipChars(c)) {

71 _position = p;

72 return c;

73 }

74 p += 1;

75 }

76 _position = p;

77 return null;

78 }

79

80 String skipUntil(CharPreciate untilChars) {

81 var p = position;

82 while (p < length) {

83 var c = _bytes[p];

84 if (untilChars(c)) {

85 _position = p;

86 return c;

87 }

88 p += 1;

89 }

90 return null;

91 }

92

93 /**

94 * Look for a sequence of bytes at the start of a string. If the bytes

95 * are found return true and advance the position to the byte after the

96 * match. Otherwise return false and leave the position alone.

97 */

98 bool matchBytes(String bytes) {

99 var p = position;

100 if (_bytes.length < p + bytes.length) {

101 return false;

102 }

103 var data = _bytes.substring(p, p + bytes.length);

104 if (data == bytes) {

105 position += bytes.length;

106 return true;

107 }

108 return false;

109 }

110

111 /**

112 * Look for the next sequence of bytes matching a given sequence. If

113 * a match is found advance the position to the last byte of the match

114 */

115 bool jumpTo(String bytes) {

116 var newPosition = _bytes.indexOf(bytes, position);

117 if (newPosition >= 0) {

118 _position = newPosition + bytes.length - 1;

119 return true;

120 } else {

121 throw const NoMoreElementsException();

122 }

123 }

124

125 String slice(int start, [int end]) {

126 if (end == null) end = length;

127 if (end < 0) end += length;

128 return _bytes.substring(start, end - start);

129 }

130 }

131

132 /** Mini parser for detecting character encoding from meta elements. */

133 class EncodingParser {

134 final EncodingBytes data;

135 String encoding;

136

137 /** [bytes] - the data to work on for encoding detection. */

138 EncodingParser(List<int> bytes)

139 // Note: this is intentionally interpreting bytes as codepoints.

140 : data = new EncodingBytes(new String.fromCharCodes(bytes).toLowerCase());

141

142 String getEncoding() {

143 final methodDispatch = [

144 ["<!--", handleComment],

145 ["<meta", handleMeta],

146 ["</", handlePossibleEndTag],

147 ["<!", handleOther],

148 ["<?", handleOther],

149 ["<", handlePossibleStartTag]];

150

151 try {

152 for (var byte in data) {

153 var keepParsing = true;

154 for (var dispatch in methodDispatch) {

155 if (data.matchBytes(dispatch[0])) {

156 try {

157 keepParsing = dispatch[1]();

158 break;

159 } on NoMoreElementsException catch (e) {

160 keepParsing = false;

161 break;

162 }

163 }

164 }

165 if (!keepParsing) {

166 break;

167 }

168 }

169 } on NoMoreElementsException catch (e) {

170 // Catch this here to match behavior of Python's StopIteration

171 }

172 return encoding;

173 }

174

175 /** Skip over comments. */

176 bool handleComment() => data.jumpTo("-->");

177

178 bool handleMeta() {

179 if (!isWhitespace(data.currentByte)) {

180 // if we have <meta not followed by a space so just keep going

181 return true;

182 }

183 // We have a valid meta element we want to search for attributes

184 while (true) {

185 // Try to find the next attribute after the current position

186 var attr = getAttribute();

187 if (attr === null) return true;

188

189 if (attr[0] == "charset") {

190 var tentativeEncoding = attr[1];

191 var codec = codecName(tentativeEncoding);

192 if (codec != null) {

193 encoding = codec;

194 return false;

195 }

196 } else if (attr[0] == "content") {

197 var contentParser = new ContentAttrParser(new EncodingBytes(attr[1]));

198 var tentativeEncoding = contentParser.parse();

199 var codec = codecName(tentativeEncoding);

200 if (codec != null) {

201 encoding = codec;

202 return false;

203 }

204 }

205 }

206 }

207

208 bool handlePossibleStartTag() => handlePossibleTag(false);

209

210 bool handlePossibleEndTag() {

211 data.next();

212 return handlePossibleTag(true);

213 }

214

215 bool handlePossibleTag(bool endTag) {

216 if (!isLetter(data.currentByte)) {

217 //If the next byte is not an ascii letter either ignore this

218 //fragment (possible start tag case) or treat it according to

219 //handleOther

220 if (endTag) {

221 data.previous();

222 handleOther();

223 }

224 return true;

225 }

226

227 var c = data.skipUntil(isSpaceOrAngleBracket);

228 if (c == "<") {

229 // return to the first step in the overall "two step" algorithm

230 // reprocessing the < byte

231 data.previous();

232 } else {

233 //Read all attributes

234 var attr = getAttribute();

235 while (attr != null) {

236 attr = getAttribute();

237 }

238 }

239 return true;

240 }

241

242 bool handleOther() => data.jumpTo(">");

243

244 /**

245 * Return a name,value pair for the next attribute in the stream,

246 * if one is found, or null

247 */

248 List<String> getAttribute() {

249 // Step 1 (skip chars)

250 var c = data.skip((x) => x == "/" \|\| isWhitespace(x));

251 // Step 2

252 if (c == ">" \|\| c == null) {

253 return null;

254 }

255 // Step 3

256 var attrName = [];

257 var attrValue = [];

258 // Step 4 attribute name

259 while (true) {

260 if (c == null) {

261 return null;

262 } else if (c == "=" && attrName.length > 0) {

263 break;

264 } else if (isWhitespace(c)) {

265 // Step 6!

266 c = data.skip();

267 c = data.next();

268 break;

269 } else if (c == "/" \|\| c == ">") {

270 return [joinStr(attrName), ""];

271 } else if (isLetter(c)) {

272 attrName.add(c.toLowerCase());

273 } else {

274 attrName.add(c);

275 }

276 // Step 5

277 c = data.next();

278 }

279 // Step 7

280 if (c != "=") {

281 data.previous();

282 return [joinStr(attrName), ""];

283 }

284 // Step 8

285 data.next();

286 // Step 9

287 c = data.skip();

288 // Step 10

289 if (c == "'" \|\| c == '"') {

290 // 10.1

291 var quoteChar = c;

292 while (true) {

293 // 10.2

294 c = data.next();

295 if (c == quoteChar) {

296 // 10.3

297 data.next();

298 return [joinStr(attrName), joinStr(attrValue)];

299 } else if (isLetter(c)) {

300 // 10.4

301 attrValue.add(c.toLowerCase());

302 } else {

303 // 10.5

304 attrValue.add(c);

305 }

306 }

307 } else if (c == ">") {

308 return [joinStr(attrName), ""];

309 } else if (c === null) {

310 return null;

311 } else if (isLetter(c)) {

312 attrValue.add(c.toLowerCase());

313 } else {

314 attrValue.add(c);

315 }

316 // Step 11

317 while (true) {

318 c = data.next();

319 if (isSpaceOrAngleBracket(c)) {

320 return [joinStr(attrName), joinStr(attrValue)];

321 } else if (c === null) {

322 return null;

323 } else if (isLetter(c)) {

324 attrValue.add(c.toLowerCase());

325 } else {

326 attrValue.add(c);

327 }

328 }

329 }

330 }

331

332

333 class ContentAttrParser {

334 final EncodingBytes data;

335

336 ContentAttrParser(this.data);

337

338 String parse() {

339 try {

340 // Check if the attr name is charset

341 // otherwise return

342 data.jumpTo("charset");

343 data.position += 1;

344 data.skip();

345 if (data.currentByte != "=") {

346 // If there is no = sign keep looking for attrs

347 return null;

348 }

349 data.position += 1;

350 data.skip();

351 // Look for an encoding between matching quote marks

352 if (data.currentByte == '"' \|\| data.currentByte == "'") {

353 var quoteMark = data.currentByte;

354 data.position += 1;

355 var oldPosition = data.position;

356 if (data.jumpTo(quoteMark)) {

357 return data.slice(oldPosition, data.position);

358 } else {

359 return null;

360 }

361 } else {

362 // Unquoted value

363 var oldPosition = data.position;

364 try {

365 data.skipUntil(isWhitespace);

366 return data.slice(oldPosition, data.position);

367 } on NoMoreElementsException catch (e) {

368 //Return the whole remaining value

369 return data.slice(oldPosition);

370 }

371 }

372 } on NoMoreElementsException catch (e) {

373 return null;

374 }

375 }

376 }

377

378

379 bool isSpaceOrAngleBracket(String char) {

380 return char == ">" \|\| char == "<" \|\| isWhitespace(char);

381 }

382

383 typedef bool CharPreciate(String char);

OLD	NEW

« no previous file with comments | « lib/dom.dart ('k') | lib/html5parser.dart » ('j') | no next file with comments »