Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(221)

Side by Side Diff: utils/apidoc/mdn/extract.dart

Issue 9315026: Cleanup mdn scripts (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Code review fixes Created 8 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « utils/apidoc/mdn/crawl.js ('k') | utils/apidoc/mdn/extract.sh » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 #import ("dart:html"); 1 #import ("dart:html");
2 #import ("dart:htmlimpl"); 2 #import ("dart:htmlimpl");
3 #import ("dart:dom", prefix:"dom"); 3 #import ("dart:dom", prefix:"dom");
4 #import ("dart:json"); 4 #import ("dart:json");
5 5
6 // Workaround for HTML lib missing feature. 6 // Workaround for HTML lib missing feature.
7 Range newRange() { 7 Range newRange() {
8 return LevelDom.wrapRange(dom.document.createRange()); 8 return LevelDom.wrapRange(dom.document.createRange());
9 } 9 }
10 10
11 // Temporary range object to optimize performance computing client rects 11 // Temporary range object to optimize performance computing client rects
12 // from text nodes. 12 // from text nodes.
13 Range _tempRange; 13 Range _tempRange;
14 // Hacks because ASYNC measurement is annoying when just writing a script. 14 // Hacks because ASYNC measurement is annoying when just writing a script.
15 ClientRect getClientRect(Node n) { 15 ClientRect getClientRect(Node n) {
16 if (n is Element) { 16 if (n is Element) {
17 Element e = n; 17 dom.Element raw = unwrapDomObject(n.dynamic);
18 dom.Element raw = unwrapDomObject(e.dynamic);
19 return LevelDom.wrapClientRect(raw.getBoundingClientRect()); 18 return LevelDom.wrapClientRect(raw.getBoundingClientRect());
20 } else { 19 } else {
21 // Crazy hacks that works for nodes.... create a range and measure it. 20 // Crazy hacks that works for nodes.... create a range and measure it.
22 if (_tempRange == null) { 21 if (_tempRange == null) {
23 _tempRange = newRange(); 22 _tempRange = newRange();
24 } 23 }
25 _tempRange.setStartBefore(n); 24 _tempRange.setStartBefore(n);
26 _tempRange.setEndAfter(n); 25 _tempRange.setEndAfter(n);
27 return _tempRange.getBoundingClientRect(); 26 return _tempRange.getBoundingClientRect();
28 } 27 }
29 } 28 }
30 29
31 final DART_REMOVED = "dart_removed"; 30 /**
31 * CSS class that is added to elements in the DOM to indicate that they should
32 * be removed when extracting blocks of documentation. This is helpful when
33 * running this script in a web browser as it is easy to visually see what
34 * blocks of information were extracted when using CSS such as DEBUG_CSS
35 * which highlights elements that should be removed.
36 */
37 final DART_REMOVED = "dart-removed";
32 38
33 final DEBUG_CSS = """ 39 final DEBUG_CSS = """
34 <style type="text/css"> 40 <style type="text/css">
35 .dart_removed { 41 .dart-removed {
36 background-color: rgba(255, 0, 0, 0.5); 42 background-color: rgba(255, 0, 0, 0.5);
37 } 43 }
38 </style>"""; 44 </style>""";
39 45
40 final MIN_PIXELS_DIFFERENT_LINES = 10; 46 final MIN_PIXELS_DIFFERENT_LINES = 10;
41 47
42 final IDL_SELECTOR = "pre.eval, pre.idl"; 48 final IDL_SELECTOR = "pre.eval, pre.idl";
43 49
44 Map data; 50 Map data;
45 51
(...skipping 228 matching lines...) Expand 10 before | Expand all | Expand 10 after
274 if (path.startsWith('/')) { 280 if (path.startsWith('/')) {
275 return "$pageDomain$path"; 281 return "$pageDomain$path";
276 } else if (path.startsWith("#")) { 282 } else if (path.startsWith("#")) {
277 return "$pageUrl$path"; 283 return "$pageUrl$path";
278 } else { 284 } else {
279 return "$pageDir$path"; 285 return "$pageDir$path";
280 } 286 }
281 } 287 }
282 288
283 bool inTable(Node n) { 289 bool inTable(Node n) {
284 while(n != null) { 290 while (n != null) {
285 if (n is TableElement) return true; 291 if (n is TableElement) return true;
286 n = n.parent; 292 n = n.parent;
287 } 293 }
288 return false; 294 return false;
289 } 295 }
290 296
291 String escapeHTML(str) { 297 String escapeHTML(str) {
292 Element e = new Element.tag("div"); 298 Element e = new Element.tag("div");
293 e.text = str; 299 e.text = str;
294 return e.innerHTML; 300 return e.innerHTML;
295 } 301 }
296 302
297 List<Text> getAllTextNodes(Element elem) { 303 List<Text> getAllTextNodes(Element elem) {
298 List<Text> nodes = <Text>[]; 304 final nodes = <Text>[];
299 helper(Node n) { 305 helper(Node n) {
300 if (n is Text) { 306 if (n is Text) {
301 nodes.add(n); 307 nodes.add(n);
302 } else { 308 } else {
303 for (Node child in n.nodes) { 309 for (Node child in n.nodes) {
304 helper(child); 310 helper(child);
305 } 311 }
306 } 312 }
307 }; 313 };
308 314
309 helper(elem); 315 helper(elem);
310 return nodes; 316 return nodes;
311 } 317 }
312 318
313 /** 319 /**
314 * Whether a node and its children are all types that are safe to skip if the 320 * Whether a node and its children are all types that are safe to skip if the
315 * nodes have no text content. 321 * nodes have no text content.
316 */ 322 */
317 bool isSkippableType(Node n) { 323 bool isSkippableType(Node n) {
318 // TODO(jacobr): are there any types we don't want to skip even if they 324 // TODO(jacobr): are there any types we don't want to skip even if they
319 // have no text content? 325 // have no text content?
320 if (n is ImageElement || n is CanvasElement || n is InputElement 326 if (n is ImageElement || n is CanvasElement || n is InputElement
321 || n is ObjectElement) { 327 || n is ObjectElement) {
322 return false; 328 return false;
323 } 329 }
324 if (n is Text) return true; 330 if (n is Text) return true;
325 331
326 for (Node child in n.nodes) { 332 for (final child in n.nodes) {
327 if (isSkippableType(child) == false) { 333 if (!isSkippableType(child)) {
328 return false; 334 return false;
329 } 335 }
330 } 336 }
331 return true; 337 return true;
332 } 338 }
333 339
334 bool isSkippable(Node n) { 340 bool isSkippable(Node n) {
335 if (!isSkippableType(n)) return false; 341 if (!isSkippableType(n)) return false;
336 return n.text.trim().length == 0; 342 return n.text.trim().length == 0;
337 } 343 }
338 344
339 void onEnd() { 345 void onEnd() {
340 // Hideous hack to send JSON back to JS. 346 // Hideous hack to send JSON back to JS.
341 String dbJson = JSON.stringify(dbEntry); 347 String dbJson = JSON.stringify(dbEntry);
342 // workaround bug in JSON parser. 348 // workaround bug in JSON parser.
343 dbJson = dbJson.replaceAll("ZDARTIUMDOESNTESCAPESLASHNJXXXX", "\\n"); 349 dbJson = dbJson.replaceAll("ZDARTIUMDOESNTESCAPESLASHNJXXXX", "\\n");
344 350
351 // Use postMessage to end the JSON to JavaScript. TODO(jacobr): use a simple
352 // isolate based Dart-JS interop solution in the future.
345 window.postMessage("START_DART_MESSAGE_UNIQUE_IDENTIFIER$dbJson", "*"); 353 window.postMessage("START_DART_MESSAGE_UNIQUE_IDENTIFIER$dbJson", "*");
346 } 354 }
347 355
348 class SectionParseResult { 356 class SectionParseResult {
349 final String html; 357 final String html;
350 final String url; 358 final String url;
351 final String idl; 359 final String idl;
352 SectionParseResult(this.html, this.url, this.idl); 360 SectionParseResult(this.html, this.url, this.idl);
353 } 361 }
354 362
355 String genCleanHtml(Element root) { 363 String genCleanHtml(Element root) {
356 for (Element e in root.queryAll(".$DART_REMOVED")) { 364 for (final e in root.queryAll(".$DART_REMOVED")) {
357 e.classes.remove(DART_REMOVED); 365 e.classes.remove(DART_REMOVED);
358 } 366 }
359 367
360 // Ditch inline styles. 368 // Ditch inline styles.
361 for (Element e in root.queryAll('[style]')) { 369 for (final e in root.queryAll('[style]')) {
362 e.attributes.remove('style'); 370 e.attributes.remove('style');
363 } 371 }
364 372
365 // These elements are just tags that we should suppress. 373 // These elements are just tags that we should suppress.
366 for (Element e in root.queryAll(".lang.lang-en")) { 374 for (final e in root.queryAll(".lang.lang-en")) {
367 e.remove(); 375 e.remove();
368 } 376 }
369 377
378 Element parametersHeader;
379 Element returnValueHeader;
380 for (final e in root.queryAll("h6")) {
381 if (e.text == 'Parameters') {
382 parametersHeader = e;
383 } else if (e.text == 'Return value') {
384 returnValueHeader = e;
385 }
386 }
387
388 if (parametersHeader != null) {
389 int numEmptyParameters = 0;
390 final parameterDescriptions = root.queryAll("dd");
391 for (Element parameterDescription in parameterDescriptions) {
392 if (parameterDescription.text.trim().length == 0) {
393 numEmptyParameters++;
394 }
395 }
396 if (numEmptyParameters > 0 &&
397 numEmptyParameters == parameterDescriptions.length) {
398 // Remove the parameter list as it adds zero value as all descriptions
399 // are empty.
400 parametersHeader.remove();
401 for (final e in root.queryAll("dl")) {
402 e.remove();
403 }
404 } else if (parameterDescriptions.length == 0 &&
405 parametersHeader.nextElementSibling != null &&
406 parametersHeader.nextElementSibling.text.trim() == 'None.') {
407 // No need to display that the function takes 0 parameters.
408 parametersHeader.nextElementSibling.remove();
409 parametersHeader.remove();
410 }
411 }
412
413 // Heuristic: if the return value is a single word it is a type name not a
414 // useful text description so suppress it.
415 if (returnValueHeader != null &&
416 returnValueHeader.nextElementSibling != null &&
417 returnValueHeader.nextElementSibling.text.trim().split(' ').length < 2) {
418 returnValueHeader.nextElementSibling.remove();
419 returnValueHeader.remove();
420 }
421
370 bool changed = true; 422 bool changed = true;
371 while (changed) { 423 while (changed) {
372 changed = false; 424 changed = false;
373 while (root.nodes.length == 1) { 425 while (root.nodes.length == 1 && root.nodes.first is Element) {
374 Node child = root.nodes.first; 426 root = root.nodes.first;
375 if (child is Element) { 427 changed = true;
376 root = child;
377 changed = true;
378 } else {
379 // Just calling innerHTML on the parent will be sufficient...
380 // and insures the output is properly escaped.
381 break;
382 }
383 } 428 }
384 429
385 // Trim useless nodes from the front. 430 // Trim useless nodes from the front.
386 while(root.nodes.length > 0 && 431 while (root.nodes.length > 0 &&
387 isSkippable(root.nodes.first)) { 432 isSkippable(root.nodes.first)) {
388 root.nodes.first.remove(); 433 root.nodes.first.remove();
389 changed = true; 434 changed = true;
390 } 435 }
391 436
392 // Trim useless nodes from the back. 437 // Trim useless nodes from the back.
393 while(root.nodes.length > 0 && 438 while (root.nodes.length > 0 &&
394 isSkippable(root.nodes.last())) { 439 isSkippable(root.nodes.last())) {
395 root.nodes.last().remove(); 440 root.nodes.last().remove();
396 changed = true; 441 changed = true;
397 } 442 }
398 } 443 }
399 return JSONFIXUPHACK(root.innerHTML); 444 return JSONFIXUPHACK(root.innerHTML);
400 } 445 }
401 446
402 String genPrettyHtml(DocumentFragment fragment) {
403 return genCleanHtml(fragment);
404 }
405
406 String genPrettyHtmlFromElement(Element e) { 447 String genPrettyHtmlFromElement(Element e) {
407 e = e.clone(true); 448 e = e.clone(true);
408 return genCleanHtml(e); 449 return genCleanHtml(e);
409 } 450 }
410 451
411 class PostOrderTraversalIterator implements Iterator<Node> { 452 class PostOrderTraversalIterator implements Iterator<Node> {
412 453
413 Node _next; 454 Node _next;
414 455
415 PostOrderTraversalIterator(Node start) { 456 PostOrderTraversalIterator(Node start) {
416 _next = _leftMostDescendent(start); 457 _next = _leftMostDescendent(start);
417 } 458 }
418 459
419 bool hasNext() => _next != null; 460 bool hasNext() => _next != null;
420 461
421 Node next() { 462 Node next() {
422 if (_next == null) return null; 463 if (_next == null) return null;
423 Node ret = _next; 464 final ret = _next;
424 if (_next.nextNode != null) { 465 if (_next.nextNode != null) {
425 _next = _leftMostDescendent(_next.nextNode); 466 _next = _leftMostDescendent(_next.nextNode);
426 } else { 467 } else {
427 _next = _next.parent; 468 _next = _next.parent;
428 } 469 }
429 return ret; 470 return ret;
430 } 471 }
431 472
432 static Node _leftMostDescendent(Node n) { 473 static Node _leftMostDescendent(Node n) {
433 while (n.nodes.length > 0) { 474 while (n.nodes.length > 0) {
434 n = n.nodes.first; 475 n = n.nodes.first;
435 } 476 }
436 return n; 477 return n;
437 } 478 }
438 } 479 }
439 480
440 class PostOrderTraversal implements Iterable<Node> { 481 class PostOrderTraversal implements Iterable<Node> {
441 final Node _node; 482 final Node _node;
442 PostOrderTraversal(this._node); 483 PostOrderTraversal(this._node);
443 484
444 Iterator<Node> iterator() => new PostOrderTraversalIterator(_node); 485 Iterator<Node> iterator() => new PostOrderTraversalIterator(_node);
445 } 486 }
446 487
488 /**
489 * Estimate what content represents the first line of text within the [section]
490 * range returning null if there isn't a plausible first line of text that
491 * contains the string [prop]. We measure the actual rendered client rectangle
492 * for the text and use heuristics defining how many pixels text can vary by
493 * and still be viewed as being on the same line.
494 */
447 Range findFirstLine(Range section, String prop) { 495 Range findFirstLine(Range section, String prop) {
448 Range firstLine = newRange(); 496 final firstLine = newRange();
449 firstLine.setStart(section.startContainer, section.startOffset); 497 firstLine.setStart(section.startContainer, section.startOffset);
450 498
451 num maxBottom = null; 499 num maxBottom = null;
452 for (Node n in new PostOrderTraversal(section.startContainer)) { 500 for (final n in new PostOrderTraversal(section.startContainer)) {
453 int compareResult = section.comparePoint(n, 0); 501 int compareResult = section.comparePoint(n, 0);
454 if (compareResult == -1) { 502 if (compareResult == -1) {
455 // before range so skip. 503 // before range so skip.
456 continue; 504 continue;
457 } else if (compareResult > 0) { 505 } else if (compareResult > 0) {
458 // After range so exit. 506 // After range so exit.
459 break; 507 break;
460 } 508 }
461 509
462 final rect = getClientRect(n); 510 final rect = getClientRect(n);
463 num bottom = rect.bottom; 511 num bottom = rect.bottom;
464 if (rect.height > 0 && rect.width > 0) { 512 if (rect.height > 0 && rect.width > 0) {
465 if (maxBottom != null && ( 513 if (maxBottom != null &&
466 maxBottom + MIN_PIXELS_DIFFERENT_LINES < bottom 514 maxBottom + MIN_PIXELS_DIFFERENT_LINES < bottom) {
467 )) {
468 break; 515 break;
469 } else if (maxBottom == null || maxBottom > bottom) { 516 } else if (maxBottom == null || maxBottom > bottom) {
470 maxBottom = bottom; 517 maxBottom = bottom;
471 } 518 }
472 } 519 }
473 520
474 firstLine.setEndAfter(n); 521 firstLine.setEndAfter(n);
475 } 522 }
476 523
477 if (firstLine.toString().indexOf(stripWebkit(prop)) == -1) { 524 // If the first line of text in the section does not contain the property
525 // name then we're not confident we are able to extract a high accuracy match
526 // so we should not return anything.
527 if (!firstLine.toString().contains(stripWebkit(prop))) {
478 return null; 528 return null;
479 } 529 }
480 return firstLine; 530 return firstLine;
481 } 531 }
482 532
533 /** Find child anchor elements that contain the text [prop]. */
483 AnchorElement findAnchorElement(Element root, String prop) { 534 AnchorElement findAnchorElement(Element root, String prop) {
484 for (AnchorElement a in root.queryAll("a")) { 535 for (AnchorElement a in root.queryAll("a")) {
485 if (a.text.indexOf(prop) != -1) { 536 if (a.text.contains(prop)) {
486 return a; 537 return a;
487 } 538 }
488 } 539 }
489 return null; 540 return null;
490 } 541 }
491 542
492 // First surrounding element with an ID is safe enough. 543 // First surrounding element with an ID is safe enough.
493 Element findTigherRoot(Element elem, Element root) { 544 Element findTighterRoot(Element elem, Element root) {
494 Element candidate = elem; 545 Element candidate = elem;
495 while(root != candidate) { 546 while (root != candidate) {
496 candidate = candidate.parent; 547 candidate = candidate.parent;
497 if (candidate.id.length > 0 && candidate.id.indexOf("section_") != 0) { 548 if (candidate.id.length > 0 && candidate.id.indexOf("section_") != 0) {
498 break; 549 break;
499 } 550 }
500 } 551 }
501 return candidate; 552 return candidate;
502 } 553 }
503 554
504 // this is very slow and ugly.. consider rewriting. 555 // TODO(jacobr): this is very slow and ugly.. consider rewriting or at least
556 // commenting carefully.
505 SectionParseResult filteredHtml(Element elem, Element root, String prop, 557 SectionParseResult filteredHtml(Element elem, Element root, String prop,
506 Function fragmentGeneratedCallback) { 558 Function fragmentGeneratedCallback) {
507 // Using a tighter root avoids false positives at the risk of trimming 559 // Using a tighter root avoids false positives at the risk of trimming
508 // text we shouldn't. 560 // text we shouldn't.
509 root = findTigherRoot(elem, root); 561 root = findTighterRoot(elem, root);
510 Range range = newRange(); 562 final range = newRange();
511 range.setStartBefore(elem); 563 range.setStartBefore(elem);
512 564
513 Element current = elem; 565 Element current = elem;
514 while (current != null) { 566 while (current != null) {
515 range.setEndBefore(current); 567 range.setEndBefore(current);
516 if (current.classes.contains(DART_REMOVED)) { 568 if (current.classes.contains(DART_REMOVED) &&
517 if (range.toString().trim().length > 0) { 569 range.toString().trim().length > 0) {
518 break; 570 break;
519 }
520 } 571 }
521 if (current.firstElementChild != null) { 572 if (current.firstElementChild != null) {
522 current = current.firstElementChild; 573 current = current.firstElementChild;
523 } else { 574 } else {
524 while (current != null) { 575 while (current != null) {
525 range.setEndAfter(current); 576 range.setEndAfter(current);
526 if (current == root) { 577 if (current == root) {
527 current = null; 578 current = null;
528 break; 579 break;
529 } 580 }
(...skipping 10 matching lines...) Expand all
540 Range firstLine = findFirstLine(range, prop); 591 Range firstLine = findFirstLine(range, prop);
541 if (firstLine != null) { 592 if (firstLine != null) {
542 range.setStart(firstLine.endContainer, firstLine.endOffset); 593 range.setStart(firstLine.endContainer, firstLine.endOffset);
543 DocumentFragment firstLineClone = firstLine.cloneContents(); 594 DocumentFragment firstLineClone = firstLine.cloneContents();
544 AnchorElement anchor = findAnchorElement(firstLineClone, prop); 595 AnchorElement anchor = findAnchorElement(firstLineClone, prop);
545 if (anchor != null) { 596 if (anchor != null) {
546 url = getAbsoluteUrl(anchor); 597 url = getAbsoluteUrl(anchor);
547 } 598 }
548 } 599 }
549 } 600 }
550 DocumentFragment fragment = range.cloneContents(); 601 final fragment = range.cloneContents();
551 if (fragmentGeneratedCallback != null) { 602 if (fragmentGeneratedCallback != null) {
552 fragmentGeneratedCallback(fragment); 603 fragmentGeneratedCallback(fragment);
553 } 604 }
554 // Strip tags we don't want 605 // Strip tags we don't want
555 for (Element e in fragment.queryAll("script, object, style")) { 606 for (Element e in fragment.queryAll("script, object, style")) {
556 e.remove(); 607 e.remove();
557 } 608 }
558 609
559 // Extract idl 610 // Extract idl
560 StringBuffer idl = new StringBuffer(); 611 final idl = new StringBuffer();
561 if (prop != null && prop.length > 0) { 612 if (prop != null && prop.length > 0) {
562 // Only expect properties to have HTML. 613 // Only expect properties to have HTML.
563 for(Element e in fragment.queryAll(IDL_SELECTOR)) { 614 for(Element e in fragment.queryAll(IDL_SELECTOR)) {
564 idl.add(e.outerHTML); 615 idl.add(e.outerHTML);
565 e.remove(); 616 e.remove();
566 } 617 }
567 // TODO(jacobr) this is a very basic regex to see if text looks like IDL 618 // TODO(jacobr) this is a very basic regex to see if text looks like IDL
568 RegExp likelyIdl = new RegExp(" $prop\\w*\\("); 619 RegExp likelyIdl = new RegExp(" $prop\\w*\\(");
569 620
570 for (Element e in fragment.queryAll("pre")) { 621 for (Element e in fragment.queryAll("pre")) {
571 // Check if it looks like idl... 622 // Check if it looks like idl...
572 String txt = e.text.trim(); 623 String txt = e.text.trim();
573 if (likelyIdl.hasMatch(txt) && txt.indexOf("\n") != -1 624 if (likelyIdl.hasMatch(txt) && txt.contains("\n") && txt.contains(")")) {
574 && txt.indexOf(")") != -1) {
575 idl.add(e.outerHTML); 625 idl.add(e.outerHTML);
576 e.remove(); 626 e.remove();
577 } 627 }
578 } 628 }
579 } 629 }
580 return new SectionParseResult(genPrettyHtml(fragment), url, idl.toString()); 630 return new SectionParseResult(genCleanHtml(fragment), url, idl.toString());
581 } 631 }
582 632
583 Element findBest(Element root, List<Text> allText, String prop, String propType) { 633 /**
584 // Best bet: match an id 634 * Find the best child element of [root] that appears to be an API definition
585 Element cand; 635 * for [prop]. [allText] is a list of all text nodes under root computed by
586 cand = root.query("#" + prop); 636 * the caller to improve performance.
637 */
638 Element findBest(Element root, List<Text> allText, String prop,
639 String propType) {
640 // Best bet: find a child of root where the id matches the property name.
641 Element cand = root.query("#$prop");
587 642
588 if (cand == null && propType == "methods") { 643 if (cand == null && propType == "methods") {
589 cand = root.query("[id=" + prop + "\\(\\)]"); 644 cand = root.query("[id=$prop\\(\\)]");
645 }
646 while (cand != null && cand.text.trim().length == 0) {
647 // We found the bookmark for the element but sadly it is just an empty
648 // placeholder. Find the first real element.
649 cand = cand.nextElementSibling;
590 } 650 }
591 if (cand != null) { 651 if (cand != null) {
592 while (cand != null && cand.text.trim().length == 0) { 652 return cand;
593 // We found the bookmark for the element but sadly it is just an empty
594 // placeholder. Find the first real element.
595 cand = cand.nextElementSibling;
596 }
597 if (cand != null) {
598 return cand;
599 }
600 } 653 }
601 654
602 // If you are at least 70 pixels from the left, something is definitely fishy and we shouldn't even consider this candidate. 655 // If we are at least 70 pixels from the left, something is definitely
656 // fishy and we shouldn't even consider this candidate as nobody visually
657 // formats API docs like that.
603 num candLeft = 70; 658 num candLeft = 70;
604 659
605 for (Text text in allText) { 660 for (Text text in allText) {
606 Element proposed = null; 661 Element proposed = null;
607 662
608 // var t = safeNameCleanup(text.text); 663 // TODO(jacobr): does it hurt precision to use the full cleanup?
609 // TODO(jacobr): does it hurt precision to use the full cleanup?
610 String t = fullNameCleanup(text.text); 664 String t = fullNameCleanup(text.text);
611 if (t == prop) { 665 if (t == prop) {
612 proposed = text.parent; 666 proposed = text.parent;
613 ClientRect candRect = getClientRect(proposed); 667 ClientRect candRect = getClientRect(proposed);
614 668
615 // TODO(jacobr): this is a good heuristic 669 // TODO(jacobr): this is a good heuristic
616 // if (selObj.selector.indexOf(" > DD ") == -1 670 // if (selObj.selector.indexOf(" > DD ") == -1
617 if (candRect.left < candLeft) { 671 if (candRect.left < candLeft) {
618 cand = proposed; 672 cand = proposed;
619 candLeft = candRect.left; 673 candLeft = candRect.left;
620 } 674 }
621 } 675 }
622 } 676 }
623 return cand; 677 return cand;
624 } 678 }
625 679
680 /**
681 * Checks whether [e] is tagged as obsolete or deprecated using heuristics
682 * for what these tags look like in the MDN docs.
683 */
626 bool isObsolete(Element e) { 684 bool isObsolete(Element e) {
627 RegExp obsoleteRegExp = new RegExp(@"(^|\s)obsolete(?=\s|$)"); 685 RegExp obsoleteRegExp = new RegExp(@"(^|\s)obsolete(?=\s|$)");
628 RegExp deprecatedRegExp = new RegExp(@"(^|\s)deprecated(?=\s|$)"); 686 RegExp deprecatedRegExp = new RegExp(@"(^|\s)deprecated(?=\s|$)");
629 for (Element child in e.queryAll("span")) { 687 for (Element child in e.queryAll("span")) {
630 String t = child.text.toLowerCase(); 688 String t = child.text.toLowerCase();
631 if (t.startsWith("obsolete") || t.startsWith("deprecated")) return true; 689 if (t.startsWith("obsolete") || t.startsWith("deprecated")) return true;
632 } 690 }
633 691
634 String text = e.text.toLowerCase(); 692 String text = e.text.toLowerCase();
635 return obsoleteRegExp.hasMatch(text) || deprecatedRegExp.hasMatch(text); 693 return obsoleteRegExp.hasMatch(text) || deprecatedRegExp.hasMatch(text);
636 } 694 }
637 695
638 bool isFirstCharLowerCase(String str) { 696 bool isFirstCharLowerCase(String str) {
639 RegExp firstLower = new RegExp("^[a-z]"); 697 return const RegExp("^[a-z]").hasMatch(str);
640 return firstLower.hasMatch(str);
641 } 698 }
642 699
643 void scrapeSection(Element root, String sectionSelector, 700 /**
644 String currentType, 701 * Extracts information from a fragment of HTML only searching under the [root]
645 List members, 702 * html node. [secitonSelector] specifies the query to use to find candidate
646 String propType) { 703 * sections of the document to consider (there may be more than one).
704 * [currentType] specifies the name of the current class. [members] specifies
705 * the known class members for this class that we are attempting to find
706 * documentation for. [propType] indicates whether we are searching for
707 * methods, properties, constants, or constructors.
708 */
709 void scrapeSection(Element root, String sectionSelector, String currentType,
710 List members, String propType) {
647 Map expectedProps = dartIdl[propType]; 711 Map expectedProps = dartIdl[propType];
648 712
649 Set<String> alreadyMatchedProperties = new Set<String>(); 713 Set<String> alreadyMatchedProperties = new Set<String>();
650 bool onlyConsiderTables = false; 714 bool onlyConsiderTables = false;
651 ElementList allMatches = root.queryAll(sectionSelector); 715 ElementList allMatches = root.queryAll(sectionSelector);
652 if (allMatches.length == 0) { 716 if (allMatches.length == 0) {
717 // If we can't find any matches to the sectionSelector, we fall back to
718 // considering all tables in the document. This is dangerous so we only
719 // allow the safer table matching extraction rules for this case.
653 allMatches = root.queryAll(".fullwidth-table"); 720 allMatches = root.queryAll(".fullwidth-table");
654 onlyConsiderTables = true; 721 onlyConsiderTables = true;
655 } 722 }
656 for (Element matchElement in allMatches) { 723 for (Element matchElement in allMatches) {
657 DivElement match = matchElement.parent; 724 final match = matchElement.parent;
658 if (!match.id.startsWith("section") && !(match.id == "pageText")) { 725 if (!match.id.startsWith("section") && match.id != "pageText") {
659 throw "Enexpected element $match"; 726 throw "Unexpected element $match";
660 } 727 }
728 // We don't want to later display this text a second time while for example
729 // displaying class level summary information as then we would display
730 // the same documentation twice.
661 match.classes.add(DART_REMOVED); 731 match.classes.add(DART_REMOVED);
662 732
663 bool foundProps = false; 733 bool foundProps = false;
664 734
665 // TODO(jacobr): we should really look for the table tag instead 735 // TODO(jacobr): we should really look for the table tag instead
666 // add an assert if we are missing something that is a table... 736 // add an assert if we are missing something that is a table...
667 // TODO(jacobr) ignore tables in tables.... 737 // TODO(jacobr) ignore tables in tables.
668 for (Element t in match.queryAll('.standard-table, .fullwidth-table')) { 738 for (Element t in match.queryAll('.standard-table, .fullwidth-table')) {
669 int helpIndex = -1; 739 int helpIndex = -1;
670 num i = 0; 740 num i = 0;
671 for (Element r in t.queryAll("th, td.header")) { 741 for (Element r in t.queryAll("th, td.header")) {
672 var txt = r.text.trim().split(" ")[0].toLowerCase(); 742 final txt = r.text.trim().split(" ")[0].toLowerCase();
673 if (txt == "description") { 743 if (txt == "description") {
674 helpIndex = i; 744 helpIndex = i;
675 break; 745 break;
676 } 746 }
677 i++; 747 i++;
678 } 748 }
679 749
680 List<int> numMatches = new List<int>(i); 750 // Figure out which column in the table contains member names by
751 // tracking how many member names each column contains.
752 final numMatches = new List<int>(i);
681 for (int j = 0; j < i; j++) { 753 for (int j = 0; j < i; j++) {
682 numMatches[j] = 0; 754 numMatches[j] = 0;
683 } 755 }
684 756
685 // Find the row that seems to have the most names that look like 757 // Find the column that seems to have the most names that look like
686 // expected properties. 758 // expected properties.
687 for (Element r in t.queryAll("tbody tr")) { 759 for (Element r in t.queryAll("tbody tr")) {
688 ElementList $row = r.elements; 760 ElementList row = r.elements;
689 if ($row.length == 0 || $row.first.classes.contains(".header")) { 761 if (row.length == 0 || row.first.classes.contains(".header")) {
690 continue; 762 continue;
691 } 763 }
692 764
693 for (int k = 0; k < numMatches.length && k < $row.length; k++) { 765 for (int k = 0; k < numMatches.length && k < row.length; k++) {
694 Element e = $row[k]; 766 if (expectedProps.containsKey(fullNameCleanup(row[k].text))) {
695 if (expectedProps.containsKey(fullNameCleanup(e.text))) {
696 numMatches[k]++; 767 numMatches[k]++;
697 break; 768 break;
698 } 769 }
699 } 770 }
700 } 771 }
701 772
702 int propNameIndex = 0; 773 int propNameIndex = 0;
703 { 774 {
704 int bestCount = numMatches[0]; 775 int bestCount = numMatches[0];
705 for (int k = 1; k < numMatches.length; k++) { 776 for (int k = 1; k < numMatches.length; k++) {
706 if (numMatches[k] > bestCount) { 777 if (numMatches[k] > bestCount) {
707 bestCount = numMatches[k]; 778 bestCount = numMatches[k];
708 propNameIndex = k; 779 propNameIndex = k;
709 } 780 }
710 } 781 }
711 } 782 }
712 783
713 for (Element r in t.queryAll("tbody tr")) { 784 for (Element r in t.queryAll("tbody tr")) {
714 ElementList $row = r.elements; 785 final row = r.elements;
715 if ($row.length > propNameIndex && $row.length > helpIndex ) { 786 if (row.length > propNameIndex && row.length > helpIndex) {
716 if ($row.first.classes.contains(".header")) { 787 if (row.first.classes.contains(".header")) {
717 continue; 788 continue;
718 } 789 }
719 // TODO(jacobr): this code for determining the namestr is needlessly 790 // TODO(jacobr): this code for determining the namestr is needlessly
720 // messy. 791 // messy.
721 Element nameRow = $row[propNameIndex]; 792 final nameRow = row[propNameIndex];
722 AnchorElement a = nameRow.query("a"); 793 AnchorElement a = nameRow.query("a");
723 String goodName = ''; 794 String goodName = '';
724 if (a != null) { 795 if (a != null) {
725 goodName = a.text.trim(); 796 goodName = a.text.trim();
726 } 797 }
727 String nameStr = nameRow.text; 798 String nameStr = nameRow.text;
728 799
729 Map entry = new Map<String, String>(); 800 Map entry = new Map<String, String>();
730 801
731 // "currentType": $($row[1]).text().trim(), // find("code") ? 802 entry["name"] = fullNameCleanup(nameStr.length > 0 ?
732 entry["name"] = fullNameCleanup(nameStr.length > 0 ? nameStr : goodNam e); 803 nameStr : goodName);
733 804
734 final parse = filteredHtml(nameRow, nameRow, entry["name"], null); 805 final parse = filteredHtml(nameRow, nameRow, entry["name"], null);
735 String altHelp = parse.html; 806 String altHelp = parse.html;
736 807
737 // "jsSignature": nameStr, 808 entry["help"] = (helpIndex == -1 || row[helpIndex] == null) ?
738 entry["help"] = (helpIndex == -1 || $row[helpIndex] == null) ? altHelp : genPrettyHtmlFromElement($row[helpIndex]); 809 altHelp : genPrettyHtmlFromElement(row[helpIndex]);
739 // "altHelp" : altHelp,
740 if (parse.url != null) { 810 if (parse.url != null) {
741 entry["url"] = parse.url; 811 entry["url"] = parse.url;
742 } 812 }
743 813
744 if (parse.idl.length > 0) { 814 if (parse.idl.length > 0) {
745 entry["idl"] = parse.idl; 815 entry["idl"] = parse.idl;
746 } 816 }
747 817
748 entry["obsolete"] = isObsolete(r); 818 entry["obsolete"] = isObsolete(r);
749 819
750 if (entry["name"].length > 0) { 820 if (entry["name"].length > 0) {
751 cleanupEntry(members, entry); 821 cleanupEntry(members, entry);
752 alreadyMatchedProperties.add(entry['name']); 822 alreadyMatchedProperties.add(entry['name']);
753 foundProps = true; 823 foundProps = true;
754 } 824 }
755 } 825 }
756 } 826 }
757 } 827 }
758 828
759 if (onlyConsiderTables) { 829 if (onlyConsiderTables) {
760 continue; 830 continue;
761 } 831 }
832
762 // After this point we have higher risk tests that attempt to perform 833 // After this point we have higher risk tests that attempt to perform
763 // rudimentary page segmentation. 834 // rudimentary page segmentation. This approach is much more error-prone
835 // than using tables because the HTML is far less clearly structured.
764 836
765 // Search for expected matching names. 837 final allText = getAllTextNodes(match);
766 List<Text> allText = getAllTextNodes(match);
767 838
768 Map<String, Element> pmap = new Map<String, Element>(); 839 final pmap = new Map<String, Element>();
769 for (String prop in expectedProps.getKeys()) { 840 for (final prop in expectedProps.getKeys()) {
770 if (alreadyMatchedProperties.contains(prop)) { 841 if (alreadyMatchedProperties.contains(prop)) {
771 continue; 842 continue;
772 } 843 }
773 Element e = findBest(match, allText, prop, propType); 844 final e = findBest(match, allText, prop, propType);
774 if (e != null && !inTable(e)) { 845 if (e != null && !inTable(e)) {
775 pmap[prop] = e; 846 pmap[prop] = e;
776 } 847 }
777 } 848 }
778 849
779 for (String prop in pmap.getKeys()) { 850 for (final prop in pmap.getKeys()) {
780 Element e = pmap[prop]; 851 pmap[prop].classes.add(DART_REMOVED);
781 e.classes.add(DART_REMOVED);
782 } 852 }
783 853
854 // The problem is the MDN docs do place documentation for each method in a
855 // nice self contained subtree. Instead you will see something like:
856
857 // <h3>drawImage</h3>
858 // <p>Draw image is an awesome method</p>
859 // some more info on drawImage here
860 // <h3>mozDrawWindow</h3>
861 // <p>This API cannot currently be used by Web content.
862 // It is chrome only.</p>
863 // <h3>drawRect</h3>
864 // <p>Always call drawRect instead of drawImage</p>
865 // some more info on drawRect here...
866
867 // The trouble is we will easily detect that the drawImage and drawRect
868 // entries are method definitions because we know to search for these
869 // method names but we will not detect that mozDrawWindow is a method
870 // definition as that method doesn't exist in our IDL. Thus if we are not
871 // careful the definition for the drawImage method will contain the
872 // definition for the mozDrawWindow method as well which would result in
873 // broken docs. We solve this problem by finding all content with similar
874 // visual structure to the already found method definitions. It turns out
875 // that using the visual position of each element on the page is much
876 // more reliable than using the DOM structure
877 // (e.g. section_root > div > h3) for the MDN docs because MDN authors
878 // carefully check that the documentation for each method comment is
879 // visually consistent but take less care to check that each
880 // method comment has identical markup structure.
784 for (String prop in pmap.getKeys()) { 881 for (String prop in pmap.getKeys()) {
785 Element e = pmap[prop]; 882 Element e = pmap[prop];
786 ClientRect r = getClientRect(e); 883 ClientRect r = getClientRect(e);
787 // TODO(jacobr): a lot of these queries are identical. 884 // TODO(jacobr): a lot of these queries are identical and this code
788 for (Element cand in match.queryAll(e.tagName)) { 885 // could easily be optimized.
789 if (!cand.classes.contains(DART_REMOVED) && !inTable(cand) ) { // XXX us e a neg selector. 886 for (final cand in match.queryAll(e.tagName)) {
790 ClientRect candRect = getClientRect(cand); 887 // TODO(jacobr): use a negative selector instead.
791 // TODO(jacobr): this is somewhat loose. 888 if (!cand.classes.contains(DART_REMOVED) && !inTable(cand)) {
889 final candRect = getClientRect(cand);
890 // Only consider matches that have similar heights and identical left
891 // coordinates.
792 if (candRect.left == r.left && 892 if (candRect.left == r.left &&
793 (candRect.height - r.height).abs() < 5) { 893 (candRect.height - r.height).abs() < 5) {
794 String propName = fullNameCleanup(cand.text); 894 String propName = fullNameCleanup(cand.text);
795 if (isFirstCharLowerCase(propName) && pmap.containsKey(propName) == false && alreadyMatchedProperties.contains(propName) == false) { 895 if (isFirstCharLowerCase(propName) && !pmap.containsKey(propName)
796 // Don't set here to avoid layouts... cand.classes.add(DART_REMOVE D); 896 && !alreadyMatchedProperties.contains(propName)) {
797 pmap[propName] = cand; 897 pmap[propName] = cand;
798 } 898 }
799 } 899 }
800 } 900 }
801 } 901 }
802 } 902 }
803 903
904 // We mark these elements in batch to reduce the number of layouts
905 // triggered. TODO(jacobr): use new batch based async measurement to make
906 // this code flow simpler.
804 for (String prop in pmap.getKeys()) { 907 for (String prop in pmap.getKeys()) {
805 Element e = pmap[prop]; 908 Element e = pmap[prop];
806 e.classes.add(DART_REMOVED); 909 e.classes.add(DART_REMOVED);
807 } 910 }
808 911
809 // Find likely "subsections" of the main section and mark them with 912 // Find likely "subsections" of the main section and mark them with
810 // DART_REMOVED so we don't include them in member descriptions... which 913 // DART_REMOVED so we don't include them in member descriptions... which
811 // would suck. 914 // would suck.
812 for (Element e in match.queryAll("[id]")) { 915 for (Element e in match.queryAll("[id]")) {
813 if (e.id.indexOf(matchElement.id) != -1) { 916 if (e.id.contains(matchElement.id)) {
814 e.classes.add(DART_REMOVED); 917 e.classes.add(DART_REMOVED);
815 } 918 }
816 } 919 }
817 920
818 for (String prop in pmap.getKeys()) { 921 for (String prop in pmap.getKeys()) {
819 Element elem = pmap[prop]; 922 Element elem = pmap[prop];
820 bool obsolete = false; 923 bool obsolete = false;
821 final parse = filteredHtml( 924 final parse = filteredHtml(
822 elem, match, prop, 925 elem, match, prop,
823 (Element e) { 926 (Element e) {
824 obsolete = isObsolete(e); 927 obsolete = isObsolete(e);
825 }); 928 });
826 Map entry = { 929 Map entry = {
827 "url" : parse.url, 930 "url" : parse.url,
828 "name" : prop, 931 "name" : prop,
829 "help" : parse.html, 932 "help" : parse.html,
830 "obsolete" : obsolete 933 "obsolete" : obsolete
831 //"jsSignature" : nameStr
832 }; 934 };
833 if (parse.idl.length > 0) { 935 if (parse.idl.length > 0) {
834 entry["idl"] = parse.idl; 936 entry["idl"] = parse.idl;
835 } 937 }
836 cleanupEntry(members, entry); 938 cleanupEntry(members, entry);
837 } 939 }
838 } 940 }
839 } 941 }
840 942
841 String trimHtml(String html) { 943 String trimHtml(String html) {
842 // TODO(jacobr): impl. 944 // TODO(jacobr): implement this. Remove spurious enclosing HTML tags, etc.
843 return html; 945 return html;
844 } 946 }
845 947
846 bool maybeName(String name) { 948 bool maybeName(String name) {
847 RegExp nameRegExp = new RegExp("^[a-z][a-z0-9A-Z]+\$"); 949 return const RegExp("^[a-z][a-z0-9A-Z]+\$").hasMatch(name) ||
848 if (nameRegExp.hasMatch(name)) return true; 950 const RegExp("^[A-Z][A-Z_]*\$").hasMatch(name);
849 RegExp constRegExp = new RegExp("^[A-Z][A-Z_]*\$");
850 if (constRegExp.hasMatch(name)) return true;
851 } 951 }
852 952
953 // TODO(jacobr): this element is ugly at the moment but will become easier to
954 // read once ElementList supports most of the Element functionality.
853 void markRemoved(var e) { 955 void markRemoved(var e) {
854 if (e != null) { 956 if (e != null) {
855 // TODO( remove)
856 if (e is Element) { 957 if (e is Element) {
857 e.classes.add(DART_REMOVED); 958 e.classes.add(DART_REMOVED);
858 } else { 959 } else {
859 for (Element el in e) { 960 for (Element el in e) {
860 el.classes.add(DART_REMOVED); 961 el.classes.add(DART_REMOVED);
861 } 962 }
862 } 963 }
863 } 964 }
864 } 965 }
865 966
967 // TODO(jacobr): remove this when the dartium JSON parser handles \n correctly.
866 String JSONFIXUPHACK(String value) { 968 String JSONFIXUPHACK(String value) {
867 return value.replaceAll("\n", "ZDARTIUMDOESNTESCAPESLASHNJXXXX"); 969 return value.replaceAll("\n", "ZDARTIUMDOESNTESCAPESLASHNJXXXX");
868 } 970 }
869 971
870 String mozToWebkit(String name) { 972 String mozToWebkit(String name) {
871 RegExp regExp = new RegExp("^moz"); 973 return name.replaceFirst(const RegExp("^moz"), "webkit");
872 name = name.replaceFirst(regExp, "webkit");
873 return name;
874 } 974 }
875 975
876 String stripWebkit(String name) { 976 String stripWebkit(String name) {
877 return trimPrefix(name, "webkit"); 977 return trimPrefix(name, "webkit");
878 } 978 }
879 979
980 // TODO(jacobr): be more principled about this.
880 String fullNameCleanup(String name) { 981 String fullNameCleanup(String name) {
881 int parenIndex = name.indexOf('('); 982 int parenIndex = name.indexOf('(');
882 if (parenIndex != -1) { 983 if (parenIndex != -1) {
883 // TODO(jacobr): workaround bug in:
884 // name = name.split("(")[0];
885 name = name.substring(0, parenIndex); 984 name = name.substring(0, parenIndex);
886 } 985 }
887 name = name.split(" ")[0]; 986 name = name.split(" ")[0];
888 name = name.split("\n")[0]; 987 name = name.split("\n")[0];
889 name = name.split("\t")[0]; 988 name = name.split("\t")[0];
890 name = name.split("*")[0]; 989 name = name.split("*")[0];
891 name = name.trim(); 990 name = name.trim();
892 name = safeNameCleanup(name); 991 name = safeNameCleanup(name);
893 return name; 992 return name;
894 } 993 }
895 994
896 // Less agressive than the full cleanup to avoid overeager matching of 995 // Less agressive than the full name cleanup to avoid overeager matching.
897 // everytyhing 996 // TODO(jacobr): be more principled about this.
898 String safeNameCleanup(String name) { 997 String safeNameCleanup(String name) {
899 int parenIndex = name.indexOf('('); 998 int parenIndex = name.indexOf('(');
900 if (parenIndex != -1 && name.indexOf(")") != -1) { 999 if (parenIndex != -1 && name.indexOf(")") != -1) {
901 // TODO(jacobr): workaround bug in: 1000 // TODO(jacobr): workaround bug in:
902 // name = name.split("(")[0]; 1001 // name = name.split("(")[0];
903 name = name.substring(0, parenIndex); 1002 name = name.substring(0, parenIndex);
904 } 1003 }
905 name = name.trim(); 1004 name = name.trim();
906 name = trimPrefix(name, currentType + "."); 1005 name = trimPrefix(name, currentType + ".");
907 name = trimPrefix(name, currentType.toLowerCase() + "."); 1006 name = trimPrefix(name, currentType.toLowerCase() + ".");
908 name = trimPrefix(name, currentTypeShort + "."); 1007 name = trimPrefix(name, currentTypeShort + ".");
909 name = trimPrefix(name, currentTypeShort.toLowerCase() + "."); 1008 name = trimPrefix(name, currentTypeShort.toLowerCase() + ".");
910 name = trimPrefix(name, currentTypeTiny + "."); 1009 name = trimPrefix(name, currentTypeTiny + ".");
911 name = trimPrefix(name, currentTypeTiny.toLowerCase() + "."); 1010 name = trimPrefix(name, currentTypeTiny.toLowerCase() + ".");
912 name = name.trim(); 1011 name = name.trim();
913 name = mozToWebkit(name); 1012 name = mozToWebkit(name);
914 return name; 1013 return name;
915 } 1014 }
916 1015
1016 /**
1017 * Remove h1, h2, and h3 headers.
1018 */
917 void removeHeaders(DocumentFragment fragment) { 1019 void removeHeaders(DocumentFragment fragment) {
918 for (Element e in fragment.queryAll("h1, h2, h3")) { 1020 for (Element e in fragment.queryAll("h1, h2, h3")) {
919 e.remove(); 1021 e.remove();
920 } 1022 }
921 } 1023 }
922 1024
1025 /**
1026 * Given an [entry] representing a single method or property cleanup the
1027 * values performing some simple normalization and only adding the entry to
1028 * [members] if it has a valid name.
1029 */
923 void cleanupEntry(List members, Map entry) { 1030 void cleanupEntry(List members, Map entry) {
924 if (entry.containsKey('help')) { 1031 if (entry.containsKey('help')) {
925 entry['help'] = trimHtml(entry['help']); 1032 entry['help'] = trimHtml(entry['help']);
926 } 1033 }
927 String name = fullNameCleanup(entry['name']); 1034 String name = fullNameCleanup(entry['name']);
928 entry['name'] = name; 1035 entry['name'] = name;
929 if (maybeName(name)) { 1036 if (maybeName(name)) {
930 for (String key in entry.getKeys()) { 1037 for (String key in entry.getKeys()) {
931 var value = entry[key]; 1038 var value = entry[key];
932 if (value == null) { 1039 if (value == null) {
(...skipping 10 matching lines...) Expand all
943 1050
944 // TODO(jacobr) dup with trim start.... 1051 // TODO(jacobr) dup with trim start....
945 String trimPrefix(String str, String prefix) { 1052 String trimPrefix(String str, String prefix) {
946 if (str.indexOf(prefix) == 0) { 1053 if (str.indexOf(prefix) == 0) {
947 return str.substring(prefix.length); 1054 return str.substring(prefix.length);
948 } else { 1055 } else {
949 return str; 1056 return str;
950 } 1057 }
951 } 1058 }
952 1059
953 void resourceLoaded() {
954 if (data != null) run();
955 }
956
957 String trimStart(String str, String start) { 1060 String trimStart(String str, String start) {
958 if (str.startsWith(start) && str.length > start.length) { 1061 if (str.startsWith(start) && str.length > start.length) {
959 return str.substring(start.length); 1062 return str.substring(start.length);
960 } 1063 }
961 return str; 1064 return str;
962 } 1065 }
963 1066
964 String trimEnd(String str, String end) { 1067 String trimEnd(String str, String end) {
965 if (str.endsWith(end) && str.length > end.length) { 1068 if (str.endsWith(end) && str.length > end.length) {
966 return str.substring(0, str.length - end.length); 1069 return str.substring(0, str.length - end.length);
967 } 1070 }
968 return str; 1071 return str;
969 } 1072 }
970 1073
1074 /**
1075 * Extract a section with name [key] using [selector] to find start points for
1076 * the section in the document.
1077 */
971 void extractSection(String selector, String key) { 1078 void extractSection(String selector, String key) {
972 for (Element e in document.queryAll(selector)) { 1079 for (Element e in document.queryAll(selector)) {
973 e = e.parent; 1080 e = e.parent;
974 for (Element skip in e.queryAll("h1, h2, $IDL_SELECTOR")) { 1081 for (Element skip in e.queryAll("h1, h2, $IDL_SELECTOR")) {
975 skip.remove(); 1082 skip.remove();
976 } 1083 }
977 String html = filteredHtml(e, e, null, removeHeaders).html; 1084 String html = filteredHtml(e, e, null, removeHeaders).html;
978 if (html.length > 0) { 1085 if (html.length > 0) {
979 if (dbEntry.containsKey(key)) { 1086 if (dbEntry.containsKey(key)) {
980 dbEntry[key] += html; 1087 dbEntry[key] += html;
981 } else { 1088 } else {
982 dbEntry[key] = html; 1089 dbEntry[key] = html;
983 } 1090 }
984 } 1091 }
985 e.classes.add(DART_REMOVED); 1092 e.classes.add(DART_REMOVED);
986 } 1093 }
987 } 1094 }
988 1095
989 void run() { 1096 void run() {
990 // Inject CSS to insure lines don't wrap unless it was intentional. 1097 // Inject CSS to ensure lines don't wrap unless they were intended to.
1098 // This is needed to make the logic to determine what is a single line
1099 // behave consistently even for very long method names.
991 document.head.nodes.add(new Element.html(""" 1100 document.head.nodes.add(new Element.html("""
992 <style type="text/css"> 1101 <style type="text/css">
993 body { 1102 body {
994 width: 10000px; 1103 width: 10000px;
995 } 1104 }
996 </style>""")); 1105 </style>"""));
997 1106
998 String title = trimEnd(window.document.title.trim(), " - MDN"); 1107 String title = trimEnd(window.document.title.trim(), " - MDN");
999 dbEntry['title'] = title; 1108 dbEntry['title'] = title;
1000 1109
1001 // TODO(rnystrom): Clean up the page a bunch. Not sure if this is the best 1110 // TODO(rnystrom): Clean up the page a bunch. Not sure if this is the best
1002 // place to do this... 1111 // place to do this...
1112 // TODO(jacobr): move this to right before we extract HTML.
1003 1113
1004 // Remove the "Introduced in HTML <version>" boxes. 1114 // Remove the "Introduced in HTML <version>" boxes.
1005 for (Element e in document.queryAll('.htmlVersionHeaderTemplate')) { 1115 for (Element e in document.queryAll('.htmlVersionHeaderTemplate')) {
1006 e.remove(); 1116 e.remove();
1007 } 1117 }
1008 1118
1009 // Flatten the list of known DOM types into a faster and case-insensitive map. 1119 // Flatten the list of known DOM types into a faster and case-insensitive
1120 // map.
1010 domTypes = {}; 1121 domTypes = {};
1011 for (final domType in domTypesRaw) { 1122 for (final domType in domTypesRaw) {
1012 domTypes[domType.toLowerCase()] = domType; 1123 domTypes[domType.toLowerCase()] = domType;
1013 } 1124 }
1014 1125
1015 // Fix up links. 1126 // Fix up links.
1016 final SHORT_LINK = const RegExp(@'^[\w/]+$'); 1127 final SHORT_LINK = const RegExp(@'^[\w/]+$');
1017 final INNER_LINK = const RegExp(@'[Ee]n/(?:[\w/]+/|)([\w#.]+)(?:\(\))?$'); 1128 final INNER_LINK = const RegExp(@'[Ee]n/(?:[\w/]+/|)([\w#.]+)(?:\(\))?$');
1018 final MEMBER_LINK = const RegExp(@'(\w+)[.#](\w+)'); 1129 final MEMBER_LINK = const RegExp(@'(\w+)[.#](\w+)');
1019 final RELATIVE_LINK = const RegExp(@'^(?:../)*/?[Ee][Nn]/(.+)'); 1130 final RELATIVE_LINK = const RegExp(@'^(?:../)*/?[Ee][Nn]/(.+)');
1020 1131
1021 // - Make relative links absolute. 1132 // - Make relative links absolute.
1022 // - If we can, take links that point to other MDN pages and retarget them 1133 // - If we can, take links that point to other MDN pages and retarget them
1023 // to appropriate pages in our docs. 1134 // to appropriate pages in our docs.
1024 // TODO(rnystrom): Add rel external to links we didn't fix. 1135 // TODO(rnystrom): Add rel external to links we didn't fix.
1025 for (AnchorElement a in document.queryAll('a')) { 1136 for (AnchorElement a in document.queryAll('a')) {
1026 // Get the raw attribute because we *don't* want the browser to fully- 1137 // Get the raw attribute because we *don't* want the browser to fully-
1027 // qualify the name for us since it has the wrong base address for the page. 1138 // qualify the name for us since it has the wrong base address for the
1139 // page.
1028 var href = a.attributes['href']; 1140 var href = a.attributes['href'];
1029 1141
1030 // Ignore busted links. 1142 // Ignore busted links.
1031 if (href == null) continue; 1143 if (href == null) continue;
1032 1144
1033 // If we can recognize what it's pointing to, point it to our page instead. 1145 // If we can recognize what it's pointing to, point it to our page instead.
1034 tryToLinkToRealType(maybeType) { 1146 tryToLinkToRealType(maybeType) {
1035 // See if we know a type with that name. 1147 // See if we know a type with that name.
1036 final realType = domTypes[maybeType.toLowerCase()]; 1148 final realType = domTypes[maybeType.toLowerCase()];
1037 if (realType != null) { 1149 if (realType != null) {
(...skipping 25 matching lines...) Expand all
1063 tryToLinkToRealType(member[1]); 1175 tryToLinkToRealType(member[1]);
1064 } else { 1176 } else {
1065 tryToLinkToRealType(match[1]); 1177 tryToLinkToRealType(match[1]);
1066 } 1178 }
1067 } 1179 }
1068 1180
1069 // Put it back into the element. 1181 // Put it back into the element.
1070 a.attributes['href'] = href; 1182 a.attributes['href'] = href;
1071 } 1183 }
1072 1184
1073 if (title.toLowerCase().indexOf(currentTypeTiny.toLowerCase()) == -1) { 1185 if (!title.toLowerCase().contains(currentTypeTiny.toLowerCase())) {
1074 bool foundMatch = false; 1186 bool foundMatch = false;
1075 // Test out if the title is really an HTML tag that matches the 1187 // Test out if the title is really an HTML tag that matches the
1076 // current class name. 1188 // current class name.
1077 for (String tag in [title.split(" ")[0], title.split(".").last()]) { 1189 for (String tag in [title.split(" ")[0], title.split(".").last()]) {
1078 try { 1190 try {
1079 dom.Element element = dom.document.createElement(tag); 1191 dom.Element element = dom.document.createElement(tag);
1192 // TODO(jacobr): this is a really ugly way of doing this that will
1193 // stop working at some point soon.
1080 if (element.typeName == currentType) { 1194 if (element.typeName == currentType) {
1081 foundMatch = true; 1195 foundMatch = true;
1082 break; 1196 break;
1083 } 1197 }
1084 } catch(e) {} 1198 } catch(e) {}
1085 } 1199 }
1086 if (foundMatch == false) { 1200 if (!foundMatch) {
1087 dbEntry['skipped'] = true; 1201 dbEntry['skipped'] = true;
1088 dbEntry['cause'] = "Suspect title"; 1202 dbEntry['cause'] = "Suspect title";
1089 onEnd(); 1203 onEnd();
1090 return; 1204 return;
1091 } 1205 }
1092 } 1206 }
1093 1207
1094 Element root = document.query(".pageText"); 1208 Element root = document.query(".pageText");
1095 if (root == null) { 1209 if (root == null) {
1096 dbEntry['cause'] = '.pageText not found'; 1210 dbEntry['cause'] = '.pageText not found';
1097 onEnd(); 1211 onEnd();
1098 return; 1212 return;
1099 } 1213 }
1100 1214
1101 markRemoved(root.query("#Notes")); 1215 markRemoved(root.query("#Notes"));
1102 List members = dbEntry['members']; 1216 List members = dbEntry['members'];
1103 1217
1218 // This is a laundry list of CSS selectors for boilerplate content on the
1219 // MDN pages that we should ignore for the purposes of extracting
1220 // documentation.
1104 markRemoved(document.queryAll(".pageToc, footer, header, #nav-toolbar")); 1221 markRemoved(document.queryAll(".pageToc, footer, header, #nav-toolbar"));
1105 markRemoved(document.queryAll("#article-nav")); 1222 markRemoved(document.queryAll("#article-nav"));
1106 markRemoved(document.queryAll(".hideforedit")); 1223 markRemoved(document.queryAll(".hideforedit"));
1107 markRemoved(document.queryAll(".navbox")); 1224 markRemoved(document.queryAll(".navbox"));
1108 markRemoved(document.query("#Method_overview")); 1225 markRemoved(document.query("#Method_overview"));
1109 markRemoved(document.queryAll("h1, h2")); 1226 markRemoved(document.queryAll("h1, h2"));
1110 1227
1111 scrapeSection(root, "#Methods", currentType, members, 'methods'); 1228 scrapeSection(root, "#Methods", currentType, members, 'methods');
1112 scrapeSection(root, "#Constants, #Error_codes, #State_constants", currentType, members, 'constants'); 1229 scrapeSection(root, "#Constants, #Error_codes, #State_constants",
1230 currentType, members, 'constants');
1113 // TODO(jacobr): infer tables based on multiple matches rather than 1231 // TODO(jacobr): infer tables based on multiple matches rather than
1114 // using a hard coded list of section ids. 1232 // using a hard coded list of section ids.
1115 scrapeSection(root, 1233 scrapeSection(root,
1116 "[id^=Properties], #Notes, [id^=Other_properties], #Attributes, #DOM_prope rties, #Event_handlers, #Event_Handlers", 1234 "[id^=Properties], #Notes, [id^=Other_properties], #Attributes, " +
1235 "#DOM_properties, #Event_handlers, #Event_Handlers",
1117 currentType, members, 'properties'); 1236 currentType, members, 'properties');
1118 1237
1119 // Avoid doing this till now to avoid messing up the section scrape. 1238 // Avoid doing this till now to avoid messing up the section scrape.
1120 markRemoved(document.queryAll("h3")); 1239 markRemoved(document.queryAll("h3"));
1121 1240
1122 ElementList $examples = root.queryAll("span[id^=example], span[id^=Example]"); 1241 ElementList examples = root.queryAll("span[id^=example], span[id^=Example]");
1123 1242
1124 extractSection("#See_also", 'seeAlso'); 1243 extractSection("#See_also", 'seeAlso');
1125 extractSection("#Specification, #Specifications", "specification"); 1244 extractSection("#Specification, #Specifications", "specification");
1126 // $("#Methods").parent().remove(); // not safe (e.g. Document)
1127 1245
1128 // TODO(jacobr): actually extract the constructor(s) 1246 // TODO(jacobr): actually extract the constructor(s)
1129 extractSection("#Constructor, #Constructors", 'constructor'); 1247 extractSection("#Constructor, #Constructors", 'constructor');
1130 extractSection("#Browser_compatibility, #Compatibility", 'compatibility'); 1248 extractSection("#Browser_compatibility, #Compatibility", 'compatibility');
1131 1249
1250 // Extract examples.
1132 List<String> exampleHtml = []; 1251 List<String> exampleHtml = [];
1133 for (Element e in $examples) { 1252 for (Element e in examples) {
1134 e.classes.add(DART_REMOVED); 1253 e.classes.add(DART_REMOVED);
1135 } 1254 }
1136 for (Element e in $examples) { 1255 for (Element e in examples) {
1137 String html = filteredHtml(e, root, null, 1256 String html = filteredHtml(e, root, null,
1138 (DocumentFragment fragment) { 1257 (DocumentFragment fragment) {
1139 removeHeaders(fragment); 1258 removeHeaders(fragment);
1140 if (fragment.text.trim().toLowerCase() == "example") { 1259 if (fragment.text.trim().toLowerCase() == "example") {
1141 // Degenerate example. 1260 // Degenerate example.
1142 fragment.nodes.clear(); 1261 fragment.nodes.clear();
1143 } 1262 }
1144 }).html; 1263 }).html;
1145 if (html.length > 0) { 1264 if (html.length > 0) {
1146 exampleHtml.add(html); 1265 exampleHtml.add(html);
1147 } 1266 }
1148 } 1267 }
1149 if (exampleHtml.length > 0) { 1268 if (exampleHtml.length > 0) {
1150 dbEntry['examples'] = exampleHtml; 1269 dbEntry['examples'] = exampleHtml;
1151 } 1270 }
1152 1271
1272 // Extract the class summary.
1273 // Basically everything left over after the #Summary or #Description tag is
1274 // safe to include in the summary.
1153 StringBuffer summary = new StringBuffer(); 1275 StringBuffer summary = new StringBuffer();
1154
1155 for (Element e in root.queryAll("#Summary, #Description")) { 1276 for (Element e in root.queryAll("#Summary, #Description")) {
1156 summary.add(filteredHtml(root, e, null, removeHeaders).html); 1277 summary.add(filteredHtml(root, e, null, removeHeaders).html);
1157 } 1278 }
1158 1279
1159 if (summary.length == 0) { 1280 if (summary.length == 0) {
1160 // Remove the "Gecko DOM Reference text" 1281 // Remove the "Gecko DOM Reference text"
1161 Element ref = root.query(".lang.lang-en"); 1282 Element ref = root.query(".lang.lang-en");
1162 if (ref != null) { 1283 if (ref != null) {
1163 ref = ref.parent; 1284 ref = ref.parent;
1164 String refText = ref.text.trim(); 1285 String refText = ref.text.trim();
1165 if (refText == "Gecko DOM Reference" || 1286 if (refText == "Gecko DOM Reference" ||
1166 refText == "« Gecko DOM Reference") { 1287 refText == "« Gecko DOM Reference") {
1167 ref.remove(); 1288 ref.remove();
1168 } 1289 }
1169 } 1290 }
1170 // Risky... this might add stuff we shouldn't. 1291 // Risky... this might add stuff we shouldn't.
1171 summary.add(filteredHtml(root, root, null, removeHeaders).html); 1292 summary.add(filteredHtml(root, root, null, removeHeaders).html);
1172 } 1293 }
1173 1294
1174 if (summary.length > 0) { 1295 if (summary.length > 0) {
1175 dbEntry['summary'] = summary.toString(); 1296 dbEntry['summary'] = summary.toString();
1176 } 1297 }
1177 1298
1178 // Inject CSS to aid debugging in the browser. 1299 // Inject CSS to aid debugging in the browser.
1300 // We could avoid doing this if we know we are not running in a browser..
1179 document.head.nodes.add(new Element.html(DEBUG_CSS)); 1301 document.head.nodes.add(new Element.html(DEBUG_CSS));
1180 1302
1181 onEnd(); 1303 onEnd();
1182 } 1304 }
1183 1305
1184 void main() { 1306 void main() {
1185 window.on.load.add(documentLoaded); 1307 window.on.load.add(documentLoaded);
1186 } 1308 }
1187 1309
1188 void documentLoaded(event) { 1310 void documentLoaded(event) {
1311 // Load the database of expected methods and properties with an
1312 // XMLHttpRequest.
1189 new XMLHttpRequest.getTEMPNAME('${window.location}.json', (req) { 1313 new XMLHttpRequest.getTEMPNAME('${window.location}.json', (req) {
1190 data = JSON.parse(req.responseText); 1314 data = JSON.parse(req.responseText);
1191 dbEntry = {'members': [], 'srcUrl': pageUrl}; 1315 dbEntry = {'members': [], 'srcUrl': pageUrl};
1192 resourceLoaded(); 1316 run();
1193 }); 1317 });
1194 } 1318 }
OLDNEW
« no previous file with comments | « utils/apidoc/mdn/crawl.js ('k') | utils/apidoc/mdn/extract.sh » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698