utils/apidoc/mdn/extract.dart - Issue 9315026: Cleanup mdn scripts

Unified Diff: utils/apidoc/mdn/extract.dart

Issue 9315026: Cleanup mdn scripts (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Code review comment fixes Created 8 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: utils/apidoc/mdn/extract.dart

diff --git a/utils/apidoc/mdn/extract.dart b/utils/apidoc/mdn/extract.dart

index 17627054da495a2b004b937e84d0593364b0fe29..58be76f7220027e04b514240b434763730e96bdb 100644

--- a/utils/apidoc/mdn/extract.dart

+++ b/utils/apidoc/mdn/extract.dart

@@ -14,8 +14,7 @@ Range _tempRange;

// Hacks because ASYNC measurement is annoying when just writing a script.

ClientRect getClientRect(Node n) {

if (n is Element) {

- Element e = n;

- dom.Element raw = unwrapDomObject(e.dynamic);

+ dom.Element raw = unwrapDomObject(n.dynamic);

return LevelDom.wrapClientRect(raw.getBoundingClientRect());

} else {

// Crazy hacks that works for nodes.... create a range and measure it.

@@ -28,11 +27,18 @@ ClientRect getClientRect(Node n) {

}

-final DART_REMOVED = "dart_removed";

+/**

+ * CSS class that is added to elements in the DOM to indicate that they should

+ * be removed when extracting blocks of documentation. This is helpful when

+ * running this script in a web browser as it is easy to visually see what

+ * blocks of information were extracted when using CSS such as DEBUG_CSS

+ * which highlights elements that should be removed.

+ */

+final DART_REMOVED = "dart-removed";

final DEBUG_CSS = """

- .dart_removed {

+ .dart-removed {

background-color: rgba(255, 0, 0, 0.5);

}

</style>""";

@@ -281,7 +287,7 @@ String getAbsoluteUrl(AnchorElement anchor) {

}

bool inTable(Node n) {

- while(n != null) {

+ while (n != null) {

if (n is TableElement) return true;

n = n.parent;

}

@@ -295,7 +301,7 @@ String escapeHTML(str) {

}

List<Text> getAllTextNodes(Element elem) {

- List<Text> nodes = <Text>[];

+ final nodes = <Text>[];

helper(Node n) {

if (n is Text) {

nodes.add(n);

@@ -323,8 +329,8 @@ bool isSkippableType(Node n) {

}

if (n is Text) return true;

- for (Node child in n.nodes) {

- if (isSkippableType(child) == false) {

+ for (final child in n.nodes) {

+ if (!isSkippableType(child)) {

return false;

}

@@ -353,44 +359,81 @@ class SectionParseResult {

}

String genCleanHtml(Element root) {

- for (Element e in root.queryAll(".$DART_REMOVED")) {

+ for (final e in root.queryAll(".$DART_REMOVED")) {

e.classes.remove(DART_REMOVED);

}

// Ditch inline styles.

- for (Element e in root.queryAll('[style]')) {

+ for (final e in root.queryAll('[style]')) {

e.attributes.remove('style');

}

// These elements are just tags that we should suppress.

- for (Element e in root.queryAll(".lang.lang-en")) {

+ for (final e in root.queryAll(".lang.lang-en")) {

e.remove();

}

+ Element parametersList;

+ Element returnValue;

nweiz 2012/02/02 00:16:19 Maybe "parametersHeader" and "returnHeader"? Curre

+ for (final e in root.queryAll("h6")) {

+ if (e.text == 'Parameters') {

+ parametersList = e;

+ } else if (e.text == 'Return value') {

+ returnValue = e;

+ }

+ if (parametersList != null) {

+ int numEmptyParameters = 0;

+ final parameterDescriptions = root.queryAll("dd");

+ for (Element parameterDescription in parameterDescriptions) {

+ if (parameterDescription.text.trim().length == 0) {

+ numEmptyParameters++;

nweiz 2012/02/02 00:16:19 "numEmptyParameters = parameterDescriptions.filter

+ }

+ if (numEmptyParameters > 0 &&

+ numEmptyParameters == parameterDescriptions.length) {

+ // Remove the parameter list as it adds zero value as all descriptions

+ // are empty.

+ parametersList.remove();

+ for (final e in root.queryAll("dl")) {

+ e.remove();

+ }

+ } else if (parameterDescriptions.length == 0 &&

+ parametersList.nextElementSibling != null &&

+ parametersList.nextElementSibling.text.trim() == 'None.') {

+ // No need to display that the function takes 0 parameters.

+ parametersList.nextElementSibling.remove();

+ parametersList.remove();

+ }

+ // Heuristic: if the return value is a single word it is a type name not a

+ // useful text description so suppress it.

+ if (returnValue != null &&

+ returnValue.nextElementSibling != null &&

+ returnValue.nextElementSibling.text.trim().split(' ').length <= 1) {

+ returnValue.nextElementSibling.remove();

+ returnValue.remove();

+ }

bool changed = true;

while (changed) {

changed = false;

- while (root.nodes.length == 1) {

- Node child = root.nodes.first;

- if (child is Element) {

- root = child;

- changed = true;

- } else {

- // Just calling innerHTML on the parent will be sufficient...

- // and insures the output is properly escaped.

- break;

- }

+ while (root.nodes.length == 1 && root.nodes.first is Element) {

+ root = root.nodes.first;

+ changed = true;

}

// Trim useless nodes from the front.

- while(root.nodes.length > 0 &&

+ while (root.nodes.length > 0 &&

isSkippable(root.nodes.first)) {

root.nodes.first.remove();

changed = true;

}

// Trim useless nodes from the back.

- while(root.nodes.length > 0 &&

+ while (root.nodes.length > 0 &&

isSkippable(root.nodes.last())) {

root.nodes.last().remove();

changed = true;

@@ -399,10 +442,6 @@ String genCleanHtml(Element root) {

return JSONFIXUPHACK(root.innerHTML);

}

-String genPrettyHtml(DocumentFragment fragment) {

- return genCleanHtml(fragment);

String genPrettyHtmlFromElement(Element e) {

e = e.clone(true);

return genCleanHtml(e);

@@ -420,7 +459,7 @@ class PostOrderTraversalIterator implements Iterator<Node> {

Node next() {

if (_next == null) return null;

- Node ret = _next;

+ final ret = _next;

if (_next.nextNode != null) {

_next = _leftMostDescendent(_next.nextNode);

} else {

@@ -445,11 +484,11 @@ class PostOrderTraversal implements Iterable<Node> {

}

Range findFirstLine(Range section, String prop) {

- Range firstLine = newRange();

+ final firstLine = newRange();

firstLine.setStart(section.startContainer, section.startOffset);

num maxBottom = null;

- for (Node n in new PostOrderTraversal(section.startContainer)) {

+ for (final n in new PostOrderTraversal(section.startContainer)) {

int compareResult = section.comparePoint(n, 0);

if (compareResult == -1) {

// before range so skip.

@@ -462,9 +501,8 @@ Range findFirstLine(Range section, String prop) {

final rect = getClientRect(n);

num bottom = rect.bottom;

if (rect.height > 0 && rect.width > 0) {

- if (maxBottom != null && (

- maxBottom + MIN_PIXELS_DIFFERENT_LINES < bottom

- )) {

+ if (maxBottom != null &&

+ maxBottom + MIN_PIXELS_DIFFERENT_LINES < bottom) {

break;

} else if (maxBottom == null || maxBottom > bottom) {

maxBottom = bottom;

@@ -474,7 +512,10 @@ Range findFirstLine(Range section, String prop) {

firstLine.setEndAfter(n);

}

- if (firstLine.toString().indexOf(stripWebkit(prop)) == -1) {

+ // If the first line of text in the section does not contain the property

+ // name then we're not confident we are able to extract a high accuracy match

+ // so we should not return anything.

+ if (!firstLine.toString().contains(stripWebkit(prop))) {

return null;

}

return firstLine;

@@ -482,7 +523,7 @@ Range findFirstLine(Range section, String prop) {

AnchorElement findAnchorElement(Element root, String prop) {

for (AnchorElement a in root.queryAll("a")) {

- if (a.text.indexOf(prop) != -1) {

+ if (a.text.contains(prop)) {

return a;

}

@@ -490,9 +531,9 @@ AnchorElement findAnchorElement(Element root, String prop) {

}

// First surrounding element with an ID is safe enough.

-Element findTigherRoot(Element elem, Element root) {

+Element findTighterRoot(Element elem, Element root) {

Element candidate = elem;

- while(root != candidate) {

+ while (root != candidate) {

candidate = candidate.parent;

if (candidate.id.length > 0 && candidate.id.indexOf("section_") != 0) {

break;

@@ -501,22 +542,22 @@ Element findTigherRoot(Element elem, Element root) {

return candidate;

}

-// this is very slow and ugly.. consider rewriting.

+// TODO(jacobr): this is very slow and ugly.. consider rewriting or at least

+// commenting carefully.

SectionParseResult filteredHtml(Element elem, Element root, String prop,

Function fragmentGeneratedCallback) {

// Using a tighter root avoids false positives at the risk of trimming

// text we shouldn't.

- root = findTigherRoot(elem, root);

- Range range = newRange();

+ root = findTighterRoot(elem, root);

+ final range = newRange();

range.setStartBefore(elem);

Element current = elem;

while (current != null) {

range.setEndBefore(current);

- if (current.classes.contains(DART_REMOVED)) {

- if (range.toString().trim().length > 0) {

- break;

- }

+ if (current.classes.contains(DART_REMOVED) &&

+ range.toString().trim().length > 0) {

+ break;

}

if (current.firstElementChild != null) {

current = current.firstElementChild;

@@ -547,7 +588,7 @@ SectionParseResult filteredHtml(Element elem, Element root, String prop,

}

- DocumentFragment fragment = range.cloneContents();

+ final fragment = range.cloneContents();

if (fragmentGeneratedCallback != null) {

fragmentGeneratedCallback(fragment);

}

@@ -557,7 +598,7 @@ SectionParseResult filteredHtml(Element elem, Element root, String prop,

}

// Extract idl

- StringBuffer idl = new StringBuffer();

+ final idl = new StringBuffer();

if (prop != null && prop.length > 0) {

// Only expect properties to have HTML.

for(Element e in fragment.queryAll(IDL_SELECTOR)) {

@@ -570,43 +611,41 @@ SectionParseResult filteredHtml(Element elem, Element root, String prop,

for (Element e in fragment.queryAll("pre")) {

// Check if it looks like idl...

String txt = e.text.trim();

- if (likelyIdl.hasMatch(txt) && txt.indexOf("\n") != -1

- && txt.indexOf(")") != -1) {

+ if (likelyIdl.hasMatch(txt) && txt.contains("\n") && txt.contains(")")) {

idl.add(e.outerHTML);

e.remove();

}

- return new SectionParseResult(genPrettyHtml(fragment), url, idl.toString());

+ return new SectionParseResult(genCleanHtml(fragment), url, idl.toString());

}

-Element findBest(Element root, List<Text> allText, String prop, String propType) {

+Element findBest(Element root, List<Text> allText, String prop,

+ String propType) {

// Best bet: match an id

Element cand;

- cand = root.query("#" + prop);

+ cand = root.query("#$prop");

if (cand == null && propType == "methods") {

- cand = root.query("[id=" + prop + "\$\$]");

+ cand = root.query("[id=$prop\$\$]");

+ }

+ while (cand != null && cand.text.trim().length == 0) {

+ // We found the bookmark for the element but sadly it is just an empty

+ // placeholder. Find the first real element.

+ cand = cand.nextElementSibling;

}

if (cand != null) {

- while (cand != null && cand.text.trim().length == 0) {

- // We found the bookmark for the element but sadly it is just an empty

- // placeholder. Find the first real element.

- cand = cand.nextElementSibling;

- }

- if (cand != null) {

- return cand;

- }

+ return cand;

}

- // If you are at least 70 pixels from the left, something is definitely fishy and we shouldn't even consider this candidate.

+ // If you are at least 70 pixels from the left, something is definitely

+ // fishy and we shouldn't even consider this candidate.

num candLeft = 70;

for (Text text in allText) {

Element proposed = null;

-// var t = safeNameCleanup(text.text);

-// TODO(jacobr): does it hurt precision to use the full cleanup?

+ // TODO(jacobr): does it hurt precision to use the full cleanup?

String t = fullNameCleanup(text.text);

if (t == prop) {

proposed = text.parent;

@@ -636,14 +675,12 @@ bool isObsolete(Element e) {

}

bool isFirstCharLowerCase(String str) {

- RegExp firstLower = new RegExp("^[a-z]");

- return firstLower.hasMatch(str);

+ return const RegExp("^[a-z]").hasMatch(str);

}

-void scrapeSection(Element root, String sectionSelector,

- String currentType,

- List members,

- String propType) {

+// TODO(jacobr): document this method.

+void scrapeSection(Element root, String sectionSelector, String currentType,

+ List members, String propType) {

Map expectedProps = dartIdl[propType];

Set<String> alreadyMatchedProperties = new Set<String>();

@@ -655,8 +692,8 @@ void scrapeSection(Element root, String sectionSelector,

}

for (Element matchElement in allMatches) {

DivElement match = matchElement.parent;

- if (!match.id.startsWith("section") && !(match.id == "pageText")) {

- throw "Enexpected element $match";

+ if (!match.id.startsWith("section") && match.id != "pageText") {

+ throw "Unexpected element $match";

}

match.classes.add(DART_REMOVED);

@@ -669,7 +706,7 @@ void scrapeSection(Element root, String sectionSelector,

int helpIndex = -1;

num i = 0;

for (Element r in t.queryAll("th, td.header")) {

- var txt = r.text.trim().split(" ")[0].toLowerCase();

+ final txt = r.text.trim().split(" ")[0].toLowerCase();

if (txt == "description") {

helpIndex = i;

break;

@@ -685,14 +722,13 @@ void scrapeSection(Element root, String sectionSelector,

// Find the row that seems to have the most names that look like

// expected properties.

for (Element r in t.queryAll("tbody tr")) {

- ElementList $row = r.elements;

- if ($row.length == 0 || $row.first.classes.contains(".header")) {

+ ElementList row = r.elements;

+ if (row.length == 0 || row.first.classes.contains(".header")) {

continue;

}

- for (int k = 0; k < numMatches.length && k < $row.length; k++) {

- Element e = $row[k];

- if (expectedProps.containsKey(fullNameCleanup(e.text))) {

+ for (int k = 0; k < numMatches.length && k < row.length; k++) {

+ if (expectedProps.containsKey(fullNameCleanup(row[k].text))) {

numMatches[k]++;

break;

}

@@ -711,14 +747,14 @@ void scrapeSection(Element root, String sectionSelector,

}

for (Element r in t.queryAll("tbody tr")) {

- ElementList $row = r.elements;

- if ($row.length > propNameIndex && $row.length > helpIndex ) {

- if ($row.first.classes.contains(".header")) {

+ ElementList row = r.elements;

+ if (row.length > propNameIndex && row.length > helpIndex) {

+ if (row.first.classes.contains(".header")) {

continue;

}

// TODO(jacobr): this code for determining the namestr is needlessly

// messy.

- Element nameRow = $row[propNameIndex];

+ Element nameRow = row[propNameIndex];

AnchorElement a = nameRow.query("a");

String goodName = '';

if (a != null) {

@@ -728,15 +764,14 @@ void scrapeSection(Element root, String sectionSelector,

Map entry = new Map<String, String>();

- // "currentType": $($row[1]).text().trim(), // find("code") ?

- entry["name"] = fullNameCleanup(nameStr.length > 0 ? nameStr : goodName);

+ entry["name"] = fullNameCleanup(nameStr.length > 0 ?

+ nameStr : goodName);

final parse = filteredHtml(nameRow, nameRow, entry["name"], null);

String altHelp = parse.html;

- // "jsSignature": nameStr,

- entry["help"] = (helpIndex == -1 || $row[helpIndex] == null) ? altHelp : genPrettyHtmlFromElement($row[helpIndex]);

- // "altHelp" : altHelp,

+ entry["help"] = (helpIndex == -1 || row[helpIndex] == null) ?

+ altHelp : genPrettyHtmlFromElement(row[helpIndex]);

if (parse.url != null) {

entry["url"] = parse.url;

}

@@ -777,8 +812,7 @@ void scrapeSection(Element root, String sectionSelector,

}

for (String prop in pmap.getKeys()) {

- Element e = pmap[prop];

- e.classes.add(DART_REMOVED);

+ pmap[prop].classes.add(DART_REMOVED);

}

for (String prop in pmap.getKeys()) {

@@ -786,14 +820,15 @@ void scrapeSection(Element root, String sectionSelector,

ClientRect r = getClientRect(e);

// TODO(jacobr): a lot of these queries are identical.

for (Element cand in match.queryAll(e.tagName)) {

- if (!cand.classes.contains(DART_REMOVED) && !inTable(cand) ) { // XXX use a neg selector.

+ // TODO(jacobr): use a negative selector instead.

+ if (!cand.classes.contains(DART_REMOVED) && !inTable(cand)) {

ClientRect candRect = getClientRect(cand);

// TODO(jacobr): this is somewhat loose.

if (candRect.left == r.left &&

(candRect.height - r.height).abs() < 5) {

String propName = fullNameCleanup(cand.text);

- if (isFirstCharLowerCase(propName) && pmap.containsKey(propName) == false && alreadyMatchedProperties.contains(propName) == false) {

- // Don't set here to avoid layouts... cand.classes.add(DART_REMOVED);

+ if (isFirstCharLowerCase(propName) && !pmap.containsKey(propName)

+ && !alreadyMatchedProperties.contains(propName)) {

pmap[propName] = cand;

}

@@ -810,7 +845,7 @@ void scrapeSection(Element root, String sectionSelector,

// DART_REMOVED so we don't include them in member descriptions... which

// would suck.

for (Element e in match.queryAll("[id]")) {

- if (e.id.indexOf(matchElement.id) != -1) {

+ if (e.id.contains(matchElement.id)) {

e.classes.add(DART_REMOVED);

}

@@ -828,7 +863,6 @@ void scrapeSection(Element root, String sectionSelector,

"name" : prop,

"help" : parse.html,

"obsolete" : obsolete

- //"jsSignature" : nameStr

};

if (parse.idl.length > 0) {

entry["idl"] = parse.idl;

@@ -844,10 +878,8 @@ String trimHtml(String html) {

}

bool maybeName(String name) {

- RegExp nameRegExp = new RegExp("^[a-z][a-z0-9A-Z]+\$");

- if (nameRegExp.hasMatch(name)) return true;

- RegExp constRegExp = new RegExp("^[A-Z][A-Z_]*\$");

- if (constRegExp.hasMatch(name)) return true;

+ return const RegExp("^[a-z][a-z0-9A-Z]+\$").hasMatch(name) ||

+ const RegExp("^[A-Z][A-Z_]*\$").hasMatch(name);

}

void markRemoved(var e) {

@@ -868,9 +900,7 @@ String JSONFIXUPHACK(String value) {

}

String mozToWebkit(String name) {

- RegExp regExp = new RegExp("^moz");

- name = name.replaceFirst(regExp, "webkit");

- return name;

+ return name.replaceFirst(const RegExp("^moz"), "webkit");

}

String stripWebkit(String name) {

@@ -950,10 +980,6 @@ String trimPrefix(String str, String prefix) {

}

-void resourceLoaded() {

- if (data != null) run();

String trimStart(String str, String start) {

if (str.startsWith(start) && str.length > start.length) {

return str.substring(start.length);

@@ -987,7 +1013,7 @@ void extractSection(String selector, String key) {

}

void run() {

- // Inject CSS to insure lines don't wrap unless it was intentional.

+ // Inject CSS to ensure lines don't wrap unless it was intentional.

document.head.nodes.add(new Element.html("""

body {

@@ -1006,7 +1032,8 @@ void run() {

e.remove();

}

- // Flatten the list of known DOM types into a faster and case-insensitive map.

+ // Flatten the list of known DOM types into a faster and case-insensitive

+ // map.

domTypes = {};

for (final domType in domTypesRaw) {

domTypes[domType.toLowerCase()] = domType;

@@ -1024,7 +1051,8 @@ void run() {

// TODO(rnystrom): Add rel external to links we didn't fix.

for (AnchorElement a in document.queryAll('a')) {

// Get the raw attribute because we *don't* want the browser to fully-

- // qualify the name for us since it has the wrong base address for the page.

+ // qualify the name for us since it has the wrong base address for the

+ // page.

var href = a.attributes['href'];

// Ignore busted links.

@@ -1070,7 +1098,7 @@ void run() {

a.attributes['href'] = href;

}

- if (title.toLowerCase().indexOf(currentTypeTiny.toLowerCase()) == -1) {

+ if (!title.toLowerCase().contains(currentTypeTiny.toLowerCase())) {

bool foundMatch = false;

// Test out if the title is really an HTML tag that matches the

// current class name.

@@ -1083,7 +1111,7 @@ void run() {

}

} catch(e) {}

}

- if (foundMatch == false) {

+ if (!foundMatch) {

dbEntry['skipped'] = true;

dbEntry['cause'] = "Suspect title";

onEnd();

@@ -1109,31 +1137,32 @@ void run() {

markRemoved(document.queryAll("h1, h2"));

scrapeSection(root, "#Methods", currentType, members, 'methods');

- scrapeSection(root, "#Constants, #Error_codes, #State_constants", currentType, members, 'constants');

+ scrapeSection(root, "#Constants, #Error_codes, #State_constants",

+ currentType, members, 'constants');

// TODO(jacobr): infer tables based on multiple matches rather than

// using a hard coded list of section ids.

scrapeSection(root,

- "[id^=Properties], #Notes, [id^=Other_properties], #Attributes, #DOM_properties, #Event_handlers, #Event_Handlers",

+ "[id^=Properties], #Notes, [id^=Other_properties], #Attributes, " +

+ "#DOM_properties, #Event_handlers, #Event_Handlers",

currentType, members, 'properties');

// Avoid doing this till now to avoid messing up the section scrape.

markRemoved(document.queryAll("h3"));

- ElementList $examples = root.queryAll("span[id^=example], span[id^=Example]");

+ ElementList examples = root.queryAll("span[id^=example], span[id^=Example]");

extractSection("#See_also", 'seeAlso');

extractSection("#Specification, #Specifications", "specification");

- // $("#Methods").parent().remove(); // not safe (e.g. Document)

// TODO(jacobr): actually extract the constructor(s)

extractSection("#Constructor, #Constructors", 'constructor');

extractSection("#Browser_compatibility, #Compatibility", 'compatibility');

List<String> exampleHtml = [];

- for (Element e in $examples) {

+ for (Element e in examples) {

e.classes.add(DART_REMOVED);

}

- for (Element e in $examples) {

+ for (Element e in examples) {

String html = filteredHtml(e, root, null,

(DocumentFragment fragment) {

removeHeaders(fragment);

@@ -1189,6 +1218,6 @@ void documentLoaded(event) {

new XMLHttpRequest.getTEMPNAME('${window.location}.json', (req) {

data = JSON.parse(req.responseText);

dbEntry = {'members': [], 'srcUrl': pageUrl};

- resourceLoaded();

+ run();

});

}

« no previous file with comments | « utils/apidoc/mdn/crawl.js ('k') | utils/apidoc/mdn/extract.sh » ('j') | utils/apidoc/mdn/extractRunner.js » ('J')