Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(274)

Side by Side Diff: utils/apidoc/mdn/crawl.js

Issue 9225039: Integrate MDN content into API documentation. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Respond to review. Created 8 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « utils/apidoc/mdn/README.txt ('k') | utils/apidoc/mdn/data/dartIdl.json » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 var http = require('http');
2 var fs = require('fs');
3
4 try {
5 fs.mkdirSync('output/crawl');
6 } catch (e) {
7 // It doesn't matter if the directories already exist.
8 }
9
10 var domTypes = JSON.parse(fs.readFileSync('data/domTypes.json', 'utf8'));
11
12 var cacheData = {};
13
14 function scrape(filename, link) {
15 console.log(link);
16 var httpsPrefix = "https://";
17 var prefix = 'https://developer.mozilla.org/';
18 var notFoundPrefix = 'https://developer.mozilla.org/Article_not_found?uri=';
19 if (link.indexOf(prefix) != 0 ) {
20 throw "Unexpected url:" + link;
21 }
22 var scrapePath = "/search?q=cache:" + link;
23 // We crawl content from googleusercontent.com so we don't have to worry about
24 // crawler politeness like we would have to if scraping developer.mozilla.org
25 // directly.
26 var options = {
27 host: 'webcache.googleusercontent.com',
28 path: scrapePath,
29 port: 80,
30 method: 'GET'
31 };
32
33 var req = http.request(options, function(res) {
34 res.setEncoding('utf8');
35 var data='';
36
37 res.on('data', function(d) {
38 data += d;
39 });
40 var onClose = function(e) {
41 console.log("Writing crawl result for " + link);
42 fs.writeFileSync("output/crawl/" + filename + ".html", data, 'utf8');
43 }
44 res.on('close', onClose);
45 res.on('end', onClose);
46 });
47 req.end();
48
49 req.on('error', function(e) {
50 throw "Error " + e + " scraping " + link;
51 });
52 }
53
54 for (var i = 0; i < domTypes.length; i++) {
55 var type = domTypes[i];
56
57 // Json containing the search results for the current type.
58 var data = fs.readFileSync("output/search/" + type + ".json");
59 json = JSON.parse(data);
60 if (!('items' in json)) {
61 console.warn("No search results for " + type);
62 continue;
63 }
64 var items = json['items'];
65
66 var entry = [];
67 cacheData[type] = entry;
68
69 // Hardcode the correct matching url for a few types where the search engine
70 // gets the wrong answer.
71 var link = null;
72 if (type == 'Screen') {
73 link = 'https://developer.mozilla.org/en/DOM/window.screen';
74 } else if (type == 'Text') {
75 link = 'https://developer.mozilla.org/en/DOM/Text';
76 } else if (type == 'Touch') {
77 link = 'https://developer.mozilla.org/en/DOM/Touch';
78 } else if (type == 'TouchEvent' || type == 'webkitTouchEvent' || type == 'Webk itTouchEvent' || type == 'WebKitTouchEvent') {
79 link = 'https://developer.mozilla.org/en/DOM/TouchEvent';
80 } else if (type == 'HTMLSpanElement') {
81 link = 'https://developer.mozilla.org/en/HTML/Element/span';
82 } else if (type == 'HTMLPreElement') {
83 link = 'https://developer.mozilla.org/en/HTML/Element/pre';
84 } else if (type == 'HTMLFrameElement') {
85 link = 'https://developer.mozilla.org/en/HTML/Element/frame';
86 } else if (type == 'HTMLFrameSetElement') {
87 link = 'https://developer.mozilla.org/en/HTML/Element/frameset';
88 } else if (type == 'Geolocation') {
89 link = 'https://developer.mozilla.org/en/nsIDOMGeolocation;'
90 } else if (type == 'Notification') {
91 link = 'https://developer.mozilla.org/en/DOM/notification';
92 } else if (type == 'IDBDatabase') {
93 link = 'https://developer.mozilla.org/en/IndexedDB/IDBDatabase'
94 }
95 if (link != null) {
96 entry.push({index: 0, link: link, title: type});
97 scrape(type + 0, link);
98 continue;
99 }
100
101 for (j = 0; j < items.length; j++) {
102 var item = items[j];
103 var prefix = 'https://developer.mozilla.org/';
104 var notFoundPrefix = 'https://developer.mozilla.org/Article_not_found?uri=';
105 // Be optimistic and replace article not found links with links to where the
106 // article should be.
107 link = item['link'];
108 if (link.indexOf(notFoundPrefix) == 0) {
109 link = prefix + link.substr(notFoundPrefix.length);
110 }
111
112 entry.push({index: j, link: link, title: item['title']});
113 scrape(type + j, link);
114 }
115 }
116
117 fs.writeFileSync('output/crawl/cache.json', JSON.stringify(cacheData, null, ' ') , 'utf8');
OLDNEW
« no previous file with comments | « utils/apidoc/mdn/README.txt ('k') | utils/apidoc/mdn/data/dartIdl.json » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698