OLD | NEW |
---|---|
(Empty) | |
1 var http = require('http'); | |
nweiz
2012/02/01 00:10:39
Why is this file in JS?
Jacob
2012/02/01 07:48:26
Because the dart server libraries currently lack t
nweiz
2012/02/01 21:23:02
It would be good to document exactly what's keepin
Jacob
2012/02/02 05:26:38
that seems low value. Just look at each call to a
nweiz
2012/02/02 19:54:34
It was non-obvious to me. It's important that some
Jacob
2012/02/02 22:03:14
The trouble is if I made a list of which methods s
| |
2 var fs = require('fs'); | |
3 | |
4 try { | |
5 fs.mkdirSync('output/crawl'); | |
6 } catch (e) { | |
7 // It doesn't matter if the directories already exist. | |
8 } | |
9 | |
10 var domTypes = JSON.parse(fs.readFileSync('data/domTypes.json', 'utf8')); | |
11 | |
12 var cacheData = {}; | |
13 | |
14 function scrape(filename, link) { | |
15 console.log(link); | |
16 var httpsPrefix = "https://"; | |
17 var prefix = 'https://developer.mozilla.org/'; | |
18 var notFoundPrefix = 'https://developer.mozilla.org/Article_not_found?uri='; | |
19 if (link.indexOf(prefix) != 0 ) { | |
20 throw "Unexpected url:" + link; | |
nweiz
2012/02/01 00:10:39
Space after ":"
Jacob
2012/02/01 07:48:26
Done.
| |
21 } | |
22 var scrapePath = "/search?q=cache:" + link; | |
23 // We crawl content from googleusercontent.com so we don't have to worry about | |
24 // crawler politeness like we would have to if scraping developer.mozilla.org | |
25 // directly. | |
26 var options = { | |
27 host: 'webcache.googleusercontent.com', | |
28 path: scrapePath, | |
29 port: 80, | |
30 method: 'GET' | |
31 }; | |
32 | |
33 var req = http.request(options, function(res) { | |
34 res.setEncoding('utf8'); | |
35 var data=''; | |
36 | |
37 res.on('data', function(d) { | |
38 data += d; | |
39 }); | |
40 var onClose = function(e) { | |
41 console.log("Writing crawl result for " + link); | |
42 fs.writeFileSync("output/crawl/" + filename + ".html", data, 'utf8'); | |
43 } | |
44 res.on('close', onClose); | |
45 res.on('end', onClose); | |
46 }); | |
47 req.end(); | |
48 | |
49 req.on('error', function(e) { | |
50 throw "Error " + e + " scraping " + link; | |
51 }); | |
52 } | |
53 | |
54 for (var i = 0; i < domTypes.length; i++) { | |
55 var type = domTypes[i]; | |
56 | |
57 // Json containing the search results for the current type. | |
58 var data = fs.readFileSync("output/search/" + type + ".json"); | |
59 json = JSON.parse(data); | |
60 if (!('items' in json)) { | |
61 console.warn("No search results for " + type); | |
62 continue; | |
63 } | |
64 var items = json['items']; | |
65 | |
66 var entry = []; | |
67 cacheData[type] = entry; | |
68 | |
69 // Hardcode the correct matching url for a few types where the search engine | |
70 // gets the wrong answer. | |
71 var link = null; | |
72 if (type == 'Screen') { | |
73 link = 'https://developer.mozilla.org/en/DOM/window.screen'; | |
74 } else if (type == 'Text') { | |
75 link = 'https://developer.mozilla.org/en/DOM/Text'; | |
76 } else if (type == 'Touch') { | |
77 link = 'https://developer.mozilla.org/en/DOM/Touch'; | |
78 } else if (type == 'TouchEvent' || type == 'webkitTouchEvent' || type == 'Webk itTouchEvent' || type == 'WebKitTouchEvent') { | |
79 link = 'https://developer.mozilla.org/en/DOM/TouchEvent'; | |
80 } else if (type == 'HTMLSpanElement') { | |
81 link = 'https://developer.mozilla.org/en/HTML/Element/span'; | |
82 } else if (type == 'HTMLPreElement') { | |
83 link = 'https://developer.mozilla.org/en/HTML/Element/pre'; | |
84 } else if (type == 'HTMLFrameElement') { | |
85 link = 'https://developer.mozilla.org/en/HTML/Element/frame'; | |
86 } else if (type == 'HTMLFrameSetElement') { | |
87 link = 'https://developer.mozilla.org/en/HTML/Element/frameset'; | |
88 } else if (type == 'Geolocation') { | |
89 link = 'https://developer.mozilla.org/en/nsIDOMGeolocation;' | |
90 } else if (type == 'Notification') { | |
91 link = 'https://developer.mozilla.org/en/DOM/notification'; | |
92 } else if (type == 'IDBDatabase') { | |
93 link = 'https://developer.mozilla.org/en/IndexedDB/IDBDatabase' | |
94 } | |
95 if (link != null) { | |
96 entry.push({index: 0, link: link, title: type}); | |
97 scrape(type + 0, link); | |
98 continue; | |
99 } | |
100 | |
101 for (j = 0; j < items.length; j++) { | |
102 var item = items[j]; | |
103 var prefix = 'https://developer.mozilla.org/'; | |
104 var notFoundPrefix = 'https://developer.mozilla.org/Article_not_found?uri='; | |
105 // Be optimistic and replace article not found links with links to where the | |
106 // article should be. | |
107 link = item['link']; | |
108 if (link.indexOf(notFoundPrefix) == 0) { | |
109 link = prefix + link.substr(notFoundPrefix.length); | |
110 } | |
111 | |
112 entry.push({index: j, link: link, title: item['title']}); | |
113 scrape(type + j, link); | |
114 } | |
115 } | |
116 | |
117 fs.writeFileSync('output/crawl/cache.json', JSON.stringify(cacheData, null, ' ') , 'utf8'); | |
OLD | NEW |