OLD | NEW |
(Empty) | |
| 1 var http = require('http'); |
| 2 var fs = require('fs'); |
| 3 |
| 4 try { |
| 5 fs.mkdirSync('output/crawl'); |
| 6 } catch (e) { |
| 7 // It doesn't matter if the directories already exist. |
| 8 } |
| 9 |
| 10 var domTypes = JSON.parse(fs.readFileSync('data/domTypes.json', 'utf8')); |
| 11 |
| 12 var cacheData = {}; |
| 13 |
| 14 function scrape(filename, link) { |
| 15 console.log(link); |
| 16 var httpsPrefix = "https://"; |
| 17 var prefix = 'https://developer.mozilla.org/'; |
| 18 var notFoundPrefix = 'https://developer.mozilla.org/Article_not_found?uri='; |
| 19 if (link.indexOf(prefix) != 0 ) { |
| 20 throw "Unexpected url:" + link; |
| 21 } |
| 22 var scrapePath = "/search?q=cache:" + link; |
| 23 // We crawl content from googleusercontent.com so we don't have to worry about |
| 24 // crawler politeness like we would have to if scraping developer.mozilla.org |
| 25 // directly. |
| 26 var options = { |
| 27 host: 'webcache.googleusercontent.com', |
| 28 path: scrapePath, |
| 29 port: 80, |
| 30 method: 'GET' |
| 31 }; |
| 32 |
| 33 var req = http.request(options, function(res) { |
| 34 res.setEncoding('utf8'); |
| 35 var data=''; |
| 36 |
| 37 res.on('data', function(d) { |
| 38 data += d; |
| 39 }); |
| 40 var onClose = function(e) { |
| 41 console.log("Writing crawl result for " + link); |
| 42 fs.writeFileSync("output/crawl/" + filename + ".html", data, 'utf8'); |
| 43 } |
| 44 res.on('close', onClose); |
| 45 res.on('end', onClose); |
| 46 }); |
| 47 req.end(); |
| 48 |
| 49 req.on('error', function(e) { |
| 50 throw "Error " + e + " scraping " + link; |
| 51 }); |
| 52 } |
| 53 |
| 54 for (var i = 0; i < domTypes.length; i++) { |
| 55 var type = domTypes[i]; |
| 56 |
| 57 // Json containing the search results for the current type. |
| 58 var data = fs.readFileSync("output/search/" + type + ".json"); |
| 59 json = JSON.parse(data); |
| 60 if (!('items' in json)) { |
| 61 console.warn("No search results for " + type); |
| 62 continue; |
| 63 } |
| 64 var items = json['items']; |
| 65 |
| 66 var entry = []; |
| 67 cacheData[type] = entry; |
| 68 |
| 69 // Hardcode the correct matching url for a few types where the search engine |
| 70 // gets the wrong answer. |
| 71 var link = null; |
| 72 if (type == 'Screen') { |
| 73 link = 'https://developer.mozilla.org/en/DOM/window.screen'; |
| 74 } else if (type == 'Text') { |
| 75 link = 'https://developer.mozilla.org/en/DOM/Text'; |
| 76 } else if (type == 'Touch') { |
| 77 link = 'https://developer.mozilla.org/en/DOM/Touch'; |
| 78 } else if (type == 'TouchEvent' || type == 'webkitTouchEvent' || type == 'Webk
itTouchEvent' || type == 'WebKitTouchEvent') { |
| 79 link = 'https://developer.mozilla.org/en/DOM/TouchEvent'; |
| 80 } else if (type == 'HTMLSpanElement') { |
| 81 link = 'https://developer.mozilla.org/en/HTML/Element/span'; |
| 82 } else if (type == 'HTMLPreElement') { |
| 83 link = 'https://developer.mozilla.org/en/HTML/Element/pre'; |
| 84 } else if (type == 'HTMLFrameElement') { |
| 85 link = 'https://developer.mozilla.org/en/HTML/Element/frame'; |
| 86 } else if (type == 'HTMLFrameSetElement') { |
| 87 link = 'https://developer.mozilla.org/en/HTML/Element/frameset'; |
| 88 } else if (type == 'Geolocation') { |
| 89 link = 'https://developer.mozilla.org/en/nsIDOMGeolocation;' |
| 90 } else if (type == 'Notification') { |
| 91 link = 'https://developer.mozilla.org/en/DOM/notification'; |
| 92 } else if (type == 'IDBDatabase') { |
| 93 link = 'https://developer.mozilla.org/en/IndexedDB/IDBDatabase' |
| 94 } |
| 95 if (link != null) { |
| 96 entry.push({index: 0, link: link, title: type}); |
| 97 scrape(type + 0, link); |
| 98 continue; |
| 99 } |
| 100 |
| 101 for (j = 0; j < items.length; j++) { |
| 102 var item = items[j]; |
| 103 var prefix = 'https://developer.mozilla.org/'; |
| 104 var notFoundPrefix = 'https://developer.mozilla.org/Article_not_found?uri='; |
| 105 // Be optimistic and replace article not found links with links to where the |
| 106 // article should be. |
| 107 link = item['link']; |
| 108 if (link.indexOf(notFoundPrefix) == 0) { |
| 109 link = prefix + link.substr(notFoundPrefix.length); |
| 110 } |
| 111 |
| 112 entry.push({index: j, link: link, title: item['title']}); |
| 113 scrape(type + j, link); |
| 114 } |
| 115 } |
| 116 |
| 117 fs.writeFileSync('output/crawl/cache.json', JSON.stringify(cacheData, null, ' ')
, 'utf8'); |
OLD | NEW |