OLD | NEW |
1 var http = require('http'); | 1 var http = require('http'); |
2 var fs = require('fs'); | 2 var fs = require('fs'); |
3 | 3 |
4 try { | 4 try { |
5 fs.mkdirSync('output/crawl'); | 5 fs.mkdirSync('output/crawl'); |
6 } catch (e) { | 6 } catch (e) { |
7 // It doesn't matter if the directories already exist. | 7 // It doesn't matter if the directories already exist. |
8 } | 8 } |
9 | 9 |
10 var domTypes = JSON.parse(fs.readFileSync('data/domTypes.json', 'utf8')); | 10 var domTypes = JSON.parse(fs.readFileSync('data/domTypes.json', 'utf8')); |
11 | 11 |
12 var cacheData = {}; | 12 var cacheData = {}; |
13 | 13 |
14 function scrape(filename, link) { | 14 function scrape(filename, link) { |
15 console.log(link); | 15 console.log(link); |
16 var httpsPrefix = "https://"; | 16 var httpsPrefix = "https://"; |
17 var prefix = 'https://developer.mozilla.org/'; | 17 var prefix = 'https://developer.mozilla.org/'; |
18 var notFoundPrefix = 'https://developer.mozilla.org/Article_not_found?uri='; | 18 var notFoundPrefix = 'https://developer.mozilla.org/Article_not_found?uri='; |
19 if (link.indexOf(prefix) != 0 ) { | 19 if (link.indexOf(prefix) != 0 ) { |
20 throw "Unexpected url:" + link; | 20 throw "Unexpected url: " + link; |
21 } | 21 } |
22 var scrapePath = "/search?q=cache:" + link; | 22 var scrapePath = "/search?q=cache:" + link; |
23 // We crawl content from googleusercontent.com so we don't have to worry about | 23 // We crawl content from googleusercontent.com so we don't have to worry about |
24 // crawler politeness like we would have to if scraping developer.mozilla.org | 24 // crawler politeness like we would have to if scraping developer.mozilla.org |
25 // directly. | 25 // directly. |
26 var options = { | 26 var options = { |
27 host: 'webcache.googleusercontent.com', | 27 host: 'webcache.googleusercontent.com', |
28 path: scrapePath, | 28 path: scrapePath, |
29 port: 80, | 29 port: 80, |
30 method: 'GET' | 30 method: 'GET' |
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
108 if (link.indexOf(notFoundPrefix) == 0) { | 108 if (link.indexOf(notFoundPrefix) == 0) { |
109 link = prefix + link.substr(notFoundPrefix.length); | 109 link = prefix + link.substr(notFoundPrefix.length); |
110 } | 110 } |
111 | 111 |
112 entry.push({index: j, link: link, title: item['title']}); | 112 entry.push({index: j, link: link, title: item['title']}); |
113 scrape(type + j, link); | 113 scrape(type + j, link); |
114 } | 114 } |
115 } | 115 } |
116 | 116 |
117 fs.writeFileSync('output/crawl/cache.json', JSON.stringify(cacheData, null, ' ')
, 'utf8'); | 117 fs.writeFileSync('output/crawl/cache.json', JSON.stringify(cacheData, null, ' ')
, 'utf8'); |
OLD | NEW |