OLD | NEW |
| 1 // TODO(jacobr): convert this file to Dart once Dart supports all of the |
| 2 // nodejs functionality used here. For example, search for all occurences of |
| 3 // "http." and "fs." |
1 var http = require('http'); | 4 var http = require('http'); |
2 var fs = require('fs'); | 5 var fs = require('fs'); |
3 | 6 |
4 try { | 7 try { |
5 fs.mkdirSync('output/crawl'); | 8 fs.mkdirSync('output/crawl'); |
6 } catch (e) { | 9 } catch (e) { |
7 // It doesn't matter if the directories already exist. | 10 // It doesn't matter if the directories already exist. |
8 } | 11 } |
9 | 12 |
10 var domTypes = JSON.parse(fs.readFileSync('data/domTypes.json', 'utf8')); | 13 var domTypes = JSON.parse(fs.readFileSync('data/domTypes.json', 'utf8')); |
11 | 14 |
12 var cacheData = {}; | 15 var cacheData = {}; |
13 | 16 |
14 function scrape(filename, link) { | 17 function scrape(filename, link) { |
15 console.log(link); | 18 console.log(link); |
16 var httpsPrefix = "https://"; | 19 var httpsPrefix = "https://"; |
17 var prefix = 'https://developer.mozilla.org/'; | 20 var prefix = 'https://developer.mozilla.org/'; |
18 var notFoundPrefix = 'https://developer.mozilla.org/Article_not_found?uri='; | 21 var notFoundPrefix = 'https://developer.mozilla.org/Article_not_found?uri='; |
19 if (link.indexOf(prefix) != 0 ) { | 22 if (link.indexOf(prefix) != 0 ) { |
20 throw "Unexpected url:" + link; | 23 throw "Unexpected url: " + link; |
21 } | 24 } |
22 var scrapePath = "/search?q=cache:" + link; | 25 var scrapePath = "/search?q=cache:" + link; |
23 // We crawl content from googleusercontent.com so we don't have to worry about | 26 // We crawl content from googleusercontent.com so we don't have to worry about |
24 // crawler politeness like we would have to if scraping developer.mozilla.org | 27 // crawler politeness like we would have to if scraping developer.mozilla.org |
25 // directly. | 28 // directly. |
26 var options = { | 29 var options = { |
27 host: 'webcache.googleusercontent.com', | 30 host: 'webcache.googleusercontent.com', |
28 path: scrapePath, | 31 path: scrapePath, |
29 port: 80, | 32 port: 80, |
30 method: 'GET' | 33 method: 'GET' |
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
108 if (link.indexOf(notFoundPrefix) == 0) { | 111 if (link.indexOf(notFoundPrefix) == 0) { |
109 link = prefix + link.substr(notFoundPrefix.length); | 112 link = prefix + link.substr(notFoundPrefix.length); |
110 } | 113 } |
111 | 114 |
112 entry.push({index: j, link: link, title: item['title']}); | 115 entry.push({index: j, link: link, title: item['title']}); |
113 scrape(type + j, link); | 116 scrape(type + j, link); |
114 } | 117 } |
115 } | 118 } |
116 | 119 |
117 fs.writeFileSync('output/crawl/cache.json', JSON.stringify(cacheData, null, ' ')
, 'utf8'); | 120 fs.writeFileSync('output/crawl/cache.json', JSON.stringify(cacheData, null, ' ')
, 'utf8'); |
OLD | NEW |