Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(396)

Side by Side Diff: utils/apidoc/mdn/crawl.js

Issue 9315026: Cleanup mdn scripts (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Code review fixes Created 8 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « utils/apidoc/mdn/README.txt ('k') | utils/apidoc/mdn/extract.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // TODO(jacobr): convert this file to Dart once Dart supports all of the
2 // nodejs functionality used here. For example, search for all occurences of
3 // "http." and "fs."
1 var http = require('http'); 4 var http = require('http');
2 var fs = require('fs'); 5 var fs = require('fs');
3 6
4 try { 7 try {
5 fs.mkdirSync('output/crawl'); 8 fs.mkdirSync('output/crawl');
6 } catch (e) { 9 } catch (e) {
7 // It doesn't matter if the directories already exist. 10 // It doesn't matter if the directories already exist.
8 } 11 }
9 12
10 var domTypes = JSON.parse(fs.readFileSync('data/domTypes.json', 'utf8')); 13 var domTypes = JSON.parse(fs.readFileSync('data/domTypes.json', 'utf8'));
11 14
12 var cacheData = {}; 15 var cacheData = {};
13 16
14 function scrape(filename, link) { 17 function scrape(filename, link) {
15 console.log(link); 18 console.log(link);
16 var httpsPrefix = "https://"; 19 var httpsPrefix = "https://";
17 var prefix = 'https://developer.mozilla.org/'; 20 var prefix = 'https://developer.mozilla.org/';
18 var notFoundPrefix = 'https://developer.mozilla.org/Article_not_found?uri='; 21 var notFoundPrefix = 'https://developer.mozilla.org/Article_not_found?uri=';
19 if (link.indexOf(prefix) != 0 ) { 22 if (link.indexOf(prefix) != 0 ) {
20 throw "Unexpected url:" + link; 23 throw "Unexpected url: " + link;
21 } 24 }
22 var scrapePath = "/search?q=cache:" + link; 25 var scrapePath = "/search?q=cache:" + link;
23 // We crawl content from googleusercontent.com so we don't have to worry about 26 // We crawl content from googleusercontent.com so we don't have to worry about
24 // crawler politeness like we would have to if scraping developer.mozilla.org 27 // crawler politeness like we would have to if scraping developer.mozilla.org
25 // directly. 28 // directly.
26 var options = { 29 var options = {
27 host: 'webcache.googleusercontent.com', 30 host: 'webcache.googleusercontent.com',
28 path: scrapePath, 31 path: scrapePath,
29 port: 80, 32 port: 80,
30 method: 'GET' 33 method: 'GET'
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after
108 if (link.indexOf(notFoundPrefix) == 0) { 111 if (link.indexOf(notFoundPrefix) == 0) {
109 link = prefix + link.substr(notFoundPrefix.length); 112 link = prefix + link.substr(notFoundPrefix.length);
110 } 113 }
111 114
112 entry.push({index: j, link: link, title: item['title']}); 115 entry.push({index: j, link: link, title: item['title']});
113 scrape(type + j, link); 116 scrape(type + j, link);
114 } 117 }
115 } 118 }
116 119
117 fs.writeFileSync('output/crawl/cache.json', JSON.stringify(cacheData, null, ' ') , 'utf8'); 120 fs.writeFileSync('output/crawl/cache.json', JSON.stringify(cacheData, null, ' ') , 'utf8');
OLDNEW
« no previous file with comments | « utils/apidoc/mdn/README.txt ('k') | utils/apidoc/mdn/extract.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698