utils/apidoc/mdn/crawl.js - Issue 9315026: Cleanup mdn scripts - Code Review

Chromium Code Reviews

chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out

(396)

My Issues | Starred Open | Closed | All

Side by Side Diff: utils/apidoc/mdn/crawl.js

Issue 9315026: Cleanup mdn scripts (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Code review fixes Created 8 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« no previous file with comments | « utils/apidoc/mdn/README.txt ('k') | utils/apidoc/mdn/extract.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	1 // TODO(jacobr): convert this file to Dart once Dart supports all of the

	2 // nodejs functionality used here. For example, search for all occurences of

	3 // "http." and "fs."

1 var http = require('http');	4 var http = require('http');

2 var fs = require('fs');	5 var fs = require('fs');

3	6

4 try {	7 try {

5 fs.mkdirSync('output/crawl');	8 fs.mkdirSync('output/crawl');

6 } catch (e) {	9 } catch (e) {

7 // It doesn't matter if the directories already exist.	10 // It doesn't matter if the directories already exist.

8 }	11 }

9	12

10 var domTypes = JSON.parse(fs.readFileSync('data/domTypes.json', 'utf8'));	13 var domTypes = JSON.parse(fs.readFileSync('data/domTypes.json', 'utf8'));

11	14

12 var cacheData = {};	15 var cacheData = {};

13	16

14 function scrape(filename, link) {	17 function scrape(filename, link) {

15 console.log(link);	18 console.log(link);

16 var httpsPrefix = "https://";	19 var httpsPrefix = "https://";

17 var prefix = 'https://developer.mozilla.org/';	20 var prefix = 'https://developer.mozilla.org/';

18 var notFoundPrefix = 'https://developer.mozilla.org/Article_not_found?uri=';	21 var notFoundPrefix = 'https://developer.mozilla.org/Article_not_found?uri=';

19 if (link.indexOf(prefix) != 0 ) {	22 if (link.indexOf(prefix) != 0 ) {

20 throw "Unexpected url:" + link;	23 throw "Unexpected url: " + link;

21 }	24 }

22 var scrapePath = "/search?q=cache:" + link;	25 var scrapePath = "/search?q=cache:" + link;

23 // We crawl content from googleusercontent.com so we don't have to worry about	26 // We crawl content from googleusercontent.com so we don't have to worry about

24 // crawler politeness like we would have to if scraping developer.mozilla.org	27 // crawler politeness like we would have to if scraping developer.mozilla.org

25 // directly.	28 // directly.

26 var options = {	29 var options = {

27 host: 'webcache.googleusercontent.com',	30 host: 'webcache.googleusercontent.com',

28 path: scrapePath,	31 path: scrapePath,

29 port: 80,	32 port: 80,

30 method: 'GET'	33 method: 'GET'

(...skipping 77 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
108 if (link.indexOf(notFoundPrefix) == 0) {	111 if (link.indexOf(notFoundPrefix) == 0) {

109 link = prefix + link.substr(notFoundPrefix.length);	112 link = prefix + link.substr(notFoundPrefix.length);

110 }	113 }

111	114

112 entry.push({index: j, link: link, title: item['title']});	115 entry.push({index: j, link: link, title: item['title']});

113 scrape(type + j, link);	116 scrape(type + j, link);

114 }	117 }

115 }	118 }

116	119

117 fs.writeFileSync('output/crawl/cache.json', JSON.stringify(cacheData, null, ' ') , 'utf8');	120 fs.writeFileSync('output/crawl/cache.json', JSON.stringify(cacheData, null, ' ') , 'utf8');

OLD	NEW

« no previous file with comments | « utils/apidoc/mdn/README.txt ('k') | utils/apidoc/mdn/extract.dart » ('j') | no next file with comments »

Powered by Google App Engine

This is Rietveld 408576698