OLD | NEW |
---|---|
(Empty) | |
1 var fs = require('fs'); | |
nweiz
2012/02/01 00:10:39
It's not clear why this is in JS either. The reaso
Jacob
2012/02/01 07:48:26
Same reason as other js script. This should be re
| |
2 var util = require('util'); | |
3 var exec = require('child_process').exec; | |
4 var path = require('path'); | |
5 | |
6 var db = {}; | |
7 var metadata = {}; | |
8 var USE_VM = false; | |
9 | |
10 // Warning: START_DART_MESSAGE must match the value hardcoded in extract.dart | |
11 // TODO(jacobr): figure out a cleaner way to parse this data. | |
12 var START_DART_MESSAGE = "START_DART_MESSAGE_UNIQUE_IDENTIFIER"; | |
13 var END_DART_MESSAGE = "END_DART_MESSAGE_UNIQUE_IDENTIFIER"; | |
14 | |
15 var domTypes = JSON.parse(fs.readFileSync('data/domTypes.json', 'utf8').toString ()); | |
16 var cacheData = JSON.parse(fs.readFileSync('output/crawl/cache.json', 'utf8').to String()); | |
17 var dartIdl = JSON.parse(fs.readFileSync('data/dartIdl.json', 'utf8').toString() ); | |
nweiz
2012/02/01 00:10:39
Line lengths.
Jacob
2012/02/01 07:48:26
Done.
| |
18 | |
19 try { | |
20 fs.mkdirSync('output/extract'); | |
21 } catch (e) { | |
22 // It doesn't matter if the directories already exist. | |
23 } | |
24 | |
25 var errorFiles = []; | |
26 // TODO(jacobr): blacklist these types as we can't get good docs for them. | |
27 // ["Performance"] | |
28 | |
29 function parseFile(type, onDone, entry, file, searchResultIndex) { | |
nweiz
2012/02/01 00:10:39
Why is this a separate function? It's only called
Jacob
2012/02/01 07:48:26
Seems like a reasonable function name to me.
nweiz
2012/02/01 21:23:02
The name is reasonable, but "// parse the HTML fil
Jacob
2012/02/02 05:26:38
here's a different take on why this is a good func
nweiz
2012/02/02 19:54:34
This function also includes reading the input file
| |
30 var inputFile; | |
31 try { | |
32 inputFile = fs.readFileSync("output/crawl/" + file, 'utf8').toString(); | |
33 } catch (e) { | |
34 console.warn("Couldn't read: " + file); | |
35 onDone(); | |
36 return; | |
37 } | |
38 | |
39 var inputFileRaw = inputFile; | |
40 // Cached pages have multiple DOCTYPE tags. Strip off the first one so that | |
41 // we have valid HTML. | |
42 if (inputFile.indexOf("<!DOCTYPE") == 0) { | |
43 inputFile = inputFile.substr(1); | |
nweiz
2012/02/01 00:10:39
It would be much clearer to just do "var matchInde
Jacob
2012/02/01 07:48:26
Can't do that now that a toLowerCase is also neede
| |
44 var matchIndex = inputFile.indexOf("<!DOCTYPE"); | |
45 if (matchIndex == -1) { | |
46 // not a cached page. | |
47 inputFile = inputFileRaw; | |
48 } else { | |
49 inputFile = inputFile.substr(matchIndex); | |
50 } | |
51 } | |
52 | |
53 // Disable all existing javascript in the input file to speedup parsing and | |
nweiz
2012/02/01 00:10:39
Grammar nit: "speed up"
Jacob
2012/02/01 07:48:26
speedup seems like it is valid.
http://en.wikipedi
nweiz
2012/02/01 21:23:02
"Speedup" is a noun; "speed up" is the verb form.
Jacob
2012/02/02 05:26:38
Done.
| |
54 // avoid conflicts between our JS and the JS in the file. | |
55 inputFile = inputFile.replace(/<script type="text\/javascript"/g, | |
56 '<script type="text/ignored"'); | |
57 | |
58 var endBodyIndex = inputFile.lastIndexOf("</body>"); | |
59 if (endBodyIndex == -1) { | |
60 // Some files are missing a closing body tag. | |
61 endBodyIndex = inputFile.lastIndexOf("</html>"); | |
62 } | |
63 if (endBodyIndex == -1) { | |
64 if (inputFile.indexOf("Error 404 (Not Found)") != -1) { | |
65 console.warn("Skipping 404 file"); | |
nweiz
2012/02/01 00:10:39
List the filename.
Jacob
2012/02/01 07:48:26
Done.
| |
66 onDone(); | |
67 return; | |
68 } | |
69 throw "Unexpected file format for " + file; | |
nweiz
2012/02/01 00:10:39
Why are we throwing here instead of warning?
Jacob
2012/02/01 07:48:26
because that indicates a serious bug
| |
70 } | |
71 | |
72 // Remove all easy to remove script tags to speed page load. | |
nweiz
2012/02/01 00:10:39
That's not what this code is doing.
Jacob
2012/02/01 07:48:26
yep. i used to and then stopped. removed comment.
| |
73 inputFile = inputFile.substring(0, endBodyIndex) + | |
74 '<script type="text/javascript">\n' + | |
75 ' if (window.layoutTestController) {\n' + | |
nweiz
2012/02/01 00:10:39
Why are we feature-detecting here? Are we planning
Jacob
2012/02/01 07:48:26
added comment explaining this.
// We feature
| |
76 ' var controller = window.layoutTestController;\n' + | |
77 ' controller.dumpAsText();\n' + | |
78 ' controller.waitUntilDone();\n' + | |
79 ' }\n' + | |
80 'window.addEventListener("message", receiveMessage, false);\n' + | |
81 'function receiveMessage(event) {\n' + | |
82 ' if (event.data.indexOf("' + START_DART_MESSAGE + '") != 0) return;\n' + | |
83 ' console.log(event.data + "' + END_DART_MESSAGE + '");\n' + | |
84 ' if (window.layoutTestController) {\n' + | |
85 ' document.documentElement.textContent = "";\n' + | |
86 ' window.layoutTestController.notifyDone();\n' + | |
87 ' }\n' + | |
88 '}\n' + | |
89 '</script>\n' + | |
90 (USE_VM ? | |
91 '<script type="application/dart" src="../../extract.dart"></script>' : | |
92 '<script type="text/javascript" src="../../output/extract.dart.js"></scrip t>') + | |
nweiz
2012/02/01 00:10:39
Line length
Jacob
2012/02/01 07:48:26
Done.
| |
93 '\n' + inputFile.substring(endBodyIndex); | |
94 | |
95 console.log("Processing: " + file); | |
96 var dumpFileName = "output/extract/" + file; | |
nweiz
2012/02/01 00:10:39
Style nit: unnecessary variable.
Jacob
2012/02/01 07:48:26
Done.
| |
97 var absoluteDumpFileName = path.resolve(dumpFileName); | |
98 fs.writeFileSync(absoluteDumpFileName, inputFile, 'utf8'); | |
99 var parseArgs = { | |
100 type: type, | |
101 searchResult: entry, | |
102 dartIdl: dartIdl[type] | |
103 }; | |
104 fs.writeFileSync(absoluteDumpFileName + ".json", JSON.stringify(parseArgs), | |
105 'utf8'); | |
106 | |
107 var cmd = '../../../client/tests/drt/DumpRenderTree.app/Contents/MacOS/' + | |
108 'DumpRenderTree ' + absoluteDumpFileName; | |
nweiz
2012/02/01 00:10:39
TODO: Make this run on platforms other than OS X.
Jacob
2012/02/01 07:48:26
Done.
| |
109 console.log(cmd); | |
110 var child = exec(cmd, | |
nweiz
2012/02/01 00:10:39
Unused variable.
Jacob
2012/02/01 07:48:26
Done.
| |
111 function (error, stdout, stderr) { | |
112 var msgIndex = stdout.indexOf(START_DART_MESSAGE); | |
113 var msg = stdout.substring(msgIndex + START_DART_MESSAGE.length); | |
114 var msg = msg.substring(0, msg.indexOf(END_DART_MESSAGE)); | |
nweiz
2012/02/01 00:10:39
Shouldn't have "var". Actually, this stuff should
Jacob
2012/02/01 07:48:26
Done.
| |
115 console.log('all: ' + stdout); | |
116 console.log('stderr: ' + stderr); | |
117 if (error !== null) { | |
118 console.log('exec error: ' + error); | |
119 } | |
120 | |
121 if (!(type in db)) { | |
122 db[type] = []; | |
123 } | |
124 try { | |
125 db[type][searchResultIndex] = JSON.parse(msg); | |
126 } catch(e) { | |
127 console.warn("error parsing result for " + type + " file= "+ file); | |
128 errorFiles.push(file); | |
129 fs.writeFileSync("output/errors.json", | |
130 JSON.stringify(errorFiles, null, ' '), 'utf8'); | |
nweiz
2012/02/01 00:10:39
Why is this written again for every error?
Jacob
2012/02/01 07:48:26
So that if you press control-c you always have an
nweiz
2012/02/01 21:23:02
I see. Useful inline comment?
Jacob
2012/02/02 05:26:38
Done.
| |
131 } | |
132 onDone(); | |
133 }); | |
134 } | |
135 var tasks = []; | |
136 | |
137 var numProcesses = 8; | |
138 // Have numProcesses extraction tasks running simultaneously to improve | |
139 // performance. If your machine is slow, you may need to dial back the | |
140 // parallelism. | |
nweiz
2012/02/01 00:10:39
This comment should probably be attached to numPro
Jacob
2012/02/01 07:48:26
Done. Also moved to the top of the file.
| |
141 var numPending = numProcesses; | |
142 | |
143 function processNextTask() { | |
nweiz
2012/02/01 00:10:39
If you're trying to do stuff in parallel, this wou
Jacob
2012/02/01 07:48:26
I strongly disagree because the bottleneck is in t
| |
144 numPending--; | |
145 if (tasks.length > 0) { | |
146 numPending++; | |
147 var task = tasks.pop(); | |
148 task(); | |
149 } else { | |
150 if (numPending <= 0) { | |
151 console.log("Successfully completed all tasks"); | |
152 fs.writeFileSync("output/database.json", | |
153 JSON.stringify(db, null, ' '), 'utf8'); | |
154 } | |
155 } | |
156 } | |
157 | |
158 function createTask(type, entry, index) { | |
nweiz
2012/02/01 00:10:39
This also doesn't seem worth a function.
Jacob
2012/02/01 07:48:26
I disagree
nweiz
2012/02/01 21:23:02
Why? I don't think it adds any clarity over just p
Jacob
2012/02/02 05:26:38
Keep in mind this is JavaScript not Dart so you'd
nweiz
2012/02/02 19:54:34
Good point, I had forgotten about Javascript's cra
| |
159 return function () { | |
160 var file = type + index + '.html'; | |
161 parseFile(type, processNextTask, entry, file, index); | |
162 }; | |
163 } | |
164 | |
165 for (var i = 0; i < domTypes.length; i++) { | |
166 var type = domTypes[i]; | |
167 var entries = cacheData[type]; | |
168 if (entries != null) { | |
nweiz
2012/02/01 00:10:39
Style nit: if (!entries)
Jacob
2012/02/01 07:48:26
I disagree. I prefer != null as it more clearly sp
nweiz
2012/02/01 21:23:02
I'm not sure I like the idea of writing Javascript
Jacob
2012/02/02 05:26:38
writing JavaScript as if it were dart is the right
| |
169 for (var j = 0; j < entries.length; j++) { | |
170 tasks.push(createTask(type, entries[j], j)); | |
171 } | |
172 } else { | |
173 console.warn("No crawled files for " + type); | |
174 } | |
175 } | |
176 | |
177 for (var p = 0; p < numProcesses; p++) { | |
178 processNextTask(); | |
179 } | |
OLD | NEW |