Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(167)

Side by Side Diff: third_party/readability/js/readability.js

Issue 146843010: Add support for multipage distillation. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: rebase address comments. Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 1
cjhopman 2014/02/03 23:56:53 Since this is modified, it needs to contain a noti
shashi 2014/02/04 01:39:37 Done.
2 var dbg = (typeof console !== 'undefined') ? function(s) { 2 var dbg = (typeof console !== 'undefined') ? function(s) {
3 console.log("Readability: " + s); 3 console.log("Readability: " + s);
4 } : function() {}; 4 } : function() {};
5 5
6 /* 6 /*
7 * Readability. An Arc90 Lab Experiment. 7 * Readability. An Arc90 Lab Experiment.
8 * Website: http://lab.arc90.com/experiments/readability 8 * Website: http://lab.arc90.com/experiments/readability
9 * Source: http://code.google.com/p/arc90labs-readability 9 * Source: http://code.google.com/p/arc90labs-readability
10 * 10 *
11 * "Readability" is a trademark of Arc90 Inc and may not be used without explici t permission. 11 * "Readability" is a trademark of Arc90 Inc and may not be used without explici t permission.
12 * 12 *
13 * Copyright (c) 2010 Arc90 Inc 13 * Copyright (c) 2010 Arc90 Inc
14 * Readability is licensed under the Apache License, Version 2.0. 14 * Readability is licensed under the Apache License, Version 2.0.
15 **/ 15 **/
16 var readability = { 16 var readability = {
17 readStyle: "style-newspaper", 17 readStyle: "style-newspaper",
18 readSize: "size-medium", 18 readSize: "size-medium",
19 readMargin: "margin-wide", 19 readMargin: "margin-wide",
20 20
21 distilledHTML: '', 21 distilledHTML: '',
22 distilledArticleContent: null, 22 distilledArticleContent: null,
23 nextPageLink: '',
23 24
24 version: '1.7.1', 25 version: '1.7.1',
25 iframeLoads: 0, 26 iframeLoads: 0,
26 convertLinksToFootnotes: false, 27 convertLinksToFootnotes: false,
27 reversePageScroll: false, /* If they hold shift and hit space, scroll up */ 28 reversePageScroll: false, /* If they hold shift and hit space, scroll up */
28 frameHack: false, /** 29 frameHack: false, /**
29 * The frame hack is to workaround a firefo x bug where if you 30 * The frame hack is to workaround a firefo x bug where if you
30 * pull content out of a frame and stick it into the parent element, the scrollbar won't appear. 31 * pull content out of a frame and stick it into the parent element, the scrollbar won't appear.
31 * So we fake a scrollbar in the wrapping d iv. 32 * So we fake a scrollbar in the wrapping d iv.
32 **/ 33 **/
33 biggestFrame: false, 34 biggestFrame: false,
34 flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */ 35 flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */
35 36
36 /* constants */ 37 /* constants */
37 FLAG_STRIP_UNLIKELYS: 0x1, 38 FLAG_STRIP_UNLIKELYS: 0x1,
38 FLAG_WEIGHT_CLASSES: 0x2, 39 FLAG_WEIGHT_CLASSES: 0x2,
39 FLAG_CLEAN_CONDITIONALLY: 0x4, 40 FLAG_CLEAN_CONDITIONALLY: 0x4,
40 41
41 maxPages: 30, /* The maximum number of pages to loop through before we ca ll it quits and just show a link. */ 42 maxPages: 30, /* The maximum number of pages to loop through before we ca ll it quits and just show a link. */
42 parsedPages: {}, /* The list of pages we've parsed in this call of readabili ty, for autopaging. As a key store for easier searching. */ 43 parsedPages: {}, /* The list of pages we've parsed in this call of readabili ty, for autopaging. As a key store for easier searching. */
43 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas e they happen to match, we'll know it's a duplicate. */ 44 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas e they happen to match, we'll know it's a duplicate. */
44 45
45 /** 46 /**
46 * All of the regular expressions in use within readability. 47 * All of the regular expressions in use within readability.
47 * Defined up here so we don't instantiate them repeatedly in loops. 48 * Defined up here so we don't instantiate them repeatedly in loops.
48 **/ 49 **/
49 regexps: { 50 regexps: {
50 unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header |menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popu p|tweet|twitter/i, 51 unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header |menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popu p|tweet|twitter/i,
51 okMaybeItsACandidate: /and|article|body|column|main|shadow/i, 52 okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
52 positive: /article|body|content|entry|hentry|main|page|pagi nation|post|text|blog|story/i, 53 positive: /article|body|content|entry|hentry|main|page|pagi nation|post|text|blog|story/i,
53 negative: /combx|comment|com-|contact|foot|footer|footnote| masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopp ing|tags|tool|widget/i, 54 negative: /combx|comment|com-|contact|foot|footer|footnote| masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopp ing|tags|tool|widget/i,
54 extraneous: /print|archive|comment|discuss|e[\-]?mail|share|r eply|all|login|sign|single/i, 55 extraneous: /print|archive|comment|discuss|e[\-]?mail|share|r eply|all|login|sign|single/i,
55 divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, 56 divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
56 replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi, 57 replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi,
57 replaceFonts: /<(\/?)font[^>]*>/gi, 58 replaceFonts: /<(\/?)font[^>]*>/gi,
58 trim: /^\s+|\s+$/g, 59 trim: /^\s+|\s+$/g,
59 normalize: /\s{2,}/g, 60 normalize: /\s{2,}/g,
60 killBreaks: /(<br\s*\/?>(\s|&nbsp;?)*){1,}/g, 61 killBreaks: /(<br\s*\/?>(\s|&nbsp;?)*){1,}/g,
61 videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, 62 videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i,
62 skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed) \s*$/i, 63 skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed) \s*$/i,
63 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last. 64 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last.
64 prevLink: /(prev|earl|old|new|<|«)/i 65 prevLink: /(prev|earl|old|new|<|«)/i
65 }, 66 },
66 67
67 /** 68 /**
68 * Runs readability. 69 * Runs readability.
69 * 70 *
70 * Workflow: 71 * Workflow:
71 * 1. Prep the document by removing script tags, css, etc. 72 * 1. Prep the document by removing script tags, css, etc.
72 * 2. Build readability's DOM tree. 73 * 2. Build readability's DOM tree.
73 * 3. Grab the article content from the current dom tree. 74 * 3. Grab the article content from the current dom tree.
74 * 4. Replace the current DOM tree with the new one. 75 * 4. Replace the current DOM tree with the new one.
75 * 5. Read peacefully. 76 * 5. Read peacefully.
76 * 77 *
77 * @return void 78 * @return void
78 **/ 79 **/
79 init: function() { 80 init: function() {
80 /* Before we do anything, remove all scripts that are not readability. * / 81 /* Before we do anything, remove all scripts that are not readability. * /
81 window.onload = window.onunload = function() {}; 82 window.onload = window.onunload = function() {};
82 83
83 readability.removeScripts(document); 84 readability.removeScripts(document);
84 85
85 /* Make sure this document is added to the list of parsed pages first, s o we don't double up on the first page */ 86 /* Make sure this document is added to the list of parsed pages first, s o we don't double up on the first page */
86 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true; 87 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;
87 88
88 /* Pull out any possible next page link first */ 89 /* Pull out any possible next page link first */
89 var nextPageLink = readability.findNextPageLink(document.body); 90 readability.nextPageLink = readability.findNextPageLink(document.body);
90 91
92 /* We handle processing of nextPage from C++ set nextPageLink to null */
93 var nextPageLink = null;
94
91 readability.prepDocument(); 95 readability.prepDocument();
92 96
93 /* Build readability's DOM tree */ 97 /* Build readability's DOM tree */
94 var overlay = document.createElement("DIV"); 98 var overlay = document.createElement("DIV");
95 var innerDiv = document.createElement("DIV"); 99 var innerDiv = document.createElement("DIV");
96 var articleTools = readability.getArticleTools(); 100 var articleTools = readability.getArticleTools();
97 var articleTitleText = readability.getArticleTitle(); 101 var articleTitleText = readability.getArticleTitle();
98 var articleContent = readability.grabArticle(); 102 var articleContent = readability.grabArticle();
99 103
100 if(!articleContent) { 104 if(!articleContent) {
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
145 rootWarning.innerHTML = "<em>Readability</em> was intended for u se on individual articles and not home pages. " + 149 rootWarning.innerHTML = "<em>Readability</em> was intended for u se on individual articles and not home pages. " +
146 "If you'd like to try rendering this page anyway, <a onClick='ja vascript:document.getElementById(\"readability-warning\").style.display=\"none\" ;document.getElementById(\"readability-content\").style.display=\"block\";'>clic k here</a> to continue."; 150 "If you'd like to try rendering this page anyway, <a onClick='ja vascript:document.getElementById(\"readability-warning\").style.display=\"none\" ;document.getElementById(\"readability-content\").style.display=\"block\";'>clic k here</a> to continue.";
147 151
148 innerDiv.insertBefore( rootWarning, articleContent ); 152 innerDiv.insertBefore( rootWarning, articleContent );
149 } 153 }
150 154
151 readability.postProcessContent(articleContent); 155 readability.postProcessContent(articleContent);
152 156
153 window.scrollTo(0, 0); 157 window.scrollTo(0, 0);
154 158
155 // TODO(bengr): Remove this assignment of null to nextPageLink when
156 // the processing of the next page link is safe.
157 nextPageLink = null;
158
159 if (nextPageLink) { 159 if (nextPageLink) {
160 /** 160 /**
161 * Append any additional pages after a small timeout so that people 161 * Append any additional pages after a small timeout so that people
162 * can start reading without having to wait for this to finish proce ssing. 162 * can start reading without having to wait for this to finish proce ssing.
163 **/ 163 **/
164 window.setTimeout(function() { 164 window.setTimeout(function() {
165 readability.appendNextPage(nextPageLink); 165 readability.appendNextPage(nextPageLink);
166 }, 500); 166 }, 500);
167 } 167 }
168 168
169 /** Smooth scrolling **/ 169 /** Smooth scrolling **/
170 document.onkeydown = function(e) { 170 document.onkeydown = function(e) {
171 var code = (window.event) ? event.keyCode : e.keyCode; 171 var code = (window.event) ? event.keyCode : e.keyCode;
172 if (code === 16) { 172 if (code === 16) {
173 readability.reversePageScroll = true; 173 readability.reversePageScroll = true;
174 return; 174 return;
175 } 175 }
176 176
177 if (code === 32) { 177 if (code === 32) {
178 readability.curScrollStep = 0; 178 readability.curScrollStep = 0;
179 var windowHeight = window.innerHeight ? window.innerHeight : (do cument.documentElement.clientHeight ? document.documentElement.clientHeight : do cument.body.clientHeight); 179 var windowHeight = window.innerHeight ? window.innerHeight : (do cument.documentElement.clientHeight ? document.documentElement.clientHeight : do cument.body.clientHeight);
180 180
181 if(readability.reversePageScroll) { 181 if(readability.reversePageScroll) {
182 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() - (windowHeight - 50), 20, 10); 182 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() - (windowHeight - 50), 20, 10);
183 } 183 }
184 else { 184 else {
185 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() + (windowHeight - 50), 20, 10); 185 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() + (windowHeight - 50), 20, 10);
186 } 186 }
187 187
188 return false; 188 return false;
189 } 189 }
190 }; 190 };
191 191
192 document.onkeyup = function(e) { 192 document.onkeyup = function(e) {
193 var code = (window.event) ? event.keyCode : e.keyCode; 193 var code = (window.event) ? event.keyCode : e.keyCode;
194 if (code === 16) { 194 if (code === 16) {
195 readability.reversePageScroll = false; 195 readability.reversePageScroll = false;
196 return; 196 return;
197 } 197 }
198 }; 198 };
199 }, 199 },
200 200
201 /** 201 /**
202 * Run any post-process modifications to article content as necessary. 202 * Run any post-process modifications to article content as necessary.
203 * 203 *
204 * @param Element 204 * @param Element
205 * @return void 205 * @return void
206 **/ 206 **/
207 postProcessContent: function(articleContent) { 207 postProcessContent: function(articleContent) {
208 if(readability.convertLinksToFootnotes && !window.location.href.match(/w ikipedia\.org/g)) { 208 if(readability.convertLinksToFootnotes && !window.location.href.match(/w ikipedia\.org/g)) {
209 readability.addFootnotes(articleContent); 209 readability.addFootnotes(articleContent);
210 } 210 }
211 211
212 readability.fixImageFloats(articleContent); 212 readability.fixImageFloats(articleContent);
213 }, 213 },
214 214
215 /** 215 /**
216 * Some content ends up looking ugly if the image is too large to be floated . 216 * Some content ends up looking ugly if the image is too large to be floated .
217 * If the image is wider than a threshold (currently 55%), no longer float i t, 217 * If the image is wider than a threshold (currently 55%), no longer float i t,
218 * center it instead. 218 * center it instead.
219 * 219 *
220 * @param Element 220 * @param Element
221 * @return void 221 * @return void
222 **/ 222 **/
223 fixImageFloats: function (articleContent) { 223 fixImageFloats: function (articleContent) {
224 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0. 55, 224 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0. 55,
225 images = articleContent.getElementsByTagName('img'); 225 images = articleContent.getElementsByTagName('img');
226 226
227 for(var i=0, il = images.length; i < il; i+=1) { 227 for(var i=0, il = images.length; i < il; i+=1) {
228 var image = images[i]; 228 var image = images[i];
229 229
230 if(image.offsetWidth > imageWidthThreshold) { 230 if(image.offsetWidth > imageWidthThreshold) {
231 image.className += " blockImage"; 231 image.className += " blockImage";
232 } 232 }
233 } 233 }
234 }, 234 },
235 235
236 /** 236 /**
237 * Get the article tools Element that has buttons like reload, print. 237 * Get the article tools Element that has buttons like reload, print.
238 * 238 *
239 * @return void 239 * @return void
240 **/ 240 **/
241 getArticleTools: function () { 241 getArticleTools: function () {
242 var articleTools = document.createElement("DIV"); 242 var articleTools = document.createElement("DIV");
243 243
244 articleTools.id = "readTools"; 244 articleTools.id = "readTools";
245 articleTools.innerHTML = 245 articleTools.innerHTML =
246 "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" + 246 "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +
247 "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" + 247 "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +
248 "<a href='#' onclick='readability.emailBox(); return false;' title=' Email page' id='email-page'>Email Page</a>"; 248 "<a href='#' onclick='readability.emailBox(); return false;' title=' Email page' id='email-page'>Email Page</a>";
249 249
250 return articleTools; 250 return articleTools;
251 }, 251 },
252 252
253 /** 253 /**
254 * retuns the suggested direction of the string 254 * retuns the suggested direction of the string
255 * 255 *
256 * @return "rtl" || "ltr" 256 * @return "rtl" || "ltr"
257 **/ 257 **/
258 getSuggestedDirection: function(text) { 258 getSuggestedDirection: function(text) {
259 function sanitizeText() { 259 function sanitizeText() {
260 return text.replace(/@\w+/, ""); 260 return text.replace(/@\w+/, "");
261 } 261 }
262 262
263 function countMatches(match) { 263 function countMatches(match) {
264 var matches = text.match(new RegExp(match, "g")); 264 var matches = text.match(new RegExp(match, "g"));
265 return matches !== null ? matches.length : 0; 265 return matches !== null ? matches.length : 0;
266 } 266 }
267 267
268 function isRTL() { 268 function isRTL() {
269 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]"); 269 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");
270 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]"); 270 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");
271 271
272 // if 20% of chars are Hebrew or Arbic then direction is rtl 272 // if 20% of chars are Hebrew or Arbic then direction is rtl
273 return (count_heb + count_arb) * 100 / text.length > 20; 273 return (count_heb + count_arb) * 100 / text.length > 20;
274 } 274 }
275 275
276 text = sanitizeText(text); 276 text = sanitizeText(text);
277 return isRTL() ? "rtl" : "ltr"; 277 return isRTL() ? "rtl" : "ltr";
278 }, 278 },
279 279
280 /** 280 /**
281 * Get the article title as an H1. 281 * Get the article title as an H1.
282 * 282 *
283 * @return void 283 * @return void
284 **/ 284 **/
285 getArticleTitle: function () { 285 getArticleTitle: function () {
286 var curTitle = "", 286 var curTitle = "",
287 origTitle = ""; 287 origTitle = "";
288 288
289 try { 289 try {
290 curTitle = origTitle = document.title; 290 curTitle = origTitle = document.title;
291 if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */ 291 if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */
292 curTitle = origTitle = readability.getInnerText(document.getElem entsByTagName('title')[0]); 292 curTitle = origTitle = readability.getInnerText(document.getElem entsByTagName('title')[0]);
293 } 293 }
294 } 294 }
295 catch(e) {} 295 catch(e) {}
296 296
297 if(curTitle.match(/ [\|\-] /)) 297 if(curTitle.match(/ [\|\-] /))
298 { 298 {
299 curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); 299 curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
300 300
301 if(curTitle.split(' ').length < 3) { 301 if(curTitle.split(' ').length < 3) {
302 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); 302 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
303 } 303 }
304 } 304 }
305 else if(curTitle.indexOf(': ') !== -1) 305 else if(curTitle.indexOf(': ') !== -1)
306 { 306 {
307 curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); 307 curTitle = origTitle.replace(/.*:(.*)/gi, '$1');
308 308
309 if(curTitle.split(' ').length < 3) { 309 if(curTitle.split(' ').length < 3) {
310 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); 310 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1');
(...skipping 12 matching lines...) Expand all
323 323
324 if(curTitle.split(' ').length <= 4) { 324 if(curTitle.split(' ').length <= 4) {
325 curTitle = origTitle; 325 curTitle = origTitle;
326 } 326 }
327 return curTitle; 327 return curTitle;
328 }, 328 },
329 329
330 /** 330 /**
331 * Prepare the HTML document for readability to scrape it. 331 * Prepare the HTML document for readability to scrape it.
332 * This includes things like stripping javascript, CSS, and handling terribl e markup. 332 * This includes things like stripping javascript, CSS, and handling terribl e markup.
333 * 333 *
334 * @return void 334 * @return void
335 **/ 335 **/
336 prepDocument: function () { 336 prepDocument: function () {
337 /** 337 /**
338 * In some cases a body element can't be found (if the HTML is totally h osed for example) 338 * In some cases a body element can't be found (if the HTML is totally h osed for example)
339 * so we create a new body node and append it to the document. 339 * so we create a new body node and append it to the document.
340 */ 340 */
341 if(document.body === null) 341 if(document.body === null)
342 { 342 {
343 var body = document.createElement("body"); 343 var body = document.createElement("body");
344 try { 344 try {
345 document.body = body; 345 document.body = body;
346 } 346 }
347 catch(e) { 347 catch(e) {
348 document.documentElement.appendChild(body); 348 document.documentElement.appendChild(body);
349 dbg(e); 349 dbg(e);
350 } 350 }
351 } 351 }
352 352
353 document.body.id = "readabilityBody"; 353 document.body.id = "readabilityBody";
354 354
355 var frames = document.getElementsByTagName('frame'); 355 var frames = document.getElementsByTagName('frame');
(...skipping 11 matching lines...) Expand all
367 canAccessFrame = true; 367 canAccessFrame = true;
368 } 368 }
369 catch(eFrames) { 369 catch(eFrames) {
370 dbg(eFrames); 370 dbg(eFrames);
371 } 371 }
372 372
373 if(frameSize > biggestFrameSize) { 373 if(frameSize > biggestFrameSize) {
374 biggestFrameSize = frameSize; 374 biggestFrameSize = frameSize;
375 readability.biggestFrame = frames[frameIndex]; 375 readability.biggestFrame = frames[frameIndex];
376 } 376 }
377 377
378 if(canAccessFrame && frameSize > bestFrameSize) 378 if(canAccessFrame && frameSize > bestFrameSize)
379 { 379 {
380 readability.frameHack = true; 380 readability.frameHack = true;
381 381
382 bestFrame = frames[frameIndex]; 382 bestFrame = frames[frameIndex];
383 bestFrameSize = frameSize; 383 bestFrameSize = frameSize;
384 } 384 }
385 } 385 }
386 386
387 if(bestFrame) 387 if(bestFrame)
388 { 388 {
389 var newBody = document.createElement('body'); 389 var newBody = document.createElement('body');
390 readability.moveNodeInnards(bestFrame.contentWindow.document.bod y, newBody); 390 readability.moveNodeInnards(bestFrame.contentWindow.document.bod y, newBody);
391 newBody.style.overflow = 'scroll'; 391 newBody.style.overflow = 'scroll';
392 document.body = newBody; 392 document.body = newBody;
393 393
394 var frameset = document.getElementsByTagName('frameset')[0]; 394 var frameset = document.getElementsByTagName('frameset')[0];
395 if(frameset) { 395 if(frameset) {
396 frameset.parentNode.removeChild(frameset); } 396 frameset.parentNode.removeChild(frameset); }
397 } 397 }
398 } 398 }
399 399
400 /* Remove all stylesheets */ 400 /* Remove all stylesheets */
401 for (var k=0;k < document.styleSheets.length; k+=1) { 401 for (var k=0;k < document.styleSheets.length; k+=1) {
402 if (document.styleSheets[k].href !== null && document.styleSheets[k] .href.lastIndexOf("readability") === -1) { 402 if (document.styleSheets[k].href !== null && document.styleSheets[k] .href.lastIndexOf("readability") === -1) {
403 document.styleSheets[k].disabled = true; 403 document.styleSheets[k].disabled = true;
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
448 readability.cleanConditionally(articleContent, "table"); 448 readability.cleanConditionally(articleContent, "table");
449 readability.cleanConditionally(articleContent, "ul"); 449 readability.cleanConditionally(articleContent, "ul");
450 readability.cleanConditionally(articleContent, "div"); 450 readability.cleanConditionally(articleContent, "div");
451 451
452 /* Remove extra paragraphs */ 452 /* Remove extra paragraphs */
453 var articleParagraphs = articleContent.getElementsByTagName('p'); 453 var articleParagraphs = articleContent.getElementsByTagName('p');
454 for(var i = articleParagraphs.length-1; i >= 0; i-=1) { 454 for(var i = articleParagraphs.length-1; i >= 0; i-=1) {
455 var imgCount = articleParagraphs[i].getElementsByTagName('img').l ength; 455 var imgCount = articleParagraphs[i].getElementsByTagName('img').l ength;
456 var embedCount = articleParagraphs[i].getElementsByTagName('embed') .length; 456 var embedCount = articleParagraphs[i].getElementsByTagName('embed') .length;
457 var objectCount = articleParagraphs[i].getElementsByTagName('object' ).length; 457 var objectCount = articleParagraphs[i].getElementsByTagName('object' ).length;
458 458
459 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab ility.getInnerText(articleParagraphs[i], false) === '') { 459 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab ility.getInnerText(articleParagraphs[i], false) === '') {
460 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i] ); 460 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i] );
461 } 461 }
462 } 462 }
463 463
464 try { 464 try {
465 readability.replaceBrsWithPs(articleContent); 465 readability.replaceBrsWithPs(articleContent);
466 } 466 }
467 catch (e) { 467 catch (e) {
468 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block -elements bug. Ignoring.: " + e); 468 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block -elements bug. Ignoring.: " + e);
469 } 469 }
470 }, 470 },
471 471
472 /** 472 /**
473 * Initialize a node with the readability object. Also checks the 473 * Initialize a node with the readability object. Also checks the
474 * className/id for special names to add to its score. 474 * className/id for special names to add to its score.
475 * 475 *
476 * @param Element 476 * @param Element
477 * @return void 477 * @return void
478 **/ 478 **/
479 initializeNode: function (node) { 479 initializeNode: function (node) {
480 node.readability = {"contentScore": 0}; 480 node.readability = {"contentScore": 0};
481 481
482 switch(node.tagName) { 482 switch(node.tagName) {
483 case 'DIV': 483 case 'DIV':
484 node.readability.contentScore += 5; 484 node.readability.contentScore += 5;
485 break; 485 break;
486 486
487 case 'PRE': 487 case 'PRE':
488 case 'TD': 488 case 'TD':
489 case 'BLOCKQUOTE': 489 case 'BLOCKQUOTE':
490 node.readability.contentScore += 3; 490 node.readability.contentScore += 3;
491 break; 491 break;
492 492
493 case 'ADDRESS': 493 case 'ADDRESS':
494 case 'OL': 494 case 'OL':
495 case 'UL': 495 case 'UL':
496 case 'DL': 496 case 'DL':
497 case 'DD': 497 case 'DD':
498 case 'DT': 498 case 'DT':
499 case 'LI': 499 case 'LI':
500 case 'FORM': 500 case 'FORM':
501 node.readability.contentScore -= 3; 501 node.readability.contentScore -= 3;
502 break; 502 break;
503 503
504 case 'H1': 504 case 'H1':
505 case 'H2': 505 case 'H2':
506 case 'H3': 506 case 'H3':
507 case 'H4': 507 case 'H4':
508 case 'H5': 508 case 'H5':
509 case 'H6': 509 case 'H6':
510 case 'TH': 510 case 'TH':
511 node.readability.contentScore -= 5; 511 node.readability.contentScore -= 5;
512 break; 512 break;
513 } 513 }
514 514
515 node.readability.contentScore += readability.getClassWeight(node); 515 node.readability.contentScore += readability.getClassWeight(node);
516 }, 516 },
517 517
518 /*** 518 /***
519 * grabArticle - Using a variety of metrics (content score, classname, eleme nt types), find the content that is 519 * grabArticle - Using a variety of metrics (content score, classname, eleme nt types), find the content that is
520 * most likely to be the stuff a user wants to read. Then retu rn it wrapped up in a div. 520 * most likely to be the stuff a user wants to read. Then retu rn it wrapped up in a div.
521 * 521 *
522 * @param page a document to run upon. Needs to be a full document, complete with body. 522 * @param page a document to run upon. Needs to be a full document, complete with body.
523 * @return Element 523 * @return Element
524 **/ 524 **/
525 grabArticle: function (pageToClone) { 525 grabArticle: function (pageToClone) {
526 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_ STRIP_UNLIKELYS), 526 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_ STRIP_UNLIKELYS),
527 isPaging = (page !== null) ? true: false; 527 isPaging = (page !== null) ? true: false;
528 528
529 var page = null; 529 var page = null;
530 // Never work on the actual page. 530 // Never work on the actual page.
531 if (isPaging) { 531 if (isPaging) {
532 page = document.body.cloneNode(true); 532 page = document.body.cloneNode(true);
533 } else { 533 } else {
534 page = pageToClone.cloneNode(true); 534 page = pageToClone.cloneNode(true);
535 } 535 }
536 536
537 var allElements = page.getElementsByTagName('*'); 537 var allElements = page.getElementsByTagName('*');
538 538
539 /** 539 /**
540 * First, node prepping. Trash nodes that look cruddy (like ones with th e class name "comment", etc), and turn divs 540 * First, node prepping. Trash nodes that look cruddy (like ones with th e class name "comment", etc), and turn divs
541 * into P tags where they have been used inappropriately (as in, where t hey contain no other block level elements.) 541 * into P tags where they have been used inappropriately (as in, where t hey contain no other block level elements.)
542 * 542 *
543 * Note: Assignment from index for performance. See http://www.peachpit. com/articles/article.aspx?p=31567&seqNum=5 543 * Note: Assignment from index for performance. See http://www.peachpit. com/articles/article.aspx?p=31567&seqNum=5
544 * TODO: Shouldn't this be a reverse traversal? 544 * TODO: Shouldn't this be a reverse traversal?
545 **/ 545 **/
546 var node = null; 546 var node = null;
547 var nodesToScore = []; 547 var nodesToScore = [];
548 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) { 548 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {
549 /* Remove unlikely candidates */ 549 /* Remove unlikely candidates */
550 if (stripUnlikelyCandidates) { 550 if (stripUnlikelyCandidates) {
551 var unlikelyMatchString = node.className + node.id; 551 var unlikelyMatchString = node.className + node.id;
552 if ( 552 if (
553 ( 553 (
554 unlikelyMatchString.search(readability.regexps.unlikelyC andidates) !== -1 && 554 unlikelyMatchString.search(readability.regexps.unlikelyC andidates) !== -1 &&
555 unlikelyMatchString.search(readability.regexps.okMaybeIt sACandidate) === -1 && 555 unlikelyMatchString.search(readability.regexps.okMaybeIt sACandidate) === -1 &&
556 node.tagName !== "BODY" 556 node.tagName !== "BODY"
557 ) 557 )
558 ) 558 )
559 { 559 {
560 dbg("Removing unlikely candidate - " + unlikelyMatchString); 560 dbg("Removing unlikely candidate - " + unlikelyMatchString);
561 node.parentNode.removeChild(node); 561 node.parentNode.removeChild(node);
562 nodeIndex-=1; 562 nodeIndex-=1;
563 continue; 563 continue;
564 } 564 }
565 } 565 }
566 566
567 if (node.tagName === "P" || node.tagName === "TD" || node.tagName == = "PRE") { 567 if (node.tagName === "P" || node.tagName === "TD" || node.tagName == = "PRE") {
568 nodesToScore[nodesToScore.length] = node; 568 nodesToScore[nodesToScore.length] = node;
569 } 569 }
570 570
571 /* Turn all divs that don't have children block level elements into p's */ 571 /* Turn all divs that don't have children block level elements into p's */
572 if (node.tagName === "DIV") { 572 if (node.tagName === "DIV") {
573 if (node.innerHTML.search(readability.regexps.divToPElements) == = -1) { 573 if (node.innerHTML.search(readability.regexps.divToPElements) == = -1) {
574 var newNode = document.createElement('p'); 574 var newNode = document.createElement('p');
(...skipping 16 matching lines...) Expand all
591 if(childNode.nodeType === 3) { // Node.TEXT_NODE 591 if(childNode.nodeType === 3) { // Node.TEXT_NODE
592 var p = document.createElement('p'); 592 var p = document.createElement('p');
593 var t = document.createTextNode(childNode.nodeValue) ; 593 var t = document.createTextNode(childNode.nodeValue) ;
594 p.appendChild(t); 594 p.appendChild(t);
595 p.style.display = 'inline'; 595 p.style.display = 'inline';
596 p.className = 'readability-styled'; 596 p.className = 'readability-styled';
597 childNode.parentNode.replaceChild(p, childNode); 597 childNode.parentNode.replaceChild(p, childNode);
598 } 598 }
599 } 599 }
600 } 600 }
601 } 601 }
602 } 602 }
603 603
604 /** 604 /**
605 * Loop through all paragraphs, and assign a score to them based on how content-y they look. 605 * Loop through all paragraphs, and assign a score to them based on how content-y they look.
606 * Then add their score to their parent node. 606 * Then add their score to their parent node.
607 * 607 *
608 * A score is determined by things like number of commas, class names, e tc. Maybe eventually link density. 608 * A score is determined by things like number of commas, class names, e tc. Maybe eventually link density.
609 **/ 609 **/
610 var candidates = []; 610 var candidates = [];
611 for (var pt=0; pt < nodesToScore.length; pt+=1) { 611 for (var pt=0; pt < nodesToScore.length; pt+=1) {
(...skipping 21 matching lines...) Expand all
633 candidates.push(grandParentNode); 633 candidates.push(grandParentNode);
634 } 634 }
635 635
636 var contentScore = 0; 636 var contentScore = 0;
637 637
638 /* Add a point for the paragraph itself as a base. */ 638 /* Add a point for the paragraph itself as a base. */
639 contentScore+=1; 639 contentScore+=1;
640 640
641 /* Add points for any commas within this paragraph */ 641 /* Add points for any commas within this paragraph */
642 contentScore += innerText.split(',').length; 642 contentScore += innerText.split(',').length;
643 643
644 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ 644 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
645 contentScore += Math.min(Math.floor(innerText.length / 100), 3); 645 contentScore += Math.min(Math.floor(innerText.length / 100), 3);
646 646
647 /* Add the score to the parent. The grandparent gets half. */ 647 /* Add the score to the parent. The grandparent gets half. */
648 parentNode.readability.contentScore += contentScore; 648 parentNode.readability.contentScore += contentScore;
649 649
650 if(grandParentNode) { 650 if(grandParentNode) {
651 grandParentNode.readability.contentScore += contentScore/2; 651 grandParentNode.readability.contentScore += contentScore/2;
652 } 652 }
653 } 653 }
654 654
655 /** 655 /**
656 * After we've calculated scores, loop through all of the possible candi date nodes we found 656 * After we've calculated scores, loop through all of the possible candi date nodes we found
657 * and find the one with the highest score. 657 * and find the one with the highest score.
658 **/ 658 **/
659 var topCandidate = null; 659 var topCandidate = null;
660 for(var c=0, cl=candidates.length; c < cl; c+=1) 660 for(var c=0, cl=candidates.length; c < cl; c+=1)
661 { 661 {
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after
718 var contentBonus = 0; 718 var contentBonus = 0;
719 /* Give a bonus if sibling nodes and top candidates have the example same classname */ 719 /* Give a bonus if sibling nodes and top candidates have the example same classname */
720 if(siblingNode.className === topCandidate.className && topCandidate. className !== "") { 720 if(siblingNode.className === topCandidate.className && topCandidate. className !== "") {
721 contentBonus += topCandidate.readability.contentScore * 0.2; 721 contentBonus += topCandidate.readability.contentScore * 0.2;
722 } 722 }
723 723
724 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re adability.contentScore+contentBonus) >= siblingScoreThreshold) 724 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re adability.contentScore+contentBonus) >= siblingScoreThreshold)
725 { 725 {
726 append = true; 726 append = true;
727 } 727 }
728 728
729 if(siblingNode.nodeName === "P") { 729 if(siblingNode.nodeName === "P") {
730 var linkDensity = readability.getLinkDensity(siblingNode); 730 var linkDensity = readability.getLinkDensity(siblingNode);
731 var nodeContent = readability.getInnerText(siblingNode); 731 var nodeContent = readability.getInnerText(siblingNode);
732 var nodeLength = nodeContent.length; 732 var nodeLength = nodeContent.length;
733 733
734 if(nodeLength > 80 && linkDensity < 0.25) 734 if(nodeLength > 80 && linkDensity < 0.25)
735 { 735 {
736 append = true; 736 append = true;
737 } 737 }
738 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear ch(/\.( |$)/) !== -1) 738 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear ch(/\.( |$)/) !== -1)
739 { 739 {
740 append = true; 740 append = true;
741 } 741 }
742 } 742 }
743 743
744 if(append) { 744 if(append) {
745 dbg("Appending node: " + siblingNode); 745 dbg("Appending node: " + siblingNode);
746 746
747 var nodeToAppend = null; 747 var nodeToAppend = null;
748 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P ") { 748 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P ") {
749 /* We have a node that isn't a common block level element, l ike a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ 749 /* We have a node that isn't a common block level element, l ike a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
750 750
751 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); 751 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
752 nodeToAppend = document.createElement("DIV"); 752 nodeToAppend = document.createElement("DIV");
753 try { 753 try {
754 nodeToAppend.id = siblingNode.id; 754 nodeToAppend.id = siblingNode.id;
755 readability.moveNodeInnards(siblingNode, nodeToAppend); 755 readability.moveNodeInnards(siblingNode, nodeToAppend);
756 } 756 }
757 catch(er) { 757 catch(er) {
758 dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original."); 758 dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
759 nodeToAppend = siblingNode; 759 nodeToAppend = siblingNode;
760 s-=1; 760 s-=1;
761 sl-=1; 761 sl-=1;
762 } 762 }
763 } else { 763 } else {
764 nodeToAppend = siblingNode; 764 nodeToAppend = siblingNode;
765 s-=1; 765 s-=1;
766 sl-=1; 766 sl-=1;
767 } 767 }
768 768
769 /* To ensure a node does not interfere with readability styles, remove its classnames */ 769 /* To ensure a node does not interfere with readability styles, remove its classnames */
770 nodeToAppend.className = ""; 770 nodeToAppend.className = "";
771 771
772 /* Append sibling and subtract from our list because it removes the node when you append to another node */ 772 /* Append sibling and subtract from our list because it removes the node when you append to another node */
773 articleContent.appendChild(nodeToAppend); 773 articleContent.appendChild(nodeToAppend);
774 } 774 }
775 } 775 }
776 776
777 /** 777 /**
778 * So we have all of the content that we need. Now we clean it up for pr esentation. 778 * So we have all of the content that we need. Now we clean it up for pr esentation.
779 **/ 779 **/
780 readability.distilledArticleContent = articleContent.cloneNode(true); 780 readability.distilledArticleContent = articleContent.cloneNode(true);
781 //readability.prepArticle(articleContent); 781 //readability.prepArticle(articleContent);
782 782
783 if (readability.curPageNum === 1) { 783 if (readability.curPageNum === 1) {
784 var newNode = document.createElement('div'); 784 var newNode = document.createElement('div');
785 newNode.id = "readability-page-1"; 785 newNode.id = "readability-page-1";
786 newNode.setAttribute("class", "page"); 786 newNode.setAttribute("class", "page");
787 readability.moveNodeInnards(articleContent, newNode); 787 readability.moveNodeInnards(articleContent, newNode);
788 articleContent.appendChild(newNode); 788 articleContent.appendChild(newNode);
789 } 789 }
790 790
791 /** 791 /**
792 * Now that we've gone through the full algorithm, check to see if we go t any meaningful content. 792 * Now that we've gone through the full algorithm, check to see if we go t any meaningful content.
793 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher 793 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
794 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of 794 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
795 * finding the -right- content. 795 * finding the -right- content.
796 **/ 796 **/
797 if(readability.getInnerText(articleContent, false).length < 250) { 797 if(readability.getInnerText(articleContent, false).length < 250) {
798 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) { 798 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {
799 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS); 799 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);
800 return readability.grabArticle(document.body); 800 return readability.grabArticle(document.body);
801 } 801 }
802 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { 802 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
803 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES); 803 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);
804 return readability.grabArticle(document.body); 804 return readability.grabArticle(document.body);
805 } 805 }
806 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL LY)) { 806 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL LY)) {
807 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY); 807 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);
808 return readability.grabArticle(document.body); 808 return readability.grabArticle(document.body);
809 } else { 809 } else {
810 return null; 810 return null;
811 } 811 }
812 } 812 }
813 813
814 return articleContent; 814 return articleContent;
815 }, 815 },
816 816
817 /** 817 /**
818 * Removes script tags from the document. 818 * Removes script tags from the document.
819 * 819 *
820 * @param Element 820 * @param Element
821 **/ 821 **/
822 removeScripts: function (doc) { 822 removeScripts: function (doc) {
823 var scripts = doc.getElementsByTagName('script'); 823 var scripts = doc.getElementsByTagName('script');
824 for(var i = scripts.length-1; i >= 0; i-=1) 824 for(var i = scripts.length-1; i >= 0; i-=1)
825 { 825 {
826 if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf ('readability') === -1 && scripts[i].src.indexOf('typekit') === -1)) 826 if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf ('readability') === -1 && scripts[i].src.indexOf('typekit') === -1))
827 { 827 {
828 scripts[i].nodeValue=""; 828 scripts[i].nodeValue="";
829 scripts[i].removeAttribute('src'); 829 scripts[i].removeAttribute('src');
830 if (scripts[i].parentNode) { 830 if (scripts[i].parentNode) {
831 scripts[i].parentNode.removeChild(scripts[i]); 831 scripts[i].parentNode.removeChild(scripts[i]);
832 } 832 }
833 } 833 }
834 } 834 }
835 }, 835 },
836 836
837 /** 837 /**
838 * Get the inner text of a node - cross browser compatibly. 838 * Get the inner text of a node - cross browser compatibly.
839 * This also strips out any excess whitespace to be found. 839 * This also strips out any excess whitespace to be found.
840 * 840 *
841 * @param Element 841 * @param Element
842 * @return string 842 * @return string
843 **/ 843 **/
844 getInnerText: function (e, normalizeSpaces) { 844 getInnerText: function (e, normalizeSpaces) {
845 var textContent = ""; 845 var textContent = "";
846 846
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
889 889
890 // Remove any root styles, if we're able. 890 // Remove any root styles, if we're able.
891 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili ty-styled') { 891 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili ty-styled') {
892 e.removeAttribute('style'); } 892 e.removeAttribute('style'); }
893 893
894 // Go until there are no more child nodes 894 // Go until there are no more child nodes
895 while ( cur !== null ) { 895 while ( cur !== null ) {
896 if ( cur.nodeType === 1 ) { 896 if ( cur.nodeType === 1 ) {
897 // Remove style attribute(s) : 897 // Remove style attribute(s) :
898 if(cur.className !== "readability-styled") { 898 if(cur.className !== "readability-styled") {
899 cur.removeAttribute("style"); 899 cur.removeAttribute("style");
900 } 900 }
901 readability.cleanStyles( cur ); 901 readability.cleanStyles( cur );
902 } 902 }
903 cur = cur.nextSibling; 903 cur = cur.nextSibling;
904 } 904 }
905 }, 905 },
906 906
907 /** 907 /**
908 * Get the density of links as a percentage of the content 908 * Get the density of links as a percentage of the content
909 * This is the amount of text that is inside a link divided by the total tex t in the node. 909 * This is the amount of text that is inside a link divided by the total tex t in the node.
910 * 910 *
911 * @param Element 911 * @param Element
912 * @return number (float) 912 * @return number (float)
913 **/ 913 **/
914 getLinkDensity: function (e) { 914 getLinkDensity: function (e) {
915 var links = e.getElementsByTagName("a"); 915 var links = e.getElementsByTagName("a");
916 var textLength = readability.getInnerText(e).length; 916 var textLength = readability.getInnerText(e).length;
917 var linkLength = 0; 917 var linkLength = 0;
918 for(var i=0, il=links.length; i<il;i+=1) 918 for(var i=0, il=links.length; i<il;i+=1)
919 { 919 {
920 linkLength += readability.getInnerText(links[i]).length; 920 linkLength += readability.getInnerText(links[i]).length;
921 } 921 }
922 922
923 return linkLength / textLength; 923 return linkLength / textLength;
924 }, 924 },
925 925
926 /** 926 /**
927 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness. 927 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
928 * 928 *
929 * @author Dan Lacy 929 * @author Dan Lacy
930 * @return string the base url 930 * @return string the base url
931 **/ 931 **/
932 findBaseUrl: function () { 932 findBaseUrl: function () {
933 var noUrlParams = window.location.pathname.split("?")[0], 933 var noUrlParams = window.location.pathname.split("?")[0],
934 urlSlashes = noUrlParams.split("/").reverse(), 934 urlSlashes = noUrlParams.split("/").reverse(),
935 cleanedSegments = [], 935 cleanedSegments = [],
936 possibleType = ""; 936 possibleType = "";
937 937
938 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) { 938 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) {
939 var segment = urlSlashes[i]; 939 var segment = urlSlashes[i];
940 940
941 // Split off and save anything that looks like a file type. 941 // Split off and save anything that looks like a file type.
942 if (segment.indexOf(".") !== -1) { 942 if (segment.indexOf(".") !== -1) {
943 possibleType = segment.split(".")[1]; 943 possibleType = segment.split(".")[1];
944 944
945 /* If the type isn't alpha-only, it's probably not actually a fi le extension. */ 945 /* If the type isn't alpha-only, it's probably not actually a fi le extension. */
946 if(!possibleType.match(/[^a-zA-Z]/)) { 946 if(!possibleType.match(/[^a-zA-Z]/)) {
947 segment = segment.split(".")[0]; 947 segment = segment.split(".")[0];
948 } 948 }
949 } 949 }
950 950
951 /** 951 /**
952 * EW-CMS specific segment replacement. Ugly. 952 * EW-CMS specific segment replacement. Ugly.
953 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm l 953 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm l
954 **/ 954 **/
955 if(segment.indexOf(',00') !== -1) { 955 if(segment.indexOf(',00') !== -1) {
956 segment = segment.replace(',00', ''); 956 segment = segment.replace(',00', '');
957 } 957 }
958 958
959 // If our first or second segment has anything looking like a page n umber, remove it. 959 // If our first or second segment has anything looking like a page n umber, remove it.
960 if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) { 960 if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) {
961 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, " "); 961 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, " ");
962 } 962 }
963 963
964 964
965 var del = false; 965 var del = false;
966 966
967 /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */ 967 /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */
968 if (i < 2 && segment.match(/^\d{1,2}$/)) { 968 if (i < 2 && segment.match(/^\d{1,2}$/)) {
969 del = true; 969 del = true;
970 } 970 }
971 971
972 /* If this is the first segment and it's just "index", remove it. */ 972 /* If this is the first segment and it's just "index", remove it. */
973 if(i === 0 && segment.toLowerCase() === "index") { 973 if(i === 0 && segment.toLowerCase() === "index") {
974 del = true; 974 del = true;
975 } 975 }
976 976
977 977
978 /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */ 978 /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */
979 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) { 979 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) {
980 del = true; 980 del = true;
981 } 981 }
982 982
983 /* If it's not marked for deletion, push it to cleanedSegments. */ 983 /* If it's not marked for deletion, push it to cleanedSegments. */
984 if (!del) { 984 if (!del) {
985 cleanedSegments.push(segment); 985 cleanedSegments.push(segment);
986 } 986 }
987 } 987 }
988 988
989 // This is our final, cleaned, base article URL. 989 // This is our final, cleaned, base article URL.
990 return window.location.protocol + "//" + window.location.host + cleanedS egments.reverse().join("/"); 990 return window.location.protocol + "//" + window.location.host + cleanedS egments.reverse().join("/");
991 }, 991 },
992 992
993 /** 993 /**
994 * Look for any paging links that may occur within the document. 994 * Look for any paging links that may occur within the document.
995 * 995 *
996 * @param body 996 * @param body
997 * @return object (array) 997 * @return object (array)
998 **/ 998 **/
999 findNextPageLink: function (elem) { 999 findNextPageLink: function (elem) {
1000 var possiblePages = {}, 1000 var possiblePages = {},
1001 allLinks = elem.getElementsByTagName('a'), 1001 allLinks = elem.getElementsByTagName('a'),
1002 articleBaseUrl = readability.findBaseUrl(); 1002 articleBaseUrl = readability.findBaseUrl();
1003 1003
1004 /** 1004 /**
1005 * Loop through all links, looking for hints that they may be next-page links. 1005 * Loop through all links, looking for hints that they may be next-page links.
1006 * Things like having "page" in their textContent, className or id, or b eing a child 1006 * Things like having "page" in their textContent, className or id, or b eing a child
1007 * of a node with a page-y className or id. 1007 * of a node with a page-y className or id.
1008 * 1008 *
1009 * Also possible: levenshtein distance? longest common subsequence? 1009 * Also possible: levenshtein distance? longest common subsequence?
1010 * 1010 *
1011 * After we do that, assign each page a score, and 1011 * After we do that, assign each page a score, and
1012 **/ 1012 **/
1013 for(var i = 0, il = allLinks.length; i < il; i+=1) { 1013 for(var i = 0, il = allLinks.length; i < il; i+=1) {
1014 var link = allLinks[i], 1014 var link = allLinks[i],
1015 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ' '); 1015 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ' ');
1016 1016
1017 /* If we've already seen this page, ignore it */ 1017 /* If we've already seen this page, ignore it */
1018 if(linkHref === "" || linkHref === articleBaseUrl || linkHref === wi ndow.location.href || linkHref in readability.parsedPages) { 1018 if(linkHref === "" || linkHref === articleBaseUrl || linkHref === wi ndow.location.href || linkHref in readability.parsedPages) {
1019 continue; 1019 continue;
1020 } 1020 }
1021 1021
1022 /* If it's on a different domain, skip it. */ 1022 /* If it's on a different domain, skip it. */
1023 if(window.location.host !== linkHref.split(/\/+/g)[1]) { 1023 if(window.location.host !== linkHref.split(/\/+/g)[1]) {
1024 continue; 1024 continue;
1025 } 1025 }
1026 1026
1027 var linkText = readability.getInnerText(link); 1027 var linkText = readability.getInnerText(link);
1028 1028
1029 /* If the linkText looks like it's not the next page, skip it. */ 1029 /* If the linkText looks like it's not the next page, skip it. */
1030 if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) { 1030 if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) {
1031 continue; 1031 continue;
1032 } 1032 }
1033 1033
1034 /* If the leftovers of the URL after removing the base URL don't con tain any digits, it's certainly not a next page link. */ 1034 /* If the leftovers of the URL after removing the base URL don't con tain any digits, it's certainly not a next page link. */
1035 var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); 1035 var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
1036 if(!linkHrefLeftover.match(/\d/)) { 1036 if(!linkHrefLeftover.match(/\d/)) {
1037 continue; 1037 continue;
1038 } 1038 }
1039 1039
1040 if(!(linkHref in possiblePages)) { 1040 if(!(linkHref in possiblePages)) {
1041 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr ef": linkHref}; 1041 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr ef": linkHref};
1042 } else { 1042 } else {
1043 possiblePages[linkHref].linkText += ' | ' + linkText; 1043 possiblePages[linkHref].linkText += ' | ' + linkText;
1044 } 1044 }
1045 1045
1046 var linkObj = possiblePages[linkHref]; 1046 var linkObj = possiblePages[linkHref];
1047 1047
1048 /** 1048 /**
1049 * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower. 1049 * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.
1050 * Example: http://www.actionscript.org/resources/articles/745/1/Jav aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html 1050 * Example: http://www.actionscript.org/resources/articles/745/1/Jav aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
1051 **/ 1051 **/
1052 if(linkHref.indexOf(articleBaseUrl) !== 0) { 1052 if(linkHref.indexOf(articleBaseUrl) !== 0) {
1053 linkObj.score -= 25; 1053 linkObj.score -= 25;
1054 } 1054 }
1055 1055
1056 var linkData = linkText + ' ' + link.className + ' ' + link.id; 1056 var linkData = linkText + ' ' + link.className + ' ' + link.id;
1057 if(linkData.match(readability.regexps.nextLink)) { 1057 if(linkData.match(readability.regexps.nextLink)) {
1058 linkObj.score += 50; 1058 linkObj.score += 50;
1059 } 1059 }
1060 if(linkData.match(/pag(e|ing|inat)/i)) { 1060 if(linkData.match(/pag(e|ing|inat)/i)) {
1061 linkObj.score += 25; 1061 linkObj.score += 25;
1062 } 1062 }
1063 if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text, 1063 if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,
1064 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */ 1064 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */
1065 if(!linkObj.linkText.match(readability.regexps.nextLink)) { 1065 if(!linkObj.linkText.match(readability.regexps.nextLink)) {
1066 linkObj.score -= 65; 1066 linkObj.score -= 65;
1067 } 1067 }
1068 } 1068 }
1069 if(linkData.match(readability.regexps.negative) || linkData.match(re adability.regexps.extraneous)) { 1069 if(linkData.match(readability.regexps.negative) || linkData.match(re adability.regexps.extraneous)) {
1070 linkObj.score -= 50; 1070 linkObj.score -= 50;
1071 } 1071 }
1072 if(linkData.match(readability.regexps.prevLink)) { 1072 if(linkData.match(readability.regexps.prevLink)) {
1073 linkObj.score -= 200; 1073 linkObj.score -= 200;
1074 } 1074 }
1075 1075
1076 /* If a parentNode contains page or paging or paginat */ 1076 /* If a parentNode contains page or paging or paginat */
1077 var parentNode = link.parentNode, 1077 var parentNode = link.parentNode,
1078 positiveNodeMatch = false, 1078 positiveNodeMatch = false,
1079 negativeNodeMatch = false; 1079 negativeNodeMatch = false;
1080 while(parentNode) { 1080 while(parentNode) {
1081 var parentNodeClassAndId = parentNode.className + ' ' + parentNo de.id; 1081 var parentNodeClassAndId = parentNode.className + ' ' + parentNo de.id;
1082 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(/pag(e|ing|inat)/i)) { 1082 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(/pag(e|ing|inat)/i)) {
1083 positiveNodeMatch = true; 1083 positiveNodeMatch = true;
1084 linkObj.score += 25; 1084 linkObj.score += 25;
1085 } 1085 }
1086 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(readability.regexps.negative)) { 1086 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(readability.regexps.negative)) {
1087 /* If this is just something like "footer", give it a negati ve. If it's something like "body-and-footer", leave it be. */ 1087 /* If this is just something like "footer", give it a negati ve. If it's something like "body-and-footer", leave it be. */
1088 if(!parentNodeClassAndId.match(readability.regexps.positive) ) { 1088 if(!parentNodeClassAndId.match(readability.regexps.positive) ) {
1089 linkObj.score -= 25; 1089 linkObj.score -= 25;
1090 negativeNodeMatch = true; 1090 negativeNodeMatch = true;
1091 } 1091 }
1092 } 1092 }
1093 1093
1094 parentNode = parentNode.parentNode; 1094 parentNode = parentNode.parentNode;
1095 } 1095 }
1096 1096
1097 /** 1097 /**
1098 * If the URL looks like it has paging in it, add to the score. 1098 * If the URL looks like it has paging in it, add to the score.
1099 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 1099 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
1100 **/ 1100 **/
1101 if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) { 1101 if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) {
1102 linkObj.score += 25; 1102 linkObj.score += 25;
1103 } 1103 }
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
1145 topPage = possiblePages[page]; 1145 topPage = possiblePages[page];
1146 } 1146 }
1147 } 1147 }
1148 } 1148 }
1149 1149
1150 if(topPage) { 1150 if(topPage) {
1151 var nextHref = topPage.href.replace(/\/$/,''); 1151 var nextHref = topPage.href.replace(/\/$/,'');
1152 1152
1153 dbg('NEXT PAGE IS ' + nextHref); 1153 dbg('NEXT PAGE IS ' + nextHref);
1154 readability.parsedPages[nextHref] = true; 1154 readability.parsedPages[nextHref] = true;
1155 return nextHref; 1155 return nextHref;
1156 } 1156 }
1157 else { 1157 else {
1158 return null; 1158 return null;
1159 } 1159 }
1160 }, 1160 },
1161 1161
1162 createLinkDiv: function(link) { 1162 createLinkDiv: function(link) {
1163 var divNode = document.createElement('div'); 1163 var divNode = document.createElement('div');
1164 var aNode = document.createElement('a'); 1164 var aNode = document.createElement('a');
1165 var tNode = document.createTextNode('View Next Page'); 1165 var tNode = document.createTextNode('View Next Page');
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
1197 } 1197 }
1198 else { 1198 else {
1199 if (options.error) { options.error(request); } 1199 if (options.error) { options.error(request); }
1200 } 1200 }
1201 } 1201 }
1202 } 1202 }
1203 1203
1204 if (typeof options === 'undefined') { options = {}; } 1204 if (typeof options === 'undefined') { options = {}; }
1205 1205
1206 request.onreadystatechange = respondToReadyState; 1206 request.onreadystatechange = respondToReadyState;
1207 1207
1208 request.open('get', url, true); 1208 request.open('get', url, true);
1209 request.setRequestHeader('Accept', 'text/html'); 1209 request.setRequestHeader('Accept', 'text/html');
1210 1210
1211 try { 1211 try {
1212 request.send(options.postBody); 1212 request.send(options.postBody);
1213 } 1213 }
1214 catch (e) { 1214 catch (e) {
1215 if (options.error) { options.error(); } 1215 if (options.error) { options.error(); }
1216 } 1216 }
1217 1217
(...skipping 14 matching lines...) Expand all
1232 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada bility.curPageNum + '">&sect;</p>'; 1232 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada bility.curPageNum + '">&sect;</p>';
1233 1233
1234 document.getElementById("readability-content").appendChild(articlePage); 1234 document.getElementById("readability-content").appendChild(articlePage);
1235 1235
1236 if(readability.curPageNum > readability.maxPages) { 1236 if(readability.curPageNum > readability.maxPages) {
1237 var linkDiv = readability.createLinkDiv(nextPageLink); 1237 var linkDiv = readability.createLinkDiv(nextPageLink);
1238 1238
1239 articlePage.appendChild(linkDiv); 1239 articlePage.appendChild(linkDiv);
1240 return; 1240 return;
1241 } 1241 }
1242 1242
1243 /** 1243 /**
1244 * Now that we've built the article page DOM element, get the page conte nt 1244 * Now that we've built the article page DOM element, get the page conte nt
1245 * asynchronously and load the cleaned content into the div we created f or it. 1245 * asynchronously and load the cleaned content into the div we created f or it.
1246 **/ 1246 **/
1247 (function(pageUrl, thisPage) { 1247 (function(pageUrl, thisPage) {
1248 readability.ajax(pageUrl, { 1248 readability.ajax(pageUrl, {
1249 success: function(r) { 1249 success: function(r) {
1250 1250
1251 /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */ 1251 /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */
1252 var eTag = r.getResponseHeader('ETag'); 1252 var eTag = r.getResponseHeader('ETag');
1253 if(eTag) { 1253 if(eTag) {
1254 if(eTag in readability.pageETags) { 1254 if(eTag in readability.pageETags) {
1255 dbg("Exact duplicate page found via ETag. Aborting." ); 1255 dbg("Exact duplicate page found via ETag. Aborting." );
1256 articlePage.style.display = 'none'; 1256 articlePage.style.display = 'none';
1257 return; 1257 return;
1258 } else { 1258 } else {
1259 readability.pageETags[eTag] = 1; 1259 readability.pageETags[eTag] = 1;
1260 } 1260 }
1261 } 1261 }
1262 1262
1263 // TODO: this ends up doubling up page numbers on NYTimes ar ticles. Need to generically parse those away. 1263 // TODO: this ends up doubling up page numbers on NYTimes ar ticles. Need to generically parse those away.
1264 var page = document.createElement("DIV"); 1264 var page = document.createElement("DIV");
1265 1265
1266 /** 1266 /**
1267 * Do some preprocessing to our HTML to make it ready for ap pending. 1267 * Do some preprocessing to our HTML to make it ready for ap pending.
1268 * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript. 1268 * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript.
1269 * • Turn any noscript tags into divs so that we can parse t hem. This allows us to find any next page links hidden via javascript. 1269 * • Turn any noscript tags into divs so that we can parse t hem. This allows us to find any next page links hidden via javascript.
1270 * • Turn all double br's into p's - was handled by prepDocu ment in the original view. 1270 * • Turn all double br's into p's - was handled by prepDocu ment in the original view.
(...skipping 30 matching lines...) Expand all
1301 for(var i=1; i <= readability.curPageNum; i+=1) { 1301 for(var i=1; i <= readability.curPageNum; i+=1) {
1302 var rPage = document.getElementById('readability-pag e-' + i); 1302 var rPage = document.getElementById('readability-pag e-' + i);
1303 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML ) !== -1) { 1303 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML ) !== -1) {
1304 dbg('Duplicate of page ' + i + ' - skipping.'); 1304 dbg('Duplicate of page ' + i + ' - skipping.');
1305 articlePage.style.display = 'none'; 1305 articlePage.style.display = 'none';
1306 readability.parsedPages[pageUrl] = true; 1306 readability.parsedPages[pageUrl] = true;
1307 return; 1307 return;
1308 } 1308 }
1309 } 1309 }
1310 } 1310 }
1311 1311
1312 readability.removeScripts(content); 1312 readability.removeScripts(content);
1313 1313
1314 readability.moveNodeInnards(content, thisPage); 1314 readability.moveNodeInnards(content, thisPage);
1315 1315
1316 /** 1316 /**
1317 * After the page has rendered, post process the content. Th is delay is necessary because, 1317 * After the page has rendered, post process the content. Th is delay is necessary because,
1318 * in webkit at least, offsetWidth is not set in time to det ermine image width. We have to 1318 * in webkit at least, offsetWidth is not set in time to det ermine image width. We have to
1319 * wait a little bit for reflow to finish before we can fix floating images. 1319 * wait a little bit for reflow to finish before we can fix floating images.
1320 **/ 1320 **/
1321 window.setTimeout( 1321 window.setTimeout(
1322 function() { readability.postProcessContent(thisPage); } , 1322 function() { readability.postProcessContent(thisPage); } ,
1323 500 1323 500
1324 ); 1324 );
1325 1325
1326 if(nextPageLink) { 1326 if(nextPageLink) {
1327 readability.appendNextPage(nextPageLink); 1327 readability.appendNextPage(nextPageLink);
1328 } 1328 }
1329 } 1329 }
1330 }); 1330 });
1331 }(nextPageLink, articlePage)); 1331 }(nextPageLink, articlePage));
1332 }, 1332 },
1333 1333
1334 /** 1334 /**
1335 * Get an elements class/id weight. Uses regular expressions to tell if this 1335 * Get an elements class/id weight. Uses regular expressions to tell if this
1336 * element looks good or bad. 1336 * element looks good or bad.
1337 * 1337 *
1338 * @param Element 1338 * @param Element
1339 * @return number (Integer) 1339 * @return number (Integer)
1340 **/ 1340 **/
1341 getClassWeight: function (e) { 1341 getClassWeight: function (e) {
1342 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { 1342 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
1343 return 0; 1343 return 0;
1344 } 1344 }
1345 1345
(...skipping 29 matching lines...) Expand all
1375 /** 1375 /**
1376 * Remove extraneous break tags from a node. 1376 * Remove extraneous break tags from a node.
1377 * 1377 *
1378 * @param Element 1378 * @param Element
1379 * @return void 1379 * @return void
1380 **/ 1380 **/
1381 killBreaks: function (e) { 1381 killBreaks: function (e) {
1382 var allElements = e.getElementsByTagName('*'); 1382 var allElements = e.getElementsByTagName('*');
1383 while (i < allElements.length) { 1383 while (i < allElements.length) {
1384 readability.deleteExtraBreaks(allElements[i]); 1384 readability.deleteExtraBreaks(allElements[i]);
1385 i++; 1385 i++;
1386 } 1386 }
1387 }, 1387 },
1388 1388
1389 /** 1389 /**
1390 * Clean a node of all elements of type "tag". 1390 * Clean a node of all elements of type "tag".
1391 * (Unless it's a youtube/vimeo video. People love movies.) 1391 * (Unless it's a youtube/vimeo video. People love movies.)
1392 * 1392 *
1393 * @param Element 1393 * @param Element
1394 * @param string tag to clean 1394 * @param string tag to clean
1395 * @return void 1395 * @return void
1396 **/ 1396 **/
1397 clean: function (e, tag) { 1397 clean: function (e, tag) {
1398 var targetList = e.getElementsByTagName( tag ); 1398 var targetList = e.getElementsByTagName( tag );
1399 var isEmbed = (tag === 'object' || tag === 'embed'); 1399 var isEmbed = (tag === 'object' || tag === 'embed');
1400 1400
1401 for (var y=targetList.length-1; y >= 0; y-=1) { 1401 for (var y=targetList.length-1; y >= 0; y-=1) {
1402 /* Allow youtube and vimeo videos through as people usually want to see those. */ 1402 /* Allow youtube and vimeo videos through as people usually want to see those. */
1403 if(isEmbed) { 1403 if(isEmbed) {
1404 var attributeValues = ""; 1404 var attributeValues = "";
1405 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) { 1405 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {
1406 attributeValues += targetList[y].attributes[i].value + '|'; 1406 attributeValues += targetList[y].attributes[i].value + '|';
1407 } 1407 }
1408 1408
1409 /* First, check the elements attributes to see if any of them co ntain youtube or vimeo */ 1409 /* First, check the elements attributes to see if any of them co ntain youtube or vimeo */
1410 if (attributeValues.search(readability.regexps.videos) !== -1) { 1410 if (attributeValues.search(readability.regexps.videos) !== -1) {
1411 continue; 1411 continue;
1412 } 1412 }
1413 1413
1414 /* Then check the elements inside this element for the same. */ 1414 /* Then check the elements inside this element for the same. */
1415 if (targetList[y].innerHTML.search(readability.regexps.videos) ! == -1) { 1415 if (targetList[y].innerHTML.search(readability.regexps.videos) ! == -1) {
1416 continue; 1416 continue;
1417 } 1417 }
1418 1418
1419 } 1419 }
1420 1420
1421 targetList[y].parentNode.removeChild(targetList[y]); 1421 targetList[y].parentNode.removeChild(targetList[y]);
1422 } 1422 }
1423 }, 1423 },
1424 1424
1425 /** 1425 /**
1426 * Clean an element of all tags of type "tag" if they look fishy. 1426 * Clean an element of all tags of type "tag" if they look fishy.
1427 * "Fishy" is an algorithm based on content length, classnames, link density , number of images & embeds, etc. 1427 * "Fishy" is an algorithm based on content length, classnames, link density , number of images & embeds, etc.
1428 * 1428 *
1429 * @return void 1429 * @return void
1430 **/ 1430 **/
1431 cleanConditionally: function (e, tag) { 1431 cleanConditionally: function (e, tag) {
1432 1432
1433 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { 1433 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
1434 return; 1434 return;
1435 } 1435 }
1436 1436
1437 var tagsList = e.getElementsByTagName(tag); 1437 var tagsList = e.getElementsByTagName(tag);
1438 var curTagsLength = tagsList.length; 1438 var curTagsLength = tagsList.length;
1439 1439
1440 /** 1440 /**
1441 * Gather counts for other typical elements embedded within. 1441 * Gather counts for other typical elements embedded within.
1442 * Traverse backwards so we can remove nodes at the same time without ef fecting the traversal. 1442 * Traverse backwards so we can remove nodes at the same time without ef fecting the traversal.
1443 * 1443 *
1444 * TODO: Consider taking into account original contentScore here. 1444 * TODO: Consider taking into account original contentScore here.
1445 **/ 1445 **/
1446 for (var i=curTagsLength-1; i >= 0; i-=1) { 1446 for (var i=curTagsLength-1; i >= 0; i-=1) {
1447 var weight = readability.getClassWeight(tagsList[i]); 1447 var weight = readability.getClassWeight(tagsList[i]);
1448 var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0; 1448 var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;
1449 1449
1450 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde fined') ? (" with score " + tagsList[i].readability.contentScore) : '')); 1450 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde fined') ? (" with score " + tagsList[i].readability.contentScore) : ''));
1451 1451
1452 if(weight+contentScore < 0) 1452 if(weight+contentScore < 0)
1453 { 1453 {
1454 tagsList[i].parentNode.removeChild(tagsList[i]); 1454 tagsList[i].parentNode.removeChild(tagsList[i]);
1455 } 1455 }
1456 else if ( readability.getCharCount(tagsList[i],',') < 10) { 1456 else if ( readability.getCharCount(tagsList[i],',') < 10) {
1457 /** 1457 /**
1458 * If there are not very many commas, and the number of 1458 * If there are not very many commas, and the number of
1459 * non-paragraph elements is more than paragraphs or other omino us signs, remove the element. 1459 * non-paragraph elements is more than paragraphs or other omino us signs, remove the element.
1460 **/ 1460 **/
1461 var p = tagsList[i].getElementsByTagName("p").length; 1461 var p = tagsList[i].getElementsByTagName("p").length;
1462 var img = tagsList[i].getElementsByTagName("img").length; 1462 var img = tagsList[i].getElementsByTagName("img").length;
1463 var li = tagsList[i].getElementsByTagName("li").length-100; 1463 var li = tagsList[i].getElementsByTagName("li").length-100;
1464 var input = tagsList[i].getElementsByTagName("input").length; 1464 var input = tagsList[i].getElementsByTagName("input").length;
1465 1465
1466 var embedCount = 0; 1466 var embedCount = 0;
1467 var embeds = tagsList[i].getElementsByTagName("embed"); 1467 var embeds = tagsList[i].getElementsByTagName("embed");
1468 for(var ei=0,il=embeds.length; ei < il; ei+=1) { 1468 for(var ei=0,il=embeds.length; ei < il; ei+=1) {
1469 if (embeds[ei].src.search(readability.regexps.videos) === -1 ) { 1469 if (embeds[ei].src.search(readability.regexps.videos) === -1 ) {
1470 embedCount+=1; 1470 embedCount+=1;
1471 } 1471 }
1472 } 1472 }
1473 1473
1474 var linkDensity = readability.getLinkDensity(tagsList[i]); 1474 var linkDensity = readability.getLinkDensity(tagsList[i]);
1475 var contentLength = readability.getInnerText(tagsList[i]).length ; 1475 var contentLength = readability.getInnerText(tagsList[i]).length ;
1476 var toRemove = false; 1476 var toRemove = false;
1477 1477
1478 if ( img > p ) { 1478 if ( img > p ) {
1479 toRemove = true; 1479 toRemove = true;
1480 } else if(li > p && tag !== "ul" && tag !== "ol") { 1480 } else if(li > p && tag !== "ul" && tag !== "ol") {
1481 toRemove = true; 1481 toRemove = true;
1482 } else if( input > Math.floor(p/3) ) { 1482 } else if( input > Math.floor(p/3) ) {
1483 toRemove = true; 1483 toRemove = true;
1484 } else if(contentLength < 25 && (img === 0 || img > 2) ) { 1484 } else if(contentLength < 25 && (img === 0 || img > 2) ) {
1485 toRemove = true; 1485 toRemove = true;
1486 } else if(weight < 25 && linkDensity > 0.2) { 1486 } else if(weight < 25 && linkDensity > 0.2) {
1487 toRemove = true; 1487 toRemove = true;
1488 } else if(weight >= 25 && linkDensity > 0.5) { 1488 } else if(weight >= 25 && linkDensity > 0.5) {
1489 toRemove = true; 1489 toRemove = true;
1490 } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) { 1490 } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) {
1491 toRemove = true; 1491 toRemove = true;
1492 } 1492 }
1493 1493
(...skipping 21 matching lines...) Expand all
1515 } 1515 }
1516 }, 1516 },
1517 1517
1518 flagIsActive: function(flag) { 1518 flagIsActive: function(flag) {
1519 return (readability.flags & flag) > 0; 1519 return (readability.flags & flag) > 0;
1520 }, 1520 },
1521 1521
1522 addFlag: function(flag) { 1522 addFlag: function(flag) {
1523 readability.flags = readability.flags | flag; 1523 readability.flags = readability.flags | flag;
1524 }, 1524 },
1525 1525
1526 removeFlag: function(flag) { 1526 removeFlag: function(flag) {
1527 readability.flags = readability.flags & ~flag; 1527 readability.flags = readability.flags & ~flag;
1528 }, 1528 },
1529 1529
1530 // Removes the children of |src| and appends them to |dest|. 1530 // Removes the children of |src| and appends them to |dest|.
1531 moveNodeInnards: function(src, dest) { 1531 moveNodeInnards: function(src, dest) {
1532 try { 1532 try {
1533 while (src.firstChild) { 1533 while (src.firstChild) {
1534 dest.appendChild(src.removeChild(src.firstChild)); 1534 dest.appendChild(src.removeChild(src.firstChild));
1535 } 1535 }
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after
1584 var lastBr = readability.isMultipleBr(node, false); 1584 var lastBr = readability.isMultipleBr(node, false);
1585 var ret = false; 1585 var ret = false;
1586 while (lastBr && lastBr != node) { 1586 while (lastBr && lastBr != node) {
1587 var toRemove = lastBr; 1587 var toRemove = lastBr;
1588 lastBr = lastBr.previousSibling; 1588 lastBr = lastBr.previousSibling;
1589 toRemove.parentNode.removeChild(toRemove); 1589 toRemove.parentNode.removeChild(toRemove);
1590 ret = true; 1590 ret = true;
1591 } 1591 }
1592 return ret; 1592 return ret;
1593 }, 1593 },
1594 1594
1595 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a 1595 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a
1596 // <P> node, and makes all next siblings of that pair children of <P>, up 1596 // <P> node, and makes all next siblings of that pair children of <P>, up
1597 // until the next pair of <BR> nodes is reached. 1597 // until the next pair of <BR> nodes is reached.
1598 replaceDoubleBrWithP: function(node) { 1598 replaceDoubleBrWithP: function(node) {
1599 // Check that we are starting with a BR. 1599 // Check that we are starting with a BR.
1600 var second = readability.isMultipleBr(node, true); 1600 var second = readability.isMultipleBr(node, true);
1601 if (!second) { 1601 if (!second) {
1602 return; 1602 return;
1603 } 1603 }
1604 // Make all next siblings of the second BR into children of a P. 1604 // Make all next siblings of the second BR into children of a P.
1605 var p = document.createElement('p'); 1605 var p = document.createElement('p');
1606 var curr = second.nextSibling; 1606 var curr = second.nextSibling;
1607 while (curr) { 1607 while (curr) {
1608 if (readability.isMultipleBr(curr, true)) { 1608 if (readability.isMultipleBr(curr, true)) {
1609 break; 1609 break;
1610 } 1610 }
1611 var next = curr.nextSibling; 1611 var next = curr.nextSibling;
1612 p.appendChild(curr.parentNode.removeChild(curr)); 1612 p.appendChild(curr.parentNode.removeChild(curr));
1613 curr = next; 1613 curr = next;
1614 } 1614 }
1615 var ret = curr; 1615 var ret = curr;
1616 1616
1617 // Remove all nodes between the first and second BR. 1617 // Remove all nodes between the first and second BR.
1618 curr = node.nextSibling; 1618 curr = node.nextSibling;
1619 while (curr && curr != second) { 1619 while (curr && curr != second) {
1620 var next = curr.nextSibling; 1620 var next = curr.nextSibling;
1621 curr.parentNode.removeChild(curr); 1621 curr.parentNode.removeChild(curr);
1622 curr = next; 1622 curr = next;
1623 } 1623 }
1624 // Remove the second BR. 1624 // Remove the second BR.
1625 second.parentNode.removeChild(second); 1625 second.parentNode.removeChild(second);
1626 // Replace the first BR with the P. 1626 // Replace the first BR with the P.
1627 node.parentNode.replaceChild(p, node); 1627 node.parentNode.replaceChild(p, node);
1628 1628
1629 return ret; 1629 return ret;
1630 }, 1630 },
1631 1631
1632 // Returns true if the NodeList contains a double <BR>. 1632 // Returns true if the NodeList contains a double <BR>.
1633 hasDoubleBr: function(nodeList) { 1633 hasDoubleBr: function(nodeList) {
1634 for (var i = 0; i < nodeList.length; nodeList++) { 1634 for (var i = 0; i < nodeList.length; nodeList++) {
1635 if (readability.isMultipleBr(nodeList[i], true)) { 1635 if (readability.isMultipleBr(nodeList[i], true)) {
1636 return true; 1636 return true;
1637 } 1637 }
1638 } 1638 }
1639 return false; 1639 return false;
1640 }, 1640 },
1641 1641
1642 // Replaces double <BR> tags with <P> tags. 1642 // Replaces double <BR> tags with <P> tags.
1643 replaceDoubleBrsWithPs: function(node) { 1643 replaceDoubleBrsWithPs: function(node) {
1644 var allElements = node.getElementsByTagName('BR'); 1644 var allElements = node.getElementsByTagName('BR');
1645 var node = null; 1645 var node = null;
1646 while (allElements && allElements.length > 0 && 1646 while (allElements && allElements.length > 0 &&
1647 readability.hasDoubleBr(allElements)) { 1647 readability.hasDoubleBr(allElements)) {
1648 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) { 1648 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {
1649 var next = node; 1649 var next = node;
1650 while (next = readability.replaceDoubleBrWithP(next)); 1650 while (next = readability.replaceDoubleBrWithP(next));
1651 } 1651 }
1652 allElements = document.body.getElementsByTagName('BR'); 1652 allElements = document.body.getElementsByTagName('BR');
1653 } 1653 }
1654 }, 1654 },
1655 1655
1656 1656
1657 // Replaces a BR and the whitespace that follows it with a P. 1657 // Replaces a BR and the whitespace that follows it with a P.
1658 replaceBrWithP: function(node) { 1658 replaceBrWithP: function(node) {
1659 if (!readability.isBrNode(node)) { 1659 if (!readability.isBrNode(node)) {
1660 return; 1660 return;
1661 } 1661 }
1662 var p = document.createElement('p'); 1662 var p = document.createElement('p');
1663 var curr = node.nextSibling; 1663 var curr = node.nextSibling;
1664 while (curr && !isBrNode(curr)) { 1664 while (curr && !isBrNode(curr)) {
1665 var next = curr.nextSibling; 1665 var next = curr.nextSibling;
1666 if (readability.isWhitespaceNode(curr)) { 1666 if (readability.isWhitespaceNode(curr)) {
1667 curr.parentNode.removeChild(curr); 1667 curr.parentNode.removeChild(curr);
1668 } else { 1668 } else {
1669 p.appendChild(curr.parentNode.removeChild(curr)); 1669 p.appendChild(curr.parentNode.removeChild(curr));
1670 } 1670 }
1671 curr = next; 1671 curr = next;
1672 } 1672 }
1673 node.parentNode.replaceChild(p, node); 1673 node.parentNode.replaceChild(p, node);
1674 return curr; 1674 return curr;
1675 }, 1675 },
1676 1676
1677 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t ag 1677 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t ag
1678 // children of the <P>. 1678 // children of the <P>.
1679 replaceBrsWithPs: function(node) { 1679 replaceBrsWithPs: function(node) {
1680 var allElements = node.getElementsByTagName('BR'); 1680 var allElements = node.getElementsByTagName('BR');
1681 var node = null; 1681 var node = null;
1682 while (allElements && allElements.length > 0) { 1682 while (allElements && allElements.length > 0) {
1683 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) { 1683 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {
1684 var next = node; 1684 var next = node;
1685 while (next = readability.replaceBrWithP(next)); 1685 while (next = readability.replaceBrWithP(next));
1686 } 1686 }
1687 allElements = document.body.getElementsByTagName('BR'); 1687 allElements = document.body.getElementsByTagName('BR');
1688 } 1688 }
1689 }, 1689 },
1690 1690
1691 // Replaces any tag with any other tag. 1691 // Replaces any tag with any other tag.
1692 replaceTagsWithTags: function(node, srcTag, destTag) { 1692 replaceTagsWithTags: function(node, srcTag, destTag) {
1693 var allElements = node.getElementsByTagName(srcTag); 1693 var allElements = node.getElementsByTagName(srcTag);
1694 for (var i = 0; i < allElements.length; i++) { 1694 for (var i = 0; i < allElements.length; i++) {
1695 var dest = document.createElement(destTag); 1695 var dest = document.createElement(destTag);
1696 readability.moveNodeInnards(allElements[i], dest); 1696 readability.moveNodeInnards(allElements[i], dest);
1697 node.replaceNode(dest, allElements[i]); 1697 allElements[i].parentNode.replaceChild(dest, allElements[i]);
1698 } 1698 }
1699 }, 1699 },
1700 1700
1701 // Replaces all <noscript> tags with <p> tags. 1701 // Replaces all <noscript> tags with <p> tags.
1702 replaceNoscriptsWithPs: function(node) { 1702 replaceNoscriptsWithPs: function(node) {
1703 readability.replaceTagsWithTags(node, 'noscript', 'p'); 1703 readability.replaceTagsWithTags(node, 'noscript', 'p');
1704 }, 1704 },
1705 1705
1706 // Replaces all <font> tags with <span> tags. 1706 // Replaces all <font> tags with <span> tags.
1707 replaceFontsWithSpans: function(node) { 1707 replaceFontsWithSpans: function(node) {
1708 readability.replaceTagsWithTags(node, 'font', 'span'); 1708 readability.replaceTagsWithTags(node, 'font', 'span');
1709 }, 1709 },
1710 1710
1711 // Returns a list of image URLs in the distilled article. 1711 // Returns a list of image URLs in the distilled article.
1712 getImages : function() { 1712 getImages : function() {
1713 var images = document.getElementsByTagName('img'); 1713 var images = document.getElementsByTagName('img');
1714 var result = new Array(images.length); 1714 var result = new Array(images.length);
1715 dbg("Number of images: " + images.length); 1715 dbg("Number of images: " + images.length);
1716 for(i = 0; i < images.length; i++) { 1716 for(i = 0; i < images.length; i++) {
1717 result[i] = images[i].src; 1717 result[i] = images[i].src;
1718 dbg("Image: " + result[i]); 1718 dbg("Image: " + result[i]);
1719 } 1719 }
1720 return result; 1720 return result;
1721 }, 1721 },
1722 1722
1723 // Returns the distilled article HTML from the page(s). 1723 // Returns the distilled article HTML from the page(s).
1724 getDistilledArticleHTML : function() { 1724 getDistilledArticleHTML : function() {
1725 return readability.distilledHTML; 1725 return readability.distilledHTML;
1726 },
1727
1728 // Returns the next page of this article.
1729 getNextPageLink : function() {
1730 return readability.nextPageLink;
1726 } 1731 }
1727 }; 1732 };
1728 1733
1729 // Extracts long-form content from a page and returns and array where the first 1734 // Extracts long-form content from a page and returns and array where the first
1730 // element is the article title, the second element is HTML containing the 1735 // element is the article title, the second element is HTML containing the
1731 // long-form content, and remaining elements are URLs for images referenced by 1736 // long-form content, and remaining elements are URLs for images referenced by
1732 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which 1737 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which
1733 // corresponds to a URL listed at index k in the array returned. 1738 // corresponds to a URL listed at index k in the array returned.
1734 (function () { 1739 (function () {
1735 readability.init(); 1740 readability.init();
1736 var result = new Array(2); 1741 var result = new Array(3);
1737 result[0] = readability.getArticleTitle(); 1742 result[0] = readability.getArticleTitle();
1738 result[1] = readability.getDistilledArticleHTML(); 1743 result[1] = readability.getDistilledArticleHTML();
1744 result[2] = readability.getNextPageLink();
1739 return result.concat(readability.getImages()); 1745 return result.concat(readability.getImages());
1740 }()) 1746 }())
1741 1747
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698