Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(217)

Side by Side Diff: third_party/readability/js/readability.js

Issue 146843010: Add support for multipage distillation. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Rebase + change Viewer to use DomDistillerArticleProto Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 1
2 var dbg = (typeof console !== 'undefined') ? function(s) { 2 var dbg = (typeof console !== 'undefined') ? function(s) {
3 console.log("Readability: " + s); 3 console.log("Readability: " + s);
4 } : function() {}; 4 } : function() {};
5 5
6 /* 6 /*
7 * Readability. An Arc90 Lab Experiment. 7 * Readability. An Arc90 Lab Experiment.
8 * Website: http://lab.arc90.com/experiments/readability 8 * Website: http://lab.arc90.com/experiments/readability
9 * Source: http://code.google.com/p/arc90labs-readability 9 * Source: http://code.google.com/p/arc90labs-readability
10 * 10 *
11 * "Readability" is a trademark of Arc90 Inc and may not be used without explici t permission. 11 * "Readability" is a trademark of Arc90 Inc and may not be used without explici t permission.
12 * 12 *
13 * Copyright (c) 2010 Arc90 Inc 13 * Copyright (c) 2010 Arc90 Inc
14 * Readability is licensed under the Apache License, Version 2.0. 14 * Readability is licensed under the Apache License, Version 2.0.
15 **/ 15 **/
16 var readability = { 16 var readability = {
17 readStyle: "style-newspaper", 17 readStyle: "style-newspaper",
18 readSize: "size-medium", 18 readSize: "size-medium",
19 readMargin: "margin-wide", 19 readMargin: "margin-wide",
20 20
21 distilledHTML: '', 21 distilledHTML: '',
22 distilledArticleContent: null, 22 distilledArticleContent: null,
23 nextPageLink: '',
23 24
24 version: '1.7.1', 25 version: '1.7.1',
25 iframeLoads: 0, 26 iframeLoads: 0,
26 convertLinksToFootnotes: false, 27 convertLinksToFootnotes: false,
27 reversePageScroll: false, /* If they hold shift and hit space, scroll up */ 28 reversePageScroll: false, /* If they hold shift and hit space, scroll up */
28 frameHack: false, /** 29 frameHack: false, /**
29 * The frame hack is to workaround a firefo x bug where if you 30 * The frame hack is to workaround a firefo x bug where if you
30 * pull content out of a frame and stick it into the parent element, the scrollbar won't appear. 31 * pull content out of a frame and stick it into the parent element, the scrollbar won't appear.
31 * So we fake a scrollbar in the wrapping d iv. 32 * So we fake a scrollbar in the wrapping d iv.
32 **/ 33 **/
33 biggestFrame: false, 34 biggestFrame: false,
34 flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */ 35 flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */
35 36
36 /* constants */ 37 /* constants */
37 FLAG_STRIP_UNLIKELYS: 0x1, 38 FLAG_STRIP_UNLIKELYS: 0x1,
38 FLAG_WEIGHT_CLASSES: 0x2, 39 FLAG_WEIGHT_CLASSES: 0x2,
39 FLAG_CLEAN_CONDITIONALLY: 0x4, 40 FLAG_CLEAN_CONDITIONALLY: 0x4,
40 41
41 maxPages: 30, /* The maximum number of pages to loop through before we ca ll it quits and just show a link. */ 42 maxPages: 30, /* The maximum number of pages to loop through before we ca ll it quits and just show a link. */
42 parsedPages: {}, /* The list of pages we've parsed in this call of readabili ty, for autopaging. As a key store for easier searching. */ 43 parsedPages: {}, /* The list of pages we've parsed in this call of readabili ty, for autopaging. As a key store for easier searching. */
43 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas e they happen to match, we'll know it's a duplicate. */ 44 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas e they happen to match, we'll know it's a duplicate. */
44 45
45 /** 46 /**
46 * All of the regular expressions in use within readability. 47 * All of the regular expressions in use within readability.
47 * Defined up here so we don't instantiate them repeatedly in loops. 48 * Defined up here so we don't instantiate them repeatedly in loops.
48 **/ 49 **/
49 regexps: { 50 regexps: {
50 unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header |menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popu p|tweet|twitter/i, 51 unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header |menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popu p|tweet|twitter/i,
51 okMaybeItsACandidate: /and|article|body|column|main|shadow/i, 52 okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
52 positive: /article|body|content|entry|hentry|main|page|pagi nation|post|text|blog|story/i, 53 positive: /article|body|content|entry|hentry|main|page|pagi nation|post|text|blog|story/i,
53 negative: /combx|comment|com-|contact|foot|footer|footnote| masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopp ing|tags|tool|widget/i, 54 negative: /combx|comment|com-|contact|foot|footer|footnote| masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopp ing|tags|tool|widget/i,
54 extraneous: /print|archive|comment|discuss|e[\-]?mail|share|r eply|all|login|sign|single/i, 55 extraneous: /print|archive|comment|discuss|e[\-]?mail|share|r eply|all|login|sign|single/i,
55 divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, 56 divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
56 replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi, 57 replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi,
57 replaceFonts: /<(\/?)font[^>]*>/gi, 58 replaceFonts: /<(\/?)font[^>]*>/gi,
58 trim: /^\s+|\s+$/g, 59 trim: /^\s+|\s+$/g,
59 normalize: /\s{2,}/g, 60 normalize: /\s{2,}/g,
60 killBreaks: /(<br\s*\/?>(\s|&nbsp;?)*){1,}/g, 61 killBreaks: /(<br\s*\/?>(\s|&nbsp;?)*){1,}/g,
61 videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, 62 videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i,
62 skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed) \s*$/i, 63 skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed) \s*$/i,
63 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last. 64 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last.
64 prevLink: /(prev|earl|old|new|<|«)/i 65 prevLink: /(prev|earl|old|new|<|«)/i
65 }, 66 },
66 67
67 /** 68 /**
68 * Runs readability. 69 * Runs readability.
69 * 70 *
70 * Workflow: 71 * Workflow:
71 * 1. Prep the document by removing script tags, css, etc. 72 * 1. Prep the document by removing script tags, css, etc.
72 * 2. Build readability's DOM tree. 73 * 2. Build readability's DOM tree.
73 * 3. Grab the article content from the current dom tree. 74 * 3. Grab the article content from the current dom tree.
74 * 4. Replace the current DOM tree with the new one. 75 * 4. Replace the current DOM tree with the new one.
75 * 5. Read peacefully. 76 * 5. Read peacefully.
76 * 77 *
77 * @return void 78 * @return void
78 **/ 79 **/
79 init: function() { 80 init: function() {
80 /* Before we do anything, remove all scripts that are not readability. * / 81 /* Before we do anything, remove all scripts that are not readability. * /
81 window.onload = window.onunload = function() {}; 82 window.onload = window.onunload = function() {};
82 83
83 readability.removeScripts(document); 84 readability.removeScripts(document);
84 85
85 /* Make sure this document is added to the list of parsed pages first, s o we don't double up on the first page */ 86 /* Make sure this document is added to the list of parsed pages first, s o we don't double up on the first page */
86 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true; 87 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;
87 88
88 /* Pull out any possible next page link first */ 89 /* Pull out any possible next page link first */
89 var nextPageLink = readability.findNextPageLink(document.body); 90 readability.nextPageLink = readability.findNextPageLink(document.body);
90 91
92 /* We handle processing of nextPage from C++ set nextPageLink to null */
93 var nextPageLink = null;
94
91 readability.prepDocument(); 95 readability.prepDocument();
92 96
93 /* Build readability's DOM tree */ 97 /* Build readability's DOM tree */
94 var overlay = document.createElement("DIV"); 98 var overlay = document.createElement("DIV");
95 var innerDiv = document.createElement("DIV"); 99 var innerDiv = document.createElement("DIV");
96 var articleTools = readability.getArticleTools(); 100 var articleTools = readability.getArticleTools();
97 var articleTitleText = readability.getArticleTitle(); 101 var articleTitleText = readability.getArticleTitle();
98 var articleContent = readability.grabArticle(); 102 var articleContent = readability.grabArticle();
99 103
100 if(!articleContent) { 104 if(!articleContent) {
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
150 154
151 readability.postProcessContent(articleContent); 155 readability.postProcessContent(articleContent);
152 156
153 window.scrollTo(0, 0); 157 window.scrollTo(0, 0);
154 158
155 // TODO(bengr): Remove this assignment of null to nextPageLink when 159 // TODO(bengr): Remove this assignment of null to nextPageLink when
156 // the processing of the next page link is safe. 160 // the processing of the next page link is safe.
157 nextPageLink = null; 161 nextPageLink = null;
158 162
159 if (nextPageLink) { 163 if (nextPageLink) {
160 /** 164 /**
161 * Append any additional pages after a small timeout so that people 165 * Append any additional pages after a small timeout so that people
162 * can start reading without having to wait for this to finish proce ssing. 166 * can start reading without having to wait for this to finish proce ssing.
163 **/ 167 **/
164 window.setTimeout(function() { 168 window.setTimeout(function() {
165 readability.appendNextPage(nextPageLink); 169 readability.appendNextPage(nextPageLink);
166 }, 500); 170 }, 500);
167 } 171 }
168 172
169 /** Smooth scrolling **/ 173 /** Smooth scrolling **/
170 document.onkeydown = function(e) { 174 document.onkeydown = function(e) {
171 var code = (window.event) ? event.keyCode : e.keyCode; 175 var code = (window.event) ? event.keyCode : e.keyCode;
172 if (code === 16) { 176 if (code === 16) {
173 readability.reversePageScroll = true; 177 readability.reversePageScroll = true;
174 return; 178 return;
175 } 179 }
176 180
177 if (code === 32) { 181 if (code === 32) {
178 readability.curScrollStep = 0; 182 readability.curScrollStep = 0;
179 var windowHeight = window.innerHeight ? window.innerHeight : (do cument.documentElement.clientHeight ? document.documentElement.clientHeight : do cument.body.clientHeight); 183 var windowHeight = window.innerHeight ? window.innerHeight : (do cument.documentElement.clientHeight ? document.documentElement.clientHeight : do cument.body.clientHeight);
180 184
181 if(readability.reversePageScroll) { 185 if(readability.reversePageScroll) {
182 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() - (windowHeight - 50), 20, 10); 186 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() - (windowHeight - 50), 20, 10);
183 } 187 }
184 else { 188 else {
185 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() + (windowHeight - 50), 20, 10); 189 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() + (windowHeight - 50), 20, 10);
186 } 190 }
187 191
188 return false; 192 return false;
189 } 193 }
190 }; 194 };
191 195
192 document.onkeyup = function(e) { 196 document.onkeyup = function(e) {
193 var code = (window.event) ? event.keyCode : e.keyCode; 197 var code = (window.event) ? event.keyCode : e.keyCode;
194 if (code === 16) { 198 if (code === 16) {
195 readability.reversePageScroll = false; 199 readability.reversePageScroll = false;
196 return; 200 return;
197 } 201 }
198 }; 202 };
199 }, 203 },
200 204
201 /** 205 /**
202 * Run any post-process modifications to article content as necessary. 206 * Run any post-process modifications to article content as necessary.
203 * 207 *
204 * @param Element 208 * @param Element
205 * @return void 209 * @return void
206 **/ 210 **/
207 postProcessContent: function(articleContent) { 211 postProcessContent: function(articleContent) {
208 if(readability.convertLinksToFootnotes && !window.location.href.match(/w ikipedia\.org/g)) { 212 if(readability.convertLinksToFootnotes && !window.location.href.match(/w ikipedia\.org/g)) {
209 readability.addFootnotes(articleContent); 213 readability.addFootnotes(articleContent);
210 } 214 }
211 215
212 readability.fixImageFloats(articleContent); 216 readability.fixImageFloats(articleContent);
213 }, 217 },
214 218
215 /** 219 /**
216 * Some content ends up looking ugly if the image is too large to be floated . 220 * Some content ends up looking ugly if the image is too large to be floated .
217 * If the image is wider than a threshold (currently 55%), no longer float i t, 221 * If the image is wider than a threshold (currently 55%), no longer float i t,
218 * center it instead. 222 * center it instead.
219 * 223 *
220 * @param Element 224 * @param Element
221 * @return void 225 * @return void
222 **/ 226 **/
223 fixImageFloats: function (articleContent) { 227 fixImageFloats: function (articleContent) {
224 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0. 55, 228 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0. 55,
225 images = articleContent.getElementsByTagName('img'); 229 images = articleContent.getElementsByTagName('img');
226 230
227 for(var i=0, il = images.length; i < il; i+=1) { 231 for(var i=0, il = images.length; i < il; i+=1) {
228 var image = images[i]; 232 var image = images[i];
229 233
230 if(image.offsetWidth > imageWidthThreshold) { 234 if(image.offsetWidth > imageWidthThreshold) {
231 image.className += " blockImage"; 235 image.className += " blockImage";
232 } 236 }
233 } 237 }
234 }, 238 },
235 239
236 /** 240 /**
237 * Get the article tools Element that has buttons like reload, print. 241 * Get the article tools Element that has buttons like reload, print.
238 * 242 *
239 * @return void 243 * @return void
240 **/ 244 **/
241 getArticleTools: function () { 245 getArticleTools: function () {
242 var articleTools = document.createElement("DIV"); 246 var articleTools = document.createElement("DIV");
243 247
244 articleTools.id = "readTools"; 248 articleTools.id = "readTools";
245 articleTools.innerHTML = 249 articleTools.innerHTML =
246 "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" + 250 "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +
247 "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" + 251 "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +
248 "<a href='#' onclick='readability.emailBox(); return false;' title=' Email page' id='email-page'>Email Page</a>"; 252 "<a href='#' onclick='readability.emailBox(); return false;' title=' Email page' id='email-page'>Email Page</a>";
249 253
250 return articleTools; 254 return articleTools;
251 }, 255 },
252 256
253 /** 257 /**
254 * retuns the suggested direction of the string 258 * retuns the suggested direction of the string
255 * 259 *
256 * @return "rtl" || "ltr" 260 * @return "rtl" || "ltr"
257 **/ 261 **/
258 getSuggestedDirection: function(text) { 262 getSuggestedDirection: function(text) {
259 function sanitizeText() { 263 function sanitizeText() {
260 return text.replace(/@\w+/, ""); 264 return text.replace(/@\w+/, "");
261 } 265 }
262 266
263 function countMatches(match) { 267 function countMatches(match) {
264 var matches = text.match(new RegExp(match, "g")); 268 var matches = text.match(new RegExp(match, "g"));
265 return matches !== null ? matches.length : 0; 269 return matches !== null ? matches.length : 0;
266 } 270 }
267 271
268 function isRTL() { 272 function isRTL() {
269 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]"); 273 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");
270 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]"); 274 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");
271 275
272 // if 20% of chars are Hebrew or Arbic then direction is rtl 276 // if 20% of chars are Hebrew or Arbic then direction is rtl
273 return (count_heb + count_arb) * 100 / text.length > 20; 277 return (count_heb + count_arb) * 100 / text.length > 20;
274 } 278 }
275 279
276 text = sanitizeText(text); 280 text = sanitizeText(text);
277 return isRTL() ? "rtl" : "ltr"; 281 return isRTL() ? "rtl" : "ltr";
278 }, 282 },
279 283
280 /** 284 /**
281 * Get the article title as an H1. 285 * Get the article title as an H1.
282 * 286 *
283 * @return void 287 * @return void
284 **/ 288 **/
285 getArticleTitle: function () { 289 getArticleTitle: function () {
286 var curTitle = "", 290 var curTitle = "",
287 origTitle = ""; 291 origTitle = "";
288 292
289 try { 293 try {
290 curTitle = origTitle = document.title; 294 curTitle = origTitle = document.title;
291 if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */ 295 if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */
292 curTitle = origTitle = readability.getInnerText(document.getElem entsByTagName('title')[0]); 296 curTitle = origTitle = readability.getInnerText(document.getElem entsByTagName('title')[0]);
293 } 297 }
294 } 298 }
295 catch(e) {} 299 catch(e) {}
296 300
297 if(curTitle.match(/ [\|\-] /)) 301 if(curTitle.match(/ [\|\-] /))
298 { 302 {
299 curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); 303 curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
300 304
301 if(curTitle.split(' ').length < 3) { 305 if(curTitle.split(' ').length < 3) {
302 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); 306 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
303 } 307 }
304 } 308 }
305 else if(curTitle.indexOf(': ') !== -1) 309 else if(curTitle.indexOf(': ') !== -1)
306 { 310 {
307 curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); 311 curTitle = origTitle.replace(/.*:(.*)/gi, '$1');
308 312
309 if(curTitle.split(' ').length < 3) { 313 if(curTitle.split(' ').length < 3) {
310 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); 314 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1');
(...skipping 12 matching lines...) Expand all
323 327
324 if(curTitle.split(' ').length <= 4) { 328 if(curTitle.split(' ').length <= 4) {
325 curTitle = origTitle; 329 curTitle = origTitle;
326 } 330 }
327 return curTitle; 331 return curTitle;
328 }, 332 },
329 333
330 /** 334 /**
331 * Prepare the HTML document for readability to scrape it. 335 * Prepare the HTML document for readability to scrape it.
332 * This includes things like stripping javascript, CSS, and handling terribl e markup. 336 * This includes things like stripping javascript, CSS, and handling terribl e markup.
333 * 337 *
334 * @return void 338 * @return void
335 **/ 339 **/
336 prepDocument: function () { 340 prepDocument: function () {
337 /** 341 /**
338 * In some cases a body element can't be found (if the HTML is totally h osed for example) 342 * In some cases a body element can't be found (if the HTML is totally h osed for example)
339 * so we create a new body node and append it to the document. 343 * so we create a new body node and append it to the document.
340 */ 344 */
341 if(document.body === null) 345 if(document.body === null)
342 { 346 {
343 var body = document.createElement("body"); 347 var body = document.createElement("body");
344 try { 348 try {
345 document.body = body; 349 document.body = body;
346 } 350 }
347 catch(e) { 351 catch(e) {
348 document.documentElement.appendChild(body); 352 document.documentElement.appendChild(body);
349 dbg(e); 353 dbg(e);
350 } 354 }
351 } 355 }
352 356
353 document.body.id = "readabilityBody"; 357 document.body.id = "readabilityBody";
354 358
355 var frames = document.getElementsByTagName('frame'); 359 var frames = document.getElementsByTagName('frame');
(...skipping 11 matching lines...) Expand all
367 canAccessFrame = true; 371 canAccessFrame = true;
368 } 372 }
369 catch(eFrames) { 373 catch(eFrames) {
370 dbg(eFrames); 374 dbg(eFrames);
371 } 375 }
372 376
373 if(frameSize > biggestFrameSize) { 377 if(frameSize > biggestFrameSize) {
374 biggestFrameSize = frameSize; 378 biggestFrameSize = frameSize;
375 readability.biggestFrame = frames[frameIndex]; 379 readability.biggestFrame = frames[frameIndex];
376 } 380 }
377 381
378 if(canAccessFrame && frameSize > bestFrameSize) 382 if(canAccessFrame && frameSize > bestFrameSize)
379 { 383 {
380 readability.frameHack = true; 384 readability.frameHack = true;
381 385
382 bestFrame = frames[frameIndex]; 386 bestFrame = frames[frameIndex];
383 bestFrameSize = frameSize; 387 bestFrameSize = frameSize;
384 } 388 }
385 } 389 }
386 390
387 if(bestFrame) 391 if(bestFrame)
388 { 392 {
389 var newBody = document.createElement('body'); 393 var newBody = document.createElement('body');
390 readability.moveNodeInnards(bestFrame.contentWindow.document.bod y, newBody); 394 readability.moveNodeInnards(bestFrame.contentWindow.document.bod y, newBody);
391 newBody.style.overflow = 'scroll'; 395 newBody.style.overflow = 'scroll';
392 document.body = newBody; 396 document.body = newBody;
393 397
394 var frameset = document.getElementsByTagName('frameset')[0]; 398 var frameset = document.getElementsByTagName('frameset')[0];
395 if(frameset) { 399 if(frameset) {
396 frameset.parentNode.removeChild(frameset); } 400 frameset.parentNode.removeChild(frameset); }
397 } 401 }
398 } 402 }
399 403
400 /* Remove all stylesheets */ 404 /* Remove all stylesheets */
401 for (var k=0;k < document.styleSheets.length; k+=1) { 405 for (var k=0;k < document.styleSheets.length; k+=1) {
402 if (document.styleSheets[k].href !== null && document.styleSheets[k] .href.lastIndexOf("readability") === -1) { 406 if (document.styleSheets[k].href !== null && document.styleSheets[k] .href.lastIndexOf("readability") === -1) {
403 document.styleSheets[k].disabled = true; 407 document.styleSheets[k].disabled = true;
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
448 readability.cleanConditionally(articleContent, "table"); 452 readability.cleanConditionally(articleContent, "table");
449 readability.cleanConditionally(articleContent, "ul"); 453 readability.cleanConditionally(articleContent, "ul");
450 readability.cleanConditionally(articleContent, "div"); 454 readability.cleanConditionally(articleContent, "div");
451 455
452 /* Remove extra paragraphs */ 456 /* Remove extra paragraphs */
453 var articleParagraphs = articleContent.getElementsByTagName('p'); 457 var articleParagraphs = articleContent.getElementsByTagName('p');
454 for(var i = articleParagraphs.length-1; i >= 0; i-=1) { 458 for(var i = articleParagraphs.length-1; i >= 0; i-=1) {
455 var imgCount = articleParagraphs[i].getElementsByTagName('img').l ength; 459 var imgCount = articleParagraphs[i].getElementsByTagName('img').l ength;
456 var embedCount = articleParagraphs[i].getElementsByTagName('embed') .length; 460 var embedCount = articleParagraphs[i].getElementsByTagName('embed') .length;
457 var objectCount = articleParagraphs[i].getElementsByTagName('object' ).length; 461 var objectCount = articleParagraphs[i].getElementsByTagName('object' ).length;
458 462
459 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab ility.getInnerText(articleParagraphs[i], false) === '') { 463 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab ility.getInnerText(articleParagraphs[i], false) === '') {
460 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i] ); 464 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i] );
461 } 465 }
462 } 466 }
463 467
464 try { 468 try {
465 readability.replaceBrsWithPs(articleContent); 469 readability.replaceBrsWithPs(articleContent);
466 } 470 }
467 catch (e) { 471 catch (e) {
468 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block -elements bug. Ignoring.: " + e); 472 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block -elements bug. Ignoring.: " + e);
469 } 473 }
470 }, 474 },
471 475
472 /** 476 /**
473 * Initialize a node with the readability object. Also checks the 477 * Initialize a node with the readability object. Also checks the
474 * className/id for special names to add to its score. 478 * className/id for special names to add to its score.
475 * 479 *
476 * @param Element 480 * @param Element
477 * @return void 481 * @return void
478 **/ 482 **/
479 initializeNode: function (node) { 483 initializeNode: function (node) {
480 node.readability = {"contentScore": 0}; 484 node.readability = {"contentScore": 0};
481 485
482 switch(node.tagName) { 486 switch(node.tagName) {
483 case 'DIV': 487 case 'DIV':
484 node.readability.contentScore += 5; 488 node.readability.contentScore += 5;
485 break; 489 break;
486 490
487 case 'PRE': 491 case 'PRE':
488 case 'TD': 492 case 'TD':
489 case 'BLOCKQUOTE': 493 case 'BLOCKQUOTE':
490 node.readability.contentScore += 3; 494 node.readability.contentScore += 3;
491 break; 495 break;
492 496
493 case 'ADDRESS': 497 case 'ADDRESS':
494 case 'OL': 498 case 'OL':
495 case 'UL': 499 case 'UL':
496 case 'DL': 500 case 'DL':
497 case 'DD': 501 case 'DD':
498 case 'DT': 502 case 'DT':
499 case 'LI': 503 case 'LI':
500 case 'FORM': 504 case 'FORM':
501 node.readability.contentScore -= 3; 505 node.readability.contentScore -= 3;
502 break; 506 break;
503 507
504 case 'H1': 508 case 'H1':
505 case 'H2': 509 case 'H2':
506 case 'H3': 510 case 'H3':
507 case 'H4': 511 case 'H4':
508 case 'H5': 512 case 'H5':
509 case 'H6': 513 case 'H6':
510 case 'TH': 514 case 'TH':
511 node.readability.contentScore -= 5; 515 node.readability.contentScore -= 5;
512 break; 516 break;
513 } 517 }
514 518
515 node.readability.contentScore += readability.getClassWeight(node); 519 node.readability.contentScore += readability.getClassWeight(node);
516 }, 520 },
517 521
518 /*** 522 /***
519 * grabArticle - Using a variety of metrics (content score, classname, eleme nt types), find the content that is 523 * grabArticle - Using a variety of metrics (content score, classname, eleme nt types), find the content that is
520 * most likely to be the stuff a user wants to read. Then retu rn it wrapped up in a div. 524 * most likely to be the stuff a user wants to read. Then retu rn it wrapped up in a div.
521 * 525 *
522 * @param page a document to run upon. Needs to be a full document, complete with body. 526 * @param page a document to run upon. Needs to be a full document, complete with body.
523 * @return Element 527 * @return Element
524 **/ 528 **/
525 grabArticle: function (pageToClone) { 529 grabArticle: function (pageToClone) {
526 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_ STRIP_UNLIKELYS), 530 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_ STRIP_UNLIKELYS),
527 isPaging = (page !== null) ? true: false; 531 isPaging = (page !== null) ? true: false;
528 532
529 var page = null; 533 var page = null;
530 // Never work on the actual page. 534 // Never work on the actual page.
531 if (isPaging) { 535 if (isPaging) {
532 page = document.body.cloneNode(true); 536 page = document.body.cloneNode(true);
533 } else { 537 } else {
534 page = pageToClone.cloneNode(true); 538 page = pageToClone.cloneNode(true);
535 } 539 }
536 540
537 var allElements = page.getElementsByTagName('*'); 541 var allElements = page.getElementsByTagName('*');
538 542
539 /** 543 /**
540 * First, node prepping. Trash nodes that look cruddy (like ones with th e class name "comment", etc), and turn divs 544 * First, node prepping. Trash nodes that look cruddy (like ones with th e class name "comment", etc), and turn divs
541 * into P tags where they have been used inappropriately (as in, where t hey contain no other block level elements.) 545 * into P tags where they have been used inappropriately (as in, where t hey contain no other block level elements.)
542 * 546 *
543 * Note: Assignment from index for performance. See http://www.peachpit. com/articles/article.aspx?p=31567&seqNum=5 547 * Note: Assignment from index for performance. See http://www.peachpit. com/articles/article.aspx?p=31567&seqNum=5
544 * TODO: Shouldn't this be a reverse traversal? 548 * TODO: Shouldn't this be a reverse traversal?
545 **/ 549 **/
546 var node = null; 550 var node = null;
547 var nodesToScore = []; 551 var nodesToScore = [];
548 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) { 552 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {
549 /* Remove unlikely candidates */ 553 /* Remove unlikely candidates */
550 if (stripUnlikelyCandidates) { 554 if (stripUnlikelyCandidates) {
551 var unlikelyMatchString = node.className + node.id; 555 var unlikelyMatchString = node.className + node.id;
552 if ( 556 if (
553 ( 557 (
554 unlikelyMatchString.search(readability.regexps.unlikelyC andidates) !== -1 && 558 unlikelyMatchString.search(readability.regexps.unlikelyC andidates) !== -1 &&
555 unlikelyMatchString.search(readability.regexps.okMaybeIt sACandidate) === -1 && 559 unlikelyMatchString.search(readability.regexps.okMaybeIt sACandidate) === -1 &&
556 node.tagName !== "BODY" 560 node.tagName !== "BODY"
557 ) 561 )
558 ) 562 )
559 { 563 {
560 dbg("Removing unlikely candidate - " + unlikelyMatchString); 564 dbg("Removing unlikely candidate - " + unlikelyMatchString);
561 node.parentNode.removeChild(node); 565 node.parentNode.removeChild(node);
562 nodeIndex-=1; 566 nodeIndex-=1;
563 continue; 567 continue;
564 } 568 }
565 } 569 }
566 570
567 if (node.tagName === "P" || node.tagName === "TD" || node.tagName == = "PRE") { 571 if (node.tagName === "P" || node.tagName === "TD" || node.tagName == = "PRE") {
568 nodesToScore[nodesToScore.length] = node; 572 nodesToScore[nodesToScore.length] = node;
569 } 573 }
570 574
571 /* Turn all divs that don't have children block level elements into p's */ 575 /* Turn all divs that don't have children block level elements into p's */
572 if (node.tagName === "DIV") { 576 if (node.tagName === "DIV") {
573 if (node.innerHTML.search(readability.regexps.divToPElements) == = -1) { 577 if (node.innerHTML.search(readability.regexps.divToPElements) == = -1) {
574 var newNode = document.createElement('p'); 578 var newNode = document.createElement('p');
(...skipping 16 matching lines...) Expand all
591 if(childNode.nodeType === 3) { // Node.TEXT_NODE 595 if(childNode.nodeType === 3) { // Node.TEXT_NODE
592 var p = document.createElement('p'); 596 var p = document.createElement('p');
593 var t = document.createTextNode(childNode.nodeValue) ; 597 var t = document.createTextNode(childNode.nodeValue) ;
594 p.appendChild(t); 598 p.appendChild(t);
595 p.style.display = 'inline'; 599 p.style.display = 'inline';
596 p.className = 'readability-styled'; 600 p.className = 'readability-styled';
597 childNode.parentNode.replaceChild(p, childNode); 601 childNode.parentNode.replaceChild(p, childNode);
598 } 602 }
599 } 603 }
600 } 604 }
601 } 605 }
602 } 606 }
603 607
604 /** 608 /**
605 * Loop through all paragraphs, and assign a score to them based on how content-y they look. 609 * Loop through all paragraphs, and assign a score to them based on how content-y they look.
606 * Then add their score to their parent node. 610 * Then add their score to their parent node.
607 * 611 *
608 * A score is determined by things like number of commas, class names, e tc. Maybe eventually link density. 612 * A score is determined by things like number of commas, class names, e tc. Maybe eventually link density.
609 **/ 613 **/
610 var candidates = []; 614 var candidates = [];
611 for (var pt=0; pt < nodesToScore.length; pt+=1) { 615 for (var pt=0; pt < nodesToScore.length; pt+=1) {
(...skipping 21 matching lines...) Expand all
633 candidates.push(grandParentNode); 637 candidates.push(grandParentNode);
634 } 638 }
635 639
636 var contentScore = 0; 640 var contentScore = 0;
637 641
638 /* Add a point for the paragraph itself as a base. */ 642 /* Add a point for the paragraph itself as a base. */
639 contentScore+=1; 643 contentScore+=1;
640 644
641 /* Add points for any commas within this paragraph */ 645 /* Add points for any commas within this paragraph */
642 contentScore += innerText.split(',').length; 646 contentScore += innerText.split(',').length;
643 647
644 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ 648 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
645 contentScore += Math.min(Math.floor(innerText.length / 100), 3); 649 contentScore += Math.min(Math.floor(innerText.length / 100), 3);
646 650
647 /* Add the score to the parent. The grandparent gets half. */ 651 /* Add the score to the parent. The grandparent gets half. */
648 parentNode.readability.contentScore += contentScore; 652 parentNode.readability.contentScore += contentScore;
649 653
650 if(grandParentNode) { 654 if(grandParentNode) {
651 grandParentNode.readability.contentScore += contentScore/2; 655 grandParentNode.readability.contentScore += contentScore/2;
652 } 656 }
653 } 657 }
654 658
655 /** 659 /**
656 * After we've calculated scores, loop through all of the possible candi date nodes we found 660 * After we've calculated scores, loop through all of the possible candi date nodes we found
657 * and find the one with the highest score. 661 * and find the one with the highest score.
658 **/ 662 **/
659 var topCandidate = null; 663 var topCandidate = null;
660 for(var c=0, cl=candidates.length; c < cl; c+=1) 664 for(var c=0, cl=candidates.length; c < cl; c+=1)
661 { 665 {
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after
718 var contentBonus = 0; 722 var contentBonus = 0;
719 /* Give a bonus if sibling nodes and top candidates have the example same classname */ 723 /* Give a bonus if sibling nodes and top candidates have the example same classname */
720 if(siblingNode.className === topCandidate.className && topCandidate. className !== "") { 724 if(siblingNode.className === topCandidate.className && topCandidate. className !== "") {
721 contentBonus += topCandidate.readability.contentScore * 0.2; 725 contentBonus += topCandidate.readability.contentScore * 0.2;
722 } 726 }
723 727
724 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re adability.contentScore+contentBonus) >= siblingScoreThreshold) 728 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re adability.contentScore+contentBonus) >= siblingScoreThreshold)
725 { 729 {
726 append = true; 730 append = true;
727 } 731 }
728 732
729 if(siblingNode.nodeName === "P") { 733 if(siblingNode.nodeName === "P") {
730 var linkDensity = readability.getLinkDensity(siblingNode); 734 var linkDensity = readability.getLinkDensity(siblingNode);
731 var nodeContent = readability.getInnerText(siblingNode); 735 var nodeContent = readability.getInnerText(siblingNode);
732 var nodeLength = nodeContent.length; 736 var nodeLength = nodeContent.length;
733 737
734 if(nodeLength > 80 && linkDensity < 0.25) 738 if(nodeLength > 80 && linkDensity < 0.25)
735 { 739 {
736 append = true; 740 append = true;
737 } 741 }
738 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear ch(/\.( |$)/) !== -1) 742 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear ch(/\.( |$)/) !== -1)
739 { 743 {
740 append = true; 744 append = true;
741 } 745 }
742 } 746 }
743 747
744 if(append) { 748 if(append) {
745 dbg("Appending node: " + siblingNode); 749 dbg("Appending node: " + siblingNode);
746 750
747 var nodeToAppend = null; 751 var nodeToAppend = null;
748 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P ") { 752 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P ") {
749 /* We have a node that isn't a common block level element, l ike a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ 753 /* We have a node that isn't a common block level element, l ike a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
750 754
751 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); 755 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
752 nodeToAppend = document.createElement("DIV"); 756 nodeToAppend = document.createElement("DIV");
753 try { 757 try {
754 nodeToAppend.id = siblingNode.id; 758 nodeToAppend.id = siblingNode.id;
755 readability.moveNodeInnards(siblingNode, nodeToAppend); 759 readability.moveNodeInnards(siblingNode, nodeToAppend);
756 } 760 }
757 catch(er) { 761 catch(er) {
758 dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original."); 762 dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
759 nodeToAppend = siblingNode; 763 nodeToAppend = siblingNode;
760 s-=1; 764 s-=1;
761 sl-=1; 765 sl-=1;
762 } 766 }
763 } else { 767 } else {
764 nodeToAppend = siblingNode; 768 nodeToAppend = siblingNode;
765 s-=1; 769 s-=1;
766 sl-=1; 770 sl-=1;
767 } 771 }
768 772
769 /* To ensure a node does not interfere with readability styles, remove its classnames */ 773 /* To ensure a node does not interfere with readability styles, remove its classnames */
770 nodeToAppend.className = ""; 774 nodeToAppend.className = "";
771 775
772 /* Append sibling and subtract from our list because it removes the node when you append to another node */ 776 /* Append sibling and subtract from our list because it removes the node when you append to another node */
773 articleContent.appendChild(nodeToAppend); 777 articleContent.appendChild(nodeToAppend);
774 } 778 }
775 } 779 }
776 780
777 /** 781 /**
778 * So we have all of the content that we need. Now we clean it up for pr esentation. 782 * So we have all of the content that we need. Now we clean it up for pr esentation.
779 **/ 783 **/
780 readability.distilledArticleContent = articleContent.cloneNode(true); 784 readability.distilledArticleContent = articleContent.cloneNode(true);
781 //readability.prepArticle(articleContent); 785 //readability.prepArticle(articleContent);
782 786
783 if (readability.curPageNum === 1) { 787 if (readability.curPageNum === 1) {
784 var newNode = document.createElement('div'); 788 var newNode = document.createElement('div');
785 newNode.id = "readability-page-1"; 789 newNode.id = "readability-page-1";
786 newNode.setAttribute("class", "page"); 790 newNode.setAttribute("class", "page");
787 readability.moveNodeInnards(articleContent, newNode); 791 readability.moveNodeInnards(articleContent, newNode);
788 articleContent.appendChild(newNode); 792 articleContent.appendChild(newNode);
789 } 793 }
790 794
791 /** 795 /**
792 * Now that we've gone through the full algorithm, check to see if we go t any meaningful content. 796 * Now that we've gone through the full algorithm, check to see if we go t any meaningful content.
793 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher 797 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
794 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of 798 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
795 * finding the -right- content. 799 * finding the -right- content.
796 **/ 800 **/
797 if(readability.getInnerText(articleContent, false).length < 250) { 801 if(readability.getInnerText(articleContent, false).length < 250) {
798 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) { 802 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {
799 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS); 803 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);
800 return readability.grabArticle(document.body); 804 return readability.grabArticle(document.body);
801 } 805 }
802 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { 806 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
803 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES); 807 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);
804 return readability.grabArticle(document.body); 808 return readability.grabArticle(document.body);
805 } 809 }
806 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL LY)) { 810 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL LY)) {
807 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY); 811 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);
808 return readability.grabArticle(document.body); 812 return readability.grabArticle(document.body);
809 } else { 813 } else {
810 return null; 814 return null;
811 } 815 }
812 } 816 }
813 817
814 return articleContent; 818 return articleContent;
815 }, 819 },
816 820
817 /** 821 /**
818 * Removes script tags from the document. 822 * Removes script tags from the document.
819 * 823 *
820 * @param Element 824 * @param Element
821 **/ 825 **/
822 removeScripts: function (doc) { 826 removeScripts: function (doc) {
823 var scripts = doc.getElementsByTagName('script'); 827 var scripts = doc.getElementsByTagName('script');
824 for(var i = scripts.length-1; i >= 0; i-=1) 828 for(var i = scripts.length-1; i >= 0; i-=1)
825 { 829 {
826 if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf ('readability') === -1 && scripts[i].src.indexOf('typekit') === -1)) 830 if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf ('readability') === -1 && scripts[i].src.indexOf('typekit') === -1))
827 { 831 {
828 scripts[i].nodeValue=""; 832 scripts[i].nodeValue="";
829 scripts[i].removeAttribute('src'); 833 scripts[i].removeAttribute('src');
830 if (scripts[i].parentNode) { 834 if (scripts[i].parentNode) {
831 scripts[i].parentNode.removeChild(scripts[i]); 835 scripts[i].parentNode.removeChild(scripts[i]);
832 } 836 }
833 } 837 }
834 } 838 }
835 }, 839 },
836 840
837 /** 841 /**
838 * Get the inner text of a node - cross browser compatibly. 842 * Get the inner text of a node - cross browser compatibly.
839 * This also strips out any excess whitespace to be found. 843 * This also strips out any excess whitespace to be found.
840 * 844 *
841 * @param Element 845 * @param Element
842 * @return string 846 * @return string
843 **/ 847 **/
844 getInnerText: function (e, normalizeSpaces) { 848 getInnerText: function (e, normalizeSpaces) {
845 var textContent = ""; 849 var textContent = "";
846 850
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
889 893
890 // Remove any root styles, if we're able. 894 // Remove any root styles, if we're able.
891 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili ty-styled') { 895 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili ty-styled') {
892 e.removeAttribute('style'); } 896 e.removeAttribute('style'); }
893 897
894 // Go until there are no more child nodes 898 // Go until there are no more child nodes
895 while ( cur !== null ) { 899 while ( cur !== null ) {
896 if ( cur.nodeType === 1 ) { 900 if ( cur.nodeType === 1 ) {
897 // Remove style attribute(s) : 901 // Remove style attribute(s) :
898 if(cur.className !== "readability-styled") { 902 if(cur.className !== "readability-styled") {
899 cur.removeAttribute("style"); 903 cur.removeAttribute("style");
900 } 904 }
901 readability.cleanStyles( cur ); 905 readability.cleanStyles( cur );
902 } 906 }
903 cur = cur.nextSibling; 907 cur = cur.nextSibling;
904 } 908 }
905 }, 909 },
906 910
907 /** 911 /**
908 * Get the density of links as a percentage of the content 912 * Get the density of links as a percentage of the content
909 * This is the amount of text that is inside a link divided by the total tex t in the node. 913 * This is the amount of text that is inside a link divided by the total tex t in the node.
910 * 914 *
911 * @param Element 915 * @param Element
912 * @return number (float) 916 * @return number (float)
913 **/ 917 **/
914 getLinkDensity: function (e) { 918 getLinkDensity: function (e) {
915 var links = e.getElementsByTagName("a"); 919 var links = e.getElementsByTagName("a");
916 var textLength = readability.getInnerText(e).length; 920 var textLength = readability.getInnerText(e).length;
917 var linkLength = 0; 921 var linkLength = 0;
918 for(var i=0, il=links.length; i<il;i+=1) 922 for(var i=0, il=links.length; i<il;i+=1)
919 { 923 {
920 linkLength += readability.getInnerText(links[i]).length; 924 linkLength += readability.getInnerText(links[i]).length;
921 } 925 }
922 926
923 return linkLength / textLength; 927 return linkLength / textLength;
924 }, 928 },
925 929
926 /** 930 /**
927 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness. 931 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
928 * 932 *
929 * @author Dan Lacy 933 * @author Dan Lacy
930 * @return string the base url 934 * @return string the base url
931 **/ 935 **/
932 findBaseUrl: function () { 936 findBaseUrl: function () {
933 var noUrlParams = window.location.pathname.split("?")[0], 937 var noUrlParams = window.location.pathname.split("?")[0],
934 urlSlashes = noUrlParams.split("/").reverse(), 938 urlSlashes = noUrlParams.split("/").reverse(),
935 cleanedSegments = [], 939 cleanedSegments = [],
936 possibleType = ""; 940 possibleType = "";
937 941
938 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) { 942 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) {
939 var segment = urlSlashes[i]; 943 var segment = urlSlashes[i];
940 944
941 // Split off and save anything that looks like a file type. 945 // Split off and save anything that looks like a file type.
942 if (segment.indexOf(".") !== -1) { 946 if (segment.indexOf(".") !== -1) {
943 possibleType = segment.split(".")[1]; 947 possibleType = segment.split(".")[1];
944 948
945 /* If the type isn't alpha-only, it's probably not actually a fi le extension. */ 949 /* If the type isn't alpha-only, it's probably not actually a fi le extension. */
946 if(!possibleType.match(/[^a-zA-Z]/)) { 950 if(!possibleType.match(/[^a-zA-Z]/)) {
947 segment = segment.split(".")[0]; 951 segment = segment.split(".")[0];
948 } 952 }
949 } 953 }
950 954
951 /** 955 /**
952 * EW-CMS specific segment replacement. Ugly. 956 * EW-CMS specific segment replacement. Ugly.
953 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm l 957 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm l
954 **/ 958 **/
955 if(segment.indexOf(',00') !== -1) { 959 if(segment.indexOf(',00') !== -1) {
956 segment = segment.replace(',00', ''); 960 segment = segment.replace(',00', '');
957 } 961 }
958 962
959 // If our first or second segment has anything looking like a page n umber, remove it. 963 // If our first or second segment has anything looking like a page n umber, remove it.
960 if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) { 964 if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) {
961 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, " "); 965 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, " ");
962 } 966 }
963 967
964 968
965 var del = false; 969 var del = false;
966 970
967 /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */ 971 /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */
968 if (i < 2 && segment.match(/^\d{1,2}$/)) { 972 if (i < 2 && segment.match(/^\d{1,2}$/)) {
969 del = true; 973 del = true;
970 } 974 }
971 975
972 /* If this is the first segment and it's just "index", remove it. */ 976 /* If this is the first segment and it's just "index", remove it. */
973 if(i === 0 && segment.toLowerCase() === "index") { 977 if(i === 0 && segment.toLowerCase() === "index") {
974 del = true; 978 del = true;
975 } 979 }
976 980
977 981
978 /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */ 982 /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */
979 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) { 983 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) {
980 del = true; 984 del = true;
981 } 985 }
982 986
983 /* If it's not marked for deletion, push it to cleanedSegments. */ 987 /* If it's not marked for deletion, push it to cleanedSegments. */
984 if (!del) { 988 if (!del) {
985 cleanedSegments.push(segment); 989 cleanedSegments.push(segment);
986 } 990 }
987 } 991 }
988 992
989 // This is our final, cleaned, base article URL. 993 // This is our final, cleaned, base article URL.
990 return window.location.protocol + "//" + window.location.host + cleanedS egments.reverse().join("/"); 994 return window.location.protocol + "//" + window.location.host + cleanedS egments.reverse().join("/");
991 }, 995 },
992 996
993 /** 997 /**
994 * Look for any paging links that may occur within the document. 998 * Look for any paging links that may occur within the document.
995 * 999 *
996 * @param body 1000 * @param body
997 * @return object (array) 1001 * @return object (array)
998 **/ 1002 **/
999 findNextPageLink: function (elem) { 1003 findNextPageLink: function (elem) {
1000 var possiblePages = {}, 1004 var possiblePages = {},
1001 allLinks = elem.getElementsByTagName('a'), 1005 allLinks = elem.getElementsByTagName('a'),
1002 articleBaseUrl = readability.findBaseUrl(); 1006 articleBaseUrl = readability.findBaseUrl();
1003 1007
1004 /** 1008 /**
1005 * Loop through all links, looking for hints that they may be next-page links. 1009 * Loop through all links, looking for hints that they may be next-page links.
1006 * Things like having "page" in their textContent, className or id, or b eing a child 1010 * Things like having "page" in their textContent, className or id, or b eing a child
1007 * of a node with a page-y className or id. 1011 * of a node with a page-y className or id.
1008 * 1012 *
1009 * Also possible: levenshtein distance? longest common subsequence? 1013 * Also possible: levenshtein distance? longest common subsequence?
1010 * 1014 *
1011 * After we do that, assign each page a score, and 1015 * After we do that, assign each page a score, and
1012 **/ 1016 **/
1013 for(var i = 0, il = allLinks.length; i < il; i+=1) { 1017 for(var i = 0, il = allLinks.length; i < il; i+=1) {
1014 var link = allLinks[i], 1018 var link = allLinks[i],
1015 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ' '); 1019 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ' ');
1016 1020
1017 /* If we've already seen this page, ignore it */ 1021 /* If we've already seen this page, ignore it */
1018 if(linkHref === "" || linkHref === articleBaseUrl || linkHref === wi ndow.location.href || linkHref in readability.parsedPages) { 1022 if(linkHref === "" || linkHref === articleBaseUrl || linkHref === wi ndow.location.href || linkHref in readability.parsedPages) {
1019 continue; 1023 continue;
1020 } 1024 }
1021 1025
1022 /* If it's on a different domain, skip it. */ 1026 /* If it's on a different domain, skip it. */
1023 if(window.location.host !== linkHref.split(/\/+/g)[1]) { 1027 if(window.location.host !== linkHref.split(/\/+/g)[1]) {
1024 continue; 1028 continue;
1025 } 1029 }
1026 1030
1027 var linkText = readability.getInnerText(link); 1031 var linkText = readability.getInnerText(link);
1028 1032
1029 /* If the linkText looks like it's not the next page, skip it. */ 1033 /* If the linkText looks like it's not the next page, skip it. */
1030 if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) { 1034 if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) {
1031 continue; 1035 continue;
1032 } 1036 }
1033 1037
1034 /* If the leftovers of the URL after removing the base URL don't con tain any digits, it's certainly not a next page link. */ 1038 /* If the leftovers of the URL after removing the base URL don't con tain any digits, it's certainly not a next page link. */
1035 var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); 1039 var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
1036 if(!linkHrefLeftover.match(/\d/)) { 1040 if(!linkHrefLeftover.match(/\d/)) {
1037 continue; 1041 continue;
1038 } 1042 }
1039 1043
1040 if(!(linkHref in possiblePages)) { 1044 if(!(linkHref in possiblePages)) {
1041 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr ef": linkHref}; 1045 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr ef": linkHref};
1042 } else { 1046 } else {
1043 possiblePages[linkHref].linkText += ' | ' + linkText; 1047 possiblePages[linkHref].linkText += ' | ' + linkText;
1044 } 1048 }
1045 1049
1046 var linkObj = possiblePages[linkHref]; 1050 var linkObj = possiblePages[linkHref];
1047 1051
1048 /** 1052 /**
1049 * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower. 1053 * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.
1050 * Example: http://www.actionscript.org/resources/articles/745/1/Jav aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html 1054 * Example: http://www.actionscript.org/resources/articles/745/1/Jav aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
1051 **/ 1055 **/
1052 if(linkHref.indexOf(articleBaseUrl) !== 0) { 1056 if(linkHref.indexOf(articleBaseUrl) !== 0) {
1053 linkObj.score -= 25; 1057 linkObj.score -= 25;
1054 } 1058 }
1055 1059
1056 var linkData = linkText + ' ' + link.className + ' ' + link.id; 1060 var linkData = linkText + ' ' + link.className + ' ' + link.id;
1057 if(linkData.match(readability.regexps.nextLink)) { 1061 if(linkData.match(readability.regexps.nextLink)) {
1058 linkObj.score += 50; 1062 linkObj.score += 50;
1059 } 1063 }
1060 if(linkData.match(/pag(e|ing|inat)/i)) { 1064 if(linkData.match(/pag(e|ing|inat)/i)) {
1061 linkObj.score += 25; 1065 linkObj.score += 25;
1062 } 1066 }
1063 if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text, 1067 if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,
1064 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */ 1068 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */
1065 if(!linkObj.linkText.match(readability.regexps.nextLink)) { 1069 if(!linkObj.linkText.match(readability.regexps.nextLink)) {
1066 linkObj.score -= 65; 1070 linkObj.score -= 65;
1067 } 1071 }
1068 } 1072 }
1069 if(linkData.match(readability.regexps.negative) || linkData.match(re adability.regexps.extraneous)) { 1073 if(linkData.match(readability.regexps.negative) || linkData.match(re adability.regexps.extraneous)) {
1070 linkObj.score -= 50; 1074 linkObj.score -= 50;
1071 } 1075 }
1072 if(linkData.match(readability.regexps.prevLink)) { 1076 if(linkData.match(readability.regexps.prevLink)) {
1073 linkObj.score -= 200; 1077 linkObj.score -= 200;
1074 } 1078 }
1075 1079
1076 /* If a parentNode contains page or paging or paginat */ 1080 /* If a parentNode contains page or paging or paginat */
1077 var parentNode = link.parentNode, 1081 var parentNode = link.parentNode,
1078 positiveNodeMatch = false, 1082 positiveNodeMatch = false,
1079 negativeNodeMatch = false; 1083 negativeNodeMatch = false;
1080 while(parentNode) { 1084 while(parentNode) {
1081 var parentNodeClassAndId = parentNode.className + ' ' + parentNo de.id; 1085 var parentNodeClassAndId = parentNode.className + ' ' + parentNo de.id;
1082 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(/pag(e|ing|inat)/i)) { 1086 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(/pag(e|ing|inat)/i)) {
1083 positiveNodeMatch = true; 1087 positiveNodeMatch = true;
1084 linkObj.score += 25; 1088 linkObj.score += 25;
1085 } 1089 }
1086 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(readability.regexps.negative)) { 1090 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(readability.regexps.negative)) {
1087 /* If this is just something like "footer", give it a negati ve. If it's something like "body-and-footer", leave it be. */ 1091 /* If this is just something like "footer", give it a negati ve. If it's something like "body-and-footer", leave it be. */
1088 if(!parentNodeClassAndId.match(readability.regexps.positive) ) { 1092 if(!parentNodeClassAndId.match(readability.regexps.positive) ) {
1089 linkObj.score -= 25; 1093 linkObj.score -= 25;
1090 negativeNodeMatch = true; 1094 negativeNodeMatch = true;
1091 } 1095 }
1092 } 1096 }
1093 1097
1094 parentNode = parentNode.parentNode; 1098 parentNode = parentNode.parentNode;
1095 } 1099 }
1096 1100
1097 /** 1101 /**
1098 * If the URL looks like it has paging in it, add to the score. 1102 * If the URL looks like it has paging in it, add to the score.
1099 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 1103 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
1100 **/ 1104 **/
1101 if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) { 1105 if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) {
1102 linkObj.score += 25; 1106 linkObj.score += 25;
1103 } 1107 }
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
1145 topPage = possiblePages[page]; 1149 topPage = possiblePages[page];
1146 } 1150 }
1147 } 1151 }
1148 } 1152 }
1149 1153
1150 if(topPage) { 1154 if(topPage) {
1151 var nextHref = topPage.href.replace(/\/$/,''); 1155 var nextHref = topPage.href.replace(/\/$/,'');
1152 1156
1153 dbg('NEXT PAGE IS ' + nextHref); 1157 dbg('NEXT PAGE IS ' + nextHref);
1154 readability.parsedPages[nextHref] = true; 1158 readability.parsedPages[nextHref] = true;
1155 return nextHref; 1159 return nextHref;
1156 } 1160 }
1157 else { 1161 else {
1158 return null; 1162 return null;
1159 } 1163 }
1160 }, 1164 },
1161 1165
1162 createLinkDiv: function(link) { 1166 createLinkDiv: function(link) {
1163 var divNode = document.createElement('div'); 1167 var divNode = document.createElement('div');
1164 var aNode = document.createElement('a'); 1168 var aNode = document.createElement('a');
1165 var tNode = document.createTextNode('View Next Page'); 1169 var tNode = document.createTextNode('View Next Page');
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
1197 } 1201 }
1198 else { 1202 else {
1199 if (options.error) { options.error(request); } 1203 if (options.error) { options.error(request); }
1200 } 1204 }
1201 } 1205 }
1202 } 1206 }
1203 1207
1204 if (typeof options === 'undefined') { options = {}; } 1208 if (typeof options === 'undefined') { options = {}; }
1205 1209
1206 request.onreadystatechange = respondToReadyState; 1210 request.onreadystatechange = respondToReadyState;
1207 1211
1208 request.open('get', url, true); 1212 request.open('get', url, true);
1209 request.setRequestHeader('Accept', 'text/html'); 1213 request.setRequestHeader('Accept', 'text/html');
1210 1214
1211 try { 1215 try {
1212 request.send(options.postBody); 1216 request.send(options.postBody);
1213 } 1217 }
1214 catch (e) { 1218 catch (e) {
1215 if (options.error) { options.error(); } 1219 if (options.error) { options.error(); }
1216 } 1220 }
1217 1221
(...skipping 14 matching lines...) Expand all
1232 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada bility.curPageNum + '">&sect;</p>'; 1236 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada bility.curPageNum + '">&sect;</p>';
1233 1237
1234 document.getElementById("readability-content").appendChild(articlePage); 1238 document.getElementById("readability-content").appendChild(articlePage);
1235 1239
1236 if(readability.curPageNum > readability.maxPages) { 1240 if(readability.curPageNum > readability.maxPages) {
1237 var linkDiv = readability.createLinkDiv(nextPageLink); 1241 var linkDiv = readability.createLinkDiv(nextPageLink);
1238 1242
1239 articlePage.appendChild(linkDiv); 1243 articlePage.appendChild(linkDiv);
1240 return; 1244 return;
1241 } 1245 }
1242 1246
1243 /** 1247 /**
1244 * Now that we've built the article page DOM element, get the page conte nt 1248 * Now that we've built the article page DOM element, get the page conte nt
1245 * asynchronously and load the cleaned content into the div we created f or it. 1249 * asynchronously and load the cleaned content into the div we created f or it.
1246 **/ 1250 **/
1247 (function(pageUrl, thisPage) { 1251 (function(pageUrl, thisPage) {
1248 readability.ajax(pageUrl, { 1252 readability.ajax(pageUrl, {
1249 success: function(r) { 1253 success: function(r) {
1250 1254
1251 /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */ 1255 /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */
1252 var eTag = r.getResponseHeader('ETag'); 1256 var eTag = r.getResponseHeader('ETag');
1253 if(eTag) { 1257 if(eTag) {
1254 if(eTag in readability.pageETags) { 1258 if(eTag in readability.pageETags) {
1255 dbg("Exact duplicate page found via ETag. Aborting." ); 1259 dbg("Exact duplicate page found via ETag. Aborting." );
1256 articlePage.style.display = 'none'; 1260 articlePage.style.display = 'none';
1257 return; 1261 return;
1258 } else { 1262 } else {
1259 readability.pageETags[eTag] = 1; 1263 readability.pageETags[eTag] = 1;
1260 } 1264 }
1261 } 1265 }
1262 1266
1263 // TODO: this ends up doubling up page numbers on NYTimes ar ticles. Need to generically parse those away. 1267 // TODO: this ends up doubling up page numbers on NYTimes ar ticles. Need to generically parse those away.
1264 var page = document.createElement("DIV"); 1268 var page = document.createElement("DIV");
1265 1269
1266 /** 1270 /**
1267 * Do some preprocessing to our HTML to make it ready for ap pending. 1271 * Do some preprocessing to our HTML to make it ready for ap pending.
1268 * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript. 1272 * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript.
1269 * • Turn any noscript tags into divs so that we can parse t hem. This allows us to find any next page links hidden via javascript. 1273 * • Turn any noscript tags into divs so that we can parse t hem. This allows us to find any next page links hidden via javascript.
1270 * • Turn all double br's into p's - was handled by prepDocu ment in the original view. 1274 * • Turn all double br's into p's - was handled by prepDocu ment in the original view.
(...skipping 30 matching lines...) Expand all
1301 for(var i=1; i <= readability.curPageNum; i+=1) { 1305 for(var i=1; i <= readability.curPageNum; i+=1) {
1302 var rPage = document.getElementById('readability-pag e-' + i); 1306 var rPage = document.getElementById('readability-pag e-' + i);
1303 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML ) !== -1) { 1307 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML ) !== -1) {
1304 dbg('Duplicate of page ' + i + ' - skipping.'); 1308 dbg('Duplicate of page ' + i + ' - skipping.');
1305 articlePage.style.display = 'none'; 1309 articlePage.style.display = 'none';
1306 readability.parsedPages[pageUrl] = true; 1310 readability.parsedPages[pageUrl] = true;
1307 return; 1311 return;
1308 } 1312 }
1309 } 1313 }
1310 } 1314 }
1311 1315
1312 readability.removeScripts(content); 1316 readability.removeScripts(content);
1313 1317
1314 readability.moveNodeInnards(content, thisPage); 1318 readability.moveNodeInnards(content, thisPage);
1315 1319
1316 /** 1320 /**
1317 * After the page has rendered, post process the content. Th is delay is necessary because, 1321 * After the page has rendered, post process the content. Th is delay is necessary because,
1318 * in webkit at least, offsetWidth is not set in time to det ermine image width. We have to 1322 * in webkit at least, offsetWidth is not set in time to det ermine image width. We have to
1319 * wait a little bit for reflow to finish before we can fix floating images. 1323 * wait a little bit for reflow to finish before we can fix floating images.
1320 **/ 1324 **/
1321 window.setTimeout( 1325 window.setTimeout(
1322 function() { readability.postProcessContent(thisPage); } , 1326 function() { readability.postProcessContent(thisPage); } ,
1323 500 1327 500
1324 ); 1328 );
1325 1329
1326 if(nextPageLink) { 1330 if(nextPageLink) {
1327 readability.appendNextPage(nextPageLink); 1331 readability.appendNextPage(nextPageLink);
1328 } 1332 }
1329 } 1333 }
1330 }); 1334 });
1331 }(nextPageLink, articlePage)); 1335 }(nextPageLink, articlePage));
1332 }, 1336 },
1333 1337
1334 /** 1338 /**
1335 * Get an elements class/id weight. Uses regular expressions to tell if this 1339 * Get an elements class/id weight. Uses regular expressions to tell if this
1336 * element looks good or bad. 1340 * element looks good or bad.
1337 * 1341 *
1338 * @param Element 1342 * @param Element
1339 * @return number (Integer) 1343 * @return number (Integer)
1340 **/ 1344 **/
1341 getClassWeight: function (e) { 1345 getClassWeight: function (e) {
1342 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { 1346 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
1343 return 0; 1347 return 0;
1344 } 1348 }
1345 1349
(...skipping 29 matching lines...) Expand all
1375 /** 1379 /**
1376 * Remove extraneous break tags from a node. 1380 * Remove extraneous break tags from a node.
1377 * 1381 *
1378 * @param Element 1382 * @param Element
1379 * @return void 1383 * @return void
1380 **/ 1384 **/
1381 killBreaks: function (e) { 1385 killBreaks: function (e) {
1382 var allElements = e.getElementsByTagName('*'); 1386 var allElements = e.getElementsByTagName('*');
1383 while (i < allElements.length) { 1387 while (i < allElements.length) {
1384 readability.deleteExtraBreaks(allElements[i]); 1388 readability.deleteExtraBreaks(allElements[i]);
1385 i++; 1389 i++;
1386 } 1390 }
1387 }, 1391 },
1388 1392
1389 /** 1393 /**
1390 * Clean a node of all elements of type "tag". 1394 * Clean a node of all elements of type "tag".
1391 * (Unless it's a youtube/vimeo video. People love movies.) 1395 * (Unless it's a youtube/vimeo video. People love movies.)
1392 * 1396 *
1393 * @param Element 1397 * @param Element
1394 * @param string tag to clean 1398 * @param string tag to clean
1395 * @return void 1399 * @return void
1396 **/ 1400 **/
1397 clean: function (e, tag) { 1401 clean: function (e, tag) {
1398 var targetList = e.getElementsByTagName( tag ); 1402 var targetList = e.getElementsByTagName( tag );
1399 var isEmbed = (tag === 'object' || tag === 'embed'); 1403 var isEmbed = (tag === 'object' || tag === 'embed');
1400 1404
1401 for (var y=targetList.length-1; y >= 0; y-=1) { 1405 for (var y=targetList.length-1; y >= 0; y-=1) {
1402 /* Allow youtube and vimeo videos through as people usually want to see those. */ 1406 /* Allow youtube and vimeo videos through as people usually want to see those. */
1403 if(isEmbed) { 1407 if(isEmbed) {
1404 var attributeValues = ""; 1408 var attributeValues = "";
1405 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) { 1409 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {
1406 attributeValues += targetList[y].attributes[i].value + '|'; 1410 attributeValues += targetList[y].attributes[i].value + '|';
1407 } 1411 }
1408 1412
1409 /* First, check the elements attributes to see if any of them co ntain youtube or vimeo */ 1413 /* First, check the elements attributes to see if any of them co ntain youtube or vimeo */
1410 if (attributeValues.search(readability.regexps.videos) !== -1) { 1414 if (attributeValues.search(readability.regexps.videos) !== -1) {
1411 continue; 1415 continue;
1412 } 1416 }
1413 1417
1414 /* Then check the elements inside this element for the same. */ 1418 /* Then check the elements inside this element for the same. */
1415 if (targetList[y].innerHTML.search(readability.regexps.videos) ! == -1) { 1419 if (targetList[y].innerHTML.search(readability.regexps.videos) ! == -1) {
1416 continue; 1420 continue;
1417 } 1421 }
1418 1422
1419 } 1423 }
1420 1424
1421 targetList[y].parentNode.removeChild(targetList[y]); 1425 targetList[y].parentNode.removeChild(targetList[y]);
1422 } 1426 }
1423 }, 1427 },
1424 1428
1425 /** 1429 /**
1426 * Clean an element of all tags of type "tag" if they look fishy. 1430 * Clean an element of all tags of type "tag" if they look fishy.
1427 * "Fishy" is an algorithm based on content length, classnames, link density , number of images & embeds, etc. 1431 * "Fishy" is an algorithm based on content length, classnames, link density , number of images & embeds, etc.
1428 * 1432 *
1429 * @return void 1433 * @return void
1430 **/ 1434 **/
1431 cleanConditionally: function (e, tag) { 1435 cleanConditionally: function (e, tag) {
1432 1436
1433 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { 1437 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
1434 return; 1438 return;
1435 } 1439 }
1436 1440
1437 var tagsList = e.getElementsByTagName(tag); 1441 var tagsList = e.getElementsByTagName(tag);
1438 var curTagsLength = tagsList.length; 1442 var curTagsLength = tagsList.length;
1439 1443
1440 /** 1444 /**
1441 * Gather counts for other typical elements embedded within. 1445 * Gather counts for other typical elements embedded within.
1442 * Traverse backwards so we can remove nodes at the same time without ef fecting the traversal. 1446 * Traverse backwards so we can remove nodes at the same time without ef fecting the traversal.
1443 * 1447 *
1444 * TODO: Consider taking into account original contentScore here. 1448 * TODO: Consider taking into account original contentScore here.
1445 **/ 1449 **/
1446 for (var i=curTagsLength-1; i >= 0; i-=1) { 1450 for (var i=curTagsLength-1; i >= 0; i-=1) {
1447 var weight = readability.getClassWeight(tagsList[i]); 1451 var weight = readability.getClassWeight(tagsList[i]);
1448 var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0; 1452 var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;
1449 1453
1450 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde fined') ? (" with score " + tagsList[i].readability.contentScore) : '')); 1454 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde fined') ? (" with score " + tagsList[i].readability.contentScore) : ''));
1451 1455
1452 if(weight+contentScore < 0) 1456 if(weight+contentScore < 0)
1453 { 1457 {
1454 tagsList[i].parentNode.removeChild(tagsList[i]); 1458 tagsList[i].parentNode.removeChild(tagsList[i]);
1455 } 1459 }
1456 else if ( readability.getCharCount(tagsList[i],',') < 10) { 1460 else if ( readability.getCharCount(tagsList[i],',') < 10) {
1457 /** 1461 /**
1458 * If there are not very many commas, and the number of 1462 * If there are not very many commas, and the number of
1459 * non-paragraph elements is more than paragraphs or other omino us signs, remove the element. 1463 * non-paragraph elements is more than paragraphs or other omino us signs, remove the element.
1460 **/ 1464 **/
1461 var p = tagsList[i].getElementsByTagName("p").length; 1465 var p = tagsList[i].getElementsByTagName("p").length;
1462 var img = tagsList[i].getElementsByTagName("img").length; 1466 var img = tagsList[i].getElementsByTagName("img").length;
1463 var li = tagsList[i].getElementsByTagName("li").length-100; 1467 var li = tagsList[i].getElementsByTagName("li").length-100;
1464 var input = tagsList[i].getElementsByTagName("input").length; 1468 var input = tagsList[i].getElementsByTagName("input").length;
1465 1469
1466 var embedCount = 0; 1470 var embedCount = 0;
1467 var embeds = tagsList[i].getElementsByTagName("embed"); 1471 var embeds = tagsList[i].getElementsByTagName("embed");
1468 for(var ei=0,il=embeds.length; ei < il; ei+=1) { 1472 for(var ei=0,il=embeds.length; ei < il; ei+=1) {
1469 if (embeds[ei].src.search(readability.regexps.videos) === -1 ) { 1473 if (embeds[ei].src.search(readability.regexps.videos) === -1 ) {
1470 embedCount+=1; 1474 embedCount+=1;
1471 } 1475 }
1472 } 1476 }
1473 1477
1474 var linkDensity = readability.getLinkDensity(tagsList[i]); 1478 var linkDensity = readability.getLinkDensity(tagsList[i]);
1475 var contentLength = readability.getInnerText(tagsList[i]).length ; 1479 var contentLength = readability.getInnerText(tagsList[i]).length ;
1476 var toRemove = false; 1480 var toRemove = false;
1477 1481
1478 if ( img > p ) { 1482 if ( img > p ) {
1479 toRemove = true; 1483 toRemove = true;
1480 } else if(li > p && tag !== "ul" && tag !== "ol") { 1484 } else if(li > p && tag !== "ul" && tag !== "ol") {
1481 toRemove = true; 1485 toRemove = true;
1482 } else if( input > Math.floor(p/3) ) { 1486 } else if( input > Math.floor(p/3) ) {
1483 toRemove = true; 1487 toRemove = true;
1484 } else if(contentLength < 25 && (img === 0 || img > 2) ) { 1488 } else if(contentLength < 25 && (img === 0 || img > 2) ) {
1485 toRemove = true; 1489 toRemove = true;
1486 } else if(weight < 25 && linkDensity > 0.2) { 1490 } else if(weight < 25 && linkDensity > 0.2) {
1487 toRemove = true; 1491 toRemove = true;
1488 } else if(weight >= 25 && linkDensity > 0.5) { 1492 } else if(weight >= 25 && linkDensity > 0.5) {
1489 toRemove = true; 1493 toRemove = true;
1490 } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) { 1494 } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) {
1491 toRemove = true; 1495 toRemove = true;
1492 } 1496 }
1493 1497
(...skipping 21 matching lines...) Expand all
1515 } 1519 }
1516 }, 1520 },
1517 1521
1518 flagIsActive: function(flag) { 1522 flagIsActive: function(flag) {
1519 return (readability.flags & flag) > 0; 1523 return (readability.flags & flag) > 0;
1520 }, 1524 },
1521 1525
1522 addFlag: function(flag) { 1526 addFlag: function(flag) {
1523 readability.flags = readability.flags | flag; 1527 readability.flags = readability.flags | flag;
1524 }, 1528 },
1525 1529
1526 removeFlag: function(flag) { 1530 removeFlag: function(flag) {
1527 readability.flags = readability.flags & ~flag; 1531 readability.flags = readability.flags & ~flag;
1528 }, 1532 },
1529 1533
1530 // Removes the children of |src| and appends them to |dest|. 1534 // Removes the children of |src| and appends them to |dest|.
1531 moveNodeInnards: function(src, dest) { 1535 moveNodeInnards: function(src, dest) {
1532 try { 1536 try {
1533 while (src.firstChild) { 1537 while (src.firstChild) {
1534 dest.appendChild(src.removeChild(src.firstChild)); 1538 dest.appendChild(src.removeChild(src.firstChild));
1535 } 1539 }
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after
1584 var lastBr = readability.isMultipleBr(node, false); 1588 var lastBr = readability.isMultipleBr(node, false);
1585 var ret = false; 1589 var ret = false;
1586 while (lastBr && lastBr != node) { 1590 while (lastBr && lastBr != node) {
1587 var toRemove = lastBr; 1591 var toRemove = lastBr;
1588 lastBr = lastBr.previousSibling; 1592 lastBr = lastBr.previousSibling;
1589 toRemove.parentNode.removeChild(toRemove); 1593 toRemove.parentNode.removeChild(toRemove);
1590 ret = true; 1594 ret = true;
1591 } 1595 }
1592 return ret; 1596 return ret;
1593 }, 1597 },
1594 1598
1595 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a 1599 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a
1596 // <P> node, and makes all next siblings of that pair children of <P>, up 1600 // <P> node, and makes all next siblings of that pair children of <P>, up
1597 // until the next pair of <BR> nodes is reached. 1601 // until the next pair of <BR> nodes is reached.
1598 replaceDoubleBrWithP: function(node) { 1602 replaceDoubleBrWithP: function(node) {
1599 // Check that we are starting with a BR. 1603 // Check that we are starting with a BR.
1600 var second = readability.isMultipleBr(node, true); 1604 var second = readability.isMultipleBr(node, true);
1601 if (!second) { 1605 if (!second) {
1602 return; 1606 return;
1603 } 1607 }
1604 // Make all next siblings of the second BR into children of a P. 1608 // Make all next siblings of the second BR into children of a P.
1605 var p = document.createElement('p'); 1609 var p = document.createElement('p');
1606 var curr = second.nextSibling; 1610 var curr = second.nextSibling;
1607 while (curr) { 1611 while (curr) {
1608 if (readability.isMultipleBr(curr, true)) { 1612 if (readability.isMultipleBr(curr, true)) {
1609 break; 1613 break;
1610 } 1614 }
1611 var next = curr.nextSibling; 1615 var next = curr.nextSibling;
1612 p.appendChild(curr.parentNode.removeChild(curr)); 1616 p.appendChild(curr.parentNode.removeChild(curr));
1613 curr = next; 1617 curr = next;
1614 } 1618 }
1615 var ret = curr; 1619 var ret = curr;
1616 1620
1617 // Remove all nodes between the first and second BR. 1621 // Remove all nodes between the first and second BR.
1618 curr = node.nextSibling; 1622 curr = node.nextSibling;
1619 while (curr && curr != second) { 1623 while (curr && curr != second) {
1620 var next = curr.nextSibling; 1624 var next = curr.nextSibling;
1621 curr.parentNode.removeChild(curr); 1625 curr.parentNode.removeChild(curr);
1622 curr = next; 1626 curr = next;
1623 } 1627 }
1624 // Remove the second BR. 1628 // Remove the second BR.
1625 second.parentNode.removeChild(second); 1629 second.parentNode.removeChild(second);
1626 // Replace the first BR with the P. 1630 // Replace the first BR with the P.
1627 node.parentNode.replaceChild(p, node); 1631 node.parentNode.replaceChild(p, node);
1628 1632
1629 return ret; 1633 return ret;
1630 }, 1634 },
1631 1635
1632 // Returns true if the NodeList contains a double <BR>. 1636 // Returns true if the NodeList contains a double <BR>.
1633 hasDoubleBr: function(nodeList) { 1637 hasDoubleBr: function(nodeList) {
1634 for (var i = 0; i < nodeList.length; nodeList++) { 1638 for (var i = 0; i < nodeList.length; nodeList++) {
1635 if (readability.isMultipleBr(nodeList[i], true)) { 1639 if (readability.isMultipleBr(nodeList[i], true)) {
1636 return true; 1640 return true;
1637 } 1641 }
1638 } 1642 }
1639 return false; 1643 return false;
1640 }, 1644 },
1641 1645
1642 // Replaces double <BR> tags with <P> tags. 1646 // Replaces double <BR> tags with <P> tags.
1643 replaceDoubleBrsWithPs: function(node) { 1647 replaceDoubleBrsWithPs: function(node) {
1644 var allElements = node.getElementsByTagName('BR'); 1648 var allElements = node.getElementsByTagName('BR');
1645 var node = null; 1649 var node = null;
1646 while (allElements && allElements.length > 0 && 1650 while (allElements && allElements.length > 0 &&
1647 readability.hasDoubleBr(allElements)) { 1651 readability.hasDoubleBr(allElements)) {
1648 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) { 1652 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {
1649 var next = node; 1653 var next = node;
1650 while (next = readability.replaceDoubleBrWithP(next)); 1654 while (next = readability.replaceDoubleBrWithP(next));
1651 } 1655 }
1652 allElements = document.body.getElementsByTagName('BR'); 1656 allElements = document.body.getElementsByTagName('BR');
1653 } 1657 }
1654 }, 1658 },
1655 1659
1656 1660
1657 // Replaces a BR and the whitespace that follows it with a P. 1661 // Replaces a BR and the whitespace that follows it with a P.
1658 replaceBrWithP: function(node) { 1662 replaceBrWithP: function(node) {
1659 if (!readability.isBrNode(node)) { 1663 if (!readability.isBrNode(node)) {
1660 return; 1664 return;
1661 } 1665 }
1662 var p = document.createElement('p'); 1666 var p = document.createElement('p');
1663 var curr = node.nextSibling; 1667 var curr = node.nextSibling;
1664 while (curr && !isBrNode(curr)) { 1668 while (curr && !isBrNode(curr)) {
1665 var next = curr.nextSibling; 1669 var next = curr.nextSibling;
1666 if (readability.isWhitespaceNode(curr)) { 1670 if (readability.isWhitespaceNode(curr)) {
1667 curr.parentNode.removeChild(curr); 1671 curr.parentNode.removeChild(curr);
1668 } else { 1672 } else {
1669 p.appendChild(curr.parentNode.removeChild(curr)); 1673 p.appendChild(curr.parentNode.removeChild(curr));
1670 } 1674 }
1671 curr = next; 1675 curr = next;
1672 } 1676 }
1673 node.parentNode.replaceChild(p, node); 1677 node.parentNode.replaceChild(p, node);
1674 return curr; 1678 return curr;
1675 }, 1679 },
1676 1680
1677 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t ag 1681 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t ag
1678 // children of the <P>. 1682 // children of the <P>.
1679 replaceBrsWithPs: function(node) { 1683 replaceBrsWithPs: function(node) {
1680 var allElements = node.getElementsByTagName('BR'); 1684 var allElements = node.getElementsByTagName('BR');
1681 var node = null; 1685 var node = null;
1682 while (allElements && allElements.length > 0) { 1686 while (allElements && allElements.length > 0) {
1683 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) { 1687 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {
1684 var next = node; 1688 var next = node;
1685 while (next = readability.replaceBrWithP(next)); 1689 while (next = readability.replaceBrWithP(next));
1686 } 1690 }
1687 allElements = document.body.getElementsByTagName('BR'); 1691 allElements = document.body.getElementsByTagName('BR');
1688 } 1692 }
1689 }, 1693 },
1690 1694
1691 // Replaces any tag with any other tag. 1695 // Replaces any tag with any other tag.
1692 replaceTagsWithTags: function(node, srcTag, destTag) { 1696 replaceTagsWithTags: function(node, srcTag, destTag) {
1693 var allElements = node.getElementsByTagName(srcTag); 1697 var allElements = node.getElementsByTagName(srcTag);
1694 for (var i = 0; i < allElements.length; i++) { 1698 for (var i = 0; i < allElements.length; i++) {
1695 var dest = document.createElement(destTag); 1699 var dest = document.createElement(destTag);
1696 readability.moveNodeInnards(allElements[i], dest); 1700 readability.moveNodeInnards(allElements[i], dest);
1697 node.replaceNode(dest, allElements[i]); 1701 allElements[i].parentNode.replaceChild(dest, allElements[i]);
1698 } 1702 }
1699 }, 1703 },
1700 1704
1701 // Replaces all <noscript> tags with <p> tags. 1705 // Replaces all <noscript> tags with <p> tags.
1702 replaceNoscriptsWithPs: function(node) { 1706 replaceNoscriptsWithPs: function(node) {
1703 readability.replaceTagsWithTags(node, 'noscript', 'p'); 1707 readability.replaceTagsWithTags(node, 'noscript', 'p');
1704 }, 1708 },
1705 1709
1706 // Replaces all <font> tags with <span> tags. 1710 // Replaces all <font> tags with <span> tags.
1707 replaceFontsWithSpans: function(node) { 1711 replaceFontsWithSpans: function(node) {
1708 readability.replaceTagsWithTags(node, 'font', 'span'); 1712 readability.replaceTagsWithTags(node, 'font', 'span');
1709 }, 1713 },
1710 1714
1711 // Returns a list of image URLs in the distilled article. 1715 // Returns a list of image URLs in the distilled article.
1712 getImages : function() { 1716 getImages : function() {
1713 var images = document.getElementsByTagName('img'); 1717 var images = document.getElementsByTagName('img');
1714 var result = new Array(images.length); 1718 var result = new Array(images.length);
1715 dbg("Number of images: " + images.length); 1719 dbg("Number of images: " + images.length);
1716 for(i = 0; i < images.length; i++) { 1720 for(i = 0; i < images.length; i++) {
1717 result[i] = images[i].src; 1721 result[i] = images[i].src;
1718 dbg("Image: " + result[i]); 1722 dbg("Image: " + result[i]);
1719 } 1723 }
1720 return result; 1724 return result;
1721 }, 1725 },
1722 1726
1723 // Returns the distilled article HTML from the page(s). 1727 // Returns the distilled article HTML from the page(s).
1724 getDistilledArticleHTML : function() { 1728 getDistilledArticleHTML : function() {
1725 return readability.distilledHTML; 1729 return readability.distilledHTML;
1730 },
1731
1732 // Returns the next page of this article.
1733 getNextPageLink : function() {
1734 return readability.nextPageLink;
1726 } 1735 }
1727 }; 1736 };
1728 1737
1729 // Extracts long-form content from a page and returns and array where the first 1738 // Extracts long-form content from a page and returns and array where the first
1730 // element is the article title, the second element is HTML containing the 1739 // element is the article title, the second element is HTML containing the
1731 // long-form content, and remaining elements are URLs for images referenced by 1740 // long-form content, and remaining elements are URLs for images referenced by
1732 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which 1741 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which
1733 // corresponds to a URL listed at index k in the array returned. 1742 // corresponds to a URL listed at index k in the array returned.
1734 (function () { 1743 (function () {
1735 readability.init(); 1744 readability.init();
1736 var result = new Array(2); 1745 var result = new Array(3);
1737 result[0] = readability.getArticleTitle(); 1746 result[0] = readability.getArticleTitle();
cjhopman 2014/01/29 21:44:13 We should probably change this to a dictionary at
shashi 2014/01/29 22:51:37 I actually tried to change it to dictionary ,but m
1738 result[1] = readability.getDistilledArticleHTML(); 1747 result[1] = readability.getDistilledArticleHTML();
1748 result[2] = readability.getNextPageLink();
1739 return result.concat(readability.getImages()); 1749 return result.concat(readability.getImages());
1740 }()) 1750 }())
1741 1751
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698