Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(632)

Side by Side Diff: third_party/readability/js/readability.js

Issue 146843010: Add support for multipage distillation. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « third_party/readability/README.chromium ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Local modifications to this file are described in the README.chromium
6 // file.
1 7
2 var dbg = (typeof console !== 'undefined') ? function(s) { 8 var dbg = (typeof console !== 'undefined') ? function(s) {
3 console.log("Readability: " + s); 9 console.log("Readability: " + s);
4 } : function() {}; 10 } : function() {};
5 11
6 /* 12 /*
7 * Readability. An Arc90 Lab Experiment. 13 * Readability. An Arc90 Lab Experiment.
8 * Website: http://lab.arc90.com/experiments/readability 14 * Website: http://lab.arc90.com/experiments/readability
9 * Source: http://code.google.com/p/arc90labs-readability 15 * Source: http://code.google.com/p/arc90labs-readability
10 * 16 *
11 * "Readability" is a trademark of Arc90 Inc and may not be used without explici t permission. 17 * "Readability" is a trademark of Arc90 Inc and may not be used without explici t permission.
12 * 18 *
13 * Copyright (c) 2010 Arc90 Inc 19 * Copyright (c) 2010 Arc90 Inc
14 * Readability is licensed under the Apache License, Version 2.0. 20 * Readability is licensed under the Apache License, Version 2.0.
15 **/ 21 **/
16 var readability = { 22 var readability = {
17 readStyle: "style-newspaper", 23 readStyle: "style-newspaper",
18 readSize: "size-medium", 24 readSize: "size-medium",
19 readMargin: "margin-wide", 25 readMargin: "margin-wide",
20 26
21 distilledHTML: '', 27 distilledHTML: '',
22 distilledArticleContent: null, 28 distilledArticleContent: null,
29 nextPageLink: '',
23 30
24 version: '1.7.1', 31 version: '1.7.1',
25 iframeLoads: 0, 32 iframeLoads: 0,
26 convertLinksToFootnotes: false, 33 convertLinksToFootnotes: false,
27 reversePageScroll: false, /* If they hold shift and hit space, scroll up */ 34 reversePageScroll: false, /* If they hold shift and hit space, scroll up */
28 frameHack: false, /** 35 frameHack: false, /**
29 * The frame hack is to workaround a firefo x bug where if you 36 * The frame hack is to workaround a firefo x bug where if you
30 * pull content out of a frame and stick it into the parent element, the scrollbar won't appear. 37 * pull content out of a frame and stick it into the parent element, the scrollbar won't appear.
31 * So we fake a scrollbar in the wrapping d iv. 38 * So we fake a scrollbar in the wrapping d iv.
32 **/ 39 **/
33 biggestFrame: false, 40 biggestFrame: false,
34 flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */ 41 flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */
35 42
36 /* constants */ 43 /* constants */
37 FLAG_STRIP_UNLIKELYS: 0x1, 44 FLAG_STRIP_UNLIKELYS: 0x1,
38 FLAG_WEIGHT_CLASSES: 0x2, 45 FLAG_WEIGHT_CLASSES: 0x2,
39 FLAG_CLEAN_CONDITIONALLY: 0x4, 46 FLAG_CLEAN_CONDITIONALLY: 0x4,
40 47
41 maxPages: 30, /* The maximum number of pages to loop through before we ca ll it quits and just show a link. */ 48 maxPages: 30, /* The maximum number of pages to loop through before we ca ll it quits and just show a link. */
42 parsedPages: {}, /* The list of pages we've parsed in this call of readabili ty, for autopaging. As a key store for easier searching. */ 49 parsedPages: {}, /* The list of pages we've parsed in this call of readabili ty, for autopaging. As a key store for easier searching. */
43 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas e they happen to match, we'll know it's a duplicate. */ 50 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas e they happen to match, we'll know it's a duplicate. */
44 51
45 /** 52 /**
46 * All of the regular expressions in use within readability. 53 * All of the regular expressions in use within readability.
47 * Defined up here so we don't instantiate them repeatedly in loops. 54 * Defined up here so we don't instantiate them repeatedly in loops.
48 **/ 55 **/
49 regexps: { 56 regexps: {
50 unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header |menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popu p|tweet|twitter/i, 57 unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header |menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popu p|tweet|twitter/i,
51 okMaybeItsACandidate: /and|article|body|column|main|shadow/i, 58 okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
52 positive: /article|body|content|entry|hentry|main|page|pagi nation|post|text|blog|story/i, 59 positive: /article|body|content|entry|hentry|main|page|pagi nation|post|text|blog|story/i,
53 negative: /combx|comment|com-|contact|foot|footer|footnote| masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopp ing|tags|tool|widget/i, 60 negative: /combx|comment|com-|contact|foot|footer|footnote| masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopp ing|tags|tool|widget/i,
54 extraneous: /print|archive|comment|discuss|e[\-]?mail|share|r eply|all|login|sign|single/i, 61 extraneous: /print|archive|comment|discuss|e[\-]?mail|share|r eply|all|login|sign|single/i,
55 divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, 62 divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
56 replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi, 63 replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi,
57 replaceFonts: /<(\/?)font[^>]*>/gi, 64 replaceFonts: /<(\/?)font[^>]*>/gi,
58 trim: /^\s+|\s+$/g, 65 trim: /^\s+|\s+$/g,
59 normalize: /\s{2,}/g, 66 normalize: /\s{2,}/g,
60 killBreaks: /(<br\s*\/?>(\s|&nbsp;?)*){1,}/g, 67 killBreaks: /(<br\s*\/?>(\s|&nbsp;?)*){1,}/g,
61 videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, 68 videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i,
62 skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed) \s*$/i, 69 skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed) \s*$/i,
63 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last. 70 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last.
64 prevLink: /(prev|earl|old|new|<|«)/i 71 prevLink: /(prev|earl|old|new|<|«)/i
65 }, 72 },
66 73
67 /** 74 /**
68 * Runs readability. 75 * Runs readability.
69 * 76 *
70 * Workflow: 77 * Workflow:
71 * 1. Prep the document by removing script tags, css, etc. 78 * 1. Prep the document by removing script tags, css, etc.
72 * 2. Build readability's DOM tree. 79 * 2. Build readability's DOM tree.
73 * 3. Grab the article content from the current dom tree. 80 * 3. Grab the article content from the current dom tree.
74 * 4. Replace the current DOM tree with the new one. 81 * 4. Replace the current DOM tree with the new one.
75 * 5. Read peacefully. 82 * 5. Read peacefully.
76 * 83 *
77 * @return void 84 * @return void
78 **/ 85 **/
79 init: function() { 86 init: function() {
80 /* Before we do anything, remove all scripts that are not readability. * / 87 /* Before we do anything, remove all scripts that are not readability. * /
81 window.onload = window.onunload = function() {}; 88 window.onload = window.onunload = function() {};
82 89
83 readability.removeScripts(document); 90 readability.removeScripts(document);
84 91
85 /* Make sure this document is added to the list of parsed pages first, s o we don't double up on the first page */ 92 /* Make sure this document is added to the list of parsed pages first, s o we don't double up on the first page */
86 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true; 93 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;
87 94
88 /* Pull out any possible next page link first */ 95 /* Pull out any possible next page link first */
89 var nextPageLink = readability.findNextPageLink(document.body); 96 readability.nextPageLink = readability.findNextPageLink(document.body);
90 97
98 /* We handle processing of nextPage from C++ set nextPageLink to null */
99 var nextPageLink = null;
100
91 readability.prepDocument(); 101 readability.prepDocument();
92 102
93 /* Build readability's DOM tree */ 103 /* Build readability's DOM tree */
94 var overlay = document.createElement("DIV"); 104 var overlay = document.createElement("DIV");
95 var innerDiv = document.createElement("DIV"); 105 var innerDiv = document.createElement("DIV");
96 var articleTools = readability.getArticleTools(); 106 var articleTools = readability.getArticleTools();
97 var articleTitleText = readability.getArticleTitle(); 107 var articleTitleText = readability.getArticleTitle();
98 var articleContent = readability.grabArticle(); 108 var articleContent = readability.grabArticle();
99 109
100 if(!articleContent) { 110 if(!articleContent) {
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
145 rootWarning.innerHTML = "<em>Readability</em> was intended for u se on individual articles and not home pages. " + 155 rootWarning.innerHTML = "<em>Readability</em> was intended for u se on individual articles and not home pages. " +
146 "If you'd like to try rendering this page anyway, <a onClick='ja vascript:document.getElementById(\"readability-warning\").style.display=\"none\" ;document.getElementById(\"readability-content\").style.display=\"block\";'>clic k here</a> to continue."; 156 "If you'd like to try rendering this page anyway, <a onClick='ja vascript:document.getElementById(\"readability-warning\").style.display=\"none\" ;document.getElementById(\"readability-content\").style.display=\"block\";'>clic k here</a> to continue.";
147 157
148 innerDiv.insertBefore( rootWarning, articleContent ); 158 innerDiv.insertBefore( rootWarning, articleContent );
149 } 159 }
150 160
151 readability.postProcessContent(articleContent); 161 readability.postProcessContent(articleContent);
152 162
153 window.scrollTo(0, 0); 163 window.scrollTo(0, 0);
154 164
155 // TODO(bengr): Remove this assignment of null to nextPageLink when
156 // the processing of the next page link is safe.
157 nextPageLink = null;
158
159 if (nextPageLink) { 165 if (nextPageLink) {
160 /** 166 /**
161 * Append any additional pages after a small timeout so that people 167 * Append any additional pages after a small timeout so that people
162 * can start reading without having to wait for this to finish proce ssing. 168 * can start reading without having to wait for this to finish proce ssing.
163 **/ 169 **/
164 window.setTimeout(function() { 170 window.setTimeout(function() {
165 readability.appendNextPage(nextPageLink); 171 readability.appendNextPage(nextPageLink);
166 }, 500); 172 }, 500);
167 } 173 }
168 174
169 /** Smooth scrolling **/ 175 /** Smooth scrolling **/
170 document.onkeydown = function(e) { 176 document.onkeydown = function(e) {
171 var code = (window.event) ? event.keyCode : e.keyCode; 177 var code = (window.event) ? event.keyCode : e.keyCode;
172 if (code === 16) { 178 if (code === 16) {
173 readability.reversePageScroll = true; 179 readability.reversePageScroll = true;
174 return; 180 return;
175 } 181 }
176 182
177 if (code === 32) { 183 if (code === 32) {
178 readability.curScrollStep = 0; 184 readability.curScrollStep = 0;
179 var windowHeight = window.innerHeight ? window.innerHeight : (do cument.documentElement.clientHeight ? document.documentElement.clientHeight : do cument.body.clientHeight); 185 var windowHeight = window.innerHeight ? window.innerHeight : (do cument.documentElement.clientHeight ? document.documentElement.clientHeight : do cument.body.clientHeight);
180 186
181 if(readability.reversePageScroll) { 187 if(readability.reversePageScroll) {
182 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() - (windowHeight - 50), 20, 10); 188 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() - (windowHeight - 50), 20, 10);
183 } 189 }
184 else { 190 else {
185 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() + (windowHeight - 50), 20, 10); 191 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() + (windowHeight - 50), 20, 10);
186 } 192 }
187 193
188 return false; 194 return false;
189 } 195 }
190 }; 196 };
191 197
192 document.onkeyup = function(e) { 198 document.onkeyup = function(e) {
193 var code = (window.event) ? event.keyCode : e.keyCode; 199 var code = (window.event) ? event.keyCode : e.keyCode;
194 if (code === 16) { 200 if (code === 16) {
195 readability.reversePageScroll = false; 201 readability.reversePageScroll = false;
196 return; 202 return;
197 } 203 }
198 }; 204 };
199 }, 205 },
200 206
201 /** 207 /**
202 * Run any post-process modifications to article content as necessary. 208 * Run any post-process modifications to article content as necessary.
203 * 209 *
204 * @param Element 210 * @param Element
205 * @return void 211 * @return void
206 **/ 212 **/
207 postProcessContent: function(articleContent) { 213 postProcessContent: function(articleContent) {
208 if(readability.convertLinksToFootnotes && !window.location.href.match(/w ikipedia\.org/g)) { 214 if(readability.convertLinksToFootnotes && !window.location.href.match(/w ikipedia\.org/g)) {
209 readability.addFootnotes(articleContent); 215 readability.addFootnotes(articleContent);
210 } 216 }
211 217
212 readability.fixImageFloats(articleContent); 218 readability.fixImageFloats(articleContent);
213 }, 219 },
214 220
215 /** 221 /**
216 * Some content ends up looking ugly if the image is too large to be floated . 222 * Some content ends up looking ugly if the image is too large to be floated .
217 * If the image is wider than a threshold (currently 55%), no longer float i t, 223 * If the image is wider than a threshold (currently 55%), no longer float i t,
218 * center it instead. 224 * center it instead.
219 * 225 *
220 * @param Element 226 * @param Element
221 * @return void 227 * @return void
222 **/ 228 **/
223 fixImageFloats: function (articleContent) { 229 fixImageFloats: function (articleContent) {
224 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0. 55, 230 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0. 55,
225 images = articleContent.getElementsByTagName('img'); 231 images = articleContent.getElementsByTagName('img');
226 232
227 for(var i=0, il = images.length; i < il; i+=1) { 233 for(var i=0, il = images.length; i < il; i+=1) {
228 var image = images[i]; 234 var image = images[i];
229 235
230 if(image.offsetWidth > imageWidthThreshold) { 236 if(image.offsetWidth > imageWidthThreshold) {
231 image.className += " blockImage"; 237 image.className += " blockImage";
232 } 238 }
233 } 239 }
234 }, 240 },
235 241
236 /** 242 /**
237 * Get the article tools Element that has buttons like reload, print. 243 * Get the article tools Element that has buttons like reload, print.
238 * 244 *
239 * @return void 245 * @return void
240 **/ 246 **/
241 getArticleTools: function () { 247 getArticleTools: function () {
242 var articleTools = document.createElement("DIV"); 248 var articleTools = document.createElement("DIV");
243 249
244 articleTools.id = "readTools"; 250 articleTools.id = "readTools";
245 articleTools.innerHTML = 251 articleTools.innerHTML =
246 "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" + 252 "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +
247 "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" + 253 "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +
248 "<a href='#' onclick='readability.emailBox(); return false;' title=' Email page' id='email-page'>Email Page</a>"; 254 "<a href='#' onclick='readability.emailBox(); return false;' title=' Email page' id='email-page'>Email Page</a>";
249 255
250 return articleTools; 256 return articleTools;
251 }, 257 },
252 258
253 /** 259 /**
254 * retuns the suggested direction of the string 260 * retuns the suggested direction of the string
255 * 261 *
256 * @return "rtl" || "ltr" 262 * @return "rtl" || "ltr"
257 **/ 263 **/
258 getSuggestedDirection: function(text) { 264 getSuggestedDirection: function(text) {
259 function sanitizeText() { 265 function sanitizeText() {
260 return text.replace(/@\w+/, ""); 266 return text.replace(/@\w+/, "");
261 } 267 }
262 268
263 function countMatches(match) { 269 function countMatches(match) {
264 var matches = text.match(new RegExp(match, "g")); 270 var matches = text.match(new RegExp(match, "g"));
265 return matches !== null ? matches.length : 0; 271 return matches !== null ? matches.length : 0;
266 } 272 }
267 273
268 function isRTL() { 274 function isRTL() {
269 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]"); 275 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");
270 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]"); 276 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");
271 277
272 // if 20% of chars are Hebrew or Arbic then direction is rtl 278 // if 20% of chars are Hebrew or Arbic then direction is rtl
273 return (count_heb + count_arb) * 100 / text.length > 20; 279 return (count_heb + count_arb) * 100 / text.length > 20;
274 } 280 }
275 281
276 text = sanitizeText(text); 282 text = sanitizeText(text);
277 return isRTL() ? "rtl" : "ltr"; 283 return isRTL() ? "rtl" : "ltr";
278 }, 284 },
279 285
280 /** 286 /**
281 * Get the article title as an H1. 287 * Get the article title as an H1.
282 * 288 *
283 * @return void 289 * @return void
284 **/ 290 **/
285 getArticleTitle: function () { 291 getArticleTitle: function () {
286 var curTitle = "", 292 var curTitle = "",
287 origTitle = ""; 293 origTitle = "";
288 294
289 try { 295 try {
290 curTitle = origTitle = document.title; 296 curTitle = origTitle = document.title;
291 if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */ 297 if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */
292 curTitle = origTitle = readability.getInnerText(document.getElem entsByTagName('title')[0]); 298 curTitle = origTitle = readability.getInnerText(document.getElem entsByTagName('title')[0]);
293 } 299 }
294 } 300 }
295 catch(e) {} 301 catch(e) {}
296 302
297 if(curTitle.match(/ [\|\-] /)) 303 if(curTitle.match(/ [\|\-] /))
298 { 304 {
299 curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); 305 curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
300 306
301 if(curTitle.split(' ').length < 3) { 307 if(curTitle.split(' ').length < 3) {
302 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); 308 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
303 } 309 }
304 } 310 }
305 else if(curTitle.indexOf(': ') !== -1) 311 else if(curTitle.indexOf(': ') !== -1)
306 { 312 {
307 curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); 313 curTitle = origTitle.replace(/.*:(.*)/gi, '$1');
308 314
309 if(curTitle.split(' ').length < 3) { 315 if(curTitle.split(' ').length < 3) {
310 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); 316 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1');
(...skipping 12 matching lines...) Expand all
323 329
324 if(curTitle.split(' ').length <= 4) { 330 if(curTitle.split(' ').length <= 4) {
325 curTitle = origTitle; 331 curTitle = origTitle;
326 } 332 }
327 return curTitle; 333 return curTitle;
328 }, 334 },
329 335
330 /** 336 /**
331 * Prepare the HTML document for readability to scrape it. 337 * Prepare the HTML document for readability to scrape it.
332 * This includes things like stripping javascript, CSS, and handling terribl e markup. 338 * This includes things like stripping javascript, CSS, and handling terribl e markup.
333 * 339 *
334 * @return void 340 * @return void
335 **/ 341 **/
336 prepDocument: function () { 342 prepDocument: function () {
337 /** 343 /**
338 * In some cases a body element can't be found (if the HTML is totally h osed for example) 344 * In some cases a body element can't be found (if the HTML is totally h osed for example)
339 * so we create a new body node and append it to the document. 345 * so we create a new body node and append it to the document.
340 */ 346 */
341 if(document.body === null) 347 if(document.body === null)
342 { 348 {
343 var body = document.createElement("body"); 349 var body = document.createElement("body");
344 try { 350 try {
345 document.body = body; 351 document.body = body;
346 } 352 }
347 catch(e) { 353 catch(e) {
348 document.documentElement.appendChild(body); 354 document.documentElement.appendChild(body);
349 dbg(e); 355 dbg(e);
350 } 356 }
351 } 357 }
352 358
353 document.body.id = "readabilityBody"; 359 document.body.id = "readabilityBody";
354 360
355 var frames = document.getElementsByTagName('frame'); 361 var frames = document.getElementsByTagName('frame');
(...skipping 11 matching lines...) Expand all
367 canAccessFrame = true; 373 canAccessFrame = true;
368 } 374 }
369 catch(eFrames) { 375 catch(eFrames) {
370 dbg(eFrames); 376 dbg(eFrames);
371 } 377 }
372 378
373 if(frameSize > biggestFrameSize) { 379 if(frameSize > biggestFrameSize) {
374 biggestFrameSize = frameSize; 380 biggestFrameSize = frameSize;
375 readability.biggestFrame = frames[frameIndex]; 381 readability.biggestFrame = frames[frameIndex];
376 } 382 }
377 383
378 if(canAccessFrame && frameSize > bestFrameSize) 384 if(canAccessFrame && frameSize > bestFrameSize)
379 { 385 {
380 readability.frameHack = true; 386 readability.frameHack = true;
381 387
382 bestFrame = frames[frameIndex]; 388 bestFrame = frames[frameIndex];
383 bestFrameSize = frameSize; 389 bestFrameSize = frameSize;
384 } 390 }
385 } 391 }
386 392
387 if(bestFrame) 393 if(bestFrame)
388 { 394 {
389 var newBody = document.createElement('body'); 395 var newBody = document.createElement('body');
390 readability.moveNodeInnards(bestFrame.contentWindow.document.bod y, newBody); 396 readability.moveNodeInnards(bestFrame.contentWindow.document.bod y, newBody);
391 newBody.style.overflow = 'scroll'; 397 newBody.style.overflow = 'scroll';
392 document.body = newBody; 398 document.body = newBody;
393 399
394 var frameset = document.getElementsByTagName('frameset')[0]; 400 var frameset = document.getElementsByTagName('frameset')[0];
395 if(frameset) { 401 if(frameset) {
396 frameset.parentNode.removeChild(frameset); } 402 frameset.parentNode.removeChild(frameset); }
397 } 403 }
398 } 404 }
399 405
400 /* Remove all stylesheets */ 406 /* Remove all stylesheets */
401 for (var k=0;k < document.styleSheets.length; k+=1) { 407 for (var k=0;k < document.styleSheets.length; k+=1) {
402 if (document.styleSheets[k].href !== null && document.styleSheets[k] .href.lastIndexOf("readability") === -1) { 408 if (document.styleSheets[k].href !== null && document.styleSheets[k] .href.lastIndexOf("readability") === -1) {
403 document.styleSheets[k].disabled = true; 409 document.styleSheets[k].disabled = true;
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
448 readability.cleanConditionally(articleContent, "table"); 454 readability.cleanConditionally(articleContent, "table");
449 readability.cleanConditionally(articleContent, "ul"); 455 readability.cleanConditionally(articleContent, "ul");
450 readability.cleanConditionally(articleContent, "div"); 456 readability.cleanConditionally(articleContent, "div");
451 457
452 /* Remove extra paragraphs */ 458 /* Remove extra paragraphs */
453 var articleParagraphs = articleContent.getElementsByTagName('p'); 459 var articleParagraphs = articleContent.getElementsByTagName('p');
454 for(var i = articleParagraphs.length-1; i >= 0; i-=1) { 460 for(var i = articleParagraphs.length-1; i >= 0; i-=1) {
455 var imgCount = articleParagraphs[i].getElementsByTagName('img').l ength; 461 var imgCount = articleParagraphs[i].getElementsByTagName('img').l ength;
456 var embedCount = articleParagraphs[i].getElementsByTagName('embed') .length; 462 var embedCount = articleParagraphs[i].getElementsByTagName('embed') .length;
457 var objectCount = articleParagraphs[i].getElementsByTagName('object' ).length; 463 var objectCount = articleParagraphs[i].getElementsByTagName('object' ).length;
458 464
459 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab ility.getInnerText(articleParagraphs[i], false) === '') { 465 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab ility.getInnerText(articleParagraphs[i], false) === '') {
460 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i] ); 466 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i] );
461 } 467 }
462 } 468 }
463 469
464 try { 470 try {
465 readability.replaceBrsWithPs(articleContent); 471 readability.replaceBrsWithPs(articleContent);
466 } 472 }
467 catch (e) { 473 catch (e) {
468 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block -elements bug. Ignoring.: " + e); 474 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block -elements bug. Ignoring.: " + e);
469 } 475 }
470 }, 476 },
471 477
472 /** 478 /**
473 * Initialize a node with the readability object. Also checks the 479 * Initialize a node with the readability object. Also checks the
474 * className/id for special names to add to its score. 480 * className/id for special names to add to its score.
475 * 481 *
476 * @param Element 482 * @param Element
477 * @return void 483 * @return void
478 **/ 484 **/
479 initializeNode: function (node) { 485 initializeNode: function (node) {
480 node.readability = {"contentScore": 0}; 486 node.readability = {"contentScore": 0};
481 487
482 switch(node.tagName) { 488 switch(node.tagName) {
483 case 'DIV': 489 case 'DIV':
484 node.readability.contentScore += 5; 490 node.readability.contentScore += 5;
485 break; 491 break;
486 492
487 case 'PRE': 493 case 'PRE':
488 case 'TD': 494 case 'TD':
489 case 'BLOCKQUOTE': 495 case 'BLOCKQUOTE':
490 node.readability.contentScore += 3; 496 node.readability.contentScore += 3;
491 break; 497 break;
492 498
493 case 'ADDRESS': 499 case 'ADDRESS':
494 case 'OL': 500 case 'OL':
495 case 'UL': 501 case 'UL':
496 case 'DL': 502 case 'DL':
497 case 'DD': 503 case 'DD':
498 case 'DT': 504 case 'DT':
499 case 'LI': 505 case 'LI':
500 case 'FORM': 506 case 'FORM':
501 node.readability.contentScore -= 3; 507 node.readability.contentScore -= 3;
502 break; 508 break;
503 509
504 case 'H1': 510 case 'H1':
505 case 'H2': 511 case 'H2':
506 case 'H3': 512 case 'H3':
507 case 'H4': 513 case 'H4':
508 case 'H5': 514 case 'H5':
509 case 'H6': 515 case 'H6':
510 case 'TH': 516 case 'TH':
511 node.readability.contentScore -= 5; 517 node.readability.contentScore -= 5;
512 break; 518 break;
513 } 519 }
514 520
515 node.readability.contentScore += readability.getClassWeight(node); 521 node.readability.contentScore += readability.getClassWeight(node);
516 }, 522 },
517 523
518 /*** 524 /***
519 * grabArticle - Using a variety of metrics (content score, classname, eleme nt types), find the content that is 525 * grabArticle - Using a variety of metrics (content score, classname, eleme nt types), find the content that is
520 * most likely to be the stuff a user wants to read. Then retu rn it wrapped up in a div. 526 * most likely to be the stuff a user wants to read. Then retu rn it wrapped up in a div.
521 * 527 *
522 * @param page a document to run upon. Needs to be a full document, complete with body. 528 * @param page a document to run upon. Needs to be a full document, complete with body.
523 * @return Element 529 * @return Element
524 **/ 530 **/
525 grabArticle: function (pageToClone) { 531 grabArticle: function (pageToClone) {
526 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_ STRIP_UNLIKELYS), 532 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_ STRIP_UNLIKELYS),
527 isPaging = (page !== null) ? true: false; 533 isPaging = (page !== null) ? true: false;
528 534
529 var page = null; 535 var page = null;
530 // Never work on the actual page. 536 // Never work on the actual page.
531 if (isPaging) { 537 if (isPaging) {
532 page = document.body.cloneNode(true); 538 page = document.body.cloneNode(true);
533 } else { 539 } else {
534 page = pageToClone.cloneNode(true); 540 page = pageToClone.cloneNode(true);
535 } 541 }
536 542
537 var allElements = page.getElementsByTagName('*'); 543 var allElements = page.getElementsByTagName('*');
538 544
539 /** 545 /**
540 * First, node prepping. Trash nodes that look cruddy (like ones with th e class name "comment", etc), and turn divs 546 * First, node prepping. Trash nodes that look cruddy (like ones with th e class name "comment", etc), and turn divs
541 * into P tags where they have been used inappropriately (as in, where t hey contain no other block level elements.) 547 * into P tags where they have been used inappropriately (as in, where t hey contain no other block level elements.)
542 * 548 *
543 * Note: Assignment from index for performance. See http://www.peachpit. com/articles/article.aspx?p=31567&seqNum=5 549 * Note: Assignment from index for performance. See http://www.peachpit. com/articles/article.aspx?p=31567&seqNum=5
544 * TODO: Shouldn't this be a reverse traversal? 550 * TODO: Shouldn't this be a reverse traversal?
545 **/ 551 **/
546 var node = null; 552 var node = null;
547 var nodesToScore = []; 553 var nodesToScore = [];
548 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) { 554 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {
549 /* Remove unlikely candidates */ 555 /* Remove unlikely candidates */
550 if (stripUnlikelyCandidates) { 556 if (stripUnlikelyCandidates) {
551 var unlikelyMatchString = node.className + node.id; 557 var unlikelyMatchString = node.className + node.id;
552 if ( 558 if (
553 ( 559 (
554 unlikelyMatchString.search(readability.regexps.unlikelyC andidates) !== -1 && 560 unlikelyMatchString.search(readability.regexps.unlikelyC andidates) !== -1 &&
555 unlikelyMatchString.search(readability.regexps.okMaybeIt sACandidate) === -1 && 561 unlikelyMatchString.search(readability.regexps.okMaybeIt sACandidate) === -1 &&
556 node.tagName !== "BODY" 562 node.tagName !== "BODY"
557 ) 563 )
558 ) 564 )
559 { 565 {
560 dbg("Removing unlikely candidate - " + unlikelyMatchString); 566 dbg("Removing unlikely candidate - " + unlikelyMatchString);
561 node.parentNode.removeChild(node); 567 node.parentNode.removeChild(node);
562 nodeIndex-=1; 568 nodeIndex-=1;
563 continue; 569 continue;
564 } 570 }
565 } 571 }
566 572
567 if (node.tagName === "P" || node.tagName === "TD" || node.tagName == = "PRE") { 573 if (node.tagName === "P" || node.tagName === "TD" || node.tagName == = "PRE") {
568 nodesToScore[nodesToScore.length] = node; 574 nodesToScore[nodesToScore.length] = node;
569 } 575 }
570 576
571 /* Turn all divs that don't have children block level elements into p's */ 577 /* Turn all divs that don't have children block level elements into p's */
572 if (node.tagName === "DIV") { 578 if (node.tagName === "DIV") {
573 if (node.innerHTML.search(readability.regexps.divToPElements) == = -1) { 579 if (node.innerHTML.search(readability.regexps.divToPElements) == = -1) {
574 var newNode = document.createElement('p'); 580 var newNode = document.createElement('p');
(...skipping 16 matching lines...) Expand all
591 if(childNode.nodeType === 3) { // Node.TEXT_NODE 597 if(childNode.nodeType === 3) { // Node.TEXT_NODE
592 var p = document.createElement('p'); 598 var p = document.createElement('p');
593 var t = document.createTextNode(childNode.nodeValue) ; 599 var t = document.createTextNode(childNode.nodeValue) ;
594 p.appendChild(t); 600 p.appendChild(t);
595 p.style.display = 'inline'; 601 p.style.display = 'inline';
596 p.className = 'readability-styled'; 602 p.className = 'readability-styled';
597 childNode.parentNode.replaceChild(p, childNode); 603 childNode.parentNode.replaceChild(p, childNode);
598 } 604 }
599 } 605 }
600 } 606 }
601 } 607 }
602 } 608 }
603 609
604 /** 610 /**
605 * Loop through all paragraphs, and assign a score to them based on how content-y they look. 611 * Loop through all paragraphs, and assign a score to them based on how content-y they look.
606 * Then add their score to their parent node. 612 * Then add their score to their parent node.
607 * 613 *
608 * A score is determined by things like number of commas, class names, e tc. Maybe eventually link density. 614 * A score is determined by things like number of commas, class names, e tc. Maybe eventually link density.
609 **/ 615 **/
610 var candidates = []; 616 var candidates = [];
611 for (var pt=0; pt < nodesToScore.length; pt+=1) { 617 for (var pt=0; pt < nodesToScore.length; pt+=1) {
(...skipping 21 matching lines...) Expand all
633 candidates.push(grandParentNode); 639 candidates.push(grandParentNode);
634 } 640 }
635 641
636 var contentScore = 0; 642 var contentScore = 0;
637 643
638 /* Add a point for the paragraph itself as a base. */ 644 /* Add a point for the paragraph itself as a base. */
639 contentScore+=1; 645 contentScore+=1;
640 646
641 /* Add points for any commas within this paragraph */ 647 /* Add points for any commas within this paragraph */
642 contentScore += innerText.split(',').length; 648 contentScore += innerText.split(',').length;
643 649
644 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ 650 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
645 contentScore += Math.min(Math.floor(innerText.length / 100), 3); 651 contentScore += Math.min(Math.floor(innerText.length / 100), 3);
646 652
647 /* Add the score to the parent. The grandparent gets half. */ 653 /* Add the score to the parent. The grandparent gets half. */
648 parentNode.readability.contentScore += contentScore; 654 parentNode.readability.contentScore += contentScore;
649 655
650 if(grandParentNode) { 656 if(grandParentNode) {
651 grandParentNode.readability.contentScore += contentScore/2; 657 grandParentNode.readability.contentScore += contentScore/2;
652 } 658 }
653 } 659 }
654 660
655 /** 661 /**
656 * After we've calculated scores, loop through all of the possible candi date nodes we found 662 * After we've calculated scores, loop through all of the possible candi date nodes we found
657 * and find the one with the highest score. 663 * and find the one with the highest score.
658 **/ 664 **/
659 var topCandidate = null; 665 var topCandidate = null;
660 for(var c=0, cl=candidates.length; c < cl; c+=1) 666 for(var c=0, cl=candidates.length; c < cl; c+=1)
661 { 667 {
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after
718 var contentBonus = 0; 724 var contentBonus = 0;
719 /* Give a bonus if sibling nodes and top candidates have the example same classname */ 725 /* Give a bonus if sibling nodes and top candidates have the example same classname */
720 if(siblingNode.className === topCandidate.className && topCandidate. className !== "") { 726 if(siblingNode.className === topCandidate.className && topCandidate. className !== "") {
721 contentBonus += topCandidate.readability.contentScore * 0.2; 727 contentBonus += topCandidate.readability.contentScore * 0.2;
722 } 728 }
723 729
724 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re adability.contentScore+contentBonus) >= siblingScoreThreshold) 730 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re adability.contentScore+contentBonus) >= siblingScoreThreshold)
725 { 731 {
726 append = true; 732 append = true;
727 } 733 }
728 734
729 if(siblingNode.nodeName === "P") { 735 if(siblingNode.nodeName === "P") {
730 var linkDensity = readability.getLinkDensity(siblingNode); 736 var linkDensity = readability.getLinkDensity(siblingNode);
731 var nodeContent = readability.getInnerText(siblingNode); 737 var nodeContent = readability.getInnerText(siblingNode);
732 var nodeLength = nodeContent.length; 738 var nodeLength = nodeContent.length;
733 739
734 if(nodeLength > 80 && linkDensity < 0.25) 740 if(nodeLength > 80 && linkDensity < 0.25)
735 { 741 {
736 append = true; 742 append = true;
737 } 743 }
738 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear ch(/\.( |$)/) !== -1) 744 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear ch(/\.( |$)/) !== -1)
739 { 745 {
740 append = true; 746 append = true;
741 } 747 }
742 } 748 }
743 749
744 if(append) { 750 if(append) {
745 dbg("Appending node: " + siblingNode); 751 dbg("Appending node: " + siblingNode);
746 752
747 var nodeToAppend = null; 753 var nodeToAppend = null;
748 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P ") { 754 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P ") {
749 /* We have a node that isn't a common block level element, l ike a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ 755 /* We have a node that isn't a common block level element, l ike a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
750 756
751 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); 757 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
752 nodeToAppend = document.createElement("DIV"); 758 nodeToAppend = document.createElement("DIV");
753 try { 759 try {
754 nodeToAppend.id = siblingNode.id; 760 nodeToAppend.id = siblingNode.id;
755 readability.moveNodeInnards(siblingNode, nodeToAppend); 761 readability.moveNodeInnards(siblingNode, nodeToAppend);
756 } 762 }
757 catch(er) { 763 catch(er) {
758 dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original."); 764 dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
759 nodeToAppend = siblingNode; 765 nodeToAppend = siblingNode;
760 s-=1; 766 s-=1;
761 sl-=1; 767 sl-=1;
762 } 768 }
763 } else { 769 } else {
764 nodeToAppend = siblingNode; 770 nodeToAppend = siblingNode;
765 s-=1; 771 s-=1;
766 sl-=1; 772 sl-=1;
767 } 773 }
768 774
769 /* To ensure a node does not interfere with readability styles, remove its classnames */ 775 /* To ensure a node does not interfere with readability styles, remove its classnames */
770 nodeToAppend.className = ""; 776 nodeToAppend.className = "";
771 777
772 /* Append sibling and subtract from our list because it removes the node when you append to another node */ 778 /* Append sibling and subtract from our list because it removes the node when you append to another node */
773 articleContent.appendChild(nodeToAppend); 779 articleContent.appendChild(nodeToAppend);
774 } 780 }
775 } 781 }
776 782
777 /** 783 /**
778 * So we have all of the content that we need. Now we clean it up for pr esentation. 784 * So we have all of the content that we need. Now we clean it up for pr esentation.
779 **/ 785 **/
780 readability.distilledArticleContent = articleContent.cloneNode(true); 786 readability.distilledArticleContent = articleContent.cloneNode(true);
781 //readability.prepArticle(articleContent); 787 //readability.prepArticle(articleContent);
782 788
783 if (readability.curPageNum === 1) { 789 if (readability.curPageNum === 1) {
784 var newNode = document.createElement('div'); 790 var newNode = document.createElement('div');
785 newNode.id = "readability-page-1"; 791 newNode.id = "readability-page-1";
786 newNode.setAttribute("class", "page"); 792 newNode.setAttribute("class", "page");
787 readability.moveNodeInnards(articleContent, newNode); 793 readability.moveNodeInnards(articleContent, newNode);
788 articleContent.appendChild(newNode); 794 articleContent.appendChild(newNode);
789 } 795 }
790 796
791 /** 797 /**
792 * Now that we've gone through the full algorithm, check to see if we go t any meaningful content. 798 * Now that we've gone through the full algorithm, check to see if we go t any meaningful content.
793 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher 799 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
794 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of 800 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
795 * finding the -right- content. 801 * finding the -right- content.
796 **/ 802 **/
797 if(readability.getInnerText(articleContent, false).length < 250) { 803 if(readability.getInnerText(articleContent, false).length < 250) {
798 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) { 804 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {
799 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS); 805 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);
800 return readability.grabArticle(document.body); 806 return readability.grabArticle(document.body);
801 } 807 }
802 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { 808 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
803 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES); 809 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);
804 return readability.grabArticle(document.body); 810 return readability.grabArticle(document.body);
805 } 811 }
806 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL LY)) { 812 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL LY)) {
807 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY); 813 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);
808 return readability.grabArticle(document.body); 814 return readability.grabArticle(document.body);
809 } else { 815 } else {
810 return null; 816 return null;
811 } 817 }
812 } 818 }
813 819
814 return articleContent; 820 return articleContent;
815 }, 821 },
816 822
817 /** 823 /**
818 * Removes script tags from the document. 824 * Removes script tags from the document.
819 * 825 *
820 * @param Element 826 * @param Element
821 **/ 827 **/
822 removeScripts: function (doc) { 828 removeScripts: function (doc) {
823 var scripts = doc.getElementsByTagName('script'); 829 var scripts = doc.getElementsByTagName('script');
824 for(var i = scripts.length-1; i >= 0; i-=1) 830 for(var i = scripts.length-1; i >= 0; i-=1)
825 { 831 {
826 if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf ('readability') === -1 && scripts[i].src.indexOf('typekit') === -1)) 832 if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf ('readability') === -1 && scripts[i].src.indexOf('typekit') === -1))
827 { 833 {
828 scripts[i].nodeValue=""; 834 scripts[i].nodeValue="";
829 scripts[i].removeAttribute('src'); 835 scripts[i].removeAttribute('src');
830 if (scripts[i].parentNode) { 836 if (scripts[i].parentNode) {
831 scripts[i].parentNode.removeChild(scripts[i]); 837 scripts[i].parentNode.removeChild(scripts[i]);
832 } 838 }
833 } 839 }
834 } 840 }
835 }, 841 },
836 842
837 /** 843 /**
838 * Get the inner text of a node - cross browser compatibly. 844 * Get the inner text of a node - cross browser compatibly.
839 * This also strips out any excess whitespace to be found. 845 * This also strips out any excess whitespace to be found.
840 * 846 *
841 * @param Element 847 * @param Element
842 * @return string 848 * @return string
843 **/ 849 **/
844 getInnerText: function (e, normalizeSpaces) { 850 getInnerText: function (e, normalizeSpaces) {
845 var textContent = ""; 851 var textContent = "";
846 852
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
889 895
890 // Remove any root styles, if we're able. 896 // Remove any root styles, if we're able.
891 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili ty-styled') { 897 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili ty-styled') {
892 e.removeAttribute('style'); } 898 e.removeAttribute('style'); }
893 899
894 // Go until there are no more child nodes 900 // Go until there are no more child nodes
895 while ( cur !== null ) { 901 while ( cur !== null ) {
896 if ( cur.nodeType === 1 ) { 902 if ( cur.nodeType === 1 ) {
897 // Remove style attribute(s) : 903 // Remove style attribute(s) :
898 if(cur.className !== "readability-styled") { 904 if(cur.className !== "readability-styled") {
899 cur.removeAttribute("style"); 905 cur.removeAttribute("style");
900 } 906 }
901 readability.cleanStyles( cur ); 907 readability.cleanStyles( cur );
902 } 908 }
903 cur = cur.nextSibling; 909 cur = cur.nextSibling;
904 } 910 }
905 }, 911 },
906 912
907 /** 913 /**
908 * Get the density of links as a percentage of the content 914 * Get the density of links as a percentage of the content
909 * This is the amount of text that is inside a link divided by the total tex t in the node. 915 * This is the amount of text that is inside a link divided by the total tex t in the node.
910 * 916 *
911 * @param Element 917 * @param Element
912 * @return number (float) 918 * @return number (float)
913 **/ 919 **/
914 getLinkDensity: function (e) { 920 getLinkDensity: function (e) {
915 var links = e.getElementsByTagName("a"); 921 var links = e.getElementsByTagName("a");
916 var textLength = readability.getInnerText(e).length; 922 var textLength = readability.getInnerText(e).length;
917 var linkLength = 0; 923 var linkLength = 0;
918 for(var i=0, il=links.length; i<il;i+=1) 924 for(var i=0, il=links.length; i<il;i+=1)
919 { 925 {
920 linkLength += readability.getInnerText(links[i]).length; 926 linkLength += readability.getInnerText(links[i]).length;
921 } 927 }
922 928
923 return linkLength / textLength; 929 return linkLength / textLength;
924 }, 930 },
925 931
926 /** 932 /**
927 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness. 933 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
928 * 934 *
929 * @author Dan Lacy 935 * @author Dan Lacy
930 * @return string the base url 936 * @return string the base url
931 **/ 937 **/
932 findBaseUrl: function () { 938 findBaseUrl: function () {
933 var noUrlParams = window.location.pathname.split("?")[0], 939 var noUrlParams = window.location.pathname.split("?")[0],
934 urlSlashes = noUrlParams.split("/").reverse(), 940 urlSlashes = noUrlParams.split("/").reverse(),
935 cleanedSegments = [], 941 cleanedSegments = [],
936 possibleType = ""; 942 possibleType = "";
937 943
938 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) { 944 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) {
939 var segment = urlSlashes[i]; 945 var segment = urlSlashes[i];
940 946
941 // Split off and save anything that looks like a file type. 947 // Split off and save anything that looks like a file type.
942 if (segment.indexOf(".") !== -1) { 948 if (segment.indexOf(".") !== -1) {
943 possibleType = segment.split(".")[1]; 949 possibleType = segment.split(".")[1];
944 950
945 /* If the type isn't alpha-only, it's probably not actually a fi le extension. */ 951 /* If the type isn't alpha-only, it's probably not actually a fi le extension. */
946 if(!possibleType.match(/[^a-zA-Z]/)) { 952 if(!possibleType.match(/[^a-zA-Z]/)) {
947 segment = segment.split(".")[0]; 953 segment = segment.split(".")[0];
948 } 954 }
949 } 955 }
950 956
951 /** 957 /**
952 * EW-CMS specific segment replacement. Ugly. 958 * EW-CMS specific segment replacement. Ugly.
953 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm l 959 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm l
954 **/ 960 **/
955 if(segment.indexOf(',00') !== -1) { 961 if(segment.indexOf(',00') !== -1) {
956 segment = segment.replace(',00', ''); 962 segment = segment.replace(',00', '');
957 } 963 }
958 964
959 // If our first or second segment has anything looking like a page n umber, remove it. 965 // If our first or second segment has anything looking like a page n umber, remove it.
960 if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) { 966 if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) {
961 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, " "); 967 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, " ");
962 } 968 }
963 969
964 970
965 var del = false; 971 var del = false;
966 972
967 /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */ 973 /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */
968 if (i < 2 && segment.match(/^\d{1,2}$/)) { 974 if (i < 2 && segment.match(/^\d{1,2}$/)) {
969 del = true; 975 del = true;
970 } 976 }
971 977
972 /* If this is the first segment and it's just "index", remove it. */ 978 /* If this is the first segment and it's just "index", remove it. */
973 if(i === 0 && segment.toLowerCase() === "index") { 979 if(i === 0 && segment.toLowerCase() === "index") {
974 del = true; 980 del = true;
975 } 981 }
976 982
977 983
978 /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */ 984 /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */
979 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) { 985 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) {
980 del = true; 986 del = true;
981 } 987 }
982 988
983 /* If it's not marked for deletion, push it to cleanedSegments. */ 989 /* If it's not marked for deletion, push it to cleanedSegments. */
984 if (!del) { 990 if (!del) {
985 cleanedSegments.push(segment); 991 cleanedSegments.push(segment);
986 } 992 }
987 } 993 }
988 994
989 // This is our final, cleaned, base article URL. 995 // This is our final, cleaned, base article URL.
990 return window.location.protocol + "//" + window.location.host + cleanedS egments.reverse().join("/"); 996 return window.location.protocol + "//" + window.location.host + cleanedS egments.reverse().join("/");
991 }, 997 },
992 998
993 /** 999 /**
994 * Look for any paging links that may occur within the document. 1000 * Look for any paging links that may occur within the document.
995 * 1001 *
996 * @param body 1002 * @param body
997 * @return object (array) 1003 * @return object (array)
998 **/ 1004 **/
999 findNextPageLink: function (elem) { 1005 findNextPageLink: function (elem) {
1000 var possiblePages = {}, 1006 var possiblePages = {},
1001 allLinks = elem.getElementsByTagName('a'), 1007 allLinks = elem.getElementsByTagName('a'),
1002 articleBaseUrl = readability.findBaseUrl(); 1008 articleBaseUrl = readability.findBaseUrl();
1003 1009
1004 /** 1010 /**
1005 * Loop through all links, looking for hints that they may be next-page links. 1011 * Loop through all links, looking for hints that they may be next-page links.
1006 * Things like having "page" in their textContent, className or id, or b eing a child 1012 * Things like having "page" in their textContent, className or id, or b eing a child
1007 * of a node with a page-y className or id. 1013 * of a node with a page-y className or id.
1008 * 1014 *
1009 * Also possible: levenshtein distance? longest common subsequence? 1015 * Also possible: levenshtein distance? longest common subsequence?
1010 * 1016 *
1011 * After we do that, assign each page a score, and 1017 * After we do that, assign each page a score, and
1012 **/ 1018 **/
1013 for(var i = 0, il = allLinks.length; i < il; i+=1) { 1019 for(var i = 0, il = allLinks.length; i < il; i+=1) {
1014 var link = allLinks[i], 1020 var link = allLinks[i],
1015 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ' '); 1021 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ' ');
1016 1022
1017 /* If we've already seen this page, ignore it */ 1023 /* If we've already seen this page, ignore it */
1018 if(linkHref === "" || linkHref === articleBaseUrl || linkHref === wi ndow.location.href || linkHref in readability.parsedPages) { 1024 if(linkHref === "" || linkHref === articleBaseUrl || linkHref === wi ndow.location.href || linkHref in readability.parsedPages) {
1019 continue; 1025 continue;
1020 } 1026 }
1021 1027
1022 /* If it's on a different domain, skip it. */ 1028 /* If it's on a different domain, skip it. */
1023 if(window.location.host !== linkHref.split(/\/+/g)[1]) { 1029 if(window.location.host !== linkHref.split(/\/+/g)[1]) {
1024 continue; 1030 continue;
1025 } 1031 }
1026 1032
1027 var linkText = readability.getInnerText(link); 1033 var linkText = readability.getInnerText(link);
1028 1034
1029 /* If the linkText looks like it's not the next page, skip it. */ 1035 /* If the linkText looks like it's not the next page, skip it. */
1030 if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) { 1036 if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) {
1031 continue; 1037 continue;
1032 } 1038 }
1033 1039
1034 /* If the leftovers of the URL after removing the base URL don't con tain any digits, it's certainly not a next page link. */ 1040 /* If the leftovers of the URL after removing the base URL don't con tain any digits, it's certainly not a next page link. */
1035 var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); 1041 var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
1036 if(!linkHrefLeftover.match(/\d/)) { 1042 if(!linkHrefLeftover.match(/\d/)) {
1037 continue; 1043 continue;
1038 } 1044 }
1039 1045
1040 if(!(linkHref in possiblePages)) { 1046 if(!(linkHref in possiblePages)) {
1041 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr ef": linkHref}; 1047 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr ef": linkHref};
1042 } else { 1048 } else {
1043 possiblePages[linkHref].linkText += ' | ' + linkText; 1049 possiblePages[linkHref].linkText += ' | ' + linkText;
1044 } 1050 }
1045 1051
1046 var linkObj = possiblePages[linkHref]; 1052 var linkObj = possiblePages[linkHref];
1047 1053
1048 /** 1054 /**
1049 * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower. 1055 * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.
1050 * Example: http://www.actionscript.org/resources/articles/745/1/Jav aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html 1056 * Example: http://www.actionscript.org/resources/articles/745/1/Jav aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
1051 **/ 1057 **/
1052 if(linkHref.indexOf(articleBaseUrl) !== 0) { 1058 if(linkHref.indexOf(articleBaseUrl) !== 0) {
1053 linkObj.score -= 25; 1059 linkObj.score -= 25;
1054 } 1060 }
1055 1061
1056 var linkData = linkText + ' ' + link.className + ' ' + link.id; 1062 var linkData = linkText + ' ' + link.className + ' ' + link.id;
1057 if(linkData.match(readability.regexps.nextLink)) { 1063 if(linkData.match(readability.regexps.nextLink)) {
1058 linkObj.score += 50; 1064 linkObj.score += 50;
1059 } 1065 }
1060 if(linkData.match(/pag(e|ing|inat)/i)) { 1066 if(linkData.match(/pag(e|ing|inat)/i)) {
1061 linkObj.score += 25; 1067 linkObj.score += 25;
1062 } 1068 }
1063 if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text, 1069 if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,
1064 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */ 1070 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */
1065 if(!linkObj.linkText.match(readability.regexps.nextLink)) { 1071 if(!linkObj.linkText.match(readability.regexps.nextLink)) {
1066 linkObj.score -= 65; 1072 linkObj.score -= 65;
1067 } 1073 }
1068 } 1074 }
1069 if(linkData.match(readability.regexps.negative) || linkData.match(re adability.regexps.extraneous)) { 1075 if(linkData.match(readability.regexps.negative) || linkData.match(re adability.regexps.extraneous)) {
1070 linkObj.score -= 50; 1076 linkObj.score -= 50;
1071 } 1077 }
1072 if(linkData.match(readability.regexps.prevLink)) { 1078 if(linkData.match(readability.regexps.prevLink)) {
1073 linkObj.score -= 200; 1079 linkObj.score -= 200;
1074 } 1080 }
1075 1081
1076 /* If a parentNode contains page or paging or paginat */ 1082 /* If a parentNode contains page or paging or paginat */
1077 var parentNode = link.parentNode, 1083 var parentNode = link.parentNode,
1078 positiveNodeMatch = false, 1084 positiveNodeMatch = false,
1079 negativeNodeMatch = false; 1085 negativeNodeMatch = false;
1080 while(parentNode) { 1086 while(parentNode) {
1081 var parentNodeClassAndId = parentNode.className + ' ' + parentNo de.id; 1087 var parentNodeClassAndId = parentNode.className + ' ' + parentNo de.id;
1082 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(/pag(e|ing|inat)/i)) { 1088 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(/pag(e|ing|inat)/i)) {
1083 positiveNodeMatch = true; 1089 positiveNodeMatch = true;
1084 linkObj.score += 25; 1090 linkObj.score += 25;
1085 } 1091 }
1086 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(readability.regexps.negative)) { 1092 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(readability.regexps.negative)) {
1087 /* If this is just something like "footer", give it a negati ve. If it's something like "body-and-footer", leave it be. */ 1093 /* If this is just something like "footer", give it a negati ve. If it's something like "body-and-footer", leave it be. */
1088 if(!parentNodeClassAndId.match(readability.regexps.positive) ) { 1094 if(!parentNodeClassAndId.match(readability.regexps.positive) ) {
1089 linkObj.score -= 25; 1095 linkObj.score -= 25;
1090 negativeNodeMatch = true; 1096 negativeNodeMatch = true;
1091 } 1097 }
1092 } 1098 }
1093 1099
1094 parentNode = parentNode.parentNode; 1100 parentNode = parentNode.parentNode;
1095 } 1101 }
1096 1102
1097 /** 1103 /**
1098 * If the URL looks like it has paging in it, add to the score. 1104 * If the URL looks like it has paging in it, add to the score.
1099 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 1105 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
1100 **/ 1106 **/
1101 if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) { 1107 if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) {
1102 linkObj.score += 25; 1108 linkObj.score += 25;
1103 } 1109 }
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
1145 topPage = possiblePages[page]; 1151 topPage = possiblePages[page];
1146 } 1152 }
1147 } 1153 }
1148 } 1154 }
1149 1155
1150 if(topPage) { 1156 if(topPage) {
1151 var nextHref = topPage.href.replace(/\/$/,''); 1157 var nextHref = topPage.href.replace(/\/$/,'');
1152 1158
1153 dbg('NEXT PAGE IS ' + nextHref); 1159 dbg('NEXT PAGE IS ' + nextHref);
1154 readability.parsedPages[nextHref] = true; 1160 readability.parsedPages[nextHref] = true;
1155 return nextHref; 1161 return nextHref;
1156 } 1162 }
1157 else { 1163 else {
1158 return null; 1164 return null;
1159 } 1165 }
1160 }, 1166 },
1161 1167
1162 createLinkDiv: function(link) { 1168 createLinkDiv: function(link) {
1163 var divNode = document.createElement('div'); 1169 var divNode = document.createElement('div');
1164 var aNode = document.createElement('a'); 1170 var aNode = document.createElement('a');
1165 var tNode = document.createTextNode('View Next Page'); 1171 var tNode = document.createTextNode('View Next Page');
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
1197 } 1203 }
1198 else { 1204 else {
1199 if (options.error) { options.error(request); } 1205 if (options.error) { options.error(request); }
1200 } 1206 }
1201 } 1207 }
1202 } 1208 }
1203 1209
1204 if (typeof options === 'undefined') { options = {}; } 1210 if (typeof options === 'undefined') { options = {}; }
1205 1211
1206 request.onreadystatechange = respondToReadyState; 1212 request.onreadystatechange = respondToReadyState;
1207 1213
1208 request.open('get', url, true); 1214 request.open('get', url, true);
1209 request.setRequestHeader('Accept', 'text/html'); 1215 request.setRequestHeader('Accept', 'text/html');
1210 1216
1211 try { 1217 try {
1212 request.send(options.postBody); 1218 request.send(options.postBody);
1213 } 1219 }
1214 catch (e) { 1220 catch (e) {
1215 if (options.error) { options.error(); } 1221 if (options.error) { options.error(); }
1216 } 1222 }
1217 1223
(...skipping 14 matching lines...) Expand all
1232 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada bility.curPageNum + '">&sect;</p>'; 1238 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada bility.curPageNum + '">&sect;</p>';
1233 1239
1234 document.getElementById("readability-content").appendChild(articlePage); 1240 document.getElementById("readability-content").appendChild(articlePage);
1235 1241
1236 if(readability.curPageNum > readability.maxPages) { 1242 if(readability.curPageNum > readability.maxPages) {
1237 var linkDiv = readability.createLinkDiv(nextPageLink); 1243 var linkDiv = readability.createLinkDiv(nextPageLink);
1238 1244
1239 articlePage.appendChild(linkDiv); 1245 articlePage.appendChild(linkDiv);
1240 return; 1246 return;
1241 } 1247 }
1242 1248
1243 /** 1249 /**
1244 * Now that we've built the article page DOM element, get the page conte nt 1250 * Now that we've built the article page DOM element, get the page conte nt
1245 * asynchronously and load the cleaned content into the div we created f or it. 1251 * asynchronously and load the cleaned content into the div we created f or it.
1246 **/ 1252 **/
1247 (function(pageUrl, thisPage) { 1253 (function(pageUrl, thisPage) {
1248 readability.ajax(pageUrl, { 1254 readability.ajax(pageUrl, {
1249 success: function(r) { 1255 success: function(r) {
1250 1256
1251 /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */ 1257 /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */
1252 var eTag = r.getResponseHeader('ETag'); 1258 var eTag = r.getResponseHeader('ETag');
1253 if(eTag) { 1259 if(eTag) {
1254 if(eTag in readability.pageETags) { 1260 if(eTag in readability.pageETags) {
1255 dbg("Exact duplicate page found via ETag. Aborting." ); 1261 dbg("Exact duplicate page found via ETag. Aborting." );
1256 articlePage.style.display = 'none'; 1262 articlePage.style.display = 'none';
1257 return; 1263 return;
1258 } else { 1264 } else {
1259 readability.pageETags[eTag] = 1; 1265 readability.pageETags[eTag] = 1;
1260 } 1266 }
1261 } 1267 }
1262 1268
1263 // TODO: this ends up doubling up page numbers on NYTimes ar ticles. Need to generically parse those away. 1269 // TODO: this ends up doubling up page numbers on NYTimes ar ticles. Need to generically parse those away.
1264 var page = document.createElement("DIV"); 1270 var page = document.createElement("DIV");
1265 1271
1266 /** 1272 /**
1267 * Do some preprocessing to our HTML to make it ready for ap pending. 1273 * Do some preprocessing to our HTML to make it ready for ap pending.
1268 * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript. 1274 * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript.
1269 * • Turn any noscript tags into divs so that we can parse t hem. This allows us to find any next page links hidden via javascript. 1275 * • Turn any noscript tags into divs so that we can parse t hem. This allows us to find any next page links hidden via javascript.
1270 * • Turn all double br's into p's - was handled by prepDocu ment in the original view. 1276 * • Turn all double br's into p's - was handled by prepDocu ment in the original view.
(...skipping 30 matching lines...) Expand all
1301 for(var i=1; i <= readability.curPageNum; i+=1) { 1307 for(var i=1; i <= readability.curPageNum; i+=1) {
1302 var rPage = document.getElementById('readability-pag e-' + i); 1308 var rPage = document.getElementById('readability-pag e-' + i);
1303 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML ) !== -1) { 1309 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML ) !== -1) {
1304 dbg('Duplicate of page ' + i + ' - skipping.'); 1310 dbg('Duplicate of page ' + i + ' - skipping.');
1305 articlePage.style.display = 'none'; 1311 articlePage.style.display = 'none';
1306 readability.parsedPages[pageUrl] = true; 1312 readability.parsedPages[pageUrl] = true;
1307 return; 1313 return;
1308 } 1314 }
1309 } 1315 }
1310 } 1316 }
1311 1317
1312 readability.removeScripts(content); 1318 readability.removeScripts(content);
1313 1319
1314 readability.moveNodeInnards(content, thisPage); 1320 readability.moveNodeInnards(content, thisPage);
1315 1321
1316 /** 1322 /**
1317 * After the page has rendered, post process the content. Th is delay is necessary because, 1323 * After the page has rendered, post process the content. Th is delay is necessary because,
1318 * in webkit at least, offsetWidth is not set in time to det ermine image width. We have to 1324 * in webkit at least, offsetWidth is not set in time to det ermine image width. We have to
1319 * wait a little bit for reflow to finish before we can fix floating images. 1325 * wait a little bit for reflow to finish before we can fix floating images.
1320 **/ 1326 **/
1321 window.setTimeout( 1327 window.setTimeout(
1322 function() { readability.postProcessContent(thisPage); } , 1328 function() { readability.postProcessContent(thisPage); } ,
1323 500 1329 500
1324 ); 1330 );
1325 1331
1326 if(nextPageLink) { 1332 if(nextPageLink) {
1327 readability.appendNextPage(nextPageLink); 1333 readability.appendNextPage(nextPageLink);
1328 } 1334 }
1329 } 1335 }
1330 }); 1336 });
1331 }(nextPageLink, articlePage)); 1337 }(nextPageLink, articlePage));
1332 }, 1338 },
1333 1339
1334 /** 1340 /**
1335 * Get an elements class/id weight. Uses regular expressions to tell if this 1341 * Get an elements class/id weight. Uses regular expressions to tell if this
1336 * element looks good or bad. 1342 * element looks good or bad.
1337 * 1343 *
1338 * @param Element 1344 * @param Element
1339 * @return number (Integer) 1345 * @return number (Integer)
1340 **/ 1346 **/
1341 getClassWeight: function (e) { 1347 getClassWeight: function (e) {
1342 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { 1348 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
1343 return 0; 1349 return 0;
1344 } 1350 }
1345 1351
(...skipping 29 matching lines...) Expand all
1375 /** 1381 /**
1376 * Remove extraneous break tags from a node. 1382 * Remove extraneous break tags from a node.
1377 * 1383 *
1378 * @param Element 1384 * @param Element
1379 * @return void 1385 * @return void
1380 **/ 1386 **/
1381 killBreaks: function (e) { 1387 killBreaks: function (e) {
1382 var allElements = e.getElementsByTagName('*'); 1388 var allElements = e.getElementsByTagName('*');
1383 while (i < allElements.length) { 1389 while (i < allElements.length) {
1384 readability.deleteExtraBreaks(allElements[i]); 1390 readability.deleteExtraBreaks(allElements[i]);
1385 i++; 1391 i++;
1386 } 1392 }
1387 }, 1393 },
1388 1394
1389 /** 1395 /**
1390 * Clean a node of all elements of type "tag". 1396 * Clean a node of all elements of type "tag".
1391 * (Unless it's a youtube/vimeo video. People love movies.) 1397 * (Unless it's a youtube/vimeo video. People love movies.)
1392 * 1398 *
1393 * @param Element 1399 * @param Element
1394 * @param string tag to clean 1400 * @param string tag to clean
1395 * @return void 1401 * @return void
1396 **/ 1402 **/
1397 clean: function (e, tag) { 1403 clean: function (e, tag) {
1398 var targetList = e.getElementsByTagName( tag ); 1404 var targetList = e.getElementsByTagName( tag );
1399 var isEmbed = (tag === 'object' || tag === 'embed'); 1405 var isEmbed = (tag === 'object' || tag === 'embed');
1400 1406
1401 for (var y=targetList.length-1; y >= 0; y-=1) { 1407 for (var y=targetList.length-1; y >= 0; y-=1) {
1402 /* Allow youtube and vimeo videos through as people usually want to see those. */ 1408 /* Allow youtube and vimeo videos through as people usually want to see those. */
1403 if(isEmbed) { 1409 if(isEmbed) {
1404 var attributeValues = ""; 1410 var attributeValues = "";
1405 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) { 1411 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {
1406 attributeValues += targetList[y].attributes[i].value + '|'; 1412 attributeValues += targetList[y].attributes[i].value + '|';
1407 } 1413 }
1408 1414
1409 /* First, check the elements attributes to see if any of them co ntain youtube or vimeo */ 1415 /* First, check the elements attributes to see if any of them co ntain youtube or vimeo */
1410 if (attributeValues.search(readability.regexps.videos) !== -1) { 1416 if (attributeValues.search(readability.regexps.videos) !== -1) {
1411 continue; 1417 continue;
1412 } 1418 }
1413 1419
1414 /* Then check the elements inside this element for the same. */ 1420 /* Then check the elements inside this element for the same. */
1415 if (targetList[y].innerHTML.search(readability.regexps.videos) ! == -1) { 1421 if (targetList[y].innerHTML.search(readability.regexps.videos) ! == -1) {
1416 continue; 1422 continue;
1417 } 1423 }
1418 1424
1419 } 1425 }
1420 1426
1421 targetList[y].parentNode.removeChild(targetList[y]); 1427 targetList[y].parentNode.removeChild(targetList[y]);
1422 } 1428 }
1423 }, 1429 },
1424 1430
1425 /** 1431 /**
1426 * Clean an element of all tags of type "tag" if they look fishy. 1432 * Clean an element of all tags of type "tag" if they look fishy.
1427 * "Fishy" is an algorithm based on content length, classnames, link density , number of images & embeds, etc. 1433 * "Fishy" is an algorithm based on content length, classnames, link density , number of images & embeds, etc.
1428 * 1434 *
1429 * @return void 1435 * @return void
1430 **/ 1436 **/
1431 cleanConditionally: function (e, tag) { 1437 cleanConditionally: function (e, tag) {
1432 1438
1433 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { 1439 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
1434 return; 1440 return;
1435 } 1441 }
1436 1442
1437 var tagsList = e.getElementsByTagName(tag); 1443 var tagsList = e.getElementsByTagName(tag);
1438 var curTagsLength = tagsList.length; 1444 var curTagsLength = tagsList.length;
1439 1445
1440 /** 1446 /**
1441 * Gather counts for other typical elements embedded within. 1447 * Gather counts for other typical elements embedded within.
1442 * Traverse backwards so we can remove nodes at the same time without ef fecting the traversal. 1448 * Traverse backwards so we can remove nodes at the same time without ef fecting the traversal.
1443 * 1449 *
1444 * TODO: Consider taking into account original contentScore here. 1450 * TODO: Consider taking into account original contentScore here.
1445 **/ 1451 **/
1446 for (var i=curTagsLength-1; i >= 0; i-=1) { 1452 for (var i=curTagsLength-1; i >= 0; i-=1) {
1447 var weight = readability.getClassWeight(tagsList[i]); 1453 var weight = readability.getClassWeight(tagsList[i]);
1448 var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0; 1454 var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;
1449 1455
1450 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde fined') ? (" with score " + tagsList[i].readability.contentScore) : '')); 1456 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde fined') ? (" with score " + tagsList[i].readability.contentScore) : ''));
1451 1457
1452 if(weight+contentScore < 0) 1458 if(weight+contentScore < 0)
1453 { 1459 {
1454 tagsList[i].parentNode.removeChild(tagsList[i]); 1460 tagsList[i].parentNode.removeChild(tagsList[i]);
1455 } 1461 }
1456 else if ( readability.getCharCount(tagsList[i],',') < 10) { 1462 else if ( readability.getCharCount(tagsList[i],',') < 10) {
1457 /** 1463 /**
1458 * If there are not very many commas, and the number of 1464 * If there are not very many commas, and the number of
1459 * non-paragraph elements is more than paragraphs or other omino us signs, remove the element. 1465 * non-paragraph elements is more than paragraphs or other omino us signs, remove the element.
1460 **/ 1466 **/
1461 var p = tagsList[i].getElementsByTagName("p").length; 1467 var p = tagsList[i].getElementsByTagName("p").length;
1462 var img = tagsList[i].getElementsByTagName("img").length; 1468 var img = tagsList[i].getElementsByTagName("img").length;
1463 var li = tagsList[i].getElementsByTagName("li").length-100; 1469 var li = tagsList[i].getElementsByTagName("li").length-100;
1464 var input = tagsList[i].getElementsByTagName("input").length; 1470 var input = tagsList[i].getElementsByTagName("input").length;
1465 1471
1466 var embedCount = 0; 1472 var embedCount = 0;
1467 var embeds = tagsList[i].getElementsByTagName("embed"); 1473 var embeds = tagsList[i].getElementsByTagName("embed");
1468 for(var ei=0,il=embeds.length; ei < il; ei+=1) { 1474 for(var ei=0,il=embeds.length; ei < il; ei+=1) {
1469 if (embeds[ei].src.search(readability.regexps.videos) === -1 ) { 1475 if (embeds[ei].src.search(readability.regexps.videos) === -1 ) {
1470 embedCount+=1; 1476 embedCount+=1;
1471 } 1477 }
1472 } 1478 }
1473 1479
1474 var linkDensity = readability.getLinkDensity(tagsList[i]); 1480 var linkDensity = readability.getLinkDensity(tagsList[i]);
1475 var contentLength = readability.getInnerText(tagsList[i]).length ; 1481 var contentLength = readability.getInnerText(tagsList[i]).length ;
1476 var toRemove = false; 1482 var toRemove = false;
1477 1483
1478 if ( img > p ) { 1484 if ( img > p ) {
1479 toRemove = true; 1485 toRemove = true;
1480 } else if(li > p && tag !== "ul" && tag !== "ol") { 1486 } else if(li > p && tag !== "ul" && tag !== "ol") {
1481 toRemove = true; 1487 toRemove = true;
1482 } else if( input > Math.floor(p/3) ) { 1488 } else if( input > Math.floor(p/3) ) {
1483 toRemove = true; 1489 toRemove = true;
1484 } else if(contentLength < 25 && (img === 0 || img > 2) ) { 1490 } else if(contentLength < 25 && (img === 0 || img > 2) ) {
1485 toRemove = true; 1491 toRemove = true;
1486 } else if(weight < 25 && linkDensity > 0.2) { 1492 } else if(weight < 25 && linkDensity > 0.2) {
1487 toRemove = true; 1493 toRemove = true;
1488 } else if(weight >= 25 && linkDensity > 0.5) { 1494 } else if(weight >= 25 && linkDensity > 0.5) {
1489 toRemove = true; 1495 toRemove = true;
1490 } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) { 1496 } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) {
1491 toRemove = true; 1497 toRemove = true;
1492 } 1498 }
1493 1499
(...skipping 21 matching lines...) Expand all
1515 } 1521 }
1516 }, 1522 },
1517 1523
1518 flagIsActive: function(flag) { 1524 flagIsActive: function(flag) {
1519 return (readability.flags & flag) > 0; 1525 return (readability.flags & flag) > 0;
1520 }, 1526 },
1521 1527
1522 addFlag: function(flag) { 1528 addFlag: function(flag) {
1523 readability.flags = readability.flags | flag; 1529 readability.flags = readability.flags | flag;
1524 }, 1530 },
1525 1531
1526 removeFlag: function(flag) { 1532 removeFlag: function(flag) {
1527 readability.flags = readability.flags & ~flag; 1533 readability.flags = readability.flags & ~flag;
1528 }, 1534 },
1529 1535
1530 // Removes the children of |src| and appends them to |dest|. 1536 // Removes the children of |src| and appends them to |dest|.
1531 moveNodeInnards: function(src, dest) { 1537 moveNodeInnards: function(src, dest) {
1532 try { 1538 try {
1533 while (src.firstChild) { 1539 while (src.firstChild) {
1534 dest.appendChild(src.removeChild(src.firstChild)); 1540 dest.appendChild(src.removeChild(src.firstChild));
1535 } 1541 }
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after
1584 var lastBr = readability.isMultipleBr(node, false); 1590 var lastBr = readability.isMultipleBr(node, false);
1585 var ret = false; 1591 var ret = false;
1586 while (lastBr && lastBr != node) { 1592 while (lastBr && lastBr != node) {
1587 var toRemove = lastBr; 1593 var toRemove = lastBr;
1588 lastBr = lastBr.previousSibling; 1594 lastBr = lastBr.previousSibling;
1589 toRemove.parentNode.removeChild(toRemove); 1595 toRemove.parentNode.removeChild(toRemove);
1590 ret = true; 1596 ret = true;
1591 } 1597 }
1592 return ret; 1598 return ret;
1593 }, 1599 },
1594 1600
1595 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a 1601 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a
1596 // <P> node, and makes all next siblings of that pair children of <P>, up 1602 // <P> node, and makes all next siblings of that pair children of <P>, up
1597 // until the next pair of <BR> nodes is reached. 1603 // until the next pair of <BR> nodes is reached.
1598 replaceDoubleBrWithP: function(node) { 1604 replaceDoubleBrWithP: function(node) {
1599 // Check that we are starting with a BR. 1605 // Check that we are starting with a BR.
1600 var second = readability.isMultipleBr(node, true); 1606 var second = readability.isMultipleBr(node, true);
1601 if (!second) { 1607 if (!second) {
1602 return; 1608 return;
1603 } 1609 }
1604 // Make all next siblings of the second BR into children of a P. 1610 // Make all next siblings of the second BR into children of a P.
1605 var p = document.createElement('p'); 1611 var p = document.createElement('p');
1606 var curr = second.nextSibling; 1612 var curr = second.nextSibling;
1607 while (curr) { 1613 while (curr) {
1608 if (readability.isMultipleBr(curr, true)) { 1614 if (readability.isMultipleBr(curr, true)) {
1609 break; 1615 break;
1610 } 1616 }
1611 var next = curr.nextSibling; 1617 var next = curr.nextSibling;
1612 p.appendChild(curr.parentNode.removeChild(curr)); 1618 p.appendChild(curr.parentNode.removeChild(curr));
1613 curr = next; 1619 curr = next;
1614 } 1620 }
1615 var ret = curr; 1621 var ret = curr;
1616 1622
1617 // Remove all nodes between the first and second BR. 1623 // Remove all nodes between the first and second BR.
1618 curr = node.nextSibling; 1624 curr = node.nextSibling;
1619 while (curr && curr != second) { 1625 while (curr && curr != second) {
1620 var next = curr.nextSibling; 1626 var next = curr.nextSibling;
1621 curr.parentNode.removeChild(curr); 1627 curr.parentNode.removeChild(curr);
1622 curr = next; 1628 curr = next;
1623 } 1629 }
1624 // Remove the second BR. 1630 // Remove the second BR.
1625 second.parentNode.removeChild(second); 1631 second.parentNode.removeChild(second);
1626 // Replace the first BR with the P. 1632 // Replace the first BR with the P.
1627 node.parentNode.replaceChild(p, node); 1633 node.parentNode.replaceChild(p, node);
1628 1634
1629 return ret; 1635 return ret;
1630 }, 1636 },
1631 1637
1632 // Returns true if the NodeList contains a double <BR>. 1638 // Returns true if the NodeList contains a double <BR>.
1633 hasDoubleBr: function(nodeList) { 1639 hasDoubleBr: function(nodeList) {
1634 for (var i = 0; i < nodeList.length; nodeList++) { 1640 for (var i = 0; i < nodeList.length; nodeList++) {
1635 if (readability.isMultipleBr(nodeList[i], true)) { 1641 if (readability.isMultipleBr(nodeList[i], true)) {
1636 return true; 1642 return true;
1637 } 1643 }
1638 } 1644 }
1639 return false; 1645 return false;
1640 }, 1646 },
1641 1647
1642 // Replaces double <BR> tags with <P> tags. 1648 // Replaces double <BR> tags with <P> tags.
1643 replaceDoubleBrsWithPs: function(node) { 1649 replaceDoubleBrsWithPs: function(node) {
1644 var allElements = node.getElementsByTagName('BR'); 1650 var allElements = node.getElementsByTagName('BR');
1645 var node = null; 1651 var node = null;
1646 while (allElements && allElements.length > 0 && 1652 while (allElements && allElements.length > 0 &&
1647 readability.hasDoubleBr(allElements)) { 1653 readability.hasDoubleBr(allElements)) {
1648 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) { 1654 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {
1649 var next = node; 1655 var next = node;
1650 while (next = readability.replaceDoubleBrWithP(next)); 1656 while (next = readability.replaceDoubleBrWithP(next));
1651 } 1657 }
1652 allElements = document.body.getElementsByTagName('BR'); 1658 allElements = document.body.getElementsByTagName('BR');
1653 } 1659 }
1654 }, 1660 },
1655 1661
1656 1662
1657 // Replaces a BR and the whitespace that follows it with a P. 1663 // Replaces a BR and the whitespace that follows it with a P.
1658 replaceBrWithP: function(node) { 1664 replaceBrWithP: function(node) {
1659 if (!readability.isBrNode(node)) { 1665 if (!readability.isBrNode(node)) {
1660 return; 1666 return;
1661 } 1667 }
1662 var p = document.createElement('p'); 1668 var p = document.createElement('p');
1663 var curr = node.nextSibling; 1669 var curr = node.nextSibling;
1664 while (curr && !isBrNode(curr)) { 1670 while (curr && !isBrNode(curr)) {
1665 var next = curr.nextSibling; 1671 var next = curr.nextSibling;
1666 if (readability.isWhitespaceNode(curr)) { 1672 if (readability.isWhitespaceNode(curr)) {
1667 curr.parentNode.removeChild(curr); 1673 curr.parentNode.removeChild(curr);
1668 } else { 1674 } else {
1669 p.appendChild(curr.parentNode.removeChild(curr)); 1675 p.appendChild(curr.parentNode.removeChild(curr));
1670 } 1676 }
1671 curr = next; 1677 curr = next;
1672 } 1678 }
1673 node.parentNode.replaceChild(p, node); 1679 node.parentNode.replaceChild(p, node);
1674 return curr; 1680 return curr;
1675 }, 1681 },
1676 1682
1677 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t ag 1683 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t ag
1678 // children of the <P>. 1684 // children of the <P>.
1679 replaceBrsWithPs: function(node) { 1685 replaceBrsWithPs: function(node) {
1680 var allElements = node.getElementsByTagName('BR'); 1686 var allElements = node.getElementsByTagName('BR');
1681 var node = null; 1687 var node = null;
1682 while (allElements && allElements.length > 0) { 1688 while (allElements && allElements.length > 0) {
1683 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) { 1689 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {
1684 var next = node; 1690 var next = node;
1685 while (next = readability.replaceBrWithP(next)); 1691 while (next = readability.replaceBrWithP(next));
1686 } 1692 }
1687 allElements = document.body.getElementsByTagName('BR'); 1693 allElements = document.body.getElementsByTagName('BR');
1688 } 1694 }
1689 }, 1695 },
1690 1696
1691 // Replaces any tag with any other tag. 1697 // Replaces any tag with any other tag.
1692 replaceTagsWithTags: function(node, srcTag, destTag) { 1698 replaceTagsWithTags: function(node, srcTag, destTag) {
1693 var allElements = node.getElementsByTagName(srcTag); 1699 var allElements = node.getElementsByTagName(srcTag);
1694 for (var i = 0; i < allElements.length; i++) { 1700 for (var i = 0; i < allElements.length; i++) {
1695 var dest = document.createElement(destTag); 1701 var dest = document.createElement(destTag);
1696 readability.moveNodeInnards(allElements[i], dest); 1702 readability.moveNodeInnards(allElements[i], dest);
1697 node.replaceNode(dest, allElements[i]); 1703 allElements[i].parentNode.replaceChild(dest, allElements[i]);
1698 } 1704 }
1699 }, 1705 },
1700 1706
1701 // Replaces all <noscript> tags with <p> tags. 1707 // Replaces all <noscript> tags with <p> tags.
1702 replaceNoscriptsWithPs: function(node) { 1708 replaceNoscriptsWithPs: function(node) {
1703 readability.replaceTagsWithTags(node, 'noscript', 'p'); 1709 readability.replaceTagsWithTags(node, 'noscript', 'p');
1704 }, 1710 },
1705 1711
1706 // Replaces all <font> tags with <span> tags. 1712 // Replaces all <font> tags with <span> tags.
1707 replaceFontsWithSpans: function(node) { 1713 replaceFontsWithSpans: function(node) {
1708 readability.replaceTagsWithTags(node, 'font', 'span'); 1714 readability.replaceTagsWithTags(node, 'font', 'span');
1709 }, 1715 },
1710 1716
1711 // Returns a list of image URLs in the distilled article. 1717 // Returns a list of image URLs in the distilled article.
1712 getImages : function() { 1718 getImages : function() {
1713 var images = document.getElementsByTagName('img'); 1719 var images = document.getElementsByTagName('img');
1714 var result = new Array(images.length); 1720 var result = new Array(images.length);
1715 dbg("Number of images: " + images.length); 1721 dbg("Number of images: " + images.length);
1716 for(i = 0; i < images.length; i++) { 1722 for(i = 0; i < images.length; i++) {
1717 result[i] = images[i].src; 1723 result[i] = images[i].src;
1718 dbg("Image: " + result[i]); 1724 dbg("Image: " + result[i]);
1719 } 1725 }
1720 return result; 1726 return result;
1721 }, 1727 },
1722 1728
1723 // Returns the distilled article HTML from the page(s). 1729 // Returns the distilled article HTML from the page(s).
1724 getDistilledArticleHTML : function() { 1730 getDistilledArticleHTML : function() {
1725 return readability.distilledHTML; 1731 return readability.distilledHTML;
1732 },
1733
1734 // Returns the next page of this article.
1735 getNextPageLink : function() {
1736 return readability.nextPageLink;
1726 } 1737 }
1727 }; 1738 };
1728 1739
1729 // Extracts long-form content from a page and returns and array where the first 1740 // Extracts long-form content from a page and returns and array where the first
1730 // element is the article title, the second element is HTML containing the 1741 // element is the article title, the second element is HTML containing the
1731 // long-form content, and remaining elements are URLs for images referenced by 1742 // long-form content, and remaining elements are URLs for images referenced by
1732 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which 1743 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which
1733 // corresponds to a URL listed at index k in the array returned. 1744 // corresponds to a URL listed at index k in the array returned.
1734 (function () { 1745 (function () {
1735 readability.init(); 1746 readability.init();
1736 var result = new Array(2); 1747 var result = new Array(3);
1737 result[0] = readability.getArticleTitle(); 1748 result[0] = readability.getArticleTitle();
1738 result[1] = readability.getDistilledArticleHTML(); 1749 result[1] = readability.getDistilledArticleHTML();
1750 result[2] = readability.getNextPageLink();
1739 return result.concat(readability.getImages()); 1751 return result.concat(readability.getImages());
1740 }()) 1752 }())
1741 1753
OLDNEW
« no previous file with comments | « third_party/readability/README.chromium ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698