OLD | NEW |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 // Local modifications to this file are described in the README.chromium |
| 6 // file. |
1 | 7 |
2 var dbg = (typeof console !== 'undefined') ? function(s) { | 8 var dbg = (typeof console !== 'undefined') ? function(s) { |
3 console.log("Readability: " + s); | 9 console.log("Readability: " + s); |
4 } : function() {}; | 10 } : function() {}; |
5 | 11 |
6 /* | 12 /* |
7 * Readability. An Arc90 Lab Experiment. | 13 * Readability. An Arc90 Lab Experiment. |
8 * Website: http://lab.arc90.com/experiments/readability | 14 * Website: http://lab.arc90.com/experiments/readability |
9 * Source: http://code.google.com/p/arc90labs-readability | 15 * Source: http://code.google.com/p/arc90labs-readability |
10 * | 16 * |
11 * "Readability" is a trademark of Arc90 Inc and may not be used without explici
t permission. | 17 * "Readability" is a trademark of Arc90 Inc and may not be used without explici
t permission. |
12 * | 18 * |
13 * Copyright (c) 2010 Arc90 Inc | 19 * Copyright (c) 2010 Arc90 Inc |
14 * Readability is licensed under the Apache License, Version 2.0. | 20 * Readability is licensed under the Apache License, Version 2.0. |
15 **/ | 21 **/ |
16 var readability = { | 22 var readability = { |
17 readStyle: "style-newspaper", | 23 readStyle: "style-newspaper", |
18 readSize: "size-medium", | 24 readSize: "size-medium", |
19 readMargin: "margin-wide", | 25 readMargin: "margin-wide", |
20 | 26 |
21 distilledHTML: '', | 27 distilledHTML: '', |
22 distilledArticleContent: null, | 28 distilledArticleContent: null, |
| 29 nextPageLink: '', |
23 | 30 |
24 version: '1.7.1', | 31 version: '1.7.1', |
25 iframeLoads: 0, | 32 iframeLoads: 0, |
26 convertLinksToFootnotes: false, | 33 convertLinksToFootnotes: false, |
27 reversePageScroll: false, /* If they hold shift and hit space, scroll
up */ | 34 reversePageScroll: false, /* If they hold shift and hit space, scroll
up */ |
28 frameHack: false, /** | 35 frameHack: false, /** |
29 * The frame hack is to workaround a firefo
x bug where if you | 36 * The frame hack is to workaround a firefo
x bug where if you |
30 * pull content out of a frame and stick it
into the parent element, the scrollbar won't appear. | 37 * pull content out of a frame and stick it
into the parent element, the scrollbar won't appear. |
31 * So we fake a scrollbar in the wrapping d
iv. | 38 * So we fake a scrollbar in the wrapping d
iv. |
32 **/ | 39 **/ |
33 biggestFrame: false, | 40 biggestFrame: false, |
34 flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */ | 41 flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */ |
35 | 42 |
36 /* constants */ | 43 /* constants */ |
37 FLAG_STRIP_UNLIKELYS: 0x1, | 44 FLAG_STRIP_UNLIKELYS: 0x1, |
38 FLAG_WEIGHT_CLASSES: 0x2, | 45 FLAG_WEIGHT_CLASSES: 0x2, |
39 FLAG_CLEAN_CONDITIONALLY: 0x4, | 46 FLAG_CLEAN_CONDITIONALLY: 0x4, |
40 | 47 |
41 maxPages: 30, /* The maximum number of pages to loop through before we ca
ll it quits and just show a link. */ | 48 maxPages: 30, /* The maximum number of pages to loop through before we ca
ll it quits and just show a link. */ |
42 parsedPages: {}, /* The list of pages we've parsed in this call of readabili
ty, for autopaging. As a key store for easier searching. */ | 49 parsedPages: {}, /* The list of pages we've parsed in this call of readabili
ty, for autopaging. As a key store for easier searching. */ |
43 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas
e they happen to match, we'll know it's a duplicate. */ | 50 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas
e they happen to match, we'll know it's a duplicate. */ |
44 | 51 |
45 /** | 52 /** |
46 * All of the regular expressions in use within readability. | 53 * All of the regular expressions in use within readability. |
47 * Defined up here so we don't instantiate them repeatedly in loops. | 54 * Defined up here so we don't instantiate them repeatedly in loops. |
48 **/ | 55 **/ |
49 regexps: { | 56 regexps: { |
50 unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header
|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popu
p|tweet|twitter/i, | 57 unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header
|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popu
p|tweet|twitter/i, |
51 okMaybeItsACandidate: /and|article|body|column|main|shadow/i, | 58 okMaybeItsACandidate: /and|article|body|column|main|shadow/i, |
52 positive: /article|body|content|entry|hentry|main|page|pagi
nation|post|text|blog|story/i, | 59 positive: /article|body|content|entry|hentry|main|page|pagi
nation|post|text|blog|story/i, |
53 negative: /combx|comment|com-|contact|foot|footer|footnote|
masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopp
ing|tags|tool|widget/i, | 60 negative: /combx|comment|com-|contact|foot|footer|footnote|
masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopp
ing|tags|tool|widget/i, |
54 extraneous: /print|archive|comment|discuss|e[\-]?mail|share|r
eply|all|login|sign|single/i, | 61 extraneous: /print|archive|comment|discuss|e[\-]?mail|share|r
eply|all|login|sign|single/i, |
55 divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, | 62 divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, |
56 replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi, | 63 replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi, |
57 replaceFonts: /<(\/?)font[^>]*>/gi, | 64 replaceFonts: /<(\/?)font[^>]*>/gi, |
58 trim: /^\s+|\s+$/g, | 65 trim: /^\s+|\s+$/g, |
59 normalize: /\s{2,}/g, | 66 normalize: /\s{2,}/g, |
60 killBreaks: /(<br\s*\/?>(\s| ?)*){1,}/g, | 67 killBreaks: /(<br\s*\/?>(\s| ?)*){1,}/g, |
61 videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, | 68 videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, |
62 skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)
\s*$/i, | 69 skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)
\s*$/i, |
63 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
// Match: next, continue, >, >>, » but not >|, »| as those usually mean last. | 70 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
// Match: next, continue, >, >>, » but not >|, »| as those usually mean last. |
64 prevLink: /(prev|earl|old|new|<|«)/i | 71 prevLink: /(prev|earl|old|new|<|«)/i |
65 }, | 72 }, |
66 | 73 |
67 /** | 74 /** |
68 * Runs readability. | 75 * Runs readability. |
69 * | 76 * |
70 * Workflow: | 77 * Workflow: |
71 * 1. Prep the document by removing script tags, css, etc. | 78 * 1. Prep the document by removing script tags, css, etc. |
72 * 2. Build readability's DOM tree. | 79 * 2. Build readability's DOM tree. |
73 * 3. Grab the article content from the current dom tree. | 80 * 3. Grab the article content from the current dom tree. |
74 * 4. Replace the current DOM tree with the new one. | 81 * 4. Replace the current DOM tree with the new one. |
75 * 5. Read peacefully. | 82 * 5. Read peacefully. |
76 * | 83 * |
77 * @return void | 84 * @return void |
78 **/ | 85 **/ |
79 init: function() { | 86 init: function() { |
80 /* Before we do anything, remove all scripts that are not readability. *
/ | 87 /* Before we do anything, remove all scripts that are not readability. *
/ |
81 window.onload = window.onunload = function() {}; | 88 window.onload = window.onunload = function() {}; |
82 | 89 |
83 readability.removeScripts(document); | 90 readability.removeScripts(document); |
84 | 91 |
85 /* Make sure this document is added to the list of parsed pages first, s
o we don't double up on the first page */ | 92 /* Make sure this document is added to the list of parsed pages first, s
o we don't double up on the first page */ |
86 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true; | 93 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true; |
87 | 94 |
88 /* Pull out any possible next page link first */ | 95 /* Pull out any possible next page link first */ |
89 var nextPageLink = readability.findNextPageLink(document.body); | 96 readability.nextPageLink = readability.findNextPageLink(document.body); |
90 | 97 |
| 98 /* We handle processing of nextPage from C++ set nextPageLink to null */ |
| 99 var nextPageLink = null; |
| 100 |
91 readability.prepDocument(); | 101 readability.prepDocument(); |
92 | 102 |
93 /* Build readability's DOM tree */ | 103 /* Build readability's DOM tree */ |
94 var overlay = document.createElement("DIV"); | 104 var overlay = document.createElement("DIV"); |
95 var innerDiv = document.createElement("DIV"); | 105 var innerDiv = document.createElement("DIV"); |
96 var articleTools = readability.getArticleTools(); | 106 var articleTools = readability.getArticleTools(); |
97 var articleTitleText = readability.getArticleTitle(); | 107 var articleTitleText = readability.getArticleTitle(); |
98 var articleContent = readability.grabArticle(); | 108 var articleContent = readability.grabArticle(); |
99 | 109 |
100 if(!articleContent) { | 110 if(!articleContent) { |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
145 rootWarning.innerHTML = "<em>Readability</em> was intended for u
se on individual articles and not home pages. " + | 155 rootWarning.innerHTML = "<em>Readability</em> was intended for u
se on individual articles and not home pages. " + |
146 "If you'd like to try rendering this page anyway, <a onClick='ja
vascript:document.getElementById(\"readability-warning\").style.display=\"none\"
;document.getElementById(\"readability-content\").style.display=\"block\";'>clic
k here</a> to continue."; | 156 "If you'd like to try rendering this page anyway, <a onClick='ja
vascript:document.getElementById(\"readability-warning\").style.display=\"none\"
;document.getElementById(\"readability-content\").style.display=\"block\";'>clic
k here</a> to continue."; |
147 | 157 |
148 innerDiv.insertBefore( rootWarning, articleContent ); | 158 innerDiv.insertBefore( rootWarning, articleContent ); |
149 } | 159 } |
150 | 160 |
151 readability.postProcessContent(articleContent); | 161 readability.postProcessContent(articleContent); |
152 | 162 |
153 window.scrollTo(0, 0); | 163 window.scrollTo(0, 0); |
154 | 164 |
155 // TODO(bengr): Remove this assignment of null to nextPageLink when | |
156 // the processing of the next page link is safe. | |
157 nextPageLink = null; | |
158 | |
159 if (nextPageLink) { | 165 if (nextPageLink) { |
160 /** | 166 /** |
161 * Append any additional pages after a small timeout so that people | 167 * Append any additional pages after a small timeout so that people |
162 * can start reading without having to wait for this to finish proce
ssing. | 168 * can start reading without having to wait for this to finish proce
ssing. |
163 **/ | 169 **/ |
164 window.setTimeout(function() { | 170 window.setTimeout(function() { |
165 readability.appendNextPage(nextPageLink); | 171 readability.appendNextPage(nextPageLink); |
166 }, 500); | 172 }, 500); |
167 } | 173 } |
168 | 174 |
169 /** Smooth scrolling **/ | 175 /** Smooth scrolling **/ |
170 document.onkeydown = function(e) { | 176 document.onkeydown = function(e) { |
171 var code = (window.event) ? event.keyCode : e.keyCode; | 177 var code = (window.event) ? event.keyCode : e.keyCode; |
172 if (code === 16) { | 178 if (code === 16) { |
173 readability.reversePageScroll = true; | 179 readability.reversePageScroll = true; |
174 return; | 180 return; |
175 } | 181 } |
176 | 182 |
177 if (code === 32) { | 183 if (code === 32) { |
178 readability.curScrollStep = 0; | 184 readability.curScrollStep = 0; |
179 var windowHeight = window.innerHeight ? window.innerHeight : (do
cument.documentElement.clientHeight ? document.documentElement.clientHeight : do
cument.body.clientHeight); | 185 var windowHeight = window.innerHeight ? window.innerHeight : (do
cument.documentElement.clientHeight ? document.documentElement.clientHeight : do
cument.body.clientHeight); |
180 | 186 |
181 if(readability.reversePageScroll) { | 187 if(readability.reversePageScroll) { |
182 readability.scrollTo(readability.scrollTop(), readability.sc
rollTop() - (windowHeight - 50), 20, 10); | 188 readability.scrollTo(readability.scrollTop(), readability.sc
rollTop() - (windowHeight - 50), 20, 10); |
183 } | 189 } |
184 else { | 190 else { |
185 readability.scrollTo(readability.scrollTop(), readability.sc
rollTop() + (windowHeight - 50), 20, 10); | 191 readability.scrollTo(readability.scrollTop(), readability.sc
rollTop() + (windowHeight - 50), 20, 10); |
186 } | 192 } |
187 | 193 |
188 return false; | 194 return false; |
189 } | 195 } |
190 }; | 196 }; |
191 | 197 |
192 document.onkeyup = function(e) { | 198 document.onkeyup = function(e) { |
193 var code = (window.event) ? event.keyCode : e.keyCode; | 199 var code = (window.event) ? event.keyCode : e.keyCode; |
194 if (code === 16) { | 200 if (code === 16) { |
195 readability.reversePageScroll = false; | 201 readability.reversePageScroll = false; |
196 return; | 202 return; |
197 } | 203 } |
198 }; | 204 }; |
199 }, | 205 }, |
200 | 206 |
201 /** | 207 /** |
202 * Run any post-process modifications to article content as necessary. | 208 * Run any post-process modifications to article content as necessary. |
203 * | 209 * |
204 * @param Element | 210 * @param Element |
205 * @return void | 211 * @return void |
206 **/ | 212 **/ |
207 postProcessContent: function(articleContent) { | 213 postProcessContent: function(articleContent) { |
208 if(readability.convertLinksToFootnotes && !window.location.href.match(/w
ikipedia\.org/g)) { | 214 if(readability.convertLinksToFootnotes && !window.location.href.match(/w
ikipedia\.org/g)) { |
209 readability.addFootnotes(articleContent); | 215 readability.addFootnotes(articleContent); |
210 } | 216 } |
211 | 217 |
212 readability.fixImageFloats(articleContent); | 218 readability.fixImageFloats(articleContent); |
213 }, | 219 }, |
214 | 220 |
215 /** | 221 /** |
216 * Some content ends up looking ugly if the image is too large to be floated
. | 222 * Some content ends up looking ugly if the image is too large to be floated
. |
217 * If the image is wider than a threshold (currently 55%), no longer float i
t, | 223 * If the image is wider than a threshold (currently 55%), no longer float i
t, |
218 * center it instead. | 224 * center it instead. |
219 * | 225 * |
220 * @param Element | 226 * @param Element |
221 * @return void | 227 * @return void |
222 **/ | 228 **/ |
223 fixImageFloats: function (articleContent) { | 229 fixImageFloats: function (articleContent) { |
224 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.
55, | 230 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.
55, |
225 images = articleContent.getElementsByTagName('img'); | 231 images = articleContent.getElementsByTagName('img'); |
226 | 232 |
227 for(var i=0, il = images.length; i < il; i+=1) { | 233 for(var i=0, il = images.length; i < il; i+=1) { |
228 var image = images[i]; | 234 var image = images[i]; |
229 | 235 |
230 if(image.offsetWidth > imageWidthThreshold) { | 236 if(image.offsetWidth > imageWidthThreshold) { |
231 image.className += " blockImage"; | 237 image.className += " blockImage"; |
232 } | 238 } |
233 } | 239 } |
234 }, | 240 }, |
235 | 241 |
236 /** | 242 /** |
237 * Get the article tools Element that has buttons like reload, print. | 243 * Get the article tools Element that has buttons like reload, print. |
238 * | 244 * |
239 * @return void | 245 * @return void |
240 **/ | 246 **/ |
241 getArticleTools: function () { | 247 getArticleTools: function () { |
242 var articleTools = document.createElement("DIV"); | 248 var articleTools = document.createElement("DIV"); |
243 | 249 |
244 articleTools.id = "readTools"; | 250 articleTools.id = "readTools"; |
245 articleTools.innerHTML = | 251 articleTools.innerHTML = |
246 "<a href='#' onclick='return window.location.reload()' title='Reload
original page' id='reload-page'>Reload Original Page</a>" + | 252 "<a href='#' onclick='return window.location.reload()' title='Reload
original page' id='reload-page'>Reload Original Page</a>" + |
247 "<a href='#' onclick='javascript:window.print();' title='Print page'
id='print-page'>Print Page</a>" + | 253 "<a href='#' onclick='javascript:window.print();' title='Print page'
id='print-page'>Print Page</a>" + |
248 "<a href='#' onclick='readability.emailBox(); return false;' title='
Email page' id='email-page'>Email Page</a>"; | 254 "<a href='#' onclick='readability.emailBox(); return false;' title='
Email page' id='email-page'>Email Page</a>"; |
249 | 255 |
250 return articleTools; | 256 return articleTools; |
251 }, | 257 }, |
252 | 258 |
253 /** | 259 /** |
254 * retuns the suggested direction of the string | 260 * retuns the suggested direction of the string |
255 * | 261 * |
256 * @return "rtl" || "ltr" | 262 * @return "rtl" || "ltr" |
257 **/ | 263 **/ |
258 getSuggestedDirection: function(text) { | 264 getSuggestedDirection: function(text) { |
259 function sanitizeText() { | 265 function sanitizeText() { |
260 return text.replace(/@\w+/, ""); | 266 return text.replace(/@\w+/, ""); |
261 } | 267 } |
262 | 268 |
263 function countMatches(match) { | 269 function countMatches(match) { |
264 var matches = text.match(new RegExp(match, "g")); | 270 var matches = text.match(new RegExp(match, "g")); |
265 return matches !== null ? matches.length : 0; | 271 return matches !== null ? matches.length : 0; |
266 } | 272 } |
267 | 273 |
268 function isRTL() { | 274 function isRTL() { |
269 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]"); | 275 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]"); |
270 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]"); | 276 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]"); |
271 | 277 |
272 // if 20% of chars are Hebrew or Arbic then direction is rtl | 278 // if 20% of chars are Hebrew or Arbic then direction is rtl |
273 return (count_heb + count_arb) * 100 / text.length > 20; | 279 return (count_heb + count_arb) * 100 / text.length > 20; |
274 } | 280 } |
275 | 281 |
276 text = sanitizeText(text); | 282 text = sanitizeText(text); |
277 return isRTL() ? "rtl" : "ltr"; | 283 return isRTL() ? "rtl" : "ltr"; |
278 }, | 284 }, |
279 | 285 |
280 /** | 286 /** |
281 * Get the article title as an H1. | 287 * Get the article title as an H1. |
282 * | 288 * |
283 * @return void | 289 * @return void |
284 **/ | 290 **/ |
285 getArticleTitle: function () { | 291 getArticleTitle: function () { |
286 var curTitle = "", | 292 var curTitle = "", |
287 origTitle = ""; | 293 origTitle = ""; |
288 | 294 |
289 try { | 295 try { |
290 curTitle = origTitle = document.title; | 296 curTitle = origTitle = document.title; |
291 if(typeof curTitle !== "string") { /* If they had an element with id
"title" in their HTML */ | 297 if(typeof curTitle !== "string") { /* If they had an element with id
"title" in their HTML */ |
292 curTitle = origTitle = readability.getInnerText(document.getElem
entsByTagName('title')[0]); | 298 curTitle = origTitle = readability.getInnerText(document.getElem
entsByTagName('title')[0]); |
293 } | 299 } |
294 } | 300 } |
295 catch(e) {} | 301 catch(e) {} |
296 | 302 |
297 if(curTitle.match(/ [\|\-] /)) | 303 if(curTitle.match(/ [\|\-] /)) |
298 { | 304 { |
299 curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); | 305 curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); |
300 | 306 |
301 if(curTitle.split(' ').length < 3) { | 307 if(curTitle.split(' ').length < 3) { |
302 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); | 308 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); |
303 } | 309 } |
304 } | 310 } |
305 else if(curTitle.indexOf(': ') !== -1) | 311 else if(curTitle.indexOf(': ') !== -1) |
306 { | 312 { |
307 curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); | 313 curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); |
308 | 314 |
309 if(curTitle.split(' ').length < 3) { | 315 if(curTitle.split(' ').length < 3) { |
310 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); | 316 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); |
(...skipping 12 matching lines...) Expand all Loading... |
323 | 329 |
324 if(curTitle.split(' ').length <= 4) { | 330 if(curTitle.split(' ').length <= 4) { |
325 curTitle = origTitle; | 331 curTitle = origTitle; |
326 } | 332 } |
327 return curTitle; | 333 return curTitle; |
328 }, | 334 }, |
329 | 335 |
330 /** | 336 /** |
331 * Prepare the HTML document for readability to scrape it. | 337 * Prepare the HTML document for readability to scrape it. |
332 * This includes things like stripping javascript, CSS, and handling terribl
e markup. | 338 * This includes things like stripping javascript, CSS, and handling terribl
e markup. |
333 * | 339 * |
334 * @return void | 340 * @return void |
335 **/ | 341 **/ |
336 prepDocument: function () { | 342 prepDocument: function () { |
337 /** | 343 /** |
338 * In some cases a body element can't be found (if the HTML is totally h
osed for example) | 344 * In some cases a body element can't be found (if the HTML is totally h
osed for example) |
339 * so we create a new body node and append it to the document. | 345 * so we create a new body node and append it to the document. |
340 */ | 346 */ |
341 if(document.body === null) | 347 if(document.body === null) |
342 { | 348 { |
343 var body = document.createElement("body"); | 349 var body = document.createElement("body"); |
344 try { | 350 try { |
345 document.body = body; | 351 document.body = body; |
346 } | 352 } |
347 catch(e) { | 353 catch(e) { |
348 document.documentElement.appendChild(body); | 354 document.documentElement.appendChild(body); |
349 dbg(e); | 355 dbg(e); |
350 } | 356 } |
351 } | 357 } |
352 | 358 |
353 document.body.id = "readabilityBody"; | 359 document.body.id = "readabilityBody"; |
354 | 360 |
355 var frames = document.getElementsByTagName('frame'); | 361 var frames = document.getElementsByTagName('frame'); |
(...skipping 11 matching lines...) Expand all Loading... |
367 canAccessFrame = true; | 373 canAccessFrame = true; |
368 } | 374 } |
369 catch(eFrames) { | 375 catch(eFrames) { |
370 dbg(eFrames); | 376 dbg(eFrames); |
371 } | 377 } |
372 | 378 |
373 if(frameSize > biggestFrameSize) { | 379 if(frameSize > biggestFrameSize) { |
374 biggestFrameSize = frameSize; | 380 biggestFrameSize = frameSize; |
375 readability.biggestFrame = frames[frameIndex]; | 381 readability.biggestFrame = frames[frameIndex]; |
376 } | 382 } |
377 | 383 |
378 if(canAccessFrame && frameSize > bestFrameSize) | 384 if(canAccessFrame && frameSize > bestFrameSize) |
379 { | 385 { |
380 readability.frameHack = true; | 386 readability.frameHack = true; |
381 | 387 |
382 bestFrame = frames[frameIndex]; | 388 bestFrame = frames[frameIndex]; |
383 bestFrameSize = frameSize; | 389 bestFrameSize = frameSize; |
384 } | 390 } |
385 } | 391 } |
386 | 392 |
387 if(bestFrame) | 393 if(bestFrame) |
388 { | 394 { |
389 var newBody = document.createElement('body'); | 395 var newBody = document.createElement('body'); |
390 readability.moveNodeInnards(bestFrame.contentWindow.document.bod
y, newBody); | 396 readability.moveNodeInnards(bestFrame.contentWindow.document.bod
y, newBody); |
391 newBody.style.overflow = 'scroll'; | 397 newBody.style.overflow = 'scroll'; |
392 document.body = newBody; | 398 document.body = newBody; |
393 | 399 |
394 var frameset = document.getElementsByTagName('frameset')[0]; | 400 var frameset = document.getElementsByTagName('frameset')[0]; |
395 if(frameset) { | 401 if(frameset) { |
396 frameset.parentNode.removeChild(frameset); } | 402 frameset.parentNode.removeChild(frameset); } |
397 } | 403 } |
398 } | 404 } |
399 | 405 |
400 /* Remove all stylesheets */ | 406 /* Remove all stylesheets */ |
401 for (var k=0;k < document.styleSheets.length; k+=1) { | 407 for (var k=0;k < document.styleSheets.length; k+=1) { |
402 if (document.styleSheets[k].href !== null && document.styleSheets[k]
.href.lastIndexOf("readability") === -1) { | 408 if (document.styleSheets[k].href !== null && document.styleSheets[k]
.href.lastIndexOf("readability") === -1) { |
403 document.styleSheets[k].disabled = true; | 409 document.styleSheets[k].disabled = true; |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
448 readability.cleanConditionally(articleContent, "table"); | 454 readability.cleanConditionally(articleContent, "table"); |
449 readability.cleanConditionally(articleContent, "ul"); | 455 readability.cleanConditionally(articleContent, "ul"); |
450 readability.cleanConditionally(articleContent, "div"); | 456 readability.cleanConditionally(articleContent, "div"); |
451 | 457 |
452 /* Remove extra paragraphs */ | 458 /* Remove extra paragraphs */ |
453 var articleParagraphs = articleContent.getElementsByTagName('p'); | 459 var articleParagraphs = articleContent.getElementsByTagName('p'); |
454 for(var i = articleParagraphs.length-1; i >= 0; i-=1) { | 460 for(var i = articleParagraphs.length-1; i >= 0; i-=1) { |
455 var imgCount = articleParagraphs[i].getElementsByTagName('img').l
ength; | 461 var imgCount = articleParagraphs[i].getElementsByTagName('img').l
ength; |
456 var embedCount = articleParagraphs[i].getElementsByTagName('embed')
.length; | 462 var embedCount = articleParagraphs[i].getElementsByTagName('embed')
.length; |
457 var objectCount = articleParagraphs[i].getElementsByTagName('object'
).length; | 463 var objectCount = articleParagraphs[i].getElementsByTagName('object'
).length; |
458 | 464 |
459 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab
ility.getInnerText(articleParagraphs[i], false) === '') { | 465 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab
ility.getInnerText(articleParagraphs[i], false) === '') { |
460 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]
); | 466 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]
); |
461 } | 467 } |
462 } | 468 } |
463 | 469 |
464 try { | 470 try { |
465 readability.replaceBrsWithPs(articleContent); | 471 readability.replaceBrsWithPs(articleContent); |
466 } | 472 } |
467 catch (e) { | 473 catch (e) { |
468 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block
-elements bug. Ignoring.: " + e); | 474 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block
-elements bug. Ignoring.: " + e); |
469 } | 475 } |
470 }, | 476 }, |
471 | 477 |
472 /** | 478 /** |
473 * Initialize a node with the readability object. Also checks the | 479 * Initialize a node with the readability object. Also checks the |
474 * className/id for special names to add to its score. | 480 * className/id for special names to add to its score. |
475 * | 481 * |
476 * @param Element | 482 * @param Element |
477 * @return void | 483 * @return void |
478 **/ | 484 **/ |
479 initializeNode: function (node) { | 485 initializeNode: function (node) { |
480 node.readability = {"contentScore": 0}; | 486 node.readability = {"contentScore": 0}; |
481 | 487 |
482 switch(node.tagName) { | 488 switch(node.tagName) { |
483 case 'DIV': | 489 case 'DIV': |
484 node.readability.contentScore += 5; | 490 node.readability.contentScore += 5; |
485 break; | 491 break; |
486 | 492 |
487 case 'PRE': | 493 case 'PRE': |
488 case 'TD': | 494 case 'TD': |
489 case 'BLOCKQUOTE': | 495 case 'BLOCKQUOTE': |
490 node.readability.contentScore += 3; | 496 node.readability.contentScore += 3; |
491 break; | 497 break; |
492 | 498 |
493 case 'ADDRESS': | 499 case 'ADDRESS': |
494 case 'OL': | 500 case 'OL': |
495 case 'UL': | 501 case 'UL': |
496 case 'DL': | 502 case 'DL': |
497 case 'DD': | 503 case 'DD': |
498 case 'DT': | 504 case 'DT': |
499 case 'LI': | 505 case 'LI': |
500 case 'FORM': | 506 case 'FORM': |
501 node.readability.contentScore -= 3; | 507 node.readability.contentScore -= 3; |
502 break; | 508 break; |
503 | 509 |
504 case 'H1': | 510 case 'H1': |
505 case 'H2': | 511 case 'H2': |
506 case 'H3': | 512 case 'H3': |
507 case 'H4': | 513 case 'H4': |
508 case 'H5': | 514 case 'H5': |
509 case 'H6': | 515 case 'H6': |
510 case 'TH': | 516 case 'TH': |
511 node.readability.contentScore -= 5; | 517 node.readability.contentScore -= 5; |
512 break; | 518 break; |
513 } | 519 } |
514 | 520 |
515 node.readability.contentScore += readability.getClassWeight(node); | 521 node.readability.contentScore += readability.getClassWeight(node); |
516 }, | 522 }, |
517 | 523 |
518 /*** | 524 /*** |
519 * grabArticle - Using a variety of metrics (content score, classname, eleme
nt types), find the content that is | 525 * grabArticle - Using a variety of metrics (content score, classname, eleme
nt types), find the content that is |
520 * most likely to be the stuff a user wants to read. Then retu
rn it wrapped up in a div. | 526 * most likely to be the stuff a user wants to read. Then retu
rn it wrapped up in a div. |
521 * | 527 * |
522 * @param page a document to run upon. Needs to be a full document, complete
with body. | 528 * @param page a document to run upon. Needs to be a full document, complete
with body. |
523 * @return Element | 529 * @return Element |
524 **/ | 530 **/ |
525 grabArticle: function (pageToClone) { | 531 grabArticle: function (pageToClone) { |
526 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_
STRIP_UNLIKELYS), | 532 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_
STRIP_UNLIKELYS), |
527 isPaging = (page !== null) ? true: false; | 533 isPaging = (page !== null) ? true: false; |
528 | 534 |
529 var page = null; | 535 var page = null; |
530 // Never work on the actual page. | 536 // Never work on the actual page. |
531 if (isPaging) { | 537 if (isPaging) { |
532 page = document.body.cloneNode(true); | 538 page = document.body.cloneNode(true); |
533 } else { | 539 } else { |
534 page = pageToClone.cloneNode(true); | 540 page = pageToClone.cloneNode(true); |
535 } | 541 } |
536 | 542 |
537 var allElements = page.getElementsByTagName('*'); | 543 var allElements = page.getElementsByTagName('*'); |
538 | 544 |
539 /** | 545 /** |
540 * First, node prepping. Trash nodes that look cruddy (like ones with th
e class name "comment", etc), and turn divs | 546 * First, node prepping. Trash nodes that look cruddy (like ones with th
e class name "comment", etc), and turn divs |
541 * into P tags where they have been used inappropriately (as in, where t
hey contain no other block level elements.) | 547 * into P tags where they have been used inappropriately (as in, where t
hey contain no other block level elements.) |
542 * | 548 * |
543 * Note: Assignment from index for performance. See http://www.peachpit.
com/articles/article.aspx?p=31567&seqNum=5 | 549 * Note: Assignment from index for performance. See http://www.peachpit.
com/articles/article.aspx?p=31567&seqNum=5 |
544 * TODO: Shouldn't this be a reverse traversal? | 550 * TODO: Shouldn't this be a reverse traversal? |
545 **/ | 551 **/ |
546 var node = null; | 552 var node = null; |
547 var nodesToScore = []; | 553 var nodesToScore = []; |
548 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) { | 554 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) { |
549 /* Remove unlikely candidates */ | 555 /* Remove unlikely candidates */ |
550 if (stripUnlikelyCandidates) { | 556 if (stripUnlikelyCandidates) { |
551 var unlikelyMatchString = node.className + node.id; | 557 var unlikelyMatchString = node.className + node.id; |
552 if ( | 558 if ( |
553 ( | 559 ( |
554 unlikelyMatchString.search(readability.regexps.unlikelyC
andidates) !== -1 && | 560 unlikelyMatchString.search(readability.regexps.unlikelyC
andidates) !== -1 && |
555 unlikelyMatchString.search(readability.regexps.okMaybeIt
sACandidate) === -1 && | 561 unlikelyMatchString.search(readability.regexps.okMaybeIt
sACandidate) === -1 && |
556 node.tagName !== "BODY" | 562 node.tagName !== "BODY" |
557 ) | 563 ) |
558 ) | 564 ) |
559 { | 565 { |
560 dbg("Removing unlikely candidate - " + unlikelyMatchString); | 566 dbg("Removing unlikely candidate - " + unlikelyMatchString); |
561 node.parentNode.removeChild(node); | 567 node.parentNode.removeChild(node); |
562 nodeIndex-=1; | 568 nodeIndex-=1; |
563 continue; | 569 continue; |
564 } | 570 } |
565 } | 571 } |
566 | 572 |
567 if (node.tagName === "P" || node.tagName === "TD" || node.tagName ==
= "PRE") { | 573 if (node.tagName === "P" || node.tagName === "TD" || node.tagName ==
= "PRE") { |
568 nodesToScore[nodesToScore.length] = node; | 574 nodesToScore[nodesToScore.length] = node; |
569 } | 575 } |
570 | 576 |
571 /* Turn all divs that don't have children block level elements into
p's */ | 577 /* Turn all divs that don't have children block level elements into
p's */ |
572 if (node.tagName === "DIV") { | 578 if (node.tagName === "DIV") { |
573 if (node.innerHTML.search(readability.regexps.divToPElements) ==
= -1) { | 579 if (node.innerHTML.search(readability.regexps.divToPElements) ==
= -1) { |
574 var newNode = document.createElement('p'); | 580 var newNode = document.createElement('p'); |
(...skipping 16 matching lines...) Expand all Loading... |
591 if(childNode.nodeType === 3) { // Node.TEXT_NODE | 597 if(childNode.nodeType === 3) { // Node.TEXT_NODE |
592 var p = document.createElement('p'); | 598 var p = document.createElement('p'); |
593 var t = document.createTextNode(childNode.nodeValue)
; | 599 var t = document.createTextNode(childNode.nodeValue)
; |
594 p.appendChild(t); | 600 p.appendChild(t); |
595 p.style.display = 'inline'; | 601 p.style.display = 'inline'; |
596 p.className = 'readability-styled'; | 602 p.className = 'readability-styled'; |
597 childNode.parentNode.replaceChild(p, childNode); | 603 childNode.parentNode.replaceChild(p, childNode); |
598 } | 604 } |
599 } | 605 } |
600 } | 606 } |
601 } | 607 } |
602 } | 608 } |
603 | 609 |
604 /** | 610 /** |
605 * Loop through all paragraphs, and assign a score to them based on how
content-y they look. | 611 * Loop through all paragraphs, and assign a score to them based on how
content-y they look. |
606 * Then add their score to their parent node. | 612 * Then add their score to their parent node. |
607 * | 613 * |
608 * A score is determined by things like number of commas, class names, e
tc. Maybe eventually link density. | 614 * A score is determined by things like number of commas, class names, e
tc. Maybe eventually link density. |
609 **/ | 615 **/ |
610 var candidates = []; | 616 var candidates = []; |
611 for (var pt=0; pt < nodesToScore.length; pt+=1) { | 617 for (var pt=0; pt < nodesToScore.length; pt+=1) { |
(...skipping 21 matching lines...) Expand all Loading... |
633 candidates.push(grandParentNode); | 639 candidates.push(grandParentNode); |
634 } | 640 } |
635 | 641 |
636 var contentScore = 0; | 642 var contentScore = 0; |
637 | 643 |
638 /* Add a point for the paragraph itself as a base. */ | 644 /* Add a point for the paragraph itself as a base. */ |
639 contentScore+=1; | 645 contentScore+=1; |
640 | 646 |
641 /* Add points for any commas within this paragraph */ | 647 /* Add points for any commas within this paragraph */ |
642 contentScore += innerText.split(',').length; | 648 contentScore += innerText.split(',').length; |
643 | 649 |
644 /* For every 100 characters in this paragraph, add another point. Up
to 3 points. */ | 650 /* For every 100 characters in this paragraph, add another point. Up
to 3 points. */ |
645 contentScore += Math.min(Math.floor(innerText.length / 100), 3); | 651 contentScore += Math.min(Math.floor(innerText.length / 100), 3); |
646 | 652 |
647 /* Add the score to the parent. The grandparent gets half. */ | 653 /* Add the score to the parent. The grandparent gets half. */ |
648 parentNode.readability.contentScore += contentScore; | 654 parentNode.readability.contentScore += contentScore; |
649 | 655 |
650 if(grandParentNode) { | 656 if(grandParentNode) { |
651 grandParentNode.readability.contentScore += contentScore/2;
| 657 grandParentNode.readability.contentScore += contentScore/2; |
652 } | 658 } |
653 } | 659 } |
654 | 660 |
655 /** | 661 /** |
656 * After we've calculated scores, loop through all of the possible candi
date nodes we found | 662 * After we've calculated scores, loop through all of the possible candi
date nodes we found |
657 * and find the one with the highest score. | 663 * and find the one with the highest score. |
658 **/ | 664 **/ |
659 var topCandidate = null; | 665 var topCandidate = null; |
660 for(var c=0, cl=candidates.length; c < cl; c+=1) | 666 for(var c=0, cl=candidates.length; c < cl; c+=1) |
661 { | 667 { |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
718 var contentBonus = 0; | 724 var contentBonus = 0; |
719 /* Give a bonus if sibling nodes and top candidates have the example
same classname */ | 725 /* Give a bonus if sibling nodes and top candidates have the example
same classname */ |
720 if(siblingNode.className === topCandidate.className && topCandidate.
className !== "") { | 726 if(siblingNode.className === topCandidate.className && topCandidate.
className !== "") { |
721 contentBonus += topCandidate.readability.contentScore * 0.2; | 727 contentBonus += topCandidate.readability.contentScore * 0.2; |
722 } | 728 } |
723 | 729 |
724 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re
adability.contentScore+contentBonus) >= siblingScoreThreshold) | 730 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re
adability.contentScore+contentBonus) >= siblingScoreThreshold) |
725 { | 731 { |
726 append = true; | 732 append = true; |
727 } | 733 } |
728 | 734 |
729 if(siblingNode.nodeName === "P") { | 735 if(siblingNode.nodeName === "P") { |
730 var linkDensity = readability.getLinkDensity(siblingNode); | 736 var linkDensity = readability.getLinkDensity(siblingNode); |
731 var nodeContent = readability.getInnerText(siblingNode); | 737 var nodeContent = readability.getInnerText(siblingNode); |
732 var nodeLength = nodeContent.length; | 738 var nodeLength = nodeContent.length; |
733 | 739 |
734 if(nodeLength > 80 && linkDensity < 0.25) | 740 if(nodeLength > 80 && linkDensity < 0.25) |
735 { | 741 { |
736 append = true; | 742 append = true; |
737 } | 743 } |
738 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear
ch(/\.( |$)/) !== -1) | 744 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear
ch(/\.( |$)/) !== -1) |
739 { | 745 { |
740 append = true; | 746 append = true; |
741 } | 747 } |
742 } | 748 } |
743 | 749 |
744 if(append) { | 750 if(append) { |
745 dbg("Appending node: " + siblingNode); | 751 dbg("Appending node: " + siblingNode); |
746 | 752 |
747 var nodeToAppend = null; | 753 var nodeToAppend = null; |
748 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P
") { | 754 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P
") { |
749 /* We have a node that isn't a common block level element, l
ike a form or td tag. Turn it into a div so it doesn't get filtered out later by
accident. */ | 755 /* We have a node that isn't a common block level element, l
ike a form or td tag. Turn it into a div so it doesn't get filtered out later by
accident. */ |
750 | 756 |
751 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to
div.'); | 757 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to
div.'); |
752 nodeToAppend = document.createElement("DIV"); | 758 nodeToAppend = document.createElement("DIV"); |
753 try { | 759 try { |
754 nodeToAppend.id = siblingNode.id; | 760 nodeToAppend.id = siblingNode.id; |
755 readability.moveNodeInnards(siblingNode, nodeToAppend); | 761 readability.moveNodeInnards(siblingNode, nodeToAppend); |
756 } | 762 } |
757 catch(er) { | 763 catch(er) { |
758 dbg("Could not alter siblingNode to div, probably an IE
restriction, reverting back to original."); | 764 dbg("Could not alter siblingNode to div, probably an IE
restriction, reverting back to original."); |
759 nodeToAppend = siblingNode; | 765 nodeToAppend = siblingNode; |
760 s-=1; | 766 s-=1; |
761 sl-=1; | 767 sl-=1; |
762 } | 768 } |
763 } else { | 769 } else { |
764 nodeToAppend = siblingNode; | 770 nodeToAppend = siblingNode; |
765 s-=1; | 771 s-=1; |
766 sl-=1; | 772 sl-=1; |
767 } | 773 } |
768 | 774 |
769 /* To ensure a node does not interfere with readability styles,
remove its classnames */ | 775 /* To ensure a node does not interfere with readability styles,
remove its classnames */ |
770 nodeToAppend.className = ""; | 776 nodeToAppend.className = ""; |
771 | 777 |
772 /* Append sibling and subtract from our list because it removes
the node when you append to another node */ | 778 /* Append sibling and subtract from our list because it removes
the node when you append to another node */ |
773 articleContent.appendChild(nodeToAppend); | 779 articleContent.appendChild(nodeToAppend); |
774 } | 780 } |
775 } | 781 } |
776 | 782 |
777 /** | 783 /** |
778 * So we have all of the content that we need. Now we clean it up for pr
esentation. | 784 * So we have all of the content that we need. Now we clean it up for pr
esentation. |
779 **/ | 785 **/ |
780 readability.distilledArticleContent = articleContent.cloneNode(true); | 786 readability.distilledArticleContent = articleContent.cloneNode(true); |
781 //readability.prepArticle(articleContent); | 787 //readability.prepArticle(articleContent); |
782 | 788 |
783 if (readability.curPageNum === 1) { | 789 if (readability.curPageNum === 1) { |
784 var newNode = document.createElement('div'); | 790 var newNode = document.createElement('div'); |
785 newNode.id = "readability-page-1"; | 791 newNode.id = "readability-page-1"; |
786 newNode.setAttribute("class", "page"); | 792 newNode.setAttribute("class", "page"); |
787 readability.moveNodeInnards(articleContent, newNode); | 793 readability.moveNodeInnards(articleContent, newNode); |
788 articleContent.appendChild(newNode); | 794 articleContent.appendChild(newNode); |
789 } | 795 } |
790 | 796 |
791 /** | 797 /** |
792 * Now that we've gone through the full algorithm, check to see if we go
t any meaningful content. | 798 * Now that we've gone through the full algorithm, check to see if we go
t any meaningful content. |
793 * If we didn't, we may need to re-run grabArticle with different flags
set. This gives us a higher | 799 * If we didn't, we may need to re-run grabArticle with different flags
set. This gives us a higher |
794 * likelihood of finding the content, and the sieve approach gives us a
higher likelihood of | 800 * likelihood of finding the content, and the sieve approach gives us a
higher likelihood of |
795 * finding the -right- content. | 801 * finding the -right- content. |
796 **/ | 802 **/ |
797 if(readability.getInnerText(articleContent, false).length < 250) { | 803 if(readability.getInnerText(articleContent, false).length < 250) { |
798 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) { | 804 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) { |
799 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS); | 805 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS); |
800 return readability.grabArticle(document.body); | 806 return readability.grabArticle(document.body); |
801 } | 807 } |
802 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES))
{ | 808 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES))
{ |
803 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES); | 809 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES); |
804 return readability.grabArticle(document.body); | 810 return readability.grabArticle(document.body); |
805 } | 811 } |
806 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL
LY)) { | 812 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL
LY)) { |
807 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY); | 813 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY); |
808 return readability.grabArticle(document.body); | 814 return readability.grabArticle(document.body); |
809 } else { | 815 } else { |
810 return null; | 816 return null; |
811 } | 817 } |
812 } | 818 } |
813 | 819 |
814 return articleContent; | 820 return articleContent; |
815 }, | 821 }, |
816 | 822 |
817 /** | 823 /** |
818 * Removes script tags from the document. | 824 * Removes script tags from the document. |
819 * | 825 * |
820 * @param Element | 826 * @param Element |
821 **/ | 827 **/ |
822 removeScripts: function (doc) { | 828 removeScripts: function (doc) { |
823 var scripts = doc.getElementsByTagName('script'); | 829 var scripts = doc.getElementsByTagName('script'); |
824 for(var i = scripts.length-1; i >= 0; i-=1) | 830 for(var i = scripts.length-1; i >= 0; i-=1) |
825 { | 831 { |
826 if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf
('readability') === -1 && scripts[i].src.indexOf('typekit') === -1)) | 832 if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf
('readability') === -1 && scripts[i].src.indexOf('typekit') === -1)) |
827 { | 833 { |
828 scripts[i].nodeValue=""; | 834 scripts[i].nodeValue=""; |
829 scripts[i].removeAttribute('src'); | 835 scripts[i].removeAttribute('src'); |
830 if (scripts[i].parentNode) { | 836 if (scripts[i].parentNode) { |
831 scripts[i].parentNode.removeChild(scripts[i]); | 837 scripts[i].parentNode.removeChild(scripts[i]); |
832 } | 838 } |
833 } | 839 } |
834 } | 840 } |
835 }, | 841 }, |
836 | 842 |
837 /** | 843 /** |
838 * Get the inner text of a node - cross browser compatibly. | 844 * Get the inner text of a node - cross browser compatibly. |
839 * This also strips out any excess whitespace to be found. | 845 * This also strips out any excess whitespace to be found. |
840 * | 846 * |
841 * @param Element | 847 * @param Element |
842 * @return string | 848 * @return string |
843 **/ | 849 **/ |
844 getInnerText: function (e, normalizeSpaces) { | 850 getInnerText: function (e, normalizeSpaces) { |
845 var textContent = ""; | 851 var textContent = ""; |
846 | 852 |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
889 | 895 |
890 // Remove any root styles, if we're able. | 896 // Remove any root styles, if we're able. |
891 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili
ty-styled') { | 897 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili
ty-styled') { |
892 e.removeAttribute('style'); } | 898 e.removeAttribute('style'); } |
893 | 899 |
894 // Go until there are no more child nodes | 900 // Go until there are no more child nodes |
895 while ( cur !== null ) { | 901 while ( cur !== null ) { |
896 if ( cur.nodeType === 1 ) { | 902 if ( cur.nodeType === 1 ) { |
897 // Remove style attribute(s) : | 903 // Remove style attribute(s) : |
898 if(cur.className !== "readability-styled") { | 904 if(cur.className !== "readability-styled") { |
899 cur.removeAttribute("style"); | 905 cur.removeAttribute("style"); |
900 } | 906 } |
901 readability.cleanStyles( cur ); | 907 readability.cleanStyles( cur ); |
902 } | 908 } |
903 cur = cur.nextSibling; | 909 cur = cur.nextSibling; |
904 } | 910 } |
905 }, | 911 }, |
906 | 912 |
907 /** | 913 /** |
908 * Get the density of links as a percentage of the content | 914 * Get the density of links as a percentage of the content |
909 * This is the amount of text that is inside a link divided by the total tex
t in the node. | 915 * This is the amount of text that is inside a link divided by the total tex
t in the node. |
910 * | 916 * |
911 * @param Element | 917 * @param Element |
912 * @return number (float) | 918 * @return number (float) |
913 **/ | 919 **/ |
914 getLinkDensity: function (e) { | 920 getLinkDensity: function (e) { |
915 var links = e.getElementsByTagName("a"); | 921 var links = e.getElementsByTagName("a"); |
916 var textLength = readability.getInnerText(e).length; | 922 var textLength = readability.getInnerText(e).length; |
917 var linkLength = 0; | 923 var linkLength = 0; |
918 for(var i=0, il=links.length; i<il;i+=1) | 924 for(var i=0, il=links.length; i<il;i+=1) |
919 { | 925 { |
920 linkLength += readability.getInnerText(links[i]).length; | 926 linkLength += readability.getInnerText(links[i]).length; |
921 } | 927 } |
922 | 928 |
923 return linkLength / textLength; | 929 return linkLength / textLength; |
924 }, | 930 }, |
925 | 931 |
926 /** | 932 /** |
927 * Find a cleaned up version of the current URL, to use for comparing links
for possible next-pageyness. | 933 * Find a cleaned up version of the current URL, to use for comparing links
for possible next-pageyness. |
928 * | 934 * |
929 * @author Dan Lacy | 935 * @author Dan Lacy |
930 * @return string the base url | 936 * @return string the base url |
931 **/ | 937 **/ |
932 findBaseUrl: function () { | 938 findBaseUrl: function () { |
933 var noUrlParams = window.location.pathname.split("?")[0], | 939 var noUrlParams = window.location.pathname.split("?")[0], |
934 urlSlashes = noUrlParams.split("/").reverse(), | 940 urlSlashes = noUrlParams.split("/").reverse(), |
935 cleanedSegments = [], | 941 cleanedSegments = [], |
936 possibleType = ""; | 942 possibleType = ""; |
937 | 943 |
938 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) { | 944 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) { |
939 var segment = urlSlashes[i]; | 945 var segment = urlSlashes[i]; |
940 | 946 |
941 // Split off and save anything that looks like a file type. | 947 // Split off and save anything that looks like a file type. |
942 if (segment.indexOf(".") !== -1) { | 948 if (segment.indexOf(".") !== -1) { |
943 possibleType = segment.split(".")[1]; | 949 possibleType = segment.split(".")[1]; |
944 | 950 |
945 /* If the type isn't alpha-only, it's probably not actually a fi
le extension. */ | 951 /* If the type isn't alpha-only, it's probably not actually a fi
le extension. */ |
946 if(!possibleType.match(/[^a-zA-Z]/)) { | 952 if(!possibleType.match(/[^a-zA-Z]/)) { |
947 segment = segment.split(".")[0]; | 953 segment = segment.split(".")[0]; |
948 } | 954 } |
949 } | 955 } |
950 | 956 |
951 /** | 957 /** |
952 * EW-CMS specific segment replacement. Ugly. | 958 * EW-CMS specific segment replacement. Ugly. |
953 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm
l | 959 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm
l |
954 **/ | 960 **/ |
955 if(segment.indexOf(',00') !== -1) { | 961 if(segment.indexOf(',00') !== -1) { |
956 segment = segment.replace(',00', ''); | 962 segment = segment.replace(',00', ''); |
957 } | 963 } |
958 | 964 |
959 // If our first or second segment has anything looking like a page n
umber, remove it. | 965 // If our first or second segment has anything looking like a page n
umber, remove it. |
960 if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1)
|| (i === 0))) { | 966 if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1)
|| (i === 0))) { |
961 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "
"); | 967 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "
"); |
962 } | 968 } |
963 | 969 |
964 | 970 |
965 var del = false; | 971 var del = false; |
966 | 972 |
967 /* If this is purely a number, and it's the first or second segment,
it's probably a page number. Remove it. */ | 973 /* If this is purely a number, and it's the first or second segment,
it's probably a page number. Remove it. */ |
968 if (i < 2 && segment.match(/^\d{1,2}$/)) { | 974 if (i < 2 && segment.match(/^\d{1,2}$/)) { |
969 del = true; | 975 del = true; |
970 } | 976 } |
971 | 977 |
972 /* If this is the first segment and it's just "index", remove it. */ | 978 /* If this is the first segment and it's just "index", remove it. */ |
973 if(i === 0 && segment.toLowerCase() === "index") { | 979 if(i === 0 && segment.toLowerCase() === "index") { |
974 del = true; | 980 del = true; |
975 } | 981 } |
976 | 982 |
977 | 983 |
978 /* If our first or second segment is smaller than 3 characters, and
the first segment was purely alphas, remove it. */ | 984 /* If our first or second segment is smaller than 3 characters, and
the first segment was purely alphas, remove it. */ |
979 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) { | 985 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) { |
980 del = true; | 986 del = true; |
981 } | 987 } |
982 | 988 |
983 /* If it's not marked for deletion, push it to cleanedSegments. */ | 989 /* If it's not marked for deletion, push it to cleanedSegments. */ |
984 if (!del) { | 990 if (!del) { |
985 cleanedSegments.push(segment); | 991 cleanedSegments.push(segment); |
986 } | 992 } |
987 } | 993 } |
988 | 994 |
989 // This is our final, cleaned, base article URL. | 995 // This is our final, cleaned, base article URL. |
990 return window.location.protocol + "//" + window.location.host + cleanedS
egments.reverse().join("/"); | 996 return window.location.protocol + "//" + window.location.host + cleanedS
egments.reverse().join("/"); |
991 }, | 997 }, |
992 | 998 |
993 /** | 999 /** |
994 * Look for any paging links that may occur within the document. | 1000 * Look for any paging links that may occur within the document. |
995 * | 1001 * |
996 * @param body | 1002 * @param body |
997 * @return object (array) | 1003 * @return object (array) |
998 **/ | 1004 **/ |
999 findNextPageLink: function (elem) { | 1005 findNextPageLink: function (elem) { |
1000 var possiblePages = {}, | 1006 var possiblePages = {}, |
1001 allLinks = elem.getElementsByTagName('a'), | 1007 allLinks = elem.getElementsByTagName('a'), |
1002 articleBaseUrl = readability.findBaseUrl(); | 1008 articleBaseUrl = readability.findBaseUrl(); |
1003 | 1009 |
1004 /** | 1010 /** |
1005 * Loop through all links, looking for hints that they may be next-page
links. | 1011 * Loop through all links, looking for hints that they may be next-page
links. |
1006 * Things like having "page" in their textContent, className or id, or b
eing a child | 1012 * Things like having "page" in their textContent, className or id, or b
eing a child |
1007 * of a node with a page-y className or id. | 1013 * of a node with a page-y className or id. |
1008 * | 1014 * |
1009 * Also possible: levenshtein distance? longest common subsequence? | 1015 * Also possible: levenshtein distance? longest common subsequence? |
1010 * | 1016 * |
1011 * After we do that, assign each page a score, and | 1017 * After we do that, assign each page a score, and |
1012 **/ | 1018 **/ |
1013 for(var i = 0, il = allLinks.length; i < il; i+=1) { | 1019 for(var i = 0, il = allLinks.length; i < il; i+=1) { |
1014 var link = allLinks[i], | 1020 var link = allLinks[i], |
1015 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '
'); | 1021 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '
'); |
1016 | 1022 |
1017 /* If we've already seen this page, ignore it */ | 1023 /* If we've already seen this page, ignore it */ |
1018 if(linkHref === "" || linkHref === articleBaseUrl || linkHref === wi
ndow.location.href || linkHref in readability.parsedPages) { | 1024 if(linkHref === "" || linkHref === articleBaseUrl || linkHref === wi
ndow.location.href || linkHref in readability.parsedPages) { |
1019 continue; | 1025 continue; |
1020 } | 1026 } |
1021 | 1027 |
1022 /* If it's on a different domain, skip it. */ | 1028 /* If it's on a different domain, skip it. */ |
1023 if(window.location.host !== linkHref.split(/\/+/g)[1]) { | 1029 if(window.location.host !== linkHref.split(/\/+/g)[1]) { |
1024 continue; | 1030 continue; |
1025 } | 1031 } |
1026 | 1032 |
1027 var linkText = readability.getInnerText(link); | 1033 var linkText = readability.getInnerText(link); |
1028 | 1034 |
1029 /* If the linkText looks like it's not the next page, skip it. */ | 1035 /* If the linkText looks like it's not the next page, skip it. */ |
1030 if(linkText.match(readability.regexps.extraneous) || linkText.length
> 25) { | 1036 if(linkText.match(readability.regexps.extraneous) || linkText.length
> 25) { |
1031 continue; | 1037 continue; |
1032 } | 1038 } |
1033 | 1039 |
1034 /* If the leftovers of the URL after removing the base URL don't con
tain any digits, it's certainly not a next page link. */ | 1040 /* If the leftovers of the URL after removing the base URL don't con
tain any digits, it's certainly not a next page link. */ |
1035 var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); | 1041 var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); |
1036 if(!linkHrefLeftover.match(/\d/)) { | 1042 if(!linkHrefLeftover.match(/\d/)) { |
1037 continue; | 1043 continue; |
1038 } | 1044 } |
1039 | 1045 |
1040 if(!(linkHref in possiblePages)) { | 1046 if(!(linkHref in possiblePages)) { |
1041 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr
ef": linkHref}; | 1047 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr
ef": linkHref}; |
1042 } else { | 1048 } else { |
1043 possiblePages[linkHref].linkText += ' | ' + linkText; | 1049 possiblePages[linkHref].linkText += ' | ' + linkText; |
1044 } | 1050 } |
1045 | 1051 |
1046 var linkObj = possiblePages[linkHref]; | 1052 var linkObj = possiblePages[linkHref]; |
1047 | 1053 |
1048 /** | 1054 /** |
1049 * If the articleBaseUrl isn't part of this URL, penalize this link.
It could still be the link, but the odds are lower. | 1055 * If the articleBaseUrl isn't part of this URL, penalize this link.
It could still be the link, but the odds are lower. |
1050 * Example: http://www.actionscript.org/resources/articles/745/1/Jav
aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html | 1056 * Example: http://www.actionscript.org/resources/articles/745/1/Jav
aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html |
1051 **/ | 1057 **/ |
1052 if(linkHref.indexOf(articleBaseUrl) !== 0) { | 1058 if(linkHref.indexOf(articleBaseUrl) !== 0) { |
1053 linkObj.score -= 25; | 1059 linkObj.score -= 25; |
1054 } | 1060 } |
1055 | 1061 |
1056 var linkData = linkText + ' ' + link.className + ' ' + link.id; | 1062 var linkData = linkText + ' ' + link.className + ' ' + link.id; |
1057 if(linkData.match(readability.regexps.nextLink)) { | 1063 if(linkData.match(readability.regexps.nextLink)) { |
1058 linkObj.score += 50; | 1064 linkObj.score += 50; |
1059 } | 1065 } |
1060 if(linkData.match(/pag(e|ing|inat)/i)) { | 1066 if(linkData.match(/pag(e|ing|inat)/i)) { |
1061 linkObj.score += 25; | 1067 linkObj.score += 25; |
1062 } | 1068 } |
1063 if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any
bonuses gotten from a > or » in the text, | 1069 if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any
bonuses gotten from a > or » in the text, |
1064 /* If we already matched on "next", last is probably fine. If we
didn't, then it's bad. Penalize. */ | 1070 /* If we already matched on "next", last is probably fine. If we
didn't, then it's bad. Penalize. */ |
1065 if(!linkObj.linkText.match(readability.regexps.nextLink)) { | 1071 if(!linkObj.linkText.match(readability.regexps.nextLink)) { |
1066 linkObj.score -= 65; | 1072 linkObj.score -= 65; |
1067 } | 1073 } |
1068 } | 1074 } |
1069 if(linkData.match(readability.regexps.negative) || linkData.match(re
adability.regexps.extraneous)) { | 1075 if(linkData.match(readability.regexps.negative) || linkData.match(re
adability.regexps.extraneous)) { |
1070 linkObj.score -= 50; | 1076 linkObj.score -= 50; |
1071 } | 1077 } |
1072 if(linkData.match(readability.regexps.prevLink)) { | 1078 if(linkData.match(readability.regexps.prevLink)) { |
1073 linkObj.score -= 200; | 1079 linkObj.score -= 200; |
1074 } | 1080 } |
1075 | 1081 |
1076 /* If a parentNode contains page or paging or paginat */ | 1082 /* If a parentNode contains page or paging or paginat */ |
1077 var parentNode = link.parentNode, | 1083 var parentNode = link.parentNode, |
1078 positiveNodeMatch = false, | 1084 positiveNodeMatch = false, |
1079 negativeNodeMatch = false; | 1085 negativeNodeMatch = false; |
1080 while(parentNode) { | 1086 while(parentNode) { |
1081 var parentNodeClassAndId = parentNode.className + ' ' + parentNo
de.id; | 1087 var parentNodeClassAndId = parentNode.className + ' ' + parentNo
de.id; |
1082 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass
AndId.match(/pag(e|ing|inat)/i)) { | 1088 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass
AndId.match(/pag(e|ing|inat)/i)) { |
1083 positiveNodeMatch = true; | 1089 positiveNodeMatch = true; |
1084 linkObj.score += 25; | 1090 linkObj.score += 25; |
1085 } | 1091 } |
1086 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass
AndId.match(readability.regexps.negative)) { | 1092 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass
AndId.match(readability.regexps.negative)) { |
1087 /* If this is just something like "footer", give it a negati
ve. If it's something like "body-and-footer", leave it be. */ | 1093 /* If this is just something like "footer", give it a negati
ve. If it's something like "body-and-footer", leave it be. */ |
1088 if(!parentNodeClassAndId.match(readability.regexps.positive)
) { | 1094 if(!parentNodeClassAndId.match(readability.regexps.positive)
) { |
1089 linkObj.score -= 25; | 1095 linkObj.score -= 25; |
1090 negativeNodeMatch = true; | 1096 negativeNodeMatch = true; |
1091 } | 1097 } |
1092 } | 1098 } |
1093 | 1099 |
1094 parentNode = parentNode.parentNode; | 1100 parentNode = parentNode.parentNode; |
1095 } | 1101 } |
1096 | 1102 |
1097 /** | 1103 /** |
1098 * If the URL looks like it has paging in it, add to the score. | 1104 * If the URL looks like it has paging in it, add to the score. |
1099 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 | 1105 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 |
1100 **/ | 1106 **/ |
1101 if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) ||
linkHref.match(/(page|paging)/i)) { | 1107 if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) ||
linkHref.match(/(page|paging)/i)) { |
1102 linkObj.score += 25; | 1108 linkObj.score += 25; |
1103 } | 1109 } |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1145 topPage = possiblePages[page]; | 1151 topPage = possiblePages[page]; |
1146 } | 1152 } |
1147 } | 1153 } |
1148 } | 1154 } |
1149 | 1155 |
1150 if(topPage) { | 1156 if(topPage) { |
1151 var nextHref = topPage.href.replace(/\/$/,''); | 1157 var nextHref = topPage.href.replace(/\/$/,''); |
1152 | 1158 |
1153 dbg('NEXT PAGE IS ' + nextHref); | 1159 dbg('NEXT PAGE IS ' + nextHref); |
1154 readability.parsedPages[nextHref] = true; | 1160 readability.parsedPages[nextHref] = true; |
1155 return nextHref; | 1161 return nextHref; |
1156 } | 1162 } |
1157 else { | 1163 else { |
1158 return null; | 1164 return null; |
1159 } | 1165 } |
1160 }, | 1166 }, |
1161 | 1167 |
1162 createLinkDiv: function(link) { | 1168 createLinkDiv: function(link) { |
1163 var divNode = document.createElement('div'); | 1169 var divNode = document.createElement('div'); |
1164 var aNode = document.createElement('a'); | 1170 var aNode = document.createElement('a'); |
1165 var tNode = document.createTextNode('View Next Page'); | 1171 var tNode = document.createTextNode('View Next Page'); |
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1197 } | 1203 } |
1198 else { | 1204 else { |
1199 if (options.error) { options.error(request); } | 1205 if (options.error) { options.error(request); } |
1200 } | 1206 } |
1201 } | 1207 } |
1202 } | 1208 } |
1203 | 1209 |
1204 if (typeof options === 'undefined') { options = {}; } | 1210 if (typeof options === 'undefined') { options = {}; } |
1205 | 1211 |
1206 request.onreadystatechange = respondToReadyState; | 1212 request.onreadystatechange = respondToReadyState; |
1207 | 1213 |
1208 request.open('get', url, true); | 1214 request.open('get', url, true); |
1209 request.setRequestHeader('Accept', 'text/html'); | 1215 request.setRequestHeader('Accept', 'text/html'); |
1210 | 1216 |
1211 try { | 1217 try { |
1212 request.send(options.postBody); | 1218 request.send(options.postBody); |
1213 } | 1219 } |
1214 catch (e) { | 1220 catch (e) { |
1215 if (options.error) { options.error(); } | 1221 if (options.error) { options.error(); } |
1216 } | 1222 } |
1217 | 1223 |
(...skipping 14 matching lines...) Expand all Loading... |
1232 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada
bility.curPageNum + '">§</p>'; | 1238 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada
bility.curPageNum + '">§</p>'; |
1233 | 1239 |
1234 document.getElementById("readability-content").appendChild(articlePage); | 1240 document.getElementById("readability-content").appendChild(articlePage); |
1235 | 1241 |
1236 if(readability.curPageNum > readability.maxPages) { | 1242 if(readability.curPageNum > readability.maxPages) { |
1237 var linkDiv = readability.createLinkDiv(nextPageLink); | 1243 var linkDiv = readability.createLinkDiv(nextPageLink); |
1238 | 1244 |
1239 articlePage.appendChild(linkDiv); | 1245 articlePage.appendChild(linkDiv); |
1240 return; | 1246 return; |
1241 } | 1247 } |
1242 | 1248 |
1243 /** | 1249 /** |
1244 * Now that we've built the article page DOM element, get the page conte
nt | 1250 * Now that we've built the article page DOM element, get the page conte
nt |
1245 * asynchronously and load the cleaned content into the div we created f
or it. | 1251 * asynchronously and load the cleaned content into the div we created f
or it. |
1246 **/ | 1252 **/ |
1247 (function(pageUrl, thisPage) { | 1253 (function(pageUrl, thisPage) { |
1248 readability.ajax(pageUrl, { | 1254 readability.ajax(pageUrl, { |
1249 success: function(r) { | 1255 success: function(r) { |
1250 | 1256 |
1251 /* First, check to see if we have a matching ETag in headers
- if we do, this is a duplicate page. */ | 1257 /* First, check to see if we have a matching ETag in headers
- if we do, this is a duplicate page. */ |
1252 var eTag = r.getResponseHeader('ETag'); | 1258 var eTag = r.getResponseHeader('ETag'); |
1253 if(eTag) { | 1259 if(eTag) { |
1254 if(eTag in readability.pageETags) { | 1260 if(eTag in readability.pageETags) { |
1255 dbg("Exact duplicate page found via ETag. Aborting."
); | 1261 dbg("Exact duplicate page found via ETag. Aborting."
); |
1256 articlePage.style.display = 'none'; | 1262 articlePage.style.display = 'none'; |
1257 return; | 1263 return; |
1258 } else { | 1264 } else { |
1259 readability.pageETags[eTag] = 1; | 1265 readability.pageETags[eTag] = 1; |
1260 } | 1266 } |
1261 } | 1267 } |
1262 | 1268 |
1263 // TODO: this ends up doubling up page numbers on NYTimes ar
ticles. Need to generically parse those away. | 1269 // TODO: this ends up doubling up page numbers on NYTimes ar
ticles. Need to generically parse those away. |
1264 var page = document.createElement("DIV"); | 1270 var page = document.createElement("DIV"); |
1265 | 1271 |
1266 /** | 1272 /** |
1267 * Do some preprocessing to our HTML to make it ready for ap
pending. | 1273 * Do some preprocessing to our HTML to make it ready for ap
pending. |
1268 * • Remove any script tags. Swap and reswap newlines with a
unicode character because multiline regex doesn't work in javascript. | 1274 * • Remove any script tags. Swap and reswap newlines with a
unicode character because multiline regex doesn't work in javascript. |
1269 * • Turn any noscript tags into divs so that we can parse t
hem. This allows us to find any next page links hidden via javascript. | 1275 * • Turn any noscript tags into divs so that we can parse t
hem. This allows us to find any next page links hidden via javascript. |
1270 * • Turn all double br's into p's - was handled by prepDocu
ment in the original view. | 1276 * • Turn all double br's into p's - was handled by prepDocu
ment in the original view. |
(...skipping 30 matching lines...) Expand all Loading... |
1301 for(var i=1; i <= readability.curPageNum; i+=1) { | 1307 for(var i=1; i <= readability.curPageNum; i+=1) { |
1302 var rPage = document.getElementById('readability-pag
e-' + i); | 1308 var rPage = document.getElementById('readability-pag
e-' + i); |
1303 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML
) !== -1) { | 1309 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML
) !== -1) { |
1304 dbg('Duplicate of page ' + i + ' - skipping.'); | 1310 dbg('Duplicate of page ' + i + ' - skipping.'); |
1305 articlePage.style.display = 'none'; | 1311 articlePage.style.display = 'none'; |
1306 readability.parsedPages[pageUrl] = true; | 1312 readability.parsedPages[pageUrl] = true; |
1307 return; | 1313 return; |
1308 } | 1314 } |
1309 } | 1315 } |
1310 } | 1316 } |
1311 | 1317 |
1312 readability.removeScripts(content); | 1318 readability.removeScripts(content); |
1313 | 1319 |
1314 readability.moveNodeInnards(content, thisPage); | 1320 readability.moveNodeInnards(content, thisPage); |
1315 | 1321 |
1316 /** | 1322 /** |
1317 * After the page has rendered, post process the content. Th
is delay is necessary because, | 1323 * After the page has rendered, post process the content. Th
is delay is necessary because, |
1318 * in webkit at least, offsetWidth is not set in time to det
ermine image width. We have to | 1324 * in webkit at least, offsetWidth is not set in time to det
ermine image width. We have to |
1319 * wait a little bit for reflow to finish before we can fix
floating images. | 1325 * wait a little bit for reflow to finish before we can fix
floating images. |
1320 **/ | 1326 **/ |
1321 window.setTimeout( | 1327 window.setTimeout( |
1322 function() { readability.postProcessContent(thisPage); }
, | 1328 function() { readability.postProcessContent(thisPage); }
, |
1323 500 | 1329 500 |
1324 ); | 1330 ); |
1325 | 1331 |
1326 if(nextPageLink) { | 1332 if(nextPageLink) { |
1327 readability.appendNextPage(nextPageLink); | 1333 readability.appendNextPage(nextPageLink); |
1328 } | 1334 } |
1329 } | 1335 } |
1330 }); | 1336 }); |
1331 }(nextPageLink, articlePage)); | 1337 }(nextPageLink, articlePage)); |
1332 }, | 1338 }, |
1333 | 1339 |
1334 /** | 1340 /** |
1335 * Get an elements class/id weight. Uses regular expressions to tell if this
| 1341 * Get an elements class/id weight. Uses regular expressions to tell if this |
1336 * element looks good or bad. | 1342 * element looks good or bad. |
1337 * | 1343 * |
1338 * @param Element | 1344 * @param Element |
1339 * @return number (Integer) | 1345 * @return number (Integer) |
1340 **/ | 1346 **/ |
1341 getClassWeight: function (e) { | 1347 getClassWeight: function (e) { |
1342 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { | 1348 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { |
1343 return 0; | 1349 return 0; |
1344 } | 1350 } |
1345 | 1351 |
(...skipping 29 matching lines...) Expand all Loading... |
1375 /** | 1381 /** |
1376 * Remove extraneous break tags from a node. | 1382 * Remove extraneous break tags from a node. |
1377 * | 1383 * |
1378 * @param Element | 1384 * @param Element |
1379 * @return void | 1385 * @return void |
1380 **/ | 1386 **/ |
1381 killBreaks: function (e) { | 1387 killBreaks: function (e) { |
1382 var allElements = e.getElementsByTagName('*'); | 1388 var allElements = e.getElementsByTagName('*'); |
1383 while (i < allElements.length) { | 1389 while (i < allElements.length) { |
1384 readability.deleteExtraBreaks(allElements[i]); | 1390 readability.deleteExtraBreaks(allElements[i]); |
1385 i++; | 1391 i++; |
1386 } | 1392 } |
1387 }, | 1393 }, |
1388 | 1394 |
1389 /** | 1395 /** |
1390 * Clean a node of all elements of type "tag". | 1396 * Clean a node of all elements of type "tag". |
1391 * (Unless it's a youtube/vimeo video. People love movies.) | 1397 * (Unless it's a youtube/vimeo video. People love movies.) |
1392 * | 1398 * |
1393 * @param Element | 1399 * @param Element |
1394 * @param string tag to clean | 1400 * @param string tag to clean |
1395 * @return void | 1401 * @return void |
1396 **/ | 1402 **/ |
1397 clean: function (e, tag) { | 1403 clean: function (e, tag) { |
1398 var targetList = e.getElementsByTagName( tag ); | 1404 var targetList = e.getElementsByTagName( tag ); |
1399 var isEmbed = (tag === 'object' || tag === 'embed'); | 1405 var isEmbed = (tag === 'object' || tag === 'embed'); |
1400 | 1406 |
1401 for (var y=targetList.length-1; y >= 0; y-=1) { | 1407 for (var y=targetList.length-1; y >= 0; y-=1) { |
1402 /* Allow youtube and vimeo videos through as people usually want to
see those. */ | 1408 /* Allow youtube and vimeo videos through as people usually want to
see those. */ |
1403 if(isEmbed) { | 1409 if(isEmbed) { |
1404 var attributeValues = ""; | 1410 var attributeValues = ""; |
1405 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1)
{ | 1411 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1)
{ |
1406 attributeValues += targetList[y].attributes[i].value + '|'; | 1412 attributeValues += targetList[y].attributes[i].value + '|'; |
1407 } | 1413 } |
1408 | 1414 |
1409 /* First, check the elements attributes to see if any of them co
ntain youtube or vimeo */ | 1415 /* First, check the elements attributes to see if any of them co
ntain youtube or vimeo */ |
1410 if (attributeValues.search(readability.regexps.videos) !== -1) { | 1416 if (attributeValues.search(readability.regexps.videos) !== -1) { |
1411 continue; | 1417 continue; |
1412 } | 1418 } |
1413 | 1419 |
1414 /* Then check the elements inside this element for the same. */ | 1420 /* Then check the elements inside this element for the same. */ |
1415 if (targetList[y].innerHTML.search(readability.regexps.videos) !
== -1) { | 1421 if (targetList[y].innerHTML.search(readability.regexps.videos) !
== -1) { |
1416 continue; | 1422 continue; |
1417 } | 1423 } |
1418 | 1424 |
1419 } | 1425 } |
1420 | 1426 |
1421 targetList[y].parentNode.removeChild(targetList[y]); | 1427 targetList[y].parentNode.removeChild(targetList[y]); |
1422 } | 1428 } |
1423 }, | 1429 }, |
1424 | 1430 |
1425 /** | 1431 /** |
1426 * Clean an element of all tags of type "tag" if they look fishy. | 1432 * Clean an element of all tags of type "tag" if they look fishy. |
1427 * "Fishy" is an algorithm based on content length, classnames, link density
, number of images & embeds, etc. | 1433 * "Fishy" is an algorithm based on content length, classnames, link density
, number of images & embeds, etc. |
1428 * | 1434 * |
1429 * @return void | 1435 * @return void |
1430 **/ | 1436 **/ |
1431 cleanConditionally: function (e, tag) { | 1437 cleanConditionally: function (e, tag) { |
1432 | 1438 |
1433 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { | 1439 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { |
1434 return; | 1440 return; |
1435 } | 1441 } |
1436 | 1442 |
1437 var tagsList = e.getElementsByTagName(tag); | 1443 var tagsList = e.getElementsByTagName(tag); |
1438 var curTagsLength = tagsList.length; | 1444 var curTagsLength = tagsList.length; |
1439 | 1445 |
1440 /** | 1446 /** |
1441 * Gather counts for other typical elements embedded within. | 1447 * Gather counts for other typical elements embedded within. |
1442 * Traverse backwards so we can remove nodes at the same time without ef
fecting the traversal. | 1448 * Traverse backwards so we can remove nodes at the same time without ef
fecting the traversal. |
1443 * | 1449 * |
1444 * TODO: Consider taking into account original contentScore here. | 1450 * TODO: Consider taking into account original contentScore here. |
1445 **/ | 1451 **/ |
1446 for (var i=curTagsLength-1; i >= 0; i-=1) { | 1452 for (var i=curTagsLength-1; i >= 0; i-=1) { |
1447 var weight = readability.getClassWeight(tagsList[i]); | 1453 var weight = readability.getClassWeight(tagsList[i]); |
1448 var contentScore = (typeof tagsList[i].readability !== 'undefined')
? tagsList[i].readability.contentScore : 0; | 1454 var contentScore = (typeof tagsList[i].readability !== 'undefined')
? tagsList[i].readability.contentScore : 0; |
1449 | 1455 |
1450 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla
ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde
fined') ? (" with score " + tagsList[i].readability.contentScore) : '')); | 1456 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla
ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde
fined') ? (" with score " + tagsList[i].readability.contentScore) : '')); |
1451 | 1457 |
1452 if(weight+contentScore < 0) | 1458 if(weight+contentScore < 0) |
1453 { | 1459 { |
1454 tagsList[i].parentNode.removeChild(tagsList[i]); | 1460 tagsList[i].parentNode.removeChild(tagsList[i]); |
1455 } | 1461 } |
1456 else if ( readability.getCharCount(tagsList[i],',') < 10) { | 1462 else if ( readability.getCharCount(tagsList[i],',') < 10) { |
1457 /** | 1463 /** |
1458 * If there are not very many commas, and the number of | 1464 * If there are not very many commas, and the number of |
1459 * non-paragraph elements is more than paragraphs or other omino
us signs, remove the element. | 1465 * non-paragraph elements is more than paragraphs or other omino
us signs, remove the element. |
1460 **/ | 1466 **/ |
1461 var p = tagsList[i].getElementsByTagName("p").length; | 1467 var p = tagsList[i].getElementsByTagName("p").length; |
1462 var img = tagsList[i].getElementsByTagName("img").length; | 1468 var img = tagsList[i].getElementsByTagName("img").length; |
1463 var li = tagsList[i].getElementsByTagName("li").length-100; | 1469 var li = tagsList[i].getElementsByTagName("li").length-100; |
1464 var input = tagsList[i].getElementsByTagName("input").length; | 1470 var input = tagsList[i].getElementsByTagName("input").length; |
1465 | 1471 |
1466 var embedCount = 0; | 1472 var embedCount = 0; |
1467 var embeds = tagsList[i].getElementsByTagName("embed"); | 1473 var embeds = tagsList[i].getElementsByTagName("embed"); |
1468 for(var ei=0,il=embeds.length; ei < il; ei+=1) { | 1474 for(var ei=0,il=embeds.length; ei < il; ei+=1) { |
1469 if (embeds[ei].src.search(readability.regexps.videos) === -1
) { | 1475 if (embeds[ei].src.search(readability.regexps.videos) === -1
) { |
1470 embedCount+=1; | 1476 embedCount+=1; |
1471 } | 1477 } |
1472 } | 1478 } |
1473 | 1479 |
1474 var linkDensity = readability.getLinkDensity(tagsList[i]); | 1480 var linkDensity = readability.getLinkDensity(tagsList[i]); |
1475 var contentLength = readability.getInnerText(tagsList[i]).length
; | 1481 var contentLength = readability.getInnerText(tagsList[i]).length
; |
1476 var toRemove = false; | 1482 var toRemove = false; |
1477 | 1483 |
1478 if ( img > p ) { | 1484 if ( img > p ) { |
1479 toRemove = true; | 1485 toRemove = true; |
1480 } else if(li > p && tag !== "ul" && tag !== "ol") { | 1486 } else if(li > p && tag !== "ul" && tag !== "ol") { |
1481 toRemove = true; | 1487 toRemove = true; |
1482 } else if( input > Math.floor(p/3) ) { | 1488 } else if( input > Math.floor(p/3) ) { |
1483 toRemove = true; | 1489 toRemove = true; |
1484 } else if(contentLength < 25 && (img === 0 || img > 2) ) { | 1490 } else if(contentLength < 25 && (img === 0 || img > 2) ) { |
1485 toRemove = true; | 1491 toRemove = true; |
1486 } else if(weight < 25 && linkDensity > 0.2) { | 1492 } else if(weight < 25 && linkDensity > 0.2) { |
1487 toRemove = true; | 1493 toRemove = true; |
1488 } else if(weight >= 25 && linkDensity > 0.5) { | 1494 } else if(weight >= 25 && linkDensity > 0.5) { |
1489 toRemove = true; | 1495 toRemove = true; |
1490 } else if((embedCount === 1 && contentLength < 75) || embedCount
> 1) { | 1496 } else if((embedCount === 1 && contentLength < 75) || embedCount
> 1) { |
1491 toRemove = true; | 1497 toRemove = true; |
1492 } | 1498 } |
1493 | 1499 |
(...skipping 21 matching lines...) Expand all Loading... |
1515 } | 1521 } |
1516 }, | 1522 }, |
1517 | 1523 |
1518 flagIsActive: function(flag) { | 1524 flagIsActive: function(flag) { |
1519 return (readability.flags & flag) > 0; | 1525 return (readability.flags & flag) > 0; |
1520 }, | 1526 }, |
1521 | 1527 |
1522 addFlag: function(flag) { | 1528 addFlag: function(flag) { |
1523 readability.flags = readability.flags | flag; | 1529 readability.flags = readability.flags | flag; |
1524 }, | 1530 }, |
1525 | 1531 |
1526 removeFlag: function(flag) { | 1532 removeFlag: function(flag) { |
1527 readability.flags = readability.flags & ~flag; | 1533 readability.flags = readability.flags & ~flag; |
1528 }, | 1534 }, |
1529 | 1535 |
1530 // Removes the children of |src| and appends them to |dest|. | 1536 // Removes the children of |src| and appends them to |dest|. |
1531 moveNodeInnards: function(src, dest) { | 1537 moveNodeInnards: function(src, dest) { |
1532 try { | 1538 try { |
1533 while (src.firstChild) { | 1539 while (src.firstChild) { |
1534 dest.appendChild(src.removeChild(src.firstChild)); | 1540 dest.appendChild(src.removeChild(src.firstChild)); |
1535 } | 1541 } |
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1584 var lastBr = readability.isMultipleBr(node, false); | 1590 var lastBr = readability.isMultipleBr(node, false); |
1585 var ret = false; | 1591 var ret = false; |
1586 while (lastBr && lastBr != node) { | 1592 while (lastBr && lastBr != node) { |
1587 var toRemove = lastBr; | 1593 var toRemove = lastBr; |
1588 lastBr = lastBr.previousSibling; | 1594 lastBr = lastBr.previousSibling; |
1589 toRemove.parentNode.removeChild(toRemove); | 1595 toRemove.parentNode.removeChild(toRemove); |
1590 ret = true; | 1596 ret = true; |
1591 } | 1597 } |
1592 return ret; | 1598 return ret; |
1593 }, | 1599 }, |
1594 | 1600 |
1595 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a | 1601 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a |
1596 // <P> node, and makes all next siblings of that pair children of <P>, up | 1602 // <P> node, and makes all next siblings of that pair children of <P>, up |
1597 // until the next pair of <BR> nodes is reached. | 1603 // until the next pair of <BR> nodes is reached. |
1598 replaceDoubleBrWithP: function(node) { | 1604 replaceDoubleBrWithP: function(node) { |
1599 // Check that we are starting with a BR. | 1605 // Check that we are starting with a BR. |
1600 var second = readability.isMultipleBr(node, true); | 1606 var second = readability.isMultipleBr(node, true); |
1601 if (!second) { | 1607 if (!second) { |
1602 return; | 1608 return; |
1603 } | 1609 } |
1604 // Make all next siblings of the second BR into children of a P. | 1610 // Make all next siblings of the second BR into children of a P. |
1605 var p = document.createElement('p'); | 1611 var p = document.createElement('p'); |
1606 var curr = second.nextSibling; | 1612 var curr = second.nextSibling; |
1607 while (curr) { | 1613 while (curr) { |
1608 if (readability.isMultipleBr(curr, true)) { | 1614 if (readability.isMultipleBr(curr, true)) { |
1609 break; | 1615 break; |
1610 } | 1616 } |
1611 var next = curr.nextSibling; | 1617 var next = curr.nextSibling; |
1612 p.appendChild(curr.parentNode.removeChild(curr)); | 1618 p.appendChild(curr.parentNode.removeChild(curr)); |
1613 curr = next; | 1619 curr = next; |
1614 } | 1620 } |
1615 var ret = curr; | 1621 var ret = curr; |
1616 | 1622 |
1617 // Remove all nodes between the first and second BR. | 1623 // Remove all nodes between the first and second BR. |
1618 curr = node.nextSibling; | 1624 curr = node.nextSibling; |
1619 while (curr && curr != second) { | 1625 while (curr && curr != second) { |
1620 var next = curr.nextSibling; | 1626 var next = curr.nextSibling; |
1621 curr.parentNode.removeChild(curr); | 1627 curr.parentNode.removeChild(curr); |
1622 curr = next; | 1628 curr = next; |
1623 } | 1629 } |
1624 // Remove the second BR. | 1630 // Remove the second BR. |
1625 second.parentNode.removeChild(second); | 1631 second.parentNode.removeChild(second); |
1626 // Replace the first BR with the P. | 1632 // Replace the first BR with the P. |
1627 node.parentNode.replaceChild(p, node); | 1633 node.parentNode.replaceChild(p, node); |
1628 | 1634 |
1629 return ret; | 1635 return ret; |
1630 }, | 1636 }, |
1631 | 1637 |
1632 // Returns true if the NodeList contains a double <BR>. | 1638 // Returns true if the NodeList contains a double <BR>. |
1633 hasDoubleBr: function(nodeList) { | 1639 hasDoubleBr: function(nodeList) { |
1634 for (var i = 0; i < nodeList.length; nodeList++) { | 1640 for (var i = 0; i < nodeList.length; nodeList++) { |
1635 if (readability.isMultipleBr(nodeList[i], true)) { | 1641 if (readability.isMultipleBr(nodeList[i], true)) { |
1636 return true; | 1642 return true; |
1637 } | 1643 } |
1638 } | 1644 } |
1639 return false; | 1645 return false; |
1640 }, | 1646 }, |
1641 | 1647 |
1642 // Replaces double <BR> tags with <P> tags. | 1648 // Replaces double <BR> tags with <P> tags. |
1643 replaceDoubleBrsWithPs: function(node) { | 1649 replaceDoubleBrsWithPs: function(node) { |
1644 var allElements = node.getElementsByTagName('BR'); | 1650 var allElements = node.getElementsByTagName('BR'); |
1645 var node = null; | 1651 var node = null; |
1646 while (allElements && allElements.length > 0 && | 1652 while (allElements && allElements.length > 0 && |
1647 readability.hasDoubleBr(allElements)) { | 1653 readability.hasDoubleBr(allElements)) { |
1648 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex +
= 1) { | 1654 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex +
= 1) { |
1649 var next = node; | 1655 var next = node; |
1650 while (next = readability.replaceDoubleBrWithP(next)); | 1656 while (next = readability.replaceDoubleBrWithP(next)); |
1651 } | 1657 } |
1652 allElements = document.body.getElementsByTagName('BR'); | 1658 allElements = document.body.getElementsByTagName('BR'); |
1653 } | 1659 } |
1654 }, | 1660 }, |
1655 | 1661 |
1656 | 1662 |
1657 // Replaces a BR and the whitespace that follows it with a P. | 1663 // Replaces a BR and the whitespace that follows it with a P. |
1658 replaceBrWithP: function(node) { | 1664 replaceBrWithP: function(node) { |
1659 if (!readability.isBrNode(node)) { | 1665 if (!readability.isBrNode(node)) { |
1660 return; | 1666 return; |
1661 } | 1667 } |
1662 var p = document.createElement('p'); | 1668 var p = document.createElement('p'); |
1663 var curr = node.nextSibling; | 1669 var curr = node.nextSibling; |
1664 while (curr && !isBrNode(curr)) { | 1670 while (curr && !isBrNode(curr)) { |
1665 var next = curr.nextSibling; | 1671 var next = curr.nextSibling; |
1666 if (readability.isWhitespaceNode(curr)) { | 1672 if (readability.isWhitespaceNode(curr)) { |
1667 curr.parentNode.removeChild(curr); | 1673 curr.parentNode.removeChild(curr); |
1668 } else { | 1674 } else { |
1669 p.appendChild(curr.parentNode.removeChild(curr)); | 1675 p.appendChild(curr.parentNode.removeChild(curr)); |
1670 } | 1676 } |
1671 curr = next; | 1677 curr = next; |
1672 } | 1678 } |
1673 node.parentNode.replaceChild(p, node); | 1679 node.parentNode.replaceChild(p, node); |
1674 return curr; | 1680 return curr; |
1675 }, | 1681 }, |
1676 | 1682 |
1677 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t
ag | 1683 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t
ag |
1678 // children of the <P>. | 1684 // children of the <P>. |
1679 replaceBrsWithPs: function(node) { | 1685 replaceBrsWithPs: function(node) { |
1680 var allElements = node.getElementsByTagName('BR'); | 1686 var allElements = node.getElementsByTagName('BR'); |
1681 var node = null; | 1687 var node = null; |
1682 while (allElements && allElements.length > 0) { | 1688 while (allElements && allElements.length > 0) { |
1683 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex +
= 1) { | 1689 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex +
= 1) { |
1684 var next = node; | 1690 var next = node; |
1685 while (next = readability.replaceBrWithP(next)); | 1691 while (next = readability.replaceBrWithP(next)); |
1686 } | 1692 } |
1687 allElements = document.body.getElementsByTagName('BR'); | 1693 allElements = document.body.getElementsByTagName('BR'); |
1688 } | 1694 } |
1689 }, | 1695 }, |
1690 | 1696 |
1691 // Replaces any tag with any other tag. | 1697 // Replaces any tag with any other tag. |
1692 replaceTagsWithTags: function(node, srcTag, destTag) { | 1698 replaceTagsWithTags: function(node, srcTag, destTag) { |
1693 var allElements = node.getElementsByTagName(srcTag); | 1699 var allElements = node.getElementsByTagName(srcTag); |
1694 for (var i = 0; i < allElements.length; i++) { | 1700 for (var i = 0; i < allElements.length; i++) { |
1695 var dest = document.createElement(destTag); | 1701 var dest = document.createElement(destTag); |
1696 readability.moveNodeInnards(allElements[i], dest); | 1702 readability.moveNodeInnards(allElements[i], dest); |
1697 node.replaceNode(dest, allElements[i]); | 1703 allElements[i].parentNode.replaceChild(dest, allElements[i]); |
1698 } | 1704 } |
1699 }, | 1705 }, |
1700 | 1706 |
1701 // Replaces all <noscript> tags with <p> tags. | 1707 // Replaces all <noscript> tags with <p> tags. |
1702 replaceNoscriptsWithPs: function(node) { | 1708 replaceNoscriptsWithPs: function(node) { |
1703 readability.replaceTagsWithTags(node, 'noscript', 'p'); | 1709 readability.replaceTagsWithTags(node, 'noscript', 'p'); |
1704 }, | 1710 }, |
1705 | 1711 |
1706 // Replaces all <font> tags with <span> tags. | 1712 // Replaces all <font> tags with <span> tags. |
1707 replaceFontsWithSpans: function(node) { | 1713 replaceFontsWithSpans: function(node) { |
1708 readability.replaceTagsWithTags(node, 'font', 'span'); | 1714 readability.replaceTagsWithTags(node, 'font', 'span'); |
1709 }, | 1715 }, |
1710 | 1716 |
1711 // Returns a list of image URLs in the distilled article. | 1717 // Returns a list of image URLs in the distilled article. |
1712 getImages : function() { | 1718 getImages : function() { |
1713 var images = document.getElementsByTagName('img'); | 1719 var images = document.getElementsByTagName('img'); |
1714 var result = new Array(images.length); | 1720 var result = new Array(images.length); |
1715 dbg("Number of images: " + images.length); | 1721 dbg("Number of images: " + images.length); |
1716 for(i = 0; i < images.length; i++) { | 1722 for(i = 0; i < images.length; i++) { |
1717 result[i] = images[i].src; | 1723 result[i] = images[i].src; |
1718 dbg("Image: " + result[i]); | 1724 dbg("Image: " + result[i]); |
1719 } | 1725 } |
1720 return result; | 1726 return result; |
1721 }, | 1727 }, |
1722 | 1728 |
1723 // Returns the distilled article HTML from the page(s). | 1729 // Returns the distilled article HTML from the page(s). |
1724 getDistilledArticleHTML : function() { | 1730 getDistilledArticleHTML : function() { |
1725 return readability.distilledHTML; | 1731 return readability.distilledHTML; |
| 1732 }, |
| 1733 |
| 1734 // Returns the next page of this article. |
| 1735 getNextPageLink : function() { |
| 1736 return readability.nextPageLink; |
1726 } | 1737 } |
1727 }; | 1738 }; |
1728 | 1739 |
1729 // Extracts long-form content from a page and returns and array where the first | 1740 // Extracts long-form content from a page and returns and array where the first |
1730 // element is the article title, the second element is HTML containing the | 1741 // element is the article title, the second element is HTML containing the |
1731 // long-form content, and remaining elements are URLs for images referenced by | 1742 // long-form content, and remaining elements are URLs for images referenced by |
1732 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which | 1743 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which |
1733 // corresponds to a URL listed at index k in the array returned. | 1744 // corresponds to a URL listed at index k in the array returned. |
1734 (function () { | 1745 (function () { |
1735 readability.init(); | 1746 readability.init(); |
1736 var result = new Array(2); | 1747 var result = new Array(3); |
1737 result[0] = readability.getArticleTitle(); | 1748 result[0] = readability.getArticleTitle(); |
1738 result[1] = readability.getDistilledArticleHTML(); | 1749 result[1] = readability.getDistilledArticleHTML(); |
| 1750 result[2] = readability.getNextPageLink(); |
1739 return result.concat(readability.getImages()); | 1751 return result.concat(readability.getImages()); |
1740 }()) | 1752 }()) |
1741 | 1753 |
OLD | NEW |