OLD | NEW |
---|---|
1 | 1 |
2 var dbg = (typeof console !== 'undefined') ? function(s) { | 2 var dbg = (typeof console !== 'undefined') ? function(s) { |
3 console.log("Readability: " + s); | 3 console.log("Readability: " + s); |
4 } : function() {}; | 4 } : function() {}; |
5 | 5 |
6 /* | 6 /* |
7 * Readability. An Arc90 Lab Experiment. | 7 * Readability. An Arc90 Lab Experiment. |
8 * Website: http://lab.arc90.com/experiments/readability | 8 * Website: http://lab.arc90.com/experiments/readability |
9 * Source: http://code.google.com/p/arc90labs-readability | 9 * Source: http://code.google.com/p/arc90labs-readability |
10 * | 10 * |
11 * "Readability" is a trademark of Arc90 Inc and may not be used without explici t permission. | 11 * "Readability" is a trademark of Arc90 Inc and may not be used without explici t permission. |
12 * | 12 * |
13 * Copyright (c) 2010 Arc90 Inc | 13 * Copyright (c) 2010 Arc90 Inc |
14 * Readability is licensed under the Apache License, Version 2.0. | 14 * Readability is licensed under the Apache License, Version 2.0. |
15 **/ | 15 **/ |
16 var readability = { | 16 var readability = { |
17 readStyle: "style-newspaper", | 17 readStyle: "style-newspaper", |
18 readSize: "size-medium", | 18 readSize: "size-medium", |
19 readMargin: "margin-wide", | 19 readMargin: "margin-wide", |
20 | 20 |
21 distilledHTML: '', | 21 distilledHTML: '', |
22 distilledArticleContent: null, | 22 distilledArticleContent: null, |
23 nextPageLink: '', | |
23 | 24 |
24 version: '1.7.1', | 25 version: '1.7.1', |
25 iframeLoads: 0, | 26 iframeLoads: 0, |
26 convertLinksToFootnotes: false, | 27 convertLinksToFootnotes: false, |
27 reversePageScroll: false, /* If they hold shift and hit space, scroll up */ | 28 reversePageScroll: false, /* If they hold shift and hit space, scroll up */ |
28 frameHack: false, /** | 29 frameHack: false, /** |
29 * The frame hack is to workaround a firefo x bug where if you | 30 * The frame hack is to workaround a firefo x bug where if you |
30 * pull content out of a frame and stick it into the parent element, the scrollbar won't appear. | 31 * pull content out of a frame and stick it into the parent element, the scrollbar won't appear. |
31 * So we fake a scrollbar in the wrapping d iv. | 32 * So we fake a scrollbar in the wrapping d iv. |
32 **/ | 33 **/ |
33 biggestFrame: false, | 34 biggestFrame: false, |
34 flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */ | 35 flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */ |
35 | 36 |
36 /* constants */ | 37 /* constants */ |
37 FLAG_STRIP_UNLIKELYS: 0x1, | 38 FLAG_STRIP_UNLIKELYS: 0x1, |
38 FLAG_WEIGHT_CLASSES: 0x2, | 39 FLAG_WEIGHT_CLASSES: 0x2, |
39 FLAG_CLEAN_CONDITIONALLY: 0x4, | 40 FLAG_CLEAN_CONDITIONALLY: 0x4, |
40 | 41 |
41 maxPages: 30, /* The maximum number of pages to loop through before we ca ll it quits and just show a link. */ | 42 maxPages: 30, /* The maximum number of pages to loop through before we ca ll it quits and just show a link. */ |
42 parsedPages: {}, /* The list of pages we've parsed in this call of readabili ty, for autopaging. As a key store for easier searching. */ | 43 parsedPages: {}, /* The list of pages we've parsed in this call of readabili ty, for autopaging. As a key store for easier searching. */ |
43 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas e they happen to match, we'll know it's a duplicate. */ | 44 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas e they happen to match, we'll know it's a duplicate. */ |
44 | 45 |
45 /** | 46 /** |
46 * All of the regular expressions in use within readability. | 47 * All of the regular expressions in use within readability. |
47 * Defined up here so we don't instantiate them repeatedly in loops. | 48 * Defined up here so we don't instantiate them repeatedly in loops. |
48 **/ | 49 **/ |
49 regexps: { | 50 regexps: { |
50 unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header |menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popu p|tweet|twitter/i, | 51 unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header |menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popu p|tweet|twitter/i, |
51 okMaybeItsACandidate: /and|article|body|column|main|shadow/i, | 52 okMaybeItsACandidate: /and|article|body|column|main|shadow/i, |
52 positive: /article|body|content|entry|hentry|main|page|pagi nation|post|text|blog|story/i, | 53 positive: /article|body|content|entry|hentry|main|page|pagi nation|post|text|blog|story/i, |
53 negative: /combx|comment|com-|contact|foot|footer|footnote| masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopp ing|tags|tool|widget/i, | 54 negative: /combx|comment|com-|contact|foot|footer|footnote| masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopp ing|tags|tool|widget/i, |
54 extraneous: /print|archive|comment|discuss|e[\-]?mail|share|r eply|all|login|sign|single/i, | 55 extraneous: /print|archive|comment|discuss|e[\-]?mail|share|r eply|all|login|sign|single/i, |
55 divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, | 56 divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, |
56 replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi, | 57 replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi, |
57 replaceFonts: /<(\/?)font[^>]*>/gi, | 58 replaceFonts: /<(\/?)font[^>]*>/gi, |
58 trim: /^\s+|\s+$/g, | 59 trim: /^\s+|\s+$/g, |
59 normalize: /\s{2,}/g, | 60 normalize: /\s{2,}/g, |
60 killBreaks: /(<br\s*\/?>(\s| ?)*){1,}/g, | 61 killBreaks: /(<br\s*\/?>(\s| ?)*){1,}/g, |
61 videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, | 62 videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, |
62 skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed) \s*$/i, | 63 skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed) \s*$/i, |
63 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last. | 64 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last. |
64 prevLink: /(prev|earl|old|new|<|«)/i | 65 prevLink: /(prev|earl|old|new|<|«)/i |
65 }, | 66 }, |
66 | 67 |
67 /** | 68 /** |
68 * Runs readability. | 69 * Runs readability. |
69 * | 70 * |
70 * Workflow: | 71 * Workflow: |
71 * 1. Prep the document by removing script tags, css, etc. | 72 * 1. Prep the document by removing script tags, css, etc. |
72 * 2. Build readability's DOM tree. | 73 * 2. Build readability's DOM tree. |
73 * 3. Grab the article content from the current dom tree. | 74 * 3. Grab the article content from the current dom tree. |
74 * 4. Replace the current DOM tree with the new one. | 75 * 4. Replace the current DOM tree with the new one. |
75 * 5. Read peacefully. | 76 * 5. Read peacefully. |
76 * | 77 * |
77 * @return void | 78 * @return void |
78 **/ | 79 **/ |
79 init: function() { | 80 init: function() { |
80 /* Before we do anything, remove all scripts that are not readability. * / | 81 /* Before we do anything, remove all scripts that are not readability. * / |
81 window.onload = window.onunload = function() {}; | 82 window.onload = window.onunload = function() {}; |
82 | 83 |
83 readability.removeScripts(document); | 84 readability.removeScripts(document); |
84 | 85 |
85 /* Make sure this document is added to the list of parsed pages first, s o we don't double up on the first page */ | 86 /* Make sure this document is added to the list of parsed pages first, s o we don't double up on the first page */ |
86 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true; | 87 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true; |
87 | 88 |
88 /* Pull out any possible next page link first */ | 89 /* Pull out any possible next page link first */ |
89 var nextPageLink = readability.findNextPageLink(document.body); | 90 readability.nextPageLink = readability.findNextPageLink(document.body); |
90 | 91 |
92 /* We handle processing of nextPage from C++ set nextPageLink to null */ | |
93 var nextPageLink = null; | |
94 | |
91 readability.prepDocument(); | 95 readability.prepDocument(); |
92 | 96 |
93 /* Build readability's DOM tree */ | 97 /* Build readability's DOM tree */ |
94 var overlay = document.createElement("DIV"); | 98 var overlay = document.createElement("DIV"); |
95 var innerDiv = document.createElement("DIV"); | 99 var innerDiv = document.createElement("DIV"); |
96 var articleTools = readability.getArticleTools(); | 100 var articleTools = readability.getArticleTools(); |
97 var articleTitleText = readability.getArticleTitle(); | 101 var articleTitleText = readability.getArticleTitle(); |
98 var articleContent = readability.grabArticle(); | 102 var articleContent = readability.grabArticle(); |
99 | 103 |
100 if(!articleContent) { | 104 if(!articleContent) { |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
145 rootWarning.innerHTML = "<em>Readability</em> was intended for u se on individual articles and not home pages. " + | 149 rootWarning.innerHTML = "<em>Readability</em> was intended for u se on individual articles and not home pages. " + |
146 "If you'd like to try rendering this page anyway, <a onClick='ja vascript:document.getElementById(\"readability-warning\").style.display=\"none\" ;document.getElementById(\"readability-content\").style.display=\"block\";'>clic k here</a> to continue."; | 150 "If you'd like to try rendering this page anyway, <a onClick='ja vascript:document.getElementById(\"readability-warning\").style.display=\"none\" ;document.getElementById(\"readability-content\").style.display=\"block\";'>clic k here</a> to continue."; |
147 | 151 |
148 innerDiv.insertBefore( rootWarning, articleContent ); | 152 innerDiv.insertBefore( rootWarning, articleContent ); |
149 } | 153 } |
150 | 154 |
151 readability.postProcessContent(articleContent); | 155 readability.postProcessContent(articleContent); |
152 | 156 |
153 window.scrollTo(0, 0); | 157 window.scrollTo(0, 0); |
154 | 158 |
155 // TODO(bengr): Remove this assignment of null to nextPageLink when | 159 // TODO(bengr): Remove this assignment of null to nextPageLink when |
Yaron
2014/01/29 20:03:41
Please remove this to-do and subsequent null-ing o
shashi
2014/01/29 22:51:37
Done.
| |
156 // the processing of the next page link is safe. | 160 // the processing of the next page link is safe. |
157 nextPageLink = null; | 161 nextPageLink = null; |
158 | 162 |
159 if (nextPageLink) { | 163 if (nextPageLink) { |
160 /** | 164 /** |
161 * Append any additional pages after a small timeout so that people | 165 * Append any additional pages after a small timeout so that people |
162 * can start reading without having to wait for this to finish proce ssing. | 166 * can start reading without having to wait for this to finish proce ssing. |
163 **/ | 167 **/ |
164 window.setTimeout(function() { | 168 window.setTimeout(function() { |
165 readability.appendNextPage(nextPageLink); | 169 readability.appendNextPage(nextPageLink); |
166 }, 500); | 170 }, 500); |
167 } | 171 } |
168 | 172 |
169 /** Smooth scrolling **/ | 173 /** Smooth scrolling **/ |
170 document.onkeydown = function(e) { | 174 document.onkeydown = function(e) { |
171 var code = (window.event) ? event.keyCode : e.keyCode; | 175 var code = (window.event) ? event.keyCode : e.keyCode; |
172 if (code === 16) { | 176 if (code === 16) { |
173 readability.reversePageScroll = true; | 177 readability.reversePageScroll = true; |
174 return; | 178 return; |
175 } | 179 } |
176 | 180 |
177 if (code === 32) { | 181 if (code === 32) { |
178 readability.curScrollStep = 0; | 182 readability.curScrollStep = 0; |
179 var windowHeight = window.innerHeight ? window.innerHeight : (do cument.documentElement.clientHeight ? document.documentElement.clientHeight : do cument.body.clientHeight); | 183 var windowHeight = window.innerHeight ? window.innerHeight : (do cument.documentElement.clientHeight ? document.documentElement.clientHeight : do cument.body.clientHeight); |
180 | 184 |
181 if(readability.reversePageScroll) { | 185 if(readability.reversePageScroll) { |
182 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() - (windowHeight - 50), 20, 10); | 186 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() - (windowHeight - 50), 20, 10); |
183 } | 187 } |
184 else { | 188 else { |
185 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() + (windowHeight - 50), 20, 10); | 189 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() + (windowHeight - 50), 20, 10); |
186 } | 190 } |
187 | 191 |
188 return false; | 192 return false; |
189 } | 193 } |
190 }; | 194 }; |
191 | 195 |
192 document.onkeyup = function(e) { | 196 document.onkeyup = function(e) { |
193 var code = (window.event) ? event.keyCode : e.keyCode; | 197 var code = (window.event) ? event.keyCode : e.keyCode; |
194 if (code === 16) { | 198 if (code === 16) { |
195 readability.reversePageScroll = false; | 199 readability.reversePageScroll = false; |
196 return; | 200 return; |
197 } | 201 } |
198 }; | 202 }; |
199 }, | 203 }, |
200 | 204 |
201 /** | 205 /** |
202 * Run any post-process modifications to article content as necessary. | 206 * Run any post-process modifications to article content as necessary. |
203 * | 207 * |
204 * @param Element | 208 * @param Element |
205 * @return void | 209 * @return void |
206 **/ | 210 **/ |
207 postProcessContent: function(articleContent) { | 211 postProcessContent: function(articleContent) { |
208 if(readability.convertLinksToFootnotes && !window.location.href.match(/w ikipedia\.org/g)) { | 212 if(readability.convertLinksToFootnotes && !window.location.href.match(/w ikipedia\.org/g)) { |
209 readability.addFootnotes(articleContent); | 213 readability.addFootnotes(articleContent); |
210 } | 214 } |
211 | 215 |
212 readability.fixImageFloats(articleContent); | 216 readability.fixImageFloats(articleContent); |
213 }, | 217 }, |
214 | 218 |
215 /** | 219 /** |
216 * Some content ends up looking ugly if the image is too large to be floated . | 220 * Some content ends up looking ugly if the image is too large to be floated . |
217 * If the image is wider than a threshold (currently 55%), no longer float i t, | 221 * If the image is wider than a threshold (currently 55%), no longer float i t, |
218 * center it instead. | 222 * center it instead. |
219 * | 223 * |
220 * @param Element | 224 * @param Element |
221 * @return void | 225 * @return void |
222 **/ | 226 **/ |
223 fixImageFloats: function (articleContent) { | 227 fixImageFloats: function (articleContent) { |
224 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0. 55, | 228 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0. 55, |
225 images = articleContent.getElementsByTagName('img'); | 229 images = articleContent.getElementsByTagName('img'); |
226 | 230 |
227 for(var i=0, il = images.length; i < il; i+=1) { | 231 for(var i=0, il = images.length; i < il; i+=1) { |
228 var image = images[i]; | 232 var image = images[i]; |
229 | 233 |
230 if(image.offsetWidth > imageWidthThreshold) { | 234 if(image.offsetWidth > imageWidthThreshold) { |
231 image.className += " blockImage"; | 235 image.className += " blockImage"; |
232 } | 236 } |
233 } | 237 } |
234 }, | 238 }, |
235 | 239 |
236 /** | 240 /** |
237 * Get the article tools Element that has buttons like reload, print. | 241 * Get the article tools Element that has buttons like reload, print. |
238 * | 242 * |
239 * @return void | 243 * @return void |
240 **/ | 244 **/ |
241 getArticleTools: function () { | 245 getArticleTools: function () { |
242 var articleTools = document.createElement("DIV"); | 246 var articleTools = document.createElement("DIV"); |
243 | 247 |
244 articleTools.id = "readTools"; | 248 articleTools.id = "readTools"; |
245 articleTools.innerHTML = | 249 articleTools.innerHTML = |
246 "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" + | 250 "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" + |
247 "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" + | 251 "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" + |
248 "<a href='#' onclick='readability.emailBox(); return false;' title=' Email page' id='email-page'>Email Page</a>"; | 252 "<a href='#' onclick='readability.emailBox(); return false;' title=' Email page' id='email-page'>Email Page</a>"; |
249 | 253 |
250 return articleTools; | 254 return articleTools; |
251 }, | 255 }, |
252 | 256 |
253 /** | 257 /** |
254 * retuns the suggested direction of the string | 258 * retuns the suggested direction of the string |
255 * | 259 * |
256 * @return "rtl" || "ltr" | 260 * @return "rtl" || "ltr" |
257 **/ | 261 **/ |
258 getSuggestedDirection: function(text) { | 262 getSuggestedDirection: function(text) { |
259 function sanitizeText() { | 263 function sanitizeText() { |
260 return text.replace(/@\w+/, ""); | 264 return text.replace(/@\w+/, ""); |
261 } | 265 } |
262 | 266 |
263 function countMatches(match) { | 267 function countMatches(match) { |
264 var matches = text.match(new RegExp(match, "g")); | 268 var matches = text.match(new RegExp(match, "g")); |
265 return matches !== null ? matches.length : 0; | 269 return matches !== null ? matches.length : 0; |
266 } | 270 } |
267 | 271 |
268 function isRTL() { | 272 function isRTL() { |
269 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]"); | 273 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]"); |
270 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]"); | 274 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]"); |
271 | 275 |
272 // if 20% of chars are Hebrew or Arbic then direction is rtl | 276 // if 20% of chars are Hebrew or Arbic then direction is rtl |
273 return (count_heb + count_arb) * 100 / text.length > 20; | 277 return (count_heb + count_arb) * 100 / text.length > 20; |
274 } | 278 } |
275 | 279 |
276 text = sanitizeText(text); | 280 text = sanitizeText(text); |
277 return isRTL() ? "rtl" : "ltr"; | 281 return isRTL() ? "rtl" : "ltr"; |
278 }, | 282 }, |
279 | 283 |
280 /** | 284 /** |
281 * Get the article title as an H1. | 285 * Get the article title as an H1. |
282 * | 286 * |
283 * @return void | 287 * @return void |
284 **/ | 288 **/ |
285 getArticleTitle: function () { | 289 getArticleTitle: function () { |
286 var curTitle = "", | 290 var curTitle = "", |
287 origTitle = ""; | 291 origTitle = ""; |
288 | 292 |
289 try { | 293 try { |
290 curTitle = origTitle = document.title; | 294 curTitle = origTitle = document.title; |
291 if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */ | 295 if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */ |
292 curTitle = origTitle = readability.getInnerText(document.getElem entsByTagName('title')[0]); | 296 curTitle = origTitle = readability.getInnerText(document.getElem entsByTagName('title')[0]); |
293 } | 297 } |
294 } | 298 } |
295 catch(e) {} | 299 catch(e) {} |
296 | 300 |
297 if(curTitle.match(/ [\|\-] /)) | 301 if(curTitle.match(/ [\|\-] /)) |
298 { | 302 { |
299 curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); | 303 curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); |
300 | 304 |
301 if(curTitle.split(' ').length < 3) { | 305 if(curTitle.split(' ').length < 3) { |
302 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); | 306 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); |
303 } | 307 } |
304 } | 308 } |
305 else if(curTitle.indexOf(': ') !== -1) | 309 else if(curTitle.indexOf(': ') !== -1) |
306 { | 310 { |
307 curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); | 311 curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); |
308 | 312 |
309 if(curTitle.split(' ').length < 3) { | 313 if(curTitle.split(' ').length < 3) { |
310 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); | 314 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); |
(...skipping 12 matching lines...) Expand all Loading... | |
323 | 327 |
324 if(curTitle.split(' ').length <= 4) { | 328 if(curTitle.split(' ').length <= 4) { |
325 curTitle = origTitle; | 329 curTitle = origTitle; |
326 } | 330 } |
327 return curTitle; | 331 return curTitle; |
328 }, | 332 }, |
329 | 333 |
330 /** | 334 /** |
331 * Prepare the HTML document for readability to scrape it. | 335 * Prepare the HTML document for readability to scrape it. |
332 * This includes things like stripping javascript, CSS, and handling terribl e markup. | 336 * This includes things like stripping javascript, CSS, and handling terribl e markup. |
333 * | 337 * |
334 * @return void | 338 * @return void |
335 **/ | 339 **/ |
336 prepDocument: function () { | 340 prepDocument: function () { |
337 /** | 341 /** |
338 * In some cases a body element can't be found (if the HTML is totally h osed for example) | 342 * In some cases a body element can't be found (if the HTML is totally h osed for example) |
339 * so we create a new body node and append it to the document. | 343 * so we create a new body node and append it to the document. |
340 */ | 344 */ |
341 if(document.body === null) | 345 if(document.body === null) |
342 { | 346 { |
343 var body = document.createElement("body"); | 347 var body = document.createElement("body"); |
344 try { | 348 try { |
345 document.body = body; | 349 document.body = body; |
346 } | 350 } |
347 catch(e) { | 351 catch(e) { |
348 document.documentElement.appendChild(body); | 352 document.documentElement.appendChild(body); |
349 dbg(e); | 353 dbg(e); |
350 } | 354 } |
351 } | 355 } |
352 | 356 |
353 document.body.id = "readabilityBody"; | 357 document.body.id = "readabilityBody"; |
354 | 358 |
355 var frames = document.getElementsByTagName('frame'); | 359 var frames = document.getElementsByTagName('frame'); |
(...skipping 11 matching lines...) Expand all Loading... | |
367 canAccessFrame = true; | 371 canAccessFrame = true; |
368 } | 372 } |
369 catch(eFrames) { | 373 catch(eFrames) { |
370 dbg(eFrames); | 374 dbg(eFrames); |
371 } | 375 } |
372 | 376 |
373 if(frameSize > biggestFrameSize) { | 377 if(frameSize > biggestFrameSize) { |
374 biggestFrameSize = frameSize; | 378 biggestFrameSize = frameSize; |
375 readability.biggestFrame = frames[frameIndex]; | 379 readability.biggestFrame = frames[frameIndex]; |
376 } | 380 } |
377 | 381 |
378 if(canAccessFrame && frameSize > bestFrameSize) | 382 if(canAccessFrame && frameSize > bestFrameSize) |
379 { | 383 { |
380 readability.frameHack = true; | 384 readability.frameHack = true; |
381 | 385 |
382 bestFrame = frames[frameIndex]; | 386 bestFrame = frames[frameIndex]; |
383 bestFrameSize = frameSize; | 387 bestFrameSize = frameSize; |
384 } | 388 } |
385 } | 389 } |
386 | 390 |
387 if(bestFrame) | 391 if(bestFrame) |
388 { | 392 { |
389 var newBody = document.createElement('body'); | 393 var newBody = document.createElement('body'); |
390 readability.moveNodeInnards(bestFrame.contentWindow.document.bod y, newBody); | 394 readability.moveNodeInnards(bestFrame.contentWindow.document.bod y, newBody); |
391 newBody.style.overflow = 'scroll'; | 395 newBody.style.overflow = 'scroll'; |
392 document.body = newBody; | 396 document.body = newBody; |
393 | 397 |
394 var frameset = document.getElementsByTagName('frameset')[0]; | 398 var frameset = document.getElementsByTagName('frameset')[0]; |
395 if(frameset) { | 399 if(frameset) { |
396 frameset.parentNode.removeChild(frameset); } | 400 frameset.parentNode.removeChild(frameset); } |
397 } | 401 } |
398 } | 402 } |
399 | 403 |
400 /* Remove all stylesheets */ | 404 /* Remove all stylesheets */ |
401 for (var k=0;k < document.styleSheets.length; k+=1) { | 405 for (var k=0;k < document.styleSheets.length; k+=1) { |
402 if (document.styleSheets[k].href !== null && document.styleSheets[k] .href.lastIndexOf("readability") === -1) { | 406 if (document.styleSheets[k].href !== null && document.styleSheets[k] .href.lastIndexOf("readability") === -1) { |
403 document.styleSheets[k].disabled = true; | 407 document.styleSheets[k].disabled = true; |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
448 readability.cleanConditionally(articleContent, "table"); | 452 readability.cleanConditionally(articleContent, "table"); |
449 readability.cleanConditionally(articleContent, "ul"); | 453 readability.cleanConditionally(articleContent, "ul"); |
450 readability.cleanConditionally(articleContent, "div"); | 454 readability.cleanConditionally(articleContent, "div"); |
451 | 455 |
452 /* Remove extra paragraphs */ | 456 /* Remove extra paragraphs */ |
453 var articleParagraphs = articleContent.getElementsByTagName('p'); | 457 var articleParagraphs = articleContent.getElementsByTagName('p'); |
454 for(var i = articleParagraphs.length-1; i >= 0; i-=1) { | 458 for(var i = articleParagraphs.length-1; i >= 0; i-=1) { |
455 var imgCount = articleParagraphs[i].getElementsByTagName('img').l ength; | 459 var imgCount = articleParagraphs[i].getElementsByTagName('img').l ength; |
456 var embedCount = articleParagraphs[i].getElementsByTagName('embed') .length; | 460 var embedCount = articleParagraphs[i].getElementsByTagName('embed') .length; |
457 var objectCount = articleParagraphs[i].getElementsByTagName('object' ).length; | 461 var objectCount = articleParagraphs[i].getElementsByTagName('object' ).length; |
458 | 462 |
459 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab ility.getInnerText(articleParagraphs[i], false) === '') { | 463 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab ility.getInnerText(articleParagraphs[i], false) === '') { |
460 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i] ); | 464 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i] ); |
461 } | 465 } |
462 } | 466 } |
463 | 467 |
464 try { | 468 try { |
465 readability.replaceBrsWithPs(articleContent); | 469 readability.replaceBrsWithPs(articleContent); |
466 } | 470 } |
467 catch (e) { | 471 catch (e) { |
468 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block -elements bug. Ignoring.: " + e); | 472 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block -elements bug. Ignoring.: " + e); |
469 } | 473 } |
470 }, | 474 }, |
471 | 475 |
472 /** | 476 /** |
473 * Initialize a node with the readability object. Also checks the | 477 * Initialize a node with the readability object. Also checks the |
474 * className/id for special names to add to its score. | 478 * className/id for special names to add to its score. |
475 * | 479 * |
476 * @param Element | 480 * @param Element |
477 * @return void | 481 * @return void |
478 **/ | 482 **/ |
479 initializeNode: function (node) { | 483 initializeNode: function (node) { |
480 node.readability = {"contentScore": 0}; | 484 node.readability = {"contentScore": 0}; |
481 | 485 |
482 switch(node.tagName) { | 486 switch(node.tagName) { |
483 case 'DIV': | 487 case 'DIV': |
484 node.readability.contentScore += 5; | 488 node.readability.contentScore += 5; |
485 break; | 489 break; |
486 | 490 |
487 case 'PRE': | 491 case 'PRE': |
488 case 'TD': | 492 case 'TD': |
489 case 'BLOCKQUOTE': | 493 case 'BLOCKQUOTE': |
490 node.readability.contentScore += 3; | 494 node.readability.contentScore += 3; |
491 break; | 495 break; |
492 | 496 |
493 case 'ADDRESS': | 497 case 'ADDRESS': |
494 case 'OL': | 498 case 'OL': |
495 case 'UL': | 499 case 'UL': |
496 case 'DL': | 500 case 'DL': |
497 case 'DD': | 501 case 'DD': |
498 case 'DT': | 502 case 'DT': |
499 case 'LI': | 503 case 'LI': |
500 case 'FORM': | 504 case 'FORM': |
501 node.readability.contentScore -= 3; | 505 node.readability.contentScore -= 3; |
502 break; | 506 break; |
503 | 507 |
504 case 'H1': | 508 case 'H1': |
505 case 'H2': | 509 case 'H2': |
506 case 'H3': | 510 case 'H3': |
507 case 'H4': | 511 case 'H4': |
508 case 'H5': | 512 case 'H5': |
509 case 'H6': | 513 case 'H6': |
510 case 'TH': | 514 case 'TH': |
511 node.readability.contentScore -= 5; | 515 node.readability.contentScore -= 5; |
512 break; | 516 break; |
513 } | 517 } |
514 | 518 |
515 node.readability.contentScore += readability.getClassWeight(node); | 519 node.readability.contentScore += readability.getClassWeight(node); |
516 }, | 520 }, |
517 | 521 |
518 /*** | 522 /*** |
519 * grabArticle - Using a variety of metrics (content score, classname, eleme nt types), find the content that is | 523 * grabArticle - Using a variety of metrics (content score, classname, eleme nt types), find the content that is |
520 * most likely to be the stuff a user wants to read. Then retu rn it wrapped up in a div. | 524 * most likely to be the stuff a user wants to read. Then retu rn it wrapped up in a div. |
521 * | 525 * |
522 * @param page a document to run upon. Needs to be a full document, complete with body. | 526 * @param page a document to run upon. Needs to be a full document, complete with body. |
523 * @return Element | 527 * @return Element |
524 **/ | 528 **/ |
525 grabArticle: function (pageToClone) { | 529 grabArticle: function (pageToClone) { |
526 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_ STRIP_UNLIKELYS), | 530 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_ STRIP_UNLIKELYS), |
527 isPaging = (page !== null) ? true: false; | 531 isPaging = (page !== null) ? true: false; |
528 | 532 |
529 var page = null; | 533 var page = null; |
530 // Never work on the actual page. | 534 // Never work on the actual page. |
531 if (isPaging) { | 535 if (isPaging) { |
532 page = document.body.cloneNode(true); | 536 page = document.body.cloneNode(true); |
533 } else { | 537 } else { |
534 page = pageToClone.cloneNode(true); | 538 page = pageToClone.cloneNode(true); |
535 } | 539 } |
536 | 540 |
537 var allElements = page.getElementsByTagName('*'); | 541 var allElements = page.getElementsByTagName('*'); |
538 | 542 |
539 /** | 543 /** |
540 * First, node prepping. Trash nodes that look cruddy (like ones with th e class name "comment", etc), and turn divs | 544 * First, node prepping. Trash nodes that look cruddy (like ones with th e class name "comment", etc), and turn divs |
541 * into P tags where they have been used inappropriately (as in, where t hey contain no other block level elements.) | 545 * into P tags where they have been used inappropriately (as in, where t hey contain no other block level elements.) |
542 * | 546 * |
543 * Note: Assignment from index for performance. See http://www.peachpit. com/articles/article.aspx?p=31567&seqNum=5 | 547 * Note: Assignment from index for performance. See http://www.peachpit. com/articles/article.aspx?p=31567&seqNum=5 |
544 * TODO: Shouldn't this be a reverse traversal? | 548 * TODO: Shouldn't this be a reverse traversal? |
545 **/ | 549 **/ |
546 var node = null; | 550 var node = null; |
547 var nodesToScore = []; | 551 var nodesToScore = []; |
548 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) { | 552 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) { |
549 /* Remove unlikely candidates */ | 553 /* Remove unlikely candidates */ |
550 if (stripUnlikelyCandidates) { | 554 if (stripUnlikelyCandidates) { |
551 var unlikelyMatchString = node.className + node.id; | 555 var unlikelyMatchString = node.className + node.id; |
552 if ( | 556 if ( |
553 ( | 557 ( |
554 unlikelyMatchString.search(readability.regexps.unlikelyC andidates) !== -1 && | 558 unlikelyMatchString.search(readability.regexps.unlikelyC andidates) !== -1 && |
555 unlikelyMatchString.search(readability.regexps.okMaybeIt sACandidate) === -1 && | 559 unlikelyMatchString.search(readability.regexps.okMaybeIt sACandidate) === -1 && |
556 node.tagName !== "BODY" | 560 node.tagName !== "BODY" |
557 ) | 561 ) |
558 ) | 562 ) |
559 { | 563 { |
560 dbg("Removing unlikely candidate - " + unlikelyMatchString); | 564 dbg("Removing unlikely candidate - " + unlikelyMatchString); |
561 node.parentNode.removeChild(node); | 565 node.parentNode.removeChild(node); |
562 nodeIndex-=1; | 566 nodeIndex-=1; |
563 continue; | 567 continue; |
564 } | 568 } |
565 } | 569 } |
566 | 570 |
567 if (node.tagName === "P" || node.tagName === "TD" || node.tagName == = "PRE") { | 571 if (node.tagName === "P" || node.tagName === "TD" || node.tagName == = "PRE") { |
568 nodesToScore[nodesToScore.length] = node; | 572 nodesToScore[nodesToScore.length] = node; |
569 } | 573 } |
570 | 574 |
571 /* Turn all divs that don't have children block level elements into p's */ | 575 /* Turn all divs that don't have children block level elements into p's */ |
572 if (node.tagName === "DIV") { | 576 if (node.tagName === "DIV") { |
573 if (node.innerHTML.search(readability.regexps.divToPElements) == = -1) { | 577 if (node.innerHTML.search(readability.regexps.divToPElements) == = -1) { |
574 var newNode = document.createElement('p'); | 578 var newNode = document.createElement('p'); |
(...skipping 16 matching lines...) Expand all Loading... | |
591 if(childNode.nodeType === 3) { // Node.TEXT_NODE | 595 if(childNode.nodeType === 3) { // Node.TEXT_NODE |
592 var p = document.createElement('p'); | 596 var p = document.createElement('p'); |
593 var t = document.createTextNode(childNode.nodeValue) ; | 597 var t = document.createTextNode(childNode.nodeValue) ; |
594 p.appendChild(t); | 598 p.appendChild(t); |
595 p.style.display = 'inline'; | 599 p.style.display = 'inline'; |
596 p.className = 'readability-styled'; | 600 p.className = 'readability-styled'; |
597 childNode.parentNode.replaceChild(p, childNode); | 601 childNode.parentNode.replaceChild(p, childNode); |
598 } | 602 } |
599 } | 603 } |
600 } | 604 } |
601 } | 605 } |
602 } | 606 } |
603 | 607 |
604 /** | 608 /** |
605 * Loop through all paragraphs, and assign a score to them based on how content-y they look. | 609 * Loop through all paragraphs, and assign a score to them based on how content-y they look. |
606 * Then add their score to their parent node. | 610 * Then add their score to their parent node. |
607 * | 611 * |
608 * A score is determined by things like number of commas, class names, e tc. Maybe eventually link density. | 612 * A score is determined by things like number of commas, class names, e tc. Maybe eventually link density. |
609 **/ | 613 **/ |
610 var candidates = []; | 614 var candidates = []; |
611 for (var pt=0; pt < nodesToScore.length; pt+=1) { | 615 for (var pt=0; pt < nodesToScore.length; pt+=1) { |
(...skipping 21 matching lines...) Expand all Loading... | |
633 candidates.push(grandParentNode); | 637 candidates.push(grandParentNode); |
634 } | 638 } |
635 | 639 |
636 var contentScore = 0; | 640 var contentScore = 0; |
637 | 641 |
638 /* Add a point for the paragraph itself as a base. */ | 642 /* Add a point for the paragraph itself as a base. */ |
639 contentScore+=1; | 643 contentScore+=1; |
640 | 644 |
641 /* Add points for any commas within this paragraph */ | 645 /* Add points for any commas within this paragraph */ |
642 contentScore += innerText.split(',').length; | 646 contentScore += innerText.split(',').length; |
643 | 647 |
644 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ | 648 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ |
645 contentScore += Math.min(Math.floor(innerText.length / 100), 3); | 649 contentScore += Math.min(Math.floor(innerText.length / 100), 3); |
646 | 650 |
647 /* Add the score to the parent. The grandparent gets half. */ | 651 /* Add the score to the parent. The grandparent gets half. */ |
648 parentNode.readability.contentScore += contentScore; | 652 parentNode.readability.contentScore += contentScore; |
649 | 653 |
650 if(grandParentNode) { | 654 if(grandParentNode) { |
651 grandParentNode.readability.contentScore += contentScore/2; | 655 grandParentNode.readability.contentScore += contentScore/2; |
652 } | 656 } |
653 } | 657 } |
654 | 658 |
655 /** | 659 /** |
656 * After we've calculated scores, loop through all of the possible candi date nodes we found | 660 * After we've calculated scores, loop through all of the possible candi date nodes we found |
657 * and find the one with the highest score. | 661 * and find the one with the highest score. |
658 **/ | 662 **/ |
659 var topCandidate = null; | 663 var topCandidate = null; |
660 for(var c=0, cl=candidates.length; c < cl; c+=1) | 664 for(var c=0, cl=candidates.length; c < cl; c+=1) |
661 { | 665 { |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
718 var contentBonus = 0; | 722 var contentBonus = 0; |
719 /* Give a bonus if sibling nodes and top candidates have the example same classname */ | 723 /* Give a bonus if sibling nodes and top candidates have the example same classname */ |
720 if(siblingNode.className === topCandidate.className && topCandidate. className !== "") { | 724 if(siblingNode.className === topCandidate.className && topCandidate. className !== "") { |
721 contentBonus += topCandidate.readability.contentScore * 0.2; | 725 contentBonus += topCandidate.readability.contentScore * 0.2; |
722 } | 726 } |
723 | 727 |
724 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re adability.contentScore+contentBonus) >= siblingScoreThreshold) | 728 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re adability.contentScore+contentBonus) >= siblingScoreThreshold) |
725 { | 729 { |
726 append = true; | 730 append = true; |
727 } | 731 } |
728 | 732 |
729 if(siblingNode.nodeName === "P") { | 733 if(siblingNode.nodeName === "P") { |
730 var linkDensity = readability.getLinkDensity(siblingNode); | 734 var linkDensity = readability.getLinkDensity(siblingNode); |
731 var nodeContent = readability.getInnerText(siblingNode); | 735 var nodeContent = readability.getInnerText(siblingNode); |
732 var nodeLength = nodeContent.length; | 736 var nodeLength = nodeContent.length; |
733 | 737 |
734 if(nodeLength > 80 && linkDensity < 0.25) | 738 if(nodeLength > 80 && linkDensity < 0.25) |
735 { | 739 { |
736 append = true; | 740 append = true; |
737 } | 741 } |
738 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear ch(/\.( |$)/) !== -1) | 742 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear ch(/\.( |$)/) !== -1) |
739 { | 743 { |
740 append = true; | 744 append = true; |
741 } | 745 } |
742 } | 746 } |
743 | 747 |
744 if(append) { | 748 if(append) { |
745 dbg("Appending node: " + siblingNode); | 749 dbg("Appending node: " + siblingNode); |
746 | 750 |
747 var nodeToAppend = null; | 751 var nodeToAppend = null; |
748 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P ") { | 752 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P ") { |
749 /* We have a node that isn't a common block level element, l ike a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ | 753 /* We have a node that isn't a common block level element, l ike a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ |
750 | 754 |
751 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); | 755 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); |
752 nodeToAppend = document.createElement("DIV"); | 756 nodeToAppend = document.createElement("DIV"); |
753 try { | 757 try { |
754 nodeToAppend.id = siblingNode.id; | 758 nodeToAppend.id = siblingNode.id; |
755 readability.moveNodeInnards(siblingNode, nodeToAppend); | 759 readability.moveNodeInnards(siblingNode, nodeToAppend); |
756 } | 760 } |
757 catch(er) { | 761 catch(er) { |
758 dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original."); | 762 dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original."); |
759 nodeToAppend = siblingNode; | 763 nodeToAppend = siblingNode; |
760 s-=1; | 764 s-=1; |
761 sl-=1; | 765 sl-=1; |
762 } | 766 } |
763 } else { | 767 } else { |
764 nodeToAppend = siblingNode; | 768 nodeToAppend = siblingNode; |
765 s-=1; | 769 s-=1; |
766 sl-=1; | 770 sl-=1; |
767 } | 771 } |
768 | 772 |
769 /* To ensure a node does not interfere with readability styles, remove its classnames */ | 773 /* To ensure a node does not interfere with readability styles, remove its classnames */ |
770 nodeToAppend.className = ""; | 774 nodeToAppend.className = ""; |
771 | 775 |
772 /* Append sibling and subtract from our list because it removes the node when you append to another node */ | 776 /* Append sibling and subtract from our list because it removes the node when you append to another node */ |
773 articleContent.appendChild(nodeToAppend); | 777 articleContent.appendChild(nodeToAppend); |
774 } | 778 } |
775 } | 779 } |
776 | 780 |
777 /** | 781 /** |
778 * So we have all of the content that we need. Now we clean it up for pr esentation. | 782 * So we have all of the content that we need. Now we clean it up for pr esentation. |
779 **/ | 783 **/ |
780 readability.distilledArticleContent = articleContent.cloneNode(true); | 784 readability.distilledArticleContent = articleContent.cloneNode(true); |
781 //readability.prepArticle(articleContent); | 785 //readability.prepArticle(articleContent); |
782 | 786 |
783 if (readability.curPageNum === 1) { | 787 if (readability.curPageNum === 1) { |
784 var newNode = document.createElement('div'); | 788 var newNode = document.createElement('div'); |
785 newNode.id = "readability-page-1"; | 789 newNode.id = "readability-page-1"; |
786 newNode.setAttribute("class", "page"); | 790 newNode.setAttribute("class", "page"); |
787 readability.moveNodeInnards(articleContent, newNode); | 791 readability.moveNodeInnards(articleContent, newNode); |
788 articleContent.appendChild(newNode); | 792 articleContent.appendChild(newNode); |
789 } | 793 } |
790 | 794 |
791 /** | 795 /** |
792 * Now that we've gone through the full algorithm, check to see if we go t any meaningful content. | 796 * Now that we've gone through the full algorithm, check to see if we go t any meaningful content. |
793 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher | 797 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher |
794 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of | 798 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of |
795 * finding the -right- content. | 799 * finding the -right- content. |
796 **/ | 800 **/ |
797 if(readability.getInnerText(articleContent, false).length < 250) { | 801 if(readability.getInnerText(articleContent, false).length < 250) { |
798 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) { | 802 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) { |
799 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS); | 803 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS); |
800 return readability.grabArticle(document.body); | 804 return readability.grabArticle(document.body); |
801 } | 805 } |
802 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { | 806 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { |
803 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES); | 807 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES); |
804 return readability.grabArticle(document.body); | 808 return readability.grabArticle(document.body); |
805 } | 809 } |
806 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL LY)) { | 810 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL LY)) { |
807 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY); | 811 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY); |
808 return readability.grabArticle(document.body); | 812 return readability.grabArticle(document.body); |
809 } else { | 813 } else { |
810 return null; | 814 return null; |
811 } | 815 } |
812 } | 816 } |
813 | 817 |
814 return articleContent; | 818 return articleContent; |
815 }, | 819 }, |
816 | 820 |
817 /** | 821 /** |
818 * Removes script tags from the document. | 822 * Removes script tags from the document. |
819 * | 823 * |
820 * @param Element | 824 * @param Element |
821 **/ | 825 **/ |
822 removeScripts: function (doc) { | 826 removeScripts: function (doc) { |
823 var scripts = doc.getElementsByTagName('script'); | 827 var scripts = doc.getElementsByTagName('script'); |
824 for(var i = scripts.length-1; i >= 0; i-=1) | 828 for(var i = scripts.length-1; i >= 0; i-=1) |
825 { | 829 { |
826 if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf ('readability') === -1 && scripts[i].src.indexOf('typekit') === -1)) | 830 if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf ('readability') === -1 && scripts[i].src.indexOf('typekit') === -1)) |
827 { | 831 { |
828 scripts[i].nodeValue=""; | 832 scripts[i].nodeValue=""; |
829 scripts[i].removeAttribute('src'); | 833 scripts[i].removeAttribute('src'); |
830 if (scripts[i].parentNode) { | 834 if (scripts[i].parentNode) { |
831 scripts[i].parentNode.removeChild(scripts[i]); | 835 scripts[i].parentNode.removeChild(scripts[i]); |
832 } | 836 } |
833 } | 837 } |
834 } | 838 } |
835 }, | 839 }, |
836 | 840 |
837 /** | 841 /** |
838 * Get the inner text of a node - cross browser compatibly. | 842 * Get the inner text of a node - cross browser compatibly. |
839 * This also strips out any excess whitespace to be found. | 843 * This also strips out any excess whitespace to be found. |
840 * | 844 * |
841 * @param Element | 845 * @param Element |
842 * @return string | 846 * @return string |
843 **/ | 847 **/ |
844 getInnerText: function (e, normalizeSpaces) { | 848 getInnerText: function (e, normalizeSpaces) { |
845 var textContent = ""; | 849 var textContent = ""; |
846 | 850 |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
889 | 893 |
890 // Remove any root styles, if we're able. | 894 // Remove any root styles, if we're able. |
891 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili ty-styled') { | 895 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili ty-styled') { |
892 e.removeAttribute('style'); } | 896 e.removeAttribute('style'); } |
893 | 897 |
894 // Go until there are no more child nodes | 898 // Go until there are no more child nodes |
895 while ( cur !== null ) { | 899 while ( cur !== null ) { |
896 if ( cur.nodeType === 1 ) { | 900 if ( cur.nodeType === 1 ) { |
897 // Remove style attribute(s) : | 901 // Remove style attribute(s) : |
898 if(cur.className !== "readability-styled") { | 902 if(cur.className !== "readability-styled") { |
899 cur.removeAttribute("style"); | 903 cur.removeAttribute("style"); |
900 } | 904 } |
901 readability.cleanStyles( cur ); | 905 readability.cleanStyles( cur ); |
902 } | 906 } |
903 cur = cur.nextSibling; | 907 cur = cur.nextSibling; |
904 } | 908 } |
905 }, | 909 }, |
906 | 910 |
907 /** | 911 /** |
908 * Get the density of links as a percentage of the content | 912 * Get the density of links as a percentage of the content |
909 * This is the amount of text that is inside a link divided by the total tex t in the node. | 913 * This is the amount of text that is inside a link divided by the total tex t in the node. |
910 * | 914 * |
911 * @param Element | 915 * @param Element |
912 * @return number (float) | 916 * @return number (float) |
913 **/ | 917 **/ |
914 getLinkDensity: function (e) { | 918 getLinkDensity: function (e) { |
915 var links = e.getElementsByTagName("a"); | 919 var links = e.getElementsByTagName("a"); |
916 var textLength = readability.getInnerText(e).length; | 920 var textLength = readability.getInnerText(e).length; |
917 var linkLength = 0; | 921 var linkLength = 0; |
918 for(var i=0, il=links.length; i<il;i+=1) | 922 for(var i=0, il=links.length; i<il;i+=1) |
919 { | 923 { |
920 linkLength += readability.getInnerText(links[i]).length; | 924 linkLength += readability.getInnerText(links[i]).length; |
921 } | 925 } |
922 | 926 |
923 return linkLength / textLength; | 927 return linkLength / textLength; |
924 }, | 928 }, |
925 | 929 |
926 /** | 930 /** |
927 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness. | 931 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness. |
928 * | 932 * |
929 * @author Dan Lacy | 933 * @author Dan Lacy |
930 * @return string the base url | 934 * @return string the base url |
931 **/ | 935 **/ |
932 findBaseUrl: function () { | 936 findBaseUrl: function () { |
933 var noUrlParams = window.location.pathname.split("?")[0], | 937 var noUrlParams = window.location.pathname.split("?")[0], |
934 urlSlashes = noUrlParams.split("/").reverse(), | 938 urlSlashes = noUrlParams.split("/").reverse(), |
935 cleanedSegments = [], | 939 cleanedSegments = [], |
936 possibleType = ""; | 940 possibleType = ""; |
937 | 941 |
938 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) { | 942 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) { |
939 var segment = urlSlashes[i]; | 943 var segment = urlSlashes[i]; |
940 | 944 |
941 // Split off and save anything that looks like a file type. | 945 // Split off and save anything that looks like a file type. |
942 if (segment.indexOf(".") !== -1) { | 946 if (segment.indexOf(".") !== -1) { |
943 possibleType = segment.split(".")[1]; | 947 possibleType = segment.split(".")[1]; |
944 | 948 |
945 /* If the type isn't alpha-only, it's probably not actually a fi le extension. */ | 949 /* If the type isn't alpha-only, it's probably not actually a fi le extension. */ |
946 if(!possibleType.match(/[^a-zA-Z]/)) { | 950 if(!possibleType.match(/[^a-zA-Z]/)) { |
947 segment = segment.split(".")[0]; | 951 segment = segment.split(".")[0]; |
948 } | 952 } |
949 } | 953 } |
950 | 954 |
951 /** | 955 /** |
952 * EW-CMS specific segment replacement. Ugly. | 956 * EW-CMS specific segment replacement. Ugly. |
953 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm l | 957 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm l |
954 **/ | 958 **/ |
955 if(segment.indexOf(',00') !== -1) { | 959 if(segment.indexOf(',00') !== -1) { |
956 segment = segment.replace(',00', ''); | 960 segment = segment.replace(',00', ''); |
957 } | 961 } |
958 | 962 |
959 // If our first or second segment has anything looking like a page n umber, remove it. | 963 // If our first or second segment has anything looking like a page n umber, remove it. |
960 if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) { | 964 if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) { |
961 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, " "); | 965 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, " "); |
962 } | 966 } |
963 | 967 |
964 | 968 |
965 var del = false; | 969 var del = false; |
966 | 970 |
967 /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */ | 971 /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */ |
968 if (i < 2 && segment.match(/^\d{1,2}$/)) { | 972 if (i < 2 && segment.match(/^\d{1,2}$/)) { |
969 del = true; | 973 del = true; |
970 } | 974 } |
971 | 975 |
972 /* If this is the first segment and it's just "index", remove it. */ | 976 /* If this is the first segment and it's just "index", remove it. */ |
973 if(i === 0 && segment.toLowerCase() === "index") { | 977 if(i === 0 && segment.toLowerCase() === "index") { |
974 del = true; | 978 del = true; |
975 } | 979 } |
976 | 980 |
977 | 981 |
978 /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */ | 982 /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */ |
979 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) { | 983 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) { |
980 del = true; | 984 del = true; |
981 } | 985 } |
982 | 986 |
983 /* If it's not marked for deletion, push it to cleanedSegments. */ | 987 /* If it's not marked for deletion, push it to cleanedSegments. */ |
984 if (!del) { | 988 if (!del) { |
985 cleanedSegments.push(segment); | 989 cleanedSegments.push(segment); |
986 } | 990 } |
987 } | 991 } |
988 | 992 |
989 // This is our final, cleaned, base article URL. | 993 // This is our final, cleaned, base article URL. |
990 return window.location.protocol + "//" + window.location.host + cleanedS egments.reverse().join("/"); | 994 return window.location.protocol + "//" + window.location.host + cleanedS egments.reverse().join("/"); |
991 }, | 995 }, |
992 | 996 |
993 /** | 997 /** |
994 * Look for any paging links that may occur within the document. | 998 * Look for any paging links that may occur within the document. |
995 * | 999 * |
996 * @param body | 1000 * @param body |
997 * @return object (array) | 1001 * @return object (array) |
998 **/ | 1002 **/ |
999 findNextPageLink: function (elem) { | 1003 findNextPageLink: function (elem) { |
1000 var possiblePages = {}, | 1004 var possiblePages = {}, |
1001 allLinks = elem.getElementsByTagName('a'), | 1005 allLinks = elem.getElementsByTagName('a'), |
1002 articleBaseUrl = readability.findBaseUrl(); | 1006 articleBaseUrl = readability.findBaseUrl(); |
1003 | 1007 |
1004 /** | 1008 /** |
1005 * Loop through all links, looking for hints that they may be next-page links. | 1009 * Loop through all links, looking for hints that they may be next-page links. |
1006 * Things like having "page" in their textContent, className or id, or b eing a child | 1010 * Things like having "page" in their textContent, className or id, or b eing a child |
1007 * of a node with a page-y className or id. | 1011 * of a node with a page-y className or id. |
1008 * | 1012 * |
1009 * Also possible: levenshtein distance? longest common subsequence? | 1013 * Also possible: levenshtein distance? longest common subsequence? |
1010 * | 1014 * |
1011 * After we do that, assign each page a score, and | 1015 * After we do that, assign each page a score, and |
1012 **/ | 1016 **/ |
1013 for(var i = 0, il = allLinks.length; i < il; i+=1) { | 1017 for(var i = 0, il = allLinks.length; i < il; i+=1) { |
1014 var link = allLinks[i], | 1018 var link = allLinks[i], |
1015 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ' '); | 1019 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ' '); |
1016 | 1020 |
1017 /* If we've already seen this page, ignore it */ | 1021 /* If we've already seen this page, ignore it */ |
1018 if(linkHref === "" || linkHref === articleBaseUrl || linkHref === wi ndow.location.href || linkHref in readability.parsedPages) { | 1022 if(linkHref === "" || linkHref === articleBaseUrl || linkHref === wi ndow.location.href || linkHref in readability.parsedPages) { |
1019 continue; | 1023 continue; |
1020 } | 1024 } |
1021 | 1025 |
1022 /* If it's on a different domain, skip it. */ | 1026 /* If it's on a different domain, skip it. */ |
1023 if(window.location.host !== linkHref.split(/\/+/g)[1]) { | 1027 if(window.location.host !== linkHref.split(/\/+/g)[1]) { |
1024 continue; | 1028 continue; |
1025 } | 1029 } |
1026 | 1030 |
1027 var linkText = readability.getInnerText(link); | 1031 var linkText = readability.getInnerText(link); |
1028 | 1032 |
1029 /* If the linkText looks like it's not the next page, skip it. */ | 1033 /* If the linkText looks like it's not the next page, skip it. */ |
1030 if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) { | 1034 if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) { |
1031 continue; | 1035 continue; |
1032 } | 1036 } |
1033 | 1037 |
1034 /* If the leftovers of the URL after removing the base URL don't con tain any digits, it's certainly not a next page link. */ | 1038 /* If the leftovers of the URL after removing the base URL don't con tain any digits, it's certainly not a next page link. */ |
1035 var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); | 1039 var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); |
1036 if(!linkHrefLeftover.match(/\d/)) { | 1040 if(!linkHrefLeftover.match(/\d/)) { |
1037 continue; | 1041 continue; |
1038 } | 1042 } |
1039 | 1043 |
1040 if(!(linkHref in possiblePages)) { | 1044 if(!(linkHref in possiblePages)) { |
1041 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr ef": linkHref}; | 1045 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr ef": linkHref}; |
1042 } else { | 1046 } else { |
1043 possiblePages[linkHref].linkText += ' | ' + linkText; | 1047 possiblePages[linkHref].linkText += ' | ' + linkText; |
1044 } | 1048 } |
1045 | 1049 |
1046 var linkObj = possiblePages[linkHref]; | 1050 var linkObj = possiblePages[linkHref]; |
1047 | 1051 |
1048 /** | 1052 /** |
1049 * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower. | 1053 * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower. |
1050 * Example: http://www.actionscript.org/resources/articles/745/1/Jav aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html | 1054 * Example: http://www.actionscript.org/resources/articles/745/1/Jav aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html |
1051 **/ | 1055 **/ |
1052 if(linkHref.indexOf(articleBaseUrl) !== 0) { | 1056 if(linkHref.indexOf(articleBaseUrl) !== 0) { |
1053 linkObj.score -= 25; | 1057 linkObj.score -= 25; |
1054 } | 1058 } |
1055 | 1059 |
1056 var linkData = linkText + ' ' + link.className + ' ' + link.id; | 1060 var linkData = linkText + ' ' + link.className + ' ' + link.id; |
1057 if(linkData.match(readability.regexps.nextLink)) { | 1061 if(linkData.match(readability.regexps.nextLink)) { |
1058 linkObj.score += 50; | 1062 linkObj.score += 50; |
1059 } | 1063 } |
1060 if(linkData.match(/pag(e|ing|inat)/i)) { | 1064 if(linkData.match(/pag(e|ing|inat)/i)) { |
1061 linkObj.score += 25; | 1065 linkObj.score += 25; |
1062 } | 1066 } |
1063 if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text, | 1067 if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text, |
1064 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */ | 1068 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */ |
1065 if(!linkObj.linkText.match(readability.regexps.nextLink)) { | 1069 if(!linkObj.linkText.match(readability.regexps.nextLink)) { |
1066 linkObj.score -= 65; | 1070 linkObj.score -= 65; |
1067 } | 1071 } |
1068 } | 1072 } |
1069 if(linkData.match(readability.regexps.negative) || linkData.match(re adability.regexps.extraneous)) { | 1073 if(linkData.match(readability.regexps.negative) || linkData.match(re adability.regexps.extraneous)) { |
1070 linkObj.score -= 50; | 1074 linkObj.score -= 50; |
1071 } | 1075 } |
1072 if(linkData.match(readability.regexps.prevLink)) { | 1076 if(linkData.match(readability.regexps.prevLink)) { |
1073 linkObj.score -= 200; | 1077 linkObj.score -= 200; |
1074 } | 1078 } |
1075 | 1079 |
1076 /* If a parentNode contains page or paging or paginat */ | 1080 /* If a parentNode contains page or paging or paginat */ |
1077 var parentNode = link.parentNode, | 1081 var parentNode = link.parentNode, |
1078 positiveNodeMatch = false, | 1082 positiveNodeMatch = false, |
1079 negativeNodeMatch = false; | 1083 negativeNodeMatch = false; |
1080 while(parentNode) { | 1084 while(parentNode) { |
1081 var parentNodeClassAndId = parentNode.className + ' ' + parentNo de.id; | 1085 var parentNodeClassAndId = parentNode.className + ' ' + parentNo de.id; |
1082 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(/pag(e|ing|inat)/i)) { | 1086 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(/pag(e|ing|inat)/i)) { |
1083 positiveNodeMatch = true; | 1087 positiveNodeMatch = true; |
1084 linkObj.score += 25; | 1088 linkObj.score += 25; |
1085 } | 1089 } |
1086 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(readability.regexps.negative)) { | 1090 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(readability.regexps.negative)) { |
1087 /* If this is just something like "footer", give it a negati ve. If it's something like "body-and-footer", leave it be. */ | 1091 /* If this is just something like "footer", give it a negati ve. If it's something like "body-and-footer", leave it be. */ |
1088 if(!parentNodeClassAndId.match(readability.regexps.positive) ) { | 1092 if(!parentNodeClassAndId.match(readability.regexps.positive) ) { |
1089 linkObj.score -= 25; | 1093 linkObj.score -= 25; |
1090 negativeNodeMatch = true; | 1094 negativeNodeMatch = true; |
1091 } | 1095 } |
1092 } | 1096 } |
1093 | 1097 |
1094 parentNode = parentNode.parentNode; | 1098 parentNode = parentNode.parentNode; |
1095 } | 1099 } |
1096 | 1100 |
1097 /** | 1101 /** |
1098 * If the URL looks like it has paging in it, add to the score. | 1102 * If the URL looks like it has paging in it, add to the score. |
1099 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 | 1103 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 |
1100 **/ | 1104 **/ |
1101 if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) { | 1105 if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) { |
1102 linkObj.score += 25; | 1106 linkObj.score += 25; |
1103 } | 1107 } |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1145 topPage = possiblePages[page]; | 1149 topPage = possiblePages[page]; |
1146 } | 1150 } |
1147 } | 1151 } |
1148 } | 1152 } |
1149 | 1153 |
1150 if(topPage) { | 1154 if(topPage) { |
1151 var nextHref = topPage.href.replace(/\/$/,''); | 1155 var nextHref = topPage.href.replace(/\/$/,''); |
1152 | 1156 |
1153 dbg('NEXT PAGE IS ' + nextHref); | 1157 dbg('NEXT PAGE IS ' + nextHref); |
1154 readability.parsedPages[nextHref] = true; | 1158 readability.parsedPages[nextHref] = true; |
1155 return nextHref; | 1159 return nextHref; |
1156 } | 1160 } |
1157 else { | 1161 else { |
1158 return null; | 1162 return null; |
1159 } | 1163 } |
1160 }, | 1164 }, |
1161 | 1165 |
1162 createLinkDiv: function(link) { | 1166 createLinkDiv: function(link) { |
1163 var divNode = document.createElement('div'); | 1167 var divNode = document.createElement('div'); |
1164 var aNode = document.createElement('a'); | 1168 var aNode = document.createElement('a'); |
1165 var tNode = document.createTextNode('View Next Page'); | 1169 var tNode = document.createTextNode('View Next Page'); |
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1197 } | 1201 } |
1198 else { | 1202 else { |
1199 if (options.error) { options.error(request); } | 1203 if (options.error) { options.error(request); } |
1200 } | 1204 } |
1201 } | 1205 } |
1202 } | 1206 } |
1203 | 1207 |
1204 if (typeof options === 'undefined') { options = {}; } | 1208 if (typeof options === 'undefined') { options = {}; } |
1205 | 1209 |
1206 request.onreadystatechange = respondToReadyState; | 1210 request.onreadystatechange = respondToReadyState; |
1207 | 1211 |
1208 request.open('get', url, true); | 1212 request.open('get', url, true); |
1209 request.setRequestHeader('Accept', 'text/html'); | 1213 request.setRequestHeader('Accept', 'text/html'); |
1210 | 1214 |
1211 try { | 1215 try { |
1212 request.send(options.postBody); | 1216 request.send(options.postBody); |
1213 } | 1217 } |
1214 catch (e) { | 1218 catch (e) { |
1215 if (options.error) { options.error(); } | 1219 if (options.error) { options.error(); } |
1216 } | 1220 } |
1217 | 1221 |
(...skipping 14 matching lines...) Expand all Loading... | |
1232 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada bility.curPageNum + '">§</p>'; | 1236 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada bility.curPageNum + '">§</p>'; |
1233 | 1237 |
1234 document.getElementById("readability-content").appendChild(articlePage); | 1238 document.getElementById("readability-content").appendChild(articlePage); |
1235 | 1239 |
1236 if(readability.curPageNum > readability.maxPages) { | 1240 if(readability.curPageNum > readability.maxPages) { |
1237 var linkDiv = readability.createLinkDiv(nextPageLink); | 1241 var linkDiv = readability.createLinkDiv(nextPageLink); |
1238 | 1242 |
1239 articlePage.appendChild(linkDiv); | 1243 articlePage.appendChild(linkDiv); |
1240 return; | 1244 return; |
1241 } | 1245 } |
1242 | 1246 |
1243 /** | 1247 /** |
1244 * Now that we've built the article page DOM element, get the page conte nt | 1248 * Now that we've built the article page DOM element, get the page conte nt |
1245 * asynchronously and load the cleaned content into the div we created f or it. | 1249 * asynchronously and load the cleaned content into the div we created f or it. |
1246 **/ | 1250 **/ |
1247 (function(pageUrl, thisPage) { | 1251 (function(pageUrl, thisPage) { |
1248 readability.ajax(pageUrl, { | 1252 readability.ajax(pageUrl, { |
1249 success: function(r) { | 1253 success: function(r) { |
1250 | 1254 |
1251 /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */ | 1255 /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */ |
1252 var eTag = r.getResponseHeader('ETag'); | 1256 var eTag = r.getResponseHeader('ETag'); |
1253 if(eTag) { | 1257 if(eTag) { |
1254 if(eTag in readability.pageETags) { | 1258 if(eTag in readability.pageETags) { |
1255 dbg("Exact duplicate page found via ETag. Aborting." ); | 1259 dbg("Exact duplicate page found via ETag. Aborting." ); |
1256 articlePage.style.display = 'none'; | 1260 articlePage.style.display = 'none'; |
1257 return; | 1261 return; |
1258 } else { | 1262 } else { |
1259 readability.pageETags[eTag] = 1; | 1263 readability.pageETags[eTag] = 1; |
1260 } | 1264 } |
1261 } | 1265 } |
1262 | 1266 |
1263 // TODO: this ends up doubling up page numbers on NYTimes ar ticles. Need to generically parse those away. | 1267 // TODO: this ends up doubling up page numbers on NYTimes ar ticles. Need to generically parse those away. |
1264 var page = document.createElement("DIV"); | 1268 var page = document.createElement("DIV"); |
1265 | 1269 |
1266 /** | 1270 /** |
1267 * Do some preprocessing to our HTML to make it ready for ap pending. | 1271 * Do some preprocessing to our HTML to make it ready for ap pending. |
1268 * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript. | 1272 * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript. |
1269 * • Turn any noscript tags into divs so that we can parse t hem. This allows us to find any next page links hidden via javascript. | 1273 * • Turn any noscript tags into divs so that we can parse t hem. This allows us to find any next page links hidden via javascript. |
1270 * • Turn all double br's into p's - was handled by prepDocu ment in the original view. | 1274 * • Turn all double br's into p's - was handled by prepDocu ment in the original view. |
(...skipping 30 matching lines...) Expand all Loading... | |
1301 for(var i=1; i <= readability.curPageNum; i+=1) { | 1305 for(var i=1; i <= readability.curPageNum; i+=1) { |
1302 var rPage = document.getElementById('readability-pag e-' + i); | 1306 var rPage = document.getElementById('readability-pag e-' + i); |
1303 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML ) !== -1) { | 1307 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML ) !== -1) { |
1304 dbg('Duplicate of page ' + i + ' - skipping.'); | 1308 dbg('Duplicate of page ' + i + ' - skipping.'); |
1305 articlePage.style.display = 'none'; | 1309 articlePage.style.display = 'none'; |
1306 readability.parsedPages[pageUrl] = true; | 1310 readability.parsedPages[pageUrl] = true; |
1307 return; | 1311 return; |
1308 } | 1312 } |
1309 } | 1313 } |
1310 } | 1314 } |
1311 | 1315 |
1312 readability.removeScripts(content); | 1316 readability.removeScripts(content); |
1313 | 1317 |
1314 readability.moveNodeInnards(content, thisPage); | 1318 readability.moveNodeInnards(content, thisPage); |
1315 | 1319 |
1316 /** | 1320 /** |
1317 * After the page has rendered, post process the content. Th is delay is necessary because, | 1321 * After the page has rendered, post process the content. Th is delay is necessary because, |
1318 * in webkit at least, offsetWidth is not set in time to det ermine image width. We have to | 1322 * in webkit at least, offsetWidth is not set in time to det ermine image width. We have to |
1319 * wait a little bit for reflow to finish before we can fix floating images. | 1323 * wait a little bit for reflow to finish before we can fix floating images. |
1320 **/ | 1324 **/ |
1321 window.setTimeout( | 1325 window.setTimeout( |
1322 function() { readability.postProcessContent(thisPage); } , | 1326 function() { readability.postProcessContent(thisPage); } , |
1323 500 | 1327 500 |
1324 ); | 1328 ); |
1325 | 1329 |
1326 if(nextPageLink) { | 1330 if(nextPageLink) { |
1327 readability.appendNextPage(nextPageLink); | 1331 readability.appendNextPage(nextPageLink); |
1328 } | 1332 } |
1329 } | 1333 } |
1330 }); | 1334 }); |
1331 }(nextPageLink, articlePage)); | 1335 }(nextPageLink, articlePage)); |
1332 }, | 1336 }, |
1333 | 1337 |
1334 /** | 1338 /** |
1335 * Get an elements class/id weight. Uses regular expressions to tell if this | 1339 * Get an elements class/id weight. Uses regular expressions to tell if this |
1336 * element looks good or bad. | 1340 * element looks good or bad. |
1337 * | 1341 * |
1338 * @param Element | 1342 * @param Element |
1339 * @return number (Integer) | 1343 * @return number (Integer) |
1340 **/ | 1344 **/ |
1341 getClassWeight: function (e) { | 1345 getClassWeight: function (e) { |
1342 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { | 1346 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { |
1343 return 0; | 1347 return 0; |
1344 } | 1348 } |
1345 | 1349 |
(...skipping 29 matching lines...) Expand all Loading... | |
1375 /** | 1379 /** |
1376 * Remove extraneous break tags from a node. | 1380 * Remove extraneous break tags from a node. |
1377 * | 1381 * |
1378 * @param Element | 1382 * @param Element |
1379 * @return void | 1383 * @return void |
1380 **/ | 1384 **/ |
1381 killBreaks: function (e) { | 1385 killBreaks: function (e) { |
1382 var allElements = e.getElementsByTagName('*'); | 1386 var allElements = e.getElementsByTagName('*'); |
1383 while (i < allElements.length) { | 1387 while (i < allElements.length) { |
1384 readability.deleteExtraBreaks(allElements[i]); | 1388 readability.deleteExtraBreaks(allElements[i]); |
1385 i++; | 1389 i++; |
1386 } | 1390 } |
1387 }, | 1391 }, |
1388 | 1392 |
1389 /** | 1393 /** |
1390 * Clean a node of all elements of type "tag". | 1394 * Clean a node of all elements of type "tag". |
1391 * (Unless it's a youtube/vimeo video. People love movies.) | 1395 * (Unless it's a youtube/vimeo video. People love movies.) |
1392 * | 1396 * |
1393 * @param Element | 1397 * @param Element |
1394 * @param string tag to clean | 1398 * @param string tag to clean |
1395 * @return void | 1399 * @return void |
1396 **/ | 1400 **/ |
1397 clean: function (e, tag) { | 1401 clean: function (e, tag) { |
1398 var targetList = e.getElementsByTagName( tag ); | 1402 var targetList = e.getElementsByTagName( tag ); |
1399 var isEmbed = (tag === 'object' || tag === 'embed'); | 1403 var isEmbed = (tag === 'object' || tag === 'embed'); |
1400 | 1404 |
1401 for (var y=targetList.length-1; y >= 0; y-=1) { | 1405 for (var y=targetList.length-1; y >= 0; y-=1) { |
1402 /* Allow youtube and vimeo videos through as people usually want to see those. */ | 1406 /* Allow youtube and vimeo videos through as people usually want to see those. */ |
1403 if(isEmbed) { | 1407 if(isEmbed) { |
1404 var attributeValues = ""; | 1408 var attributeValues = ""; |
1405 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) { | 1409 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) { |
1406 attributeValues += targetList[y].attributes[i].value + '|'; | 1410 attributeValues += targetList[y].attributes[i].value + '|'; |
1407 } | 1411 } |
1408 | 1412 |
1409 /* First, check the elements attributes to see if any of them co ntain youtube or vimeo */ | 1413 /* First, check the elements attributes to see if any of them co ntain youtube or vimeo */ |
1410 if (attributeValues.search(readability.regexps.videos) !== -1) { | 1414 if (attributeValues.search(readability.regexps.videos) !== -1) { |
1411 continue; | 1415 continue; |
1412 } | 1416 } |
1413 | 1417 |
1414 /* Then check the elements inside this element for the same. */ | 1418 /* Then check the elements inside this element for the same. */ |
1415 if (targetList[y].innerHTML.search(readability.regexps.videos) ! == -1) { | 1419 if (targetList[y].innerHTML.search(readability.regexps.videos) ! == -1) { |
1416 continue; | 1420 continue; |
1417 } | 1421 } |
1418 | 1422 |
1419 } | 1423 } |
1420 | 1424 |
1421 targetList[y].parentNode.removeChild(targetList[y]); | 1425 targetList[y].parentNode.removeChild(targetList[y]); |
1422 } | 1426 } |
1423 }, | 1427 }, |
1424 | 1428 |
1425 /** | 1429 /** |
1426 * Clean an element of all tags of type "tag" if they look fishy. | 1430 * Clean an element of all tags of type "tag" if they look fishy. |
1427 * "Fishy" is an algorithm based on content length, classnames, link density , number of images & embeds, etc. | 1431 * "Fishy" is an algorithm based on content length, classnames, link density , number of images & embeds, etc. |
1428 * | 1432 * |
1429 * @return void | 1433 * @return void |
1430 **/ | 1434 **/ |
1431 cleanConditionally: function (e, tag) { | 1435 cleanConditionally: function (e, tag) { |
1432 | 1436 |
1433 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { | 1437 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { |
1434 return; | 1438 return; |
1435 } | 1439 } |
1436 | 1440 |
1437 var tagsList = e.getElementsByTagName(tag); | 1441 var tagsList = e.getElementsByTagName(tag); |
1438 var curTagsLength = tagsList.length; | 1442 var curTagsLength = tagsList.length; |
1439 | 1443 |
1440 /** | 1444 /** |
1441 * Gather counts for other typical elements embedded within. | 1445 * Gather counts for other typical elements embedded within. |
1442 * Traverse backwards so we can remove nodes at the same time without ef fecting the traversal. | 1446 * Traverse backwards so we can remove nodes at the same time without ef fecting the traversal. |
1443 * | 1447 * |
1444 * TODO: Consider taking into account original contentScore here. | 1448 * TODO: Consider taking into account original contentScore here. |
1445 **/ | 1449 **/ |
1446 for (var i=curTagsLength-1; i >= 0; i-=1) { | 1450 for (var i=curTagsLength-1; i >= 0; i-=1) { |
1447 var weight = readability.getClassWeight(tagsList[i]); | 1451 var weight = readability.getClassWeight(tagsList[i]); |
1448 var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0; | 1452 var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0; |
1449 | 1453 |
1450 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde fined') ? (" with score " + tagsList[i].readability.contentScore) : '')); | 1454 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde fined') ? (" with score " + tagsList[i].readability.contentScore) : '')); |
1451 | 1455 |
1452 if(weight+contentScore < 0) | 1456 if(weight+contentScore < 0) |
1453 { | 1457 { |
1454 tagsList[i].parentNode.removeChild(tagsList[i]); | 1458 tagsList[i].parentNode.removeChild(tagsList[i]); |
1455 } | 1459 } |
1456 else if ( readability.getCharCount(tagsList[i],',') < 10) { | 1460 else if ( readability.getCharCount(tagsList[i],',') < 10) { |
1457 /** | 1461 /** |
1458 * If there are not very many commas, and the number of | 1462 * If there are not very many commas, and the number of |
1459 * non-paragraph elements is more than paragraphs or other omino us signs, remove the element. | 1463 * non-paragraph elements is more than paragraphs or other omino us signs, remove the element. |
1460 **/ | 1464 **/ |
1461 var p = tagsList[i].getElementsByTagName("p").length; | 1465 var p = tagsList[i].getElementsByTagName("p").length; |
1462 var img = tagsList[i].getElementsByTagName("img").length; | 1466 var img = tagsList[i].getElementsByTagName("img").length; |
1463 var li = tagsList[i].getElementsByTagName("li").length-100; | 1467 var li = tagsList[i].getElementsByTagName("li").length-100; |
1464 var input = tagsList[i].getElementsByTagName("input").length; | 1468 var input = tagsList[i].getElementsByTagName("input").length; |
1465 | 1469 |
1466 var embedCount = 0; | 1470 var embedCount = 0; |
1467 var embeds = tagsList[i].getElementsByTagName("embed"); | 1471 var embeds = tagsList[i].getElementsByTagName("embed"); |
1468 for(var ei=0,il=embeds.length; ei < il; ei+=1) { | 1472 for(var ei=0,il=embeds.length; ei < il; ei+=1) { |
1469 if (embeds[ei].src.search(readability.regexps.videos) === -1 ) { | 1473 if (embeds[ei].src.search(readability.regexps.videos) === -1 ) { |
1470 embedCount+=1; | 1474 embedCount+=1; |
1471 } | 1475 } |
1472 } | 1476 } |
1473 | 1477 |
1474 var linkDensity = readability.getLinkDensity(tagsList[i]); | 1478 var linkDensity = readability.getLinkDensity(tagsList[i]); |
1475 var contentLength = readability.getInnerText(tagsList[i]).length ; | 1479 var contentLength = readability.getInnerText(tagsList[i]).length ; |
1476 var toRemove = false; | 1480 var toRemove = false; |
1477 | 1481 |
1478 if ( img > p ) { | 1482 if ( img > p ) { |
1479 toRemove = true; | 1483 toRemove = true; |
1480 } else if(li > p && tag !== "ul" && tag !== "ol") { | 1484 } else if(li > p && tag !== "ul" && tag !== "ol") { |
1481 toRemove = true; | 1485 toRemove = true; |
1482 } else if( input > Math.floor(p/3) ) { | 1486 } else if( input > Math.floor(p/3) ) { |
1483 toRemove = true; | 1487 toRemove = true; |
1484 } else if(contentLength < 25 && (img === 0 || img > 2) ) { | 1488 } else if(contentLength < 25 && (img === 0 || img > 2) ) { |
1485 toRemove = true; | 1489 toRemove = true; |
1486 } else if(weight < 25 && linkDensity > 0.2) { | 1490 } else if(weight < 25 && linkDensity > 0.2) { |
1487 toRemove = true; | 1491 toRemove = true; |
1488 } else if(weight >= 25 && linkDensity > 0.5) { | 1492 } else if(weight >= 25 && linkDensity > 0.5) { |
1489 toRemove = true; | 1493 toRemove = true; |
1490 } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) { | 1494 } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) { |
1491 toRemove = true; | 1495 toRemove = true; |
1492 } | 1496 } |
1493 | 1497 |
(...skipping 21 matching lines...) Expand all Loading... | |
1515 } | 1519 } |
1516 }, | 1520 }, |
1517 | 1521 |
1518 flagIsActive: function(flag) { | 1522 flagIsActive: function(flag) { |
1519 return (readability.flags & flag) > 0; | 1523 return (readability.flags & flag) > 0; |
1520 }, | 1524 }, |
1521 | 1525 |
1522 addFlag: function(flag) { | 1526 addFlag: function(flag) { |
1523 readability.flags = readability.flags | flag; | 1527 readability.flags = readability.flags | flag; |
1524 }, | 1528 }, |
1525 | 1529 |
1526 removeFlag: function(flag) { | 1530 removeFlag: function(flag) { |
1527 readability.flags = readability.flags & ~flag; | 1531 readability.flags = readability.flags & ~flag; |
1528 }, | 1532 }, |
1529 | 1533 |
1530 // Removes the children of |src| and appends them to |dest|. | 1534 // Removes the children of |src| and appends them to |dest|. |
1531 moveNodeInnards: function(src, dest) { | 1535 moveNodeInnards: function(src, dest) { |
1532 try { | 1536 try { |
1533 while (src.firstChild) { | 1537 while (src.firstChild) { |
1534 dest.appendChild(src.removeChild(src.firstChild)); | 1538 dest.appendChild(src.removeChild(src.firstChild)); |
1535 } | 1539 } |
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1584 var lastBr = readability.isMultipleBr(node, false); | 1588 var lastBr = readability.isMultipleBr(node, false); |
1585 var ret = false; | 1589 var ret = false; |
1586 while (lastBr && lastBr != node) { | 1590 while (lastBr && lastBr != node) { |
1587 var toRemove = lastBr; | 1591 var toRemove = lastBr; |
1588 lastBr = lastBr.previousSibling; | 1592 lastBr = lastBr.previousSibling; |
1589 toRemove.parentNode.removeChild(toRemove); | 1593 toRemove.parentNode.removeChild(toRemove); |
1590 ret = true; | 1594 ret = true; |
1591 } | 1595 } |
1592 return ret; | 1596 return ret; |
1593 }, | 1597 }, |
1594 | 1598 |
1595 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a | 1599 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a |
1596 // <P> node, and makes all next siblings of that pair children of <P>, up | 1600 // <P> node, and makes all next siblings of that pair children of <P>, up |
1597 // until the next pair of <BR> nodes is reached. | 1601 // until the next pair of <BR> nodes is reached. |
1598 replaceDoubleBrWithP: function(node) { | 1602 replaceDoubleBrWithP: function(node) { |
1599 // Check that we are starting with a BR. | 1603 // Check that we are starting with a BR. |
1600 var second = readability.isMultipleBr(node, true); | 1604 var second = readability.isMultipleBr(node, true); |
1601 if (!second) { | 1605 if (!second) { |
1602 return; | 1606 return; |
1603 } | 1607 } |
1604 // Make all next siblings of the second BR into children of a P. | 1608 // Make all next siblings of the second BR into children of a P. |
1605 var p = document.createElement('p'); | 1609 var p = document.createElement('p'); |
1606 var curr = second.nextSibling; | 1610 var curr = second.nextSibling; |
1607 while (curr) { | 1611 while (curr) { |
1608 if (readability.isMultipleBr(curr, true)) { | 1612 if (readability.isMultipleBr(curr, true)) { |
1609 break; | 1613 break; |
1610 } | 1614 } |
1611 var next = curr.nextSibling; | 1615 var next = curr.nextSibling; |
1612 p.appendChild(curr.parentNode.removeChild(curr)); | 1616 p.appendChild(curr.parentNode.removeChild(curr)); |
1613 curr = next; | 1617 curr = next; |
1614 } | 1618 } |
1615 var ret = curr; | 1619 var ret = curr; |
1616 | 1620 |
1617 // Remove all nodes between the first and second BR. | 1621 // Remove all nodes between the first and second BR. |
1618 curr = node.nextSibling; | 1622 curr = node.nextSibling; |
1619 while (curr && curr != second) { | 1623 while (curr && curr != second) { |
1620 var next = curr.nextSibling; | 1624 var next = curr.nextSibling; |
1621 curr.parentNode.removeChild(curr); | 1625 curr.parentNode.removeChild(curr); |
1622 curr = next; | 1626 curr = next; |
1623 } | 1627 } |
1624 // Remove the second BR. | 1628 // Remove the second BR. |
1625 second.parentNode.removeChild(second); | 1629 second.parentNode.removeChild(second); |
1626 // Replace the first BR with the P. | 1630 // Replace the first BR with the P. |
1627 node.parentNode.replaceChild(p, node); | 1631 node.parentNode.replaceChild(p, node); |
1628 | 1632 |
1629 return ret; | 1633 return ret; |
1630 }, | 1634 }, |
1631 | 1635 |
1632 // Returns true if the NodeList contains a double <BR>. | 1636 // Returns true if the NodeList contains a double <BR>. |
1633 hasDoubleBr: function(nodeList) { | 1637 hasDoubleBr: function(nodeList) { |
1634 for (var i = 0; i < nodeList.length; nodeList++) { | 1638 for (var i = 0; i < nodeList.length; nodeList++) { |
1635 if (readability.isMultipleBr(nodeList[i], true)) { | 1639 if (readability.isMultipleBr(nodeList[i], true)) { |
1636 return true; | 1640 return true; |
1637 } | 1641 } |
1638 } | 1642 } |
1639 return false; | 1643 return false; |
1640 }, | 1644 }, |
1641 | 1645 |
1642 // Replaces double <BR> tags with <P> tags. | 1646 // Replaces double <BR> tags with <P> tags. |
1643 replaceDoubleBrsWithPs: function(node) { | 1647 replaceDoubleBrsWithPs: function(node) { |
1644 var allElements = node.getElementsByTagName('BR'); | 1648 var allElements = node.getElementsByTagName('BR'); |
1645 var node = null; | 1649 var node = null; |
1646 while (allElements && allElements.length > 0 && | 1650 while (allElements && allElements.length > 0 && |
1647 readability.hasDoubleBr(allElements)) { | 1651 readability.hasDoubleBr(allElements)) { |
1648 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) { | 1652 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) { |
1649 var next = node; | 1653 var next = node; |
1650 while (next = readability.replaceDoubleBrWithP(next)); | 1654 while (next = readability.replaceDoubleBrWithP(next)); |
1651 } | 1655 } |
1652 allElements = document.body.getElementsByTagName('BR'); | 1656 allElements = document.body.getElementsByTagName('BR'); |
1653 } | 1657 } |
1654 }, | 1658 }, |
1655 | 1659 |
1656 | 1660 |
1657 // Replaces a BR and the whitespace that follows it with a P. | 1661 // Replaces a BR and the whitespace that follows it with a P. |
1658 replaceBrWithP: function(node) { | 1662 replaceBrWithP: function(node) { |
1659 if (!readability.isBrNode(node)) { | 1663 if (!readability.isBrNode(node)) { |
1660 return; | 1664 return; |
1661 } | 1665 } |
1662 var p = document.createElement('p'); | 1666 var p = document.createElement('p'); |
1663 var curr = node.nextSibling; | 1667 var curr = node.nextSibling; |
1664 while (curr && !isBrNode(curr)) { | 1668 while (curr && !isBrNode(curr)) { |
1665 var next = curr.nextSibling; | 1669 var next = curr.nextSibling; |
1666 if (readability.isWhitespaceNode(curr)) { | 1670 if (readability.isWhitespaceNode(curr)) { |
1667 curr.parentNode.removeChild(curr); | 1671 curr.parentNode.removeChild(curr); |
1668 } else { | 1672 } else { |
1669 p.appendChild(curr.parentNode.removeChild(curr)); | 1673 p.appendChild(curr.parentNode.removeChild(curr)); |
1670 } | 1674 } |
1671 curr = next; | 1675 curr = next; |
1672 } | 1676 } |
1673 node.parentNode.replaceChild(p, node); | 1677 node.parentNode.replaceChild(p, node); |
1674 return curr; | 1678 return curr; |
1675 }, | 1679 }, |
1676 | 1680 |
1677 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t ag | 1681 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t ag |
1678 // children of the <P>. | 1682 // children of the <P>. |
1679 replaceBrsWithPs: function(node) { | 1683 replaceBrsWithPs: function(node) { |
1680 var allElements = node.getElementsByTagName('BR'); | 1684 var allElements = node.getElementsByTagName('BR'); |
1681 var node = null; | 1685 var node = null; |
1682 while (allElements && allElements.length > 0) { | 1686 while (allElements && allElements.length > 0) { |
1683 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) { | 1687 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) { |
1684 var next = node; | 1688 var next = node; |
1685 while (next = readability.replaceBrWithP(next)); | 1689 while (next = readability.replaceBrWithP(next)); |
1686 } | 1690 } |
1687 allElements = document.body.getElementsByTagName('BR'); | 1691 allElements = document.body.getElementsByTagName('BR'); |
1688 } | 1692 } |
1689 }, | 1693 }, |
1690 | 1694 |
1691 // Replaces any tag with any other tag. | 1695 // Replaces any tag with any other tag. |
1692 replaceTagsWithTags: function(node, srcTag, destTag) { | 1696 replaceTagsWithTags: function(node, srcTag, destTag) { |
1693 var allElements = node.getElementsByTagName(srcTag); | 1697 var allElements = node.getElementsByTagName(srcTag); |
1694 for (var i = 0; i < allElements.length; i++) { | 1698 for (var i = 0; i < allElements.length; i++) { |
1695 var dest = document.createElement(destTag); | 1699 var dest = document.createElement(destTag); |
1696 readability.moveNodeInnards(allElements[i], dest); | 1700 readability.moveNodeInnards(allElements[i], dest); |
1697 node.replaceNode(dest, allElements[i]); | 1701 allElements[i].parentNode.replaceChild(dest, allElements[i]); |
1698 } | 1702 } |
1699 }, | 1703 }, |
1700 | 1704 |
1701 // Replaces all <noscript> tags with <p> tags. | 1705 // Replaces all <noscript> tags with <p> tags. |
1702 replaceNoscriptsWithPs: function(node) { | 1706 replaceNoscriptsWithPs: function(node) { |
1703 readability.replaceTagsWithTags(node, 'noscript', 'p'); | 1707 readability.replaceTagsWithTags(node, 'noscript', 'p'); |
1704 }, | 1708 }, |
1705 | 1709 |
1706 // Replaces all <font> tags with <span> tags. | 1710 // Replaces all <font> tags with <span> tags. |
1707 replaceFontsWithSpans: function(node) { | 1711 replaceFontsWithSpans: function(node) { |
1708 readability.replaceTagsWithTags(node, 'font', 'span'); | 1712 readability.replaceTagsWithTags(node, 'font', 'span'); |
1709 }, | 1713 }, |
1710 | 1714 |
1711 // Returns a list of image URLs in the distilled article. | 1715 // Returns a list of image URLs in the distilled article. |
1712 getImages : function() { | 1716 getImages : function() { |
1713 var images = document.getElementsByTagName('img'); | 1717 var images = document.getElementsByTagName('img'); |
1714 var result = new Array(images.length); | 1718 var result = new Array(images.length); |
1715 dbg("Number of images: " + images.length); | 1719 dbg("Number of images: " + images.length); |
1716 for(i = 0; i < images.length; i++) { | 1720 for(i = 0; i < images.length; i++) { |
1717 result[i] = images[i].src; | 1721 result[i] = images[i].src; |
1718 dbg("Image: " + result[i]); | 1722 dbg("Image: " + result[i]); |
1719 } | 1723 } |
1720 return result; | 1724 return result; |
1721 }, | 1725 }, |
1722 | 1726 |
1723 // Returns the distilled article HTML from the page(s). | 1727 // Returns the distilled article HTML from the page(s). |
1724 getDistilledArticleHTML : function() { | 1728 getDistilledArticleHTML : function() { |
1725 return readability.distilledHTML; | 1729 return readability.distilledHTML; |
1730 }, | |
1731 | |
1732 // Returns the next page of this article. | |
1733 getNextPageLink : function() { | |
1734 return readability.nextPageLink; | |
1726 } | 1735 } |
1727 }; | 1736 }; |
1728 | 1737 |
1729 // Extracts long-form content from a page and returns and array where the first | 1738 // Extracts long-form content from a page and returns and array where the first |
1730 // element is the article title, the second element is HTML containing the | 1739 // element is the article title, the second element is HTML containing the |
1731 // long-form content, and remaining elements are URLs for images referenced by | 1740 // long-form content, and remaining elements are URLs for images referenced by |
1732 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which | 1741 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which |
1733 // corresponds to a URL listed at index k in the array returned. | 1742 // corresponds to a URL listed at index k in the array returned. |
1734 (function () { | 1743 (function () { |
1735 readability.init(); | 1744 readability.init(); |
1736 var result = new Array(2); | 1745 var result = new Array(3); |
1737 result[0] = readability.getArticleTitle(); | 1746 result[0] = readability.getArticleTitle(); |
1738 result[1] = readability.getDistilledArticleHTML(); | 1747 result[1] = readability.getDistilledArticleHTML(); |
1748 result[2] = readability.getNextPageLink(); | |
1739 return result.concat(readability.getImages()); | 1749 return result.concat(readability.getImages()); |
1740 }()) | 1750 }()) |
1741 | 1751 |
OLD | NEW |