OLD | NEW |
---|---|
1 | 1 |
cjhopman
2014/02/03 23:56:53
Since this is modified, it needs to contain a noti
shashi
2014/02/04 01:39:37
Done.
| |
2 var dbg = (typeof console !== 'undefined') ? function(s) { | 2 var dbg = (typeof console !== 'undefined') ? function(s) { |
3 console.log("Readability: " + s); | 3 console.log("Readability: " + s); |
4 } : function() {}; | 4 } : function() {}; |
5 | 5 |
6 /* | 6 /* |
7 * Readability. An Arc90 Lab Experiment. | 7 * Readability. An Arc90 Lab Experiment. |
8 * Website: http://lab.arc90.com/experiments/readability | 8 * Website: http://lab.arc90.com/experiments/readability |
9 * Source: http://code.google.com/p/arc90labs-readability | 9 * Source: http://code.google.com/p/arc90labs-readability |
10 * | 10 * |
11 * "Readability" is a trademark of Arc90 Inc and may not be used without explici t permission. | 11 * "Readability" is a trademark of Arc90 Inc and may not be used without explici t permission. |
12 * | 12 * |
13 * Copyright (c) 2010 Arc90 Inc | 13 * Copyright (c) 2010 Arc90 Inc |
14 * Readability is licensed under the Apache License, Version 2.0. | 14 * Readability is licensed under the Apache License, Version 2.0. |
15 **/ | 15 **/ |
16 var readability = { | 16 var readability = { |
17 readStyle: "style-newspaper", | 17 readStyle: "style-newspaper", |
18 readSize: "size-medium", | 18 readSize: "size-medium", |
19 readMargin: "margin-wide", | 19 readMargin: "margin-wide", |
20 | 20 |
21 distilledHTML: '', | 21 distilledHTML: '', |
22 distilledArticleContent: null, | 22 distilledArticleContent: null, |
23 nextPageLink: '', | |
23 | 24 |
24 version: '1.7.1', | 25 version: '1.7.1', |
25 iframeLoads: 0, | 26 iframeLoads: 0, |
26 convertLinksToFootnotes: false, | 27 convertLinksToFootnotes: false, |
27 reversePageScroll: false, /* If they hold shift and hit space, scroll up */ | 28 reversePageScroll: false, /* If they hold shift and hit space, scroll up */ |
28 frameHack: false, /** | 29 frameHack: false, /** |
29 * The frame hack is to workaround a firefo x bug where if you | 30 * The frame hack is to workaround a firefo x bug where if you |
30 * pull content out of a frame and stick it into the parent element, the scrollbar won't appear. | 31 * pull content out of a frame and stick it into the parent element, the scrollbar won't appear. |
31 * So we fake a scrollbar in the wrapping d iv. | 32 * So we fake a scrollbar in the wrapping d iv. |
32 **/ | 33 **/ |
33 biggestFrame: false, | 34 biggestFrame: false, |
34 flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */ | 35 flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */ |
35 | 36 |
36 /* constants */ | 37 /* constants */ |
37 FLAG_STRIP_UNLIKELYS: 0x1, | 38 FLAG_STRIP_UNLIKELYS: 0x1, |
38 FLAG_WEIGHT_CLASSES: 0x2, | 39 FLAG_WEIGHT_CLASSES: 0x2, |
39 FLAG_CLEAN_CONDITIONALLY: 0x4, | 40 FLAG_CLEAN_CONDITIONALLY: 0x4, |
40 | 41 |
41 maxPages: 30, /* The maximum number of pages to loop through before we ca ll it quits and just show a link. */ | 42 maxPages: 30, /* The maximum number of pages to loop through before we ca ll it quits and just show a link. */ |
42 parsedPages: {}, /* The list of pages we've parsed in this call of readabili ty, for autopaging. As a key store for easier searching. */ | 43 parsedPages: {}, /* The list of pages we've parsed in this call of readabili ty, for autopaging. As a key store for easier searching. */ |
43 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas e they happen to match, we'll know it's a duplicate. */ | 44 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas e they happen to match, we'll know it's a duplicate. */ |
44 | 45 |
45 /** | 46 /** |
46 * All of the regular expressions in use within readability. | 47 * All of the regular expressions in use within readability. |
47 * Defined up here so we don't instantiate them repeatedly in loops. | 48 * Defined up here so we don't instantiate them repeatedly in loops. |
48 **/ | 49 **/ |
49 regexps: { | 50 regexps: { |
50 unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header |menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popu p|tweet|twitter/i, | 51 unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header |menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popu p|tweet|twitter/i, |
51 okMaybeItsACandidate: /and|article|body|column|main|shadow/i, | 52 okMaybeItsACandidate: /and|article|body|column|main|shadow/i, |
52 positive: /article|body|content|entry|hentry|main|page|pagi nation|post|text|blog|story/i, | 53 positive: /article|body|content|entry|hentry|main|page|pagi nation|post|text|blog|story/i, |
53 negative: /combx|comment|com-|contact|foot|footer|footnote| masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopp ing|tags|tool|widget/i, | 54 negative: /combx|comment|com-|contact|foot|footer|footnote| masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopp ing|tags|tool|widget/i, |
54 extraneous: /print|archive|comment|discuss|e[\-]?mail|share|r eply|all|login|sign|single/i, | 55 extraneous: /print|archive|comment|discuss|e[\-]?mail|share|r eply|all|login|sign|single/i, |
55 divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, | 56 divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, |
56 replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi, | 57 replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi, |
57 replaceFonts: /<(\/?)font[^>]*>/gi, | 58 replaceFonts: /<(\/?)font[^>]*>/gi, |
58 trim: /^\s+|\s+$/g, | 59 trim: /^\s+|\s+$/g, |
59 normalize: /\s{2,}/g, | 60 normalize: /\s{2,}/g, |
60 killBreaks: /(<br\s*\/?>(\s| ?)*){1,}/g, | 61 killBreaks: /(<br\s*\/?>(\s| ?)*){1,}/g, |
61 videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, | 62 videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, |
62 skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed) \s*$/i, | 63 skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed) \s*$/i, |
63 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last. | 64 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last. |
64 prevLink: /(prev|earl|old|new|<|«)/i | 65 prevLink: /(prev|earl|old|new|<|«)/i |
65 }, | 66 }, |
66 | 67 |
67 /** | 68 /** |
68 * Runs readability. | 69 * Runs readability. |
69 * | 70 * |
70 * Workflow: | 71 * Workflow: |
71 * 1. Prep the document by removing script tags, css, etc. | 72 * 1. Prep the document by removing script tags, css, etc. |
72 * 2. Build readability's DOM tree. | 73 * 2. Build readability's DOM tree. |
73 * 3. Grab the article content from the current dom tree. | 74 * 3. Grab the article content from the current dom tree. |
74 * 4. Replace the current DOM tree with the new one. | 75 * 4. Replace the current DOM tree with the new one. |
75 * 5. Read peacefully. | 76 * 5. Read peacefully. |
76 * | 77 * |
77 * @return void | 78 * @return void |
78 **/ | 79 **/ |
79 init: function() { | 80 init: function() { |
80 /* Before we do anything, remove all scripts that are not readability. * / | 81 /* Before we do anything, remove all scripts that are not readability. * / |
81 window.onload = window.onunload = function() {}; | 82 window.onload = window.onunload = function() {}; |
82 | 83 |
83 readability.removeScripts(document); | 84 readability.removeScripts(document); |
84 | 85 |
85 /* Make sure this document is added to the list of parsed pages first, s o we don't double up on the first page */ | 86 /* Make sure this document is added to the list of parsed pages first, s o we don't double up on the first page */ |
86 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true; | 87 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true; |
87 | 88 |
88 /* Pull out any possible next page link first */ | 89 /* Pull out any possible next page link first */ |
89 var nextPageLink = readability.findNextPageLink(document.body); | 90 readability.nextPageLink = readability.findNextPageLink(document.body); |
90 | 91 |
92 /* We handle processing of nextPage from C++ set nextPageLink to null */ | |
93 var nextPageLink = null; | |
94 | |
91 readability.prepDocument(); | 95 readability.prepDocument(); |
92 | 96 |
93 /* Build readability's DOM tree */ | 97 /* Build readability's DOM tree */ |
94 var overlay = document.createElement("DIV"); | 98 var overlay = document.createElement("DIV"); |
95 var innerDiv = document.createElement("DIV"); | 99 var innerDiv = document.createElement("DIV"); |
96 var articleTools = readability.getArticleTools(); | 100 var articleTools = readability.getArticleTools(); |
97 var articleTitleText = readability.getArticleTitle(); | 101 var articleTitleText = readability.getArticleTitle(); |
98 var articleContent = readability.grabArticle(); | 102 var articleContent = readability.grabArticle(); |
99 | 103 |
100 if(!articleContent) { | 104 if(!articleContent) { |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
145 rootWarning.innerHTML = "<em>Readability</em> was intended for u se on individual articles and not home pages. " + | 149 rootWarning.innerHTML = "<em>Readability</em> was intended for u se on individual articles and not home pages. " + |
146 "If you'd like to try rendering this page anyway, <a onClick='ja vascript:document.getElementById(\"readability-warning\").style.display=\"none\" ;document.getElementById(\"readability-content\").style.display=\"block\";'>clic k here</a> to continue."; | 150 "If you'd like to try rendering this page anyway, <a onClick='ja vascript:document.getElementById(\"readability-warning\").style.display=\"none\" ;document.getElementById(\"readability-content\").style.display=\"block\";'>clic k here</a> to continue."; |
147 | 151 |
148 innerDiv.insertBefore( rootWarning, articleContent ); | 152 innerDiv.insertBefore( rootWarning, articleContent ); |
149 } | 153 } |
150 | 154 |
151 readability.postProcessContent(articleContent); | 155 readability.postProcessContent(articleContent); |
152 | 156 |
153 window.scrollTo(0, 0); | 157 window.scrollTo(0, 0); |
154 | 158 |
155 // TODO(bengr): Remove this assignment of null to nextPageLink when | |
156 // the processing of the next page link is safe. | |
157 nextPageLink = null; | |
158 | |
159 if (nextPageLink) { | 159 if (nextPageLink) { |
160 /** | 160 /** |
161 * Append any additional pages after a small timeout so that people | 161 * Append any additional pages after a small timeout so that people |
162 * can start reading without having to wait for this to finish proce ssing. | 162 * can start reading without having to wait for this to finish proce ssing. |
163 **/ | 163 **/ |
164 window.setTimeout(function() { | 164 window.setTimeout(function() { |
165 readability.appendNextPage(nextPageLink); | 165 readability.appendNextPage(nextPageLink); |
166 }, 500); | 166 }, 500); |
167 } | 167 } |
168 | 168 |
169 /** Smooth scrolling **/ | 169 /** Smooth scrolling **/ |
170 document.onkeydown = function(e) { | 170 document.onkeydown = function(e) { |
171 var code = (window.event) ? event.keyCode : e.keyCode; | 171 var code = (window.event) ? event.keyCode : e.keyCode; |
172 if (code === 16) { | 172 if (code === 16) { |
173 readability.reversePageScroll = true; | 173 readability.reversePageScroll = true; |
174 return; | 174 return; |
175 } | 175 } |
176 | 176 |
177 if (code === 32) { | 177 if (code === 32) { |
178 readability.curScrollStep = 0; | 178 readability.curScrollStep = 0; |
179 var windowHeight = window.innerHeight ? window.innerHeight : (do cument.documentElement.clientHeight ? document.documentElement.clientHeight : do cument.body.clientHeight); | 179 var windowHeight = window.innerHeight ? window.innerHeight : (do cument.documentElement.clientHeight ? document.documentElement.clientHeight : do cument.body.clientHeight); |
180 | 180 |
181 if(readability.reversePageScroll) { | 181 if(readability.reversePageScroll) { |
182 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() - (windowHeight - 50), 20, 10); | 182 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() - (windowHeight - 50), 20, 10); |
183 } | 183 } |
184 else { | 184 else { |
185 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() + (windowHeight - 50), 20, 10); | 185 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() + (windowHeight - 50), 20, 10); |
186 } | 186 } |
187 | 187 |
188 return false; | 188 return false; |
189 } | 189 } |
190 }; | 190 }; |
191 | 191 |
192 document.onkeyup = function(e) { | 192 document.onkeyup = function(e) { |
193 var code = (window.event) ? event.keyCode : e.keyCode; | 193 var code = (window.event) ? event.keyCode : e.keyCode; |
194 if (code === 16) { | 194 if (code === 16) { |
195 readability.reversePageScroll = false; | 195 readability.reversePageScroll = false; |
196 return; | 196 return; |
197 } | 197 } |
198 }; | 198 }; |
199 }, | 199 }, |
200 | 200 |
201 /** | 201 /** |
202 * Run any post-process modifications to article content as necessary. | 202 * Run any post-process modifications to article content as necessary. |
203 * | 203 * |
204 * @param Element | 204 * @param Element |
205 * @return void | 205 * @return void |
206 **/ | 206 **/ |
207 postProcessContent: function(articleContent) { | 207 postProcessContent: function(articleContent) { |
208 if(readability.convertLinksToFootnotes && !window.location.href.match(/w ikipedia\.org/g)) { | 208 if(readability.convertLinksToFootnotes && !window.location.href.match(/w ikipedia\.org/g)) { |
209 readability.addFootnotes(articleContent); | 209 readability.addFootnotes(articleContent); |
210 } | 210 } |
211 | 211 |
212 readability.fixImageFloats(articleContent); | 212 readability.fixImageFloats(articleContent); |
213 }, | 213 }, |
214 | 214 |
215 /** | 215 /** |
216 * Some content ends up looking ugly if the image is too large to be floated . | 216 * Some content ends up looking ugly if the image is too large to be floated . |
217 * If the image is wider than a threshold (currently 55%), no longer float i t, | 217 * If the image is wider than a threshold (currently 55%), no longer float i t, |
218 * center it instead. | 218 * center it instead. |
219 * | 219 * |
220 * @param Element | 220 * @param Element |
221 * @return void | 221 * @return void |
222 **/ | 222 **/ |
223 fixImageFloats: function (articleContent) { | 223 fixImageFloats: function (articleContent) { |
224 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0. 55, | 224 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0. 55, |
225 images = articleContent.getElementsByTagName('img'); | 225 images = articleContent.getElementsByTagName('img'); |
226 | 226 |
227 for(var i=0, il = images.length; i < il; i+=1) { | 227 for(var i=0, il = images.length; i < il; i+=1) { |
228 var image = images[i]; | 228 var image = images[i]; |
229 | 229 |
230 if(image.offsetWidth > imageWidthThreshold) { | 230 if(image.offsetWidth > imageWidthThreshold) { |
231 image.className += " blockImage"; | 231 image.className += " blockImage"; |
232 } | 232 } |
233 } | 233 } |
234 }, | 234 }, |
235 | 235 |
236 /** | 236 /** |
237 * Get the article tools Element that has buttons like reload, print. | 237 * Get the article tools Element that has buttons like reload, print. |
238 * | 238 * |
239 * @return void | 239 * @return void |
240 **/ | 240 **/ |
241 getArticleTools: function () { | 241 getArticleTools: function () { |
242 var articleTools = document.createElement("DIV"); | 242 var articleTools = document.createElement("DIV"); |
243 | 243 |
244 articleTools.id = "readTools"; | 244 articleTools.id = "readTools"; |
245 articleTools.innerHTML = | 245 articleTools.innerHTML = |
246 "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" + | 246 "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" + |
247 "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" + | 247 "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" + |
248 "<a href='#' onclick='readability.emailBox(); return false;' title=' Email page' id='email-page'>Email Page</a>"; | 248 "<a href='#' onclick='readability.emailBox(); return false;' title=' Email page' id='email-page'>Email Page</a>"; |
249 | 249 |
250 return articleTools; | 250 return articleTools; |
251 }, | 251 }, |
252 | 252 |
253 /** | 253 /** |
254 * retuns the suggested direction of the string | 254 * retuns the suggested direction of the string |
255 * | 255 * |
256 * @return "rtl" || "ltr" | 256 * @return "rtl" || "ltr" |
257 **/ | 257 **/ |
258 getSuggestedDirection: function(text) { | 258 getSuggestedDirection: function(text) { |
259 function sanitizeText() { | 259 function sanitizeText() { |
260 return text.replace(/@\w+/, ""); | 260 return text.replace(/@\w+/, ""); |
261 } | 261 } |
262 | 262 |
263 function countMatches(match) { | 263 function countMatches(match) { |
264 var matches = text.match(new RegExp(match, "g")); | 264 var matches = text.match(new RegExp(match, "g")); |
265 return matches !== null ? matches.length : 0; | 265 return matches !== null ? matches.length : 0; |
266 } | 266 } |
267 | 267 |
268 function isRTL() { | 268 function isRTL() { |
269 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]"); | 269 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]"); |
270 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]"); | 270 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]"); |
271 | 271 |
272 // if 20% of chars are Hebrew or Arbic then direction is rtl | 272 // if 20% of chars are Hebrew or Arbic then direction is rtl |
273 return (count_heb + count_arb) * 100 / text.length > 20; | 273 return (count_heb + count_arb) * 100 / text.length > 20; |
274 } | 274 } |
275 | 275 |
276 text = sanitizeText(text); | 276 text = sanitizeText(text); |
277 return isRTL() ? "rtl" : "ltr"; | 277 return isRTL() ? "rtl" : "ltr"; |
278 }, | 278 }, |
279 | 279 |
280 /** | 280 /** |
281 * Get the article title as an H1. | 281 * Get the article title as an H1. |
282 * | 282 * |
283 * @return void | 283 * @return void |
284 **/ | 284 **/ |
285 getArticleTitle: function () { | 285 getArticleTitle: function () { |
286 var curTitle = "", | 286 var curTitle = "", |
287 origTitle = ""; | 287 origTitle = ""; |
288 | 288 |
289 try { | 289 try { |
290 curTitle = origTitle = document.title; | 290 curTitle = origTitle = document.title; |
291 if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */ | 291 if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */ |
292 curTitle = origTitle = readability.getInnerText(document.getElem entsByTagName('title')[0]); | 292 curTitle = origTitle = readability.getInnerText(document.getElem entsByTagName('title')[0]); |
293 } | 293 } |
294 } | 294 } |
295 catch(e) {} | 295 catch(e) {} |
296 | 296 |
297 if(curTitle.match(/ [\|\-] /)) | 297 if(curTitle.match(/ [\|\-] /)) |
298 { | 298 { |
299 curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); | 299 curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); |
300 | 300 |
301 if(curTitle.split(' ').length < 3) { | 301 if(curTitle.split(' ').length < 3) { |
302 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); | 302 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); |
303 } | 303 } |
304 } | 304 } |
305 else if(curTitle.indexOf(': ') !== -1) | 305 else if(curTitle.indexOf(': ') !== -1) |
306 { | 306 { |
307 curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); | 307 curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); |
308 | 308 |
309 if(curTitle.split(' ').length < 3) { | 309 if(curTitle.split(' ').length < 3) { |
310 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); | 310 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); |
(...skipping 12 matching lines...) Expand all Loading... | |
323 | 323 |
324 if(curTitle.split(' ').length <= 4) { | 324 if(curTitle.split(' ').length <= 4) { |
325 curTitle = origTitle; | 325 curTitle = origTitle; |
326 } | 326 } |
327 return curTitle; | 327 return curTitle; |
328 }, | 328 }, |
329 | 329 |
330 /** | 330 /** |
331 * Prepare the HTML document for readability to scrape it. | 331 * Prepare the HTML document for readability to scrape it. |
332 * This includes things like stripping javascript, CSS, and handling terribl e markup. | 332 * This includes things like stripping javascript, CSS, and handling terribl e markup. |
333 * | 333 * |
334 * @return void | 334 * @return void |
335 **/ | 335 **/ |
336 prepDocument: function () { | 336 prepDocument: function () { |
337 /** | 337 /** |
338 * In some cases a body element can't be found (if the HTML is totally h osed for example) | 338 * In some cases a body element can't be found (if the HTML is totally h osed for example) |
339 * so we create a new body node and append it to the document. | 339 * so we create a new body node and append it to the document. |
340 */ | 340 */ |
341 if(document.body === null) | 341 if(document.body === null) |
342 { | 342 { |
343 var body = document.createElement("body"); | 343 var body = document.createElement("body"); |
344 try { | 344 try { |
345 document.body = body; | 345 document.body = body; |
346 } | 346 } |
347 catch(e) { | 347 catch(e) { |
348 document.documentElement.appendChild(body); | 348 document.documentElement.appendChild(body); |
349 dbg(e); | 349 dbg(e); |
350 } | 350 } |
351 } | 351 } |
352 | 352 |
353 document.body.id = "readabilityBody"; | 353 document.body.id = "readabilityBody"; |
354 | 354 |
355 var frames = document.getElementsByTagName('frame'); | 355 var frames = document.getElementsByTagName('frame'); |
(...skipping 11 matching lines...) Expand all Loading... | |
367 canAccessFrame = true; | 367 canAccessFrame = true; |
368 } | 368 } |
369 catch(eFrames) { | 369 catch(eFrames) { |
370 dbg(eFrames); | 370 dbg(eFrames); |
371 } | 371 } |
372 | 372 |
373 if(frameSize > biggestFrameSize) { | 373 if(frameSize > biggestFrameSize) { |
374 biggestFrameSize = frameSize; | 374 biggestFrameSize = frameSize; |
375 readability.biggestFrame = frames[frameIndex]; | 375 readability.biggestFrame = frames[frameIndex]; |
376 } | 376 } |
377 | 377 |
378 if(canAccessFrame && frameSize > bestFrameSize) | 378 if(canAccessFrame && frameSize > bestFrameSize) |
379 { | 379 { |
380 readability.frameHack = true; | 380 readability.frameHack = true; |
381 | 381 |
382 bestFrame = frames[frameIndex]; | 382 bestFrame = frames[frameIndex]; |
383 bestFrameSize = frameSize; | 383 bestFrameSize = frameSize; |
384 } | 384 } |
385 } | 385 } |
386 | 386 |
387 if(bestFrame) | 387 if(bestFrame) |
388 { | 388 { |
389 var newBody = document.createElement('body'); | 389 var newBody = document.createElement('body'); |
390 readability.moveNodeInnards(bestFrame.contentWindow.document.bod y, newBody); | 390 readability.moveNodeInnards(bestFrame.contentWindow.document.bod y, newBody); |
391 newBody.style.overflow = 'scroll'; | 391 newBody.style.overflow = 'scroll'; |
392 document.body = newBody; | 392 document.body = newBody; |
393 | 393 |
394 var frameset = document.getElementsByTagName('frameset')[0]; | 394 var frameset = document.getElementsByTagName('frameset')[0]; |
395 if(frameset) { | 395 if(frameset) { |
396 frameset.parentNode.removeChild(frameset); } | 396 frameset.parentNode.removeChild(frameset); } |
397 } | 397 } |
398 } | 398 } |
399 | 399 |
400 /* Remove all stylesheets */ | 400 /* Remove all stylesheets */ |
401 for (var k=0;k < document.styleSheets.length; k+=1) { | 401 for (var k=0;k < document.styleSheets.length; k+=1) { |
402 if (document.styleSheets[k].href !== null && document.styleSheets[k] .href.lastIndexOf("readability") === -1) { | 402 if (document.styleSheets[k].href !== null && document.styleSheets[k] .href.lastIndexOf("readability") === -1) { |
403 document.styleSheets[k].disabled = true; | 403 document.styleSheets[k].disabled = true; |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
448 readability.cleanConditionally(articleContent, "table"); | 448 readability.cleanConditionally(articleContent, "table"); |
449 readability.cleanConditionally(articleContent, "ul"); | 449 readability.cleanConditionally(articleContent, "ul"); |
450 readability.cleanConditionally(articleContent, "div"); | 450 readability.cleanConditionally(articleContent, "div"); |
451 | 451 |
452 /* Remove extra paragraphs */ | 452 /* Remove extra paragraphs */ |
453 var articleParagraphs = articleContent.getElementsByTagName('p'); | 453 var articleParagraphs = articleContent.getElementsByTagName('p'); |
454 for(var i = articleParagraphs.length-1; i >= 0; i-=1) { | 454 for(var i = articleParagraphs.length-1; i >= 0; i-=1) { |
455 var imgCount = articleParagraphs[i].getElementsByTagName('img').l ength; | 455 var imgCount = articleParagraphs[i].getElementsByTagName('img').l ength; |
456 var embedCount = articleParagraphs[i].getElementsByTagName('embed') .length; | 456 var embedCount = articleParagraphs[i].getElementsByTagName('embed') .length; |
457 var objectCount = articleParagraphs[i].getElementsByTagName('object' ).length; | 457 var objectCount = articleParagraphs[i].getElementsByTagName('object' ).length; |
458 | 458 |
459 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab ility.getInnerText(articleParagraphs[i], false) === '') { | 459 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab ility.getInnerText(articleParagraphs[i], false) === '') { |
460 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i] ); | 460 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i] ); |
461 } | 461 } |
462 } | 462 } |
463 | 463 |
464 try { | 464 try { |
465 readability.replaceBrsWithPs(articleContent); | 465 readability.replaceBrsWithPs(articleContent); |
466 } | 466 } |
467 catch (e) { | 467 catch (e) { |
468 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block -elements bug. Ignoring.: " + e); | 468 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block -elements bug. Ignoring.: " + e); |
469 } | 469 } |
470 }, | 470 }, |
471 | 471 |
472 /** | 472 /** |
473 * Initialize a node with the readability object. Also checks the | 473 * Initialize a node with the readability object. Also checks the |
474 * className/id for special names to add to its score. | 474 * className/id for special names to add to its score. |
475 * | 475 * |
476 * @param Element | 476 * @param Element |
477 * @return void | 477 * @return void |
478 **/ | 478 **/ |
479 initializeNode: function (node) { | 479 initializeNode: function (node) { |
480 node.readability = {"contentScore": 0}; | 480 node.readability = {"contentScore": 0}; |
481 | 481 |
482 switch(node.tagName) { | 482 switch(node.tagName) { |
483 case 'DIV': | 483 case 'DIV': |
484 node.readability.contentScore += 5; | 484 node.readability.contentScore += 5; |
485 break; | 485 break; |
486 | 486 |
487 case 'PRE': | 487 case 'PRE': |
488 case 'TD': | 488 case 'TD': |
489 case 'BLOCKQUOTE': | 489 case 'BLOCKQUOTE': |
490 node.readability.contentScore += 3; | 490 node.readability.contentScore += 3; |
491 break; | 491 break; |
492 | 492 |
493 case 'ADDRESS': | 493 case 'ADDRESS': |
494 case 'OL': | 494 case 'OL': |
495 case 'UL': | 495 case 'UL': |
496 case 'DL': | 496 case 'DL': |
497 case 'DD': | 497 case 'DD': |
498 case 'DT': | 498 case 'DT': |
499 case 'LI': | 499 case 'LI': |
500 case 'FORM': | 500 case 'FORM': |
501 node.readability.contentScore -= 3; | 501 node.readability.contentScore -= 3; |
502 break; | 502 break; |
503 | 503 |
504 case 'H1': | 504 case 'H1': |
505 case 'H2': | 505 case 'H2': |
506 case 'H3': | 506 case 'H3': |
507 case 'H4': | 507 case 'H4': |
508 case 'H5': | 508 case 'H5': |
509 case 'H6': | 509 case 'H6': |
510 case 'TH': | 510 case 'TH': |
511 node.readability.contentScore -= 5; | 511 node.readability.contentScore -= 5; |
512 break; | 512 break; |
513 } | 513 } |
514 | 514 |
515 node.readability.contentScore += readability.getClassWeight(node); | 515 node.readability.contentScore += readability.getClassWeight(node); |
516 }, | 516 }, |
517 | 517 |
518 /*** | 518 /*** |
519 * grabArticle - Using a variety of metrics (content score, classname, eleme nt types), find the content that is | 519 * grabArticle - Using a variety of metrics (content score, classname, eleme nt types), find the content that is |
520 * most likely to be the stuff a user wants to read. Then retu rn it wrapped up in a div. | 520 * most likely to be the stuff a user wants to read. Then retu rn it wrapped up in a div. |
521 * | 521 * |
522 * @param page a document to run upon. Needs to be a full document, complete with body. | 522 * @param page a document to run upon. Needs to be a full document, complete with body. |
523 * @return Element | 523 * @return Element |
524 **/ | 524 **/ |
525 grabArticle: function (pageToClone) { | 525 grabArticle: function (pageToClone) { |
526 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_ STRIP_UNLIKELYS), | 526 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_ STRIP_UNLIKELYS), |
527 isPaging = (page !== null) ? true: false; | 527 isPaging = (page !== null) ? true: false; |
528 | 528 |
529 var page = null; | 529 var page = null; |
530 // Never work on the actual page. | 530 // Never work on the actual page. |
531 if (isPaging) { | 531 if (isPaging) { |
532 page = document.body.cloneNode(true); | 532 page = document.body.cloneNode(true); |
533 } else { | 533 } else { |
534 page = pageToClone.cloneNode(true); | 534 page = pageToClone.cloneNode(true); |
535 } | 535 } |
536 | 536 |
537 var allElements = page.getElementsByTagName('*'); | 537 var allElements = page.getElementsByTagName('*'); |
538 | 538 |
539 /** | 539 /** |
540 * First, node prepping. Trash nodes that look cruddy (like ones with th e class name "comment", etc), and turn divs | 540 * First, node prepping. Trash nodes that look cruddy (like ones with th e class name "comment", etc), and turn divs |
541 * into P tags where they have been used inappropriately (as in, where t hey contain no other block level elements.) | 541 * into P tags where they have been used inappropriately (as in, where t hey contain no other block level elements.) |
542 * | 542 * |
543 * Note: Assignment from index for performance. See http://www.peachpit. com/articles/article.aspx?p=31567&seqNum=5 | 543 * Note: Assignment from index for performance. See http://www.peachpit. com/articles/article.aspx?p=31567&seqNum=5 |
544 * TODO: Shouldn't this be a reverse traversal? | 544 * TODO: Shouldn't this be a reverse traversal? |
545 **/ | 545 **/ |
546 var node = null; | 546 var node = null; |
547 var nodesToScore = []; | 547 var nodesToScore = []; |
548 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) { | 548 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) { |
549 /* Remove unlikely candidates */ | 549 /* Remove unlikely candidates */ |
550 if (stripUnlikelyCandidates) { | 550 if (stripUnlikelyCandidates) { |
551 var unlikelyMatchString = node.className + node.id; | 551 var unlikelyMatchString = node.className + node.id; |
552 if ( | 552 if ( |
553 ( | 553 ( |
554 unlikelyMatchString.search(readability.regexps.unlikelyC andidates) !== -1 && | 554 unlikelyMatchString.search(readability.regexps.unlikelyC andidates) !== -1 && |
555 unlikelyMatchString.search(readability.regexps.okMaybeIt sACandidate) === -1 && | 555 unlikelyMatchString.search(readability.regexps.okMaybeIt sACandidate) === -1 && |
556 node.tagName !== "BODY" | 556 node.tagName !== "BODY" |
557 ) | 557 ) |
558 ) | 558 ) |
559 { | 559 { |
560 dbg("Removing unlikely candidate - " + unlikelyMatchString); | 560 dbg("Removing unlikely candidate - " + unlikelyMatchString); |
561 node.parentNode.removeChild(node); | 561 node.parentNode.removeChild(node); |
562 nodeIndex-=1; | 562 nodeIndex-=1; |
563 continue; | 563 continue; |
564 } | 564 } |
565 } | 565 } |
566 | 566 |
567 if (node.tagName === "P" || node.tagName === "TD" || node.tagName == = "PRE") { | 567 if (node.tagName === "P" || node.tagName === "TD" || node.tagName == = "PRE") { |
568 nodesToScore[nodesToScore.length] = node; | 568 nodesToScore[nodesToScore.length] = node; |
569 } | 569 } |
570 | 570 |
571 /* Turn all divs that don't have children block level elements into p's */ | 571 /* Turn all divs that don't have children block level elements into p's */ |
572 if (node.tagName === "DIV") { | 572 if (node.tagName === "DIV") { |
573 if (node.innerHTML.search(readability.regexps.divToPElements) == = -1) { | 573 if (node.innerHTML.search(readability.regexps.divToPElements) == = -1) { |
574 var newNode = document.createElement('p'); | 574 var newNode = document.createElement('p'); |
(...skipping 16 matching lines...) Expand all Loading... | |
591 if(childNode.nodeType === 3) { // Node.TEXT_NODE | 591 if(childNode.nodeType === 3) { // Node.TEXT_NODE |
592 var p = document.createElement('p'); | 592 var p = document.createElement('p'); |
593 var t = document.createTextNode(childNode.nodeValue) ; | 593 var t = document.createTextNode(childNode.nodeValue) ; |
594 p.appendChild(t); | 594 p.appendChild(t); |
595 p.style.display = 'inline'; | 595 p.style.display = 'inline'; |
596 p.className = 'readability-styled'; | 596 p.className = 'readability-styled'; |
597 childNode.parentNode.replaceChild(p, childNode); | 597 childNode.parentNode.replaceChild(p, childNode); |
598 } | 598 } |
599 } | 599 } |
600 } | 600 } |
601 } | 601 } |
602 } | 602 } |
603 | 603 |
604 /** | 604 /** |
605 * Loop through all paragraphs, and assign a score to them based on how content-y they look. | 605 * Loop through all paragraphs, and assign a score to them based on how content-y they look. |
606 * Then add their score to their parent node. | 606 * Then add their score to their parent node. |
607 * | 607 * |
608 * A score is determined by things like number of commas, class names, e tc. Maybe eventually link density. | 608 * A score is determined by things like number of commas, class names, e tc. Maybe eventually link density. |
609 **/ | 609 **/ |
610 var candidates = []; | 610 var candidates = []; |
611 for (var pt=0; pt < nodesToScore.length; pt+=1) { | 611 for (var pt=0; pt < nodesToScore.length; pt+=1) { |
(...skipping 21 matching lines...) Expand all Loading... | |
633 candidates.push(grandParentNode); | 633 candidates.push(grandParentNode); |
634 } | 634 } |
635 | 635 |
636 var contentScore = 0; | 636 var contentScore = 0; |
637 | 637 |
638 /* Add a point for the paragraph itself as a base. */ | 638 /* Add a point for the paragraph itself as a base. */ |
639 contentScore+=1; | 639 contentScore+=1; |
640 | 640 |
641 /* Add points for any commas within this paragraph */ | 641 /* Add points for any commas within this paragraph */ |
642 contentScore += innerText.split(',').length; | 642 contentScore += innerText.split(',').length; |
643 | 643 |
644 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ | 644 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ |
645 contentScore += Math.min(Math.floor(innerText.length / 100), 3); | 645 contentScore += Math.min(Math.floor(innerText.length / 100), 3); |
646 | 646 |
647 /* Add the score to the parent. The grandparent gets half. */ | 647 /* Add the score to the parent. The grandparent gets half. */ |
648 parentNode.readability.contentScore += contentScore; | 648 parentNode.readability.contentScore += contentScore; |
649 | 649 |
650 if(grandParentNode) { | 650 if(grandParentNode) { |
651 grandParentNode.readability.contentScore += contentScore/2; | 651 grandParentNode.readability.contentScore += contentScore/2; |
652 } | 652 } |
653 } | 653 } |
654 | 654 |
655 /** | 655 /** |
656 * After we've calculated scores, loop through all of the possible candi date nodes we found | 656 * After we've calculated scores, loop through all of the possible candi date nodes we found |
657 * and find the one with the highest score. | 657 * and find the one with the highest score. |
658 **/ | 658 **/ |
659 var topCandidate = null; | 659 var topCandidate = null; |
660 for(var c=0, cl=candidates.length; c < cl; c+=1) | 660 for(var c=0, cl=candidates.length; c < cl; c+=1) |
661 { | 661 { |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
718 var contentBonus = 0; | 718 var contentBonus = 0; |
719 /* Give a bonus if sibling nodes and top candidates have the example same classname */ | 719 /* Give a bonus if sibling nodes and top candidates have the example same classname */ |
720 if(siblingNode.className === topCandidate.className && topCandidate. className !== "") { | 720 if(siblingNode.className === topCandidate.className && topCandidate. className !== "") { |
721 contentBonus += topCandidate.readability.contentScore * 0.2; | 721 contentBonus += topCandidate.readability.contentScore * 0.2; |
722 } | 722 } |
723 | 723 |
724 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re adability.contentScore+contentBonus) >= siblingScoreThreshold) | 724 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re adability.contentScore+contentBonus) >= siblingScoreThreshold) |
725 { | 725 { |
726 append = true; | 726 append = true; |
727 } | 727 } |
728 | 728 |
729 if(siblingNode.nodeName === "P") { | 729 if(siblingNode.nodeName === "P") { |
730 var linkDensity = readability.getLinkDensity(siblingNode); | 730 var linkDensity = readability.getLinkDensity(siblingNode); |
731 var nodeContent = readability.getInnerText(siblingNode); | 731 var nodeContent = readability.getInnerText(siblingNode); |
732 var nodeLength = nodeContent.length; | 732 var nodeLength = nodeContent.length; |
733 | 733 |
734 if(nodeLength > 80 && linkDensity < 0.25) | 734 if(nodeLength > 80 && linkDensity < 0.25) |
735 { | 735 { |
736 append = true; | 736 append = true; |
737 } | 737 } |
738 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear ch(/\.( |$)/) !== -1) | 738 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear ch(/\.( |$)/) !== -1) |
739 { | 739 { |
740 append = true; | 740 append = true; |
741 } | 741 } |
742 } | 742 } |
743 | 743 |
744 if(append) { | 744 if(append) { |
745 dbg("Appending node: " + siblingNode); | 745 dbg("Appending node: " + siblingNode); |
746 | 746 |
747 var nodeToAppend = null; | 747 var nodeToAppend = null; |
748 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P ") { | 748 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P ") { |
749 /* We have a node that isn't a common block level element, l ike a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ | 749 /* We have a node that isn't a common block level element, l ike a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ |
750 | 750 |
751 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); | 751 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); |
752 nodeToAppend = document.createElement("DIV"); | 752 nodeToAppend = document.createElement("DIV"); |
753 try { | 753 try { |
754 nodeToAppend.id = siblingNode.id; | 754 nodeToAppend.id = siblingNode.id; |
755 readability.moveNodeInnards(siblingNode, nodeToAppend); | 755 readability.moveNodeInnards(siblingNode, nodeToAppend); |
756 } | 756 } |
757 catch(er) { | 757 catch(er) { |
758 dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original."); | 758 dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original."); |
759 nodeToAppend = siblingNode; | 759 nodeToAppend = siblingNode; |
760 s-=1; | 760 s-=1; |
761 sl-=1; | 761 sl-=1; |
762 } | 762 } |
763 } else { | 763 } else { |
764 nodeToAppend = siblingNode; | 764 nodeToAppend = siblingNode; |
765 s-=1; | 765 s-=1; |
766 sl-=1; | 766 sl-=1; |
767 } | 767 } |
768 | 768 |
769 /* To ensure a node does not interfere with readability styles, remove its classnames */ | 769 /* To ensure a node does not interfere with readability styles, remove its classnames */ |
770 nodeToAppend.className = ""; | 770 nodeToAppend.className = ""; |
771 | 771 |
772 /* Append sibling and subtract from our list because it removes the node when you append to another node */ | 772 /* Append sibling and subtract from our list because it removes the node when you append to another node */ |
773 articleContent.appendChild(nodeToAppend); | 773 articleContent.appendChild(nodeToAppend); |
774 } | 774 } |
775 } | 775 } |
776 | 776 |
777 /** | 777 /** |
778 * So we have all of the content that we need. Now we clean it up for pr esentation. | 778 * So we have all of the content that we need. Now we clean it up for pr esentation. |
779 **/ | 779 **/ |
780 readability.distilledArticleContent = articleContent.cloneNode(true); | 780 readability.distilledArticleContent = articleContent.cloneNode(true); |
781 //readability.prepArticle(articleContent); | 781 //readability.prepArticle(articleContent); |
782 | 782 |
783 if (readability.curPageNum === 1) { | 783 if (readability.curPageNum === 1) { |
784 var newNode = document.createElement('div'); | 784 var newNode = document.createElement('div'); |
785 newNode.id = "readability-page-1"; | 785 newNode.id = "readability-page-1"; |
786 newNode.setAttribute("class", "page"); | 786 newNode.setAttribute("class", "page"); |
787 readability.moveNodeInnards(articleContent, newNode); | 787 readability.moveNodeInnards(articleContent, newNode); |
788 articleContent.appendChild(newNode); | 788 articleContent.appendChild(newNode); |
789 } | 789 } |
790 | 790 |
791 /** | 791 /** |
792 * Now that we've gone through the full algorithm, check to see if we go t any meaningful content. | 792 * Now that we've gone through the full algorithm, check to see if we go t any meaningful content. |
793 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher | 793 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher |
794 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of | 794 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of |
795 * finding the -right- content. | 795 * finding the -right- content. |
796 **/ | 796 **/ |
797 if(readability.getInnerText(articleContent, false).length < 250) { | 797 if(readability.getInnerText(articleContent, false).length < 250) { |
798 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) { | 798 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) { |
799 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS); | 799 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS); |
800 return readability.grabArticle(document.body); | 800 return readability.grabArticle(document.body); |
801 } | 801 } |
802 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { | 802 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { |
803 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES); | 803 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES); |
804 return readability.grabArticle(document.body); | 804 return readability.grabArticle(document.body); |
805 } | 805 } |
806 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL LY)) { | 806 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL LY)) { |
807 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY); | 807 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY); |
808 return readability.grabArticle(document.body); | 808 return readability.grabArticle(document.body); |
809 } else { | 809 } else { |
810 return null; | 810 return null; |
811 } | 811 } |
812 } | 812 } |
813 | 813 |
814 return articleContent; | 814 return articleContent; |
815 }, | 815 }, |
816 | 816 |
817 /** | 817 /** |
818 * Removes script tags from the document. | 818 * Removes script tags from the document. |
819 * | 819 * |
820 * @param Element | 820 * @param Element |
821 **/ | 821 **/ |
822 removeScripts: function (doc) { | 822 removeScripts: function (doc) { |
823 var scripts = doc.getElementsByTagName('script'); | 823 var scripts = doc.getElementsByTagName('script'); |
824 for(var i = scripts.length-1; i >= 0; i-=1) | 824 for(var i = scripts.length-1; i >= 0; i-=1) |
825 { | 825 { |
826 if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf ('readability') === -1 && scripts[i].src.indexOf('typekit') === -1)) | 826 if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf ('readability') === -1 && scripts[i].src.indexOf('typekit') === -1)) |
827 { | 827 { |
828 scripts[i].nodeValue=""; | 828 scripts[i].nodeValue=""; |
829 scripts[i].removeAttribute('src'); | 829 scripts[i].removeAttribute('src'); |
830 if (scripts[i].parentNode) { | 830 if (scripts[i].parentNode) { |
831 scripts[i].parentNode.removeChild(scripts[i]); | 831 scripts[i].parentNode.removeChild(scripts[i]); |
832 } | 832 } |
833 } | 833 } |
834 } | 834 } |
835 }, | 835 }, |
836 | 836 |
837 /** | 837 /** |
838 * Get the inner text of a node - cross browser compatibly. | 838 * Get the inner text of a node - cross browser compatibly. |
839 * This also strips out any excess whitespace to be found. | 839 * This also strips out any excess whitespace to be found. |
840 * | 840 * |
841 * @param Element | 841 * @param Element |
842 * @return string | 842 * @return string |
843 **/ | 843 **/ |
844 getInnerText: function (e, normalizeSpaces) { | 844 getInnerText: function (e, normalizeSpaces) { |
845 var textContent = ""; | 845 var textContent = ""; |
846 | 846 |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
889 | 889 |
890 // Remove any root styles, if we're able. | 890 // Remove any root styles, if we're able. |
891 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili ty-styled') { | 891 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili ty-styled') { |
892 e.removeAttribute('style'); } | 892 e.removeAttribute('style'); } |
893 | 893 |
894 // Go until there are no more child nodes | 894 // Go until there are no more child nodes |
895 while ( cur !== null ) { | 895 while ( cur !== null ) { |
896 if ( cur.nodeType === 1 ) { | 896 if ( cur.nodeType === 1 ) { |
897 // Remove style attribute(s) : | 897 // Remove style attribute(s) : |
898 if(cur.className !== "readability-styled") { | 898 if(cur.className !== "readability-styled") { |
899 cur.removeAttribute("style"); | 899 cur.removeAttribute("style"); |
900 } | 900 } |
901 readability.cleanStyles( cur ); | 901 readability.cleanStyles( cur ); |
902 } | 902 } |
903 cur = cur.nextSibling; | 903 cur = cur.nextSibling; |
904 } | 904 } |
905 }, | 905 }, |
906 | 906 |
907 /** | 907 /** |
908 * Get the density of links as a percentage of the content | 908 * Get the density of links as a percentage of the content |
909 * This is the amount of text that is inside a link divided by the total tex t in the node. | 909 * This is the amount of text that is inside a link divided by the total tex t in the node. |
910 * | 910 * |
911 * @param Element | 911 * @param Element |
912 * @return number (float) | 912 * @return number (float) |
913 **/ | 913 **/ |
914 getLinkDensity: function (e) { | 914 getLinkDensity: function (e) { |
915 var links = e.getElementsByTagName("a"); | 915 var links = e.getElementsByTagName("a"); |
916 var textLength = readability.getInnerText(e).length; | 916 var textLength = readability.getInnerText(e).length; |
917 var linkLength = 0; | 917 var linkLength = 0; |
918 for(var i=0, il=links.length; i<il;i+=1) | 918 for(var i=0, il=links.length; i<il;i+=1) |
919 { | 919 { |
920 linkLength += readability.getInnerText(links[i]).length; | 920 linkLength += readability.getInnerText(links[i]).length; |
921 } | 921 } |
922 | 922 |
923 return linkLength / textLength; | 923 return linkLength / textLength; |
924 }, | 924 }, |
925 | 925 |
926 /** | 926 /** |
927 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness. | 927 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness. |
928 * | 928 * |
929 * @author Dan Lacy | 929 * @author Dan Lacy |
930 * @return string the base url | 930 * @return string the base url |
931 **/ | 931 **/ |
932 findBaseUrl: function () { | 932 findBaseUrl: function () { |
933 var noUrlParams = window.location.pathname.split("?")[0], | 933 var noUrlParams = window.location.pathname.split("?")[0], |
934 urlSlashes = noUrlParams.split("/").reverse(), | 934 urlSlashes = noUrlParams.split("/").reverse(), |
935 cleanedSegments = [], | 935 cleanedSegments = [], |
936 possibleType = ""; | 936 possibleType = ""; |
937 | 937 |
938 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) { | 938 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) { |
939 var segment = urlSlashes[i]; | 939 var segment = urlSlashes[i]; |
940 | 940 |
941 // Split off and save anything that looks like a file type. | 941 // Split off and save anything that looks like a file type. |
942 if (segment.indexOf(".") !== -1) { | 942 if (segment.indexOf(".") !== -1) { |
943 possibleType = segment.split(".")[1]; | 943 possibleType = segment.split(".")[1]; |
944 | 944 |
945 /* If the type isn't alpha-only, it's probably not actually a fi le extension. */ | 945 /* If the type isn't alpha-only, it's probably not actually a fi le extension. */ |
946 if(!possibleType.match(/[^a-zA-Z]/)) { | 946 if(!possibleType.match(/[^a-zA-Z]/)) { |
947 segment = segment.split(".")[0]; | 947 segment = segment.split(".")[0]; |
948 } | 948 } |
949 } | 949 } |
950 | 950 |
951 /** | 951 /** |
952 * EW-CMS specific segment replacement. Ugly. | 952 * EW-CMS specific segment replacement. Ugly. |
953 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm l | 953 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm l |
954 **/ | 954 **/ |
955 if(segment.indexOf(',00') !== -1) { | 955 if(segment.indexOf(',00') !== -1) { |
956 segment = segment.replace(',00', ''); | 956 segment = segment.replace(',00', ''); |
957 } | 957 } |
958 | 958 |
959 // If our first or second segment has anything looking like a page n umber, remove it. | 959 // If our first or second segment has anything looking like a page n umber, remove it. |
960 if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) { | 960 if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) { |
961 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, " "); | 961 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, " "); |
962 } | 962 } |
963 | 963 |
964 | 964 |
965 var del = false; | 965 var del = false; |
966 | 966 |
967 /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */ | 967 /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */ |
968 if (i < 2 && segment.match(/^\d{1,2}$/)) { | 968 if (i < 2 && segment.match(/^\d{1,2}$/)) { |
969 del = true; | 969 del = true; |
970 } | 970 } |
971 | 971 |
972 /* If this is the first segment and it's just "index", remove it. */ | 972 /* If this is the first segment and it's just "index", remove it. */ |
973 if(i === 0 && segment.toLowerCase() === "index") { | 973 if(i === 0 && segment.toLowerCase() === "index") { |
974 del = true; | 974 del = true; |
975 } | 975 } |
976 | 976 |
977 | 977 |
978 /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */ | 978 /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */ |
979 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) { | 979 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) { |
980 del = true; | 980 del = true; |
981 } | 981 } |
982 | 982 |
983 /* If it's not marked for deletion, push it to cleanedSegments. */ | 983 /* If it's not marked for deletion, push it to cleanedSegments. */ |
984 if (!del) { | 984 if (!del) { |
985 cleanedSegments.push(segment); | 985 cleanedSegments.push(segment); |
986 } | 986 } |
987 } | 987 } |
988 | 988 |
989 // This is our final, cleaned, base article URL. | 989 // This is our final, cleaned, base article URL. |
990 return window.location.protocol + "//" + window.location.host + cleanedS egments.reverse().join("/"); | 990 return window.location.protocol + "//" + window.location.host + cleanedS egments.reverse().join("/"); |
991 }, | 991 }, |
992 | 992 |
993 /** | 993 /** |
994 * Look for any paging links that may occur within the document. | 994 * Look for any paging links that may occur within the document. |
995 * | 995 * |
996 * @param body | 996 * @param body |
997 * @return object (array) | 997 * @return object (array) |
998 **/ | 998 **/ |
999 findNextPageLink: function (elem) { | 999 findNextPageLink: function (elem) { |
1000 var possiblePages = {}, | 1000 var possiblePages = {}, |
1001 allLinks = elem.getElementsByTagName('a'), | 1001 allLinks = elem.getElementsByTagName('a'), |
1002 articleBaseUrl = readability.findBaseUrl(); | 1002 articleBaseUrl = readability.findBaseUrl(); |
1003 | 1003 |
1004 /** | 1004 /** |
1005 * Loop through all links, looking for hints that they may be next-page links. | 1005 * Loop through all links, looking for hints that they may be next-page links. |
1006 * Things like having "page" in their textContent, className or id, or b eing a child | 1006 * Things like having "page" in their textContent, className or id, or b eing a child |
1007 * of a node with a page-y className or id. | 1007 * of a node with a page-y className or id. |
1008 * | 1008 * |
1009 * Also possible: levenshtein distance? longest common subsequence? | 1009 * Also possible: levenshtein distance? longest common subsequence? |
1010 * | 1010 * |
1011 * After we do that, assign each page a score, and | 1011 * After we do that, assign each page a score, and |
1012 **/ | 1012 **/ |
1013 for(var i = 0, il = allLinks.length; i < il; i+=1) { | 1013 for(var i = 0, il = allLinks.length; i < il; i+=1) { |
1014 var link = allLinks[i], | 1014 var link = allLinks[i], |
1015 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ' '); | 1015 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ' '); |
1016 | 1016 |
1017 /* If we've already seen this page, ignore it */ | 1017 /* If we've already seen this page, ignore it */ |
1018 if(linkHref === "" || linkHref === articleBaseUrl || linkHref === wi ndow.location.href || linkHref in readability.parsedPages) { | 1018 if(linkHref === "" || linkHref === articleBaseUrl || linkHref === wi ndow.location.href || linkHref in readability.parsedPages) { |
1019 continue; | 1019 continue; |
1020 } | 1020 } |
1021 | 1021 |
1022 /* If it's on a different domain, skip it. */ | 1022 /* If it's on a different domain, skip it. */ |
1023 if(window.location.host !== linkHref.split(/\/+/g)[1]) { | 1023 if(window.location.host !== linkHref.split(/\/+/g)[1]) { |
1024 continue; | 1024 continue; |
1025 } | 1025 } |
1026 | 1026 |
1027 var linkText = readability.getInnerText(link); | 1027 var linkText = readability.getInnerText(link); |
1028 | 1028 |
1029 /* If the linkText looks like it's not the next page, skip it. */ | 1029 /* If the linkText looks like it's not the next page, skip it. */ |
1030 if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) { | 1030 if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) { |
1031 continue; | 1031 continue; |
1032 } | 1032 } |
1033 | 1033 |
1034 /* If the leftovers of the URL after removing the base URL don't con tain any digits, it's certainly not a next page link. */ | 1034 /* If the leftovers of the URL after removing the base URL don't con tain any digits, it's certainly not a next page link. */ |
1035 var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); | 1035 var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); |
1036 if(!linkHrefLeftover.match(/\d/)) { | 1036 if(!linkHrefLeftover.match(/\d/)) { |
1037 continue; | 1037 continue; |
1038 } | 1038 } |
1039 | 1039 |
1040 if(!(linkHref in possiblePages)) { | 1040 if(!(linkHref in possiblePages)) { |
1041 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr ef": linkHref}; | 1041 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr ef": linkHref}; |
1042 } else { | 1042 } else { |
1043 possiblePages[linkHref].linkText += ' | ' + linkText; | 1043 possiblePages[linkHref].linkText += ' | ' + linkText; |
1044 } | 1044 } |
1045 | 1045 |
1046 var linkObj = possiblePages[linkHref]; | 1046 var linkObj = possiblePages[linkHref]; |
1047 | 1047 |
1048 /** | 1048 /** |
1049 * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower. | 1049 * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower. |
1050 * Example: http://www.actionscript.org/resources/articles/745/1/Jav aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html | 1050 * Example: http://www.actionscript.org/resources/articles/745/1/Jav aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html |
1051 **/ | 1051 **/ |
1052 if(linkHref.indexOf(articleBaseUrl) !== 0) { | 1052 if(linkHref.indexOf(articleBaseUrl) !== 0) { |
1053 linkObj.score -= 25; | 1053 linkObj.score -= 25; |
1054 } | 1054 } |
1055 | 1055 |
1056 var linkData = linkText + ' ' + link.className + ' ' + link.id; | 1056 var linkData = linkText + ' ' + link.className + ' ' + link.id; |
1057 if(linkData.match(readability.regexps.nextLink)) { | 1057 if(linkData.match(readability.regexps.nextLink)) { |
1058 linkObj.score += 50; | 1058 linkObj.score += 50; |
1059 } | 1059 } |
1060 if(linkData.match(/pag(e|ing|inat)/i)) { | 1060 if(linkData.match(/pag(e|ing|inat)/i)) { |
1061 linkObj.score += 25; | 1061 linkObj.score += 25; |
1062 } | 1062 } |
1063 if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text, | 1063 if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text, |
1064 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */ | 1064 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */ |
1065 if(!linkObj.linkText.match(readability.regexps.nextLink)) { | 1065 if(!linkObj.linkText.match(readability.regexps.nextLink)) { |
1066 linkObj.score -= 65; | 1066 linkObj.score -= 65; |
1067 } | 1067 } |
1068 } | 1068 } |
1069 if(linkData.match(readability.regexps.negative) || linkData.match(re adability.regexps.extraneous)) { | 1069 if(linkData.match(readability.regexps.negative) || linkData.match(re adability.regexps.extraneous)) { |
1070 linkObj.score -= 50; | 1070 linkObj.score -= 50; |
1071 } | 1071 } |
1072 if(linkData.match(readability.regexps.prevLink)) { | 1072 if(linkData.match(readability.regexps.prevLink)) { |
1073 linkObj.score -= 200; | 1073 linkObj.score -= 200; |
1074 } | 1074 } |
1075 | 1075 |
1076 /* If a parentNode contains page or paging or paginat */ | 1076 /* If a parentNode contains page or paging or paginat */ |
1077 var parentNode = link.parentNode, | 1077 var parentNode = link.parentNode, |
1078 positiveNodeMatch = false, | 1078 positiveNodeMatch = false, |
1079 negativeNodeMatch = false; | 1079 negativeNodeMatch = false; |
1080 while(parentNode) { | 1080 while(parentNode) { |
1081 var parentNodeClassAndId = parentNode.className + ' ' + parentNo de.id; | 1081 var parentNodeClassAndId = parentNode.className + ' ' + parentNo de.id; |
1082 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(/pag(e|ing|inat)/i)) { | 1082 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(/pag(e|ing|inat)/i)) { |
1083 positiveNodeMatch = true; | 1083 positiveNodeMatch = true; |
1084 linkObj.score += 25; | 1084 linkObj.score += 25; |
1085 } | 1085 } |
1086 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(readability.regexps.negative)) { | 1086 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(readability.regexps.negative)) { |
1087 /* If this is just something like "footer", give it a negati ve. If it's something like "body-and-footer", leave it be. */ | 1087 /* If this is just something like "footer", give it a negati ve. If it's something like "body-and-footer", leave it be. */ |
1088 if(!parentNodeClassAndId.match(readability.regexps.positive) ) { | 1088 if(!parentNodeClassAndId.match(readability.regexps.positive) ) { |
1089 linkObj.score -= 25; | 1089 linkObj.score -= 25; |
1090 negativeNodeMatch = true; | 1090 negativeNodeMatch = true; |
1091 } | 1091 } |
1092 } | 1092 } |
1093 | 1093 |
1094 parentNode = parentNode.parentNode; | 1094 parentNode = parentNode.parentNode; |
1095 } | 1095 } |
1096 | 1096 |
1097 /** | 1097 /** |
1098 * If the URL looks like it has paging in it, add to the score. | 1098 * If the URL looks like it has paging in it, add to the score. |
1099 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 | 1099 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 |
1100 **/ | 1100 **/ |
1101 if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) { | 1101 if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) { |
1102 linkObj.score += 25; | 1102 linkObj.score += 25; |
1103 } | 1103 } |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1145 topPage = possiblePages[page]; | 1145 topPage = possiblePages[page]; |
1146 } | 1146 } |
1147 } | 1147 } |
1148 } | 1148 } |
1149 | 1149 |
1150 if(topPage) { | 1150 if(topPage) { |
1151 var nextHref = topPage.href.replace(/\/$/,''); | 1151 var nextHref = topPage.href.replace(/\/$/,''); |
1152 | 1152 |
1153 dbg('NEXT PAGE IS ' + nextHref); | 1153 dbg('NEXT PAGE IS ' + nextHref); |
1154 readability.parsedPages[nextHref] = true; | 1154 readability.parsedPages[nextHref] = true; |
1155 return nextHref; | 1155 return nextHref; |
1156 } | 1156 } |
1157 else { | 1157 else { |
1158 return null; | 1158 return null; |
1159 } | 1159 } |
1160 }, | 1160 }, |
1161 | 1161 |
1162 createLinkDiv: function(link) { | 1162 createLinkDiv: function(link) { |
1163 var divNode = document.createElement('div'); | 1163 var divNode = document.createElement('div'); |
1164 var aNode = document.createElement('a'); | 1164 var aNode = document.createElement('a'); |
1165 var tNode = document.createTextNode('View Next Page'); | 1165 var tNode = document.createTextNode('View Next Page'); |
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1197 } | 1197 } |
1198 else { | 1198 else { |
1199 if (options.error) { options.error(request); } | 1199 if (options.error) { options.error(request); } |
1200 } | 1200 } |
1201 } | 1201 } |
1202 } | 1202 } |
1203 | 1203 |
1204 if (typeof options === 'undefined') { options = {}; } | 1204 if (typeof options === 'undefined') { options = {}; } |
1205 | 1205 |
1206 request.onreadystatechange = respondToReadyState; | 1206 request.onreadystatechange = respondToReadyState; |
1207 | 1207 |
1208 request.open('get', url, true); | 1208 request.open('get', url, true); |
1209 request.setRequestHeader('Accept', 'text/html'); | 1209 request.setRequestHeader('Accept', 'text/html'); |
1210 | 1210 |
1211 try { | 1211 try { |
1212 request.send(options.postBody); | 1212 request.send(options.postBody); |
1213 } | 1213 } |
1214 catch (e) { | 1214 catch (e) { |
1215 if (options.error) { options.error(); } | 1215 if (options.error) { options.error(); } |
1216 } | 1216 } |
1217 | 1217 |
(...skipping 14 matching lines...) Expand all Loading... | |
1232 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada bility.curPageNum + '">§</p>'; | 1232 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada bility.curPageNum + '">§</p>'; |
1233 | 1233 |
1234 document.getElementById("readability-content").appendChild(articlePage); | 1234 document.getElementById("readability-content").appendChild(articlePage); |
1235 | 1235 |
1236 if(readability.curPageNum > readability.maxPages) { | 1236 if(readability.curPageNum > readability.maxPages) { |
1237 var linkDiv = readability.createLinkDiv(nextPageLink); | 1237 var linkDiv = readability.createLinkDiv(nextPageLink); |
1238 | 1238 |
1239 articlePage.appendChild(linkDiv); | 1239 articlePage.appendChild(linkDiv); |
1240 return; | 1240 return; |
1241 } | 1241 } |
1242 | 1242 |
1243 /** | 1243 /** |
1244 * Now that we've built the article page DOM element, get the page conte nt | 1244 * Now that we've built the article page DOM element, get the page conte nt |
1245 * asynchronously and load the cleaned content into the div we created f or it. | 1245 * asynchronously and load the cleaned content into the div we created f or it. |
1246 **/ | 1246 **/ |
1247 (function(pageUrl, thisPage) { | 1247 (function(pageUrl, thisPage) { |
1248 readability.ajax(pageUrl, { | 1248 readability.ajax(pageUrl, { |
1249 success: function(r) { | 1249 success: function(r) { |
1250 | 1250 |
1251 /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */ | 1251 /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */ |
1252 var eTag = r.getResponseHeader('ETag'); | 1252 var eTag = r.getResponseHeader('ETag'); |
1253 if(eTag) { | 1253 if(eTag) { |
1254 if(eTag in readability.pageETags) { | 1254 if(eTag in readability.pageETags) { |
1255 dbg("Exact duplicate page found via ETag. Aborting." ); | 1255 dbg("Exact duplicate page found via ETag. Aborting." ); |
1256 articlePage.style.display = 'none'; | 1256 articlePage.style.display = 'none'; |
1257 return; | 1257 return; |
1258 } else { | 1258 } else { |
1259 readability.pageETags[eTag] = 1; | 1259 readability.pageETags[eTag] = 1; |
1260 } | 1260 } |
1261 } | 1261 } |
1262 | 1262 |
1263 // TODO: this ends up doubling up page numbers on NYTimes ar ticles. Need to generically parse those away. | 1263 // TODO: this ends up doubling up page numbers on NYTimes ar ticles. Need to generically parse those away. |
1264 var page = document.createElement("DIV"); | 1264 var page = document.createElement("DIV"); |
1265 | 1265 |
1266 /** | 1266 /** |
1267 * Do some preprocessing to our HTML to make it ready for ap pending. | 1267 * Do some preprocessing to our HTML to make it ready for ap pending. |
1268 * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript. | 1268 * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript. |
1269 * • Turn any noscript tags into divs so that we can parse t hem. This allows us to find any next page links hidden via javascript. | 1269 * • Turn any noscript tags into divs so that we can parse t hem. This allows us to find any next page links hidden via javascript. |
1270 * • Turn all double br's into p's - was handled by prepDocu ment in the original view. | 1270 * • Turn all double br's into p's - was handled by prepDocu ment in the original view. |
(...skipping 30 matching lines...) Expand all Loading... | |
1301 for(var i=1; i <= readability.curPageNum; i+=1) { | 1301 for(var i=1; i <= readability.curPageNum; i+=1) { |
1302 var rPage = document.getElementById('readability-pag e-' + i); | 1302 var rPage = document.getElementById('readability-pag e-' + i); |
1303 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML ) !== -1) { | 1303 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML ) !== -1) { |
1304 dbg('Duplicate of page ' + i + ' - skipping.'); | 1304 dbg('Duplicate of page ' + i + ' - skipping.'); |
1305 articlePage.style.display = 'none'; | 1305 articlePage.style.display = 'none'; |
1306 readability.parsedPages[pageUrl] = true; | 1306 readability.parsedPages[pageUrl] = true; |
1307 return; | 1307 return; |
1308 } | 1308 } |
1309 } | 1309 } |
1310 } | 1310 } |
1311 | 1311 |
1312 readability.removeScripts(content); | 1312 readability.removeScripts(content); |
1313 | 1313 |
1314 readability.moveNodeInnards(content, thisPage); | 1314 readability.moveNodeInnards(content, thisPage); |
1315 | 1315 |
1316 /** | 1316 /** |
1317 * After the page has rendered, post process the content. Th is delay is necessary because, | 1317 * After the page has rendered, post process the content. Th is delay is necessary because, |
1318 * in webkit at least, offsetWidth is not set in time to det ermine image width. We have to | 1318 * in webkit at least, offsetWidth is not set in time to det ermine image width. We have to |
1319 * wait a little bit for reflow to finish before we can fix floating images. | 1319 * wait a little bit for reflow to finish before we can fix floating images. |
1320 **/ | 1320 **/ |
1321 window.setTimeout( | 1321 window.setTimeout( |
1322 function() { readability.postProcessContent(thisPage); } , | 1322 function() { readability.postProcessContent(thisPage); } , |
1323 500 | 1323 500 |
1324 ); | 1324 ); |
1325 | 1325 |
1326 if(nextPageLink) { | 1326 if(nextPageLink) { |
1327 readability.appendNextPage(nextPageLink); | 1327 readability.appendNextPage(nextPageLink); |
1328 } | 1328 } |
1329 } | 1329 } |
1330 }); | 1330 }); |
1331 }(nextPageLink, articlePage)); | 1331 }(nextPageLink, articlePage)); |
1332 }, | 1332 }, |
1333 | 1333 |
1334 /** | 1334 /** |
1335 * Get an elements class/id weight. Uses regular expressions to tell if this | 1335 * Get an elements class/id weight. Uses regular expressions to tell if this |
1336 * element looks good or bad. | 1336 * element looks good or bad. |
1337 * | 1337 * |
1338 * @param Element | 1338 * @param Element |
1339 * @return number (Integer) | 1339 * @return number (Integer) |
1340 **/ | 1340 **/ |
1341 getClassWeight: function (e) { | 1341 getClassWeight: function (e) { |
1342 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { | 1342 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { |
1343 return 0; | 1343 return 0; |
1344 } | 1344 } |
1345 | 1345 |
(...skipping 29 matching lines...) Expand all Loading... | |
1375 /** | 1375 /** |
1376 * Remove extraneous break tags from a node. | 1376 * Remove extraneous break tags from a node. |
1377 * | 1377 * |
1378 * @param Element | 1378 * @param Element |
1379 * @return void | 1379 * @return void |
1380 **/ | 1380 **/ |
1381 killBreaks: function (e) { | 1381 killBreaks: function (e) { |
1382 var allElements = e.getElementsByTagName('*'); | 1382 var allElements = e.getElementsByTagName('*'); |
1383 while (i < allElements.length) { | 1383 while (i < allElements.length) { |
1384 readability.deleteExtraBreaks(allElements[i]); | 1384 readability.deleteExtraBreaks(allElements[i]); |
1385 i++; | 1385 i++; |
1386 } | 1386 } |
1387 }, | 1387 }, |
1388 | 1388 |
1389 /** | 1389 /** |
1390 * Clean a node of all elements of type "tag". | 1390 * Clean a node of all elements of type "tag". |
1391 * (Unless it's a youtube/vimeo video. People love movies.) | 1391 * (Unless it's a youtube/vimeo video. People love movies.) |
1392 * | 1392 * |
1393 * @param Element | 1393 * @param Element |
1394 * @param string tag to clean | 1394 * @param string tag to clean |
1395 * @return void | 1395 * @return void |
1396 **/ | 1396 **/ |
1397 clean: function (e, tag) { | 1397 clean: function (e, tag) { |
1398 var targetList = e.getElementsByTagName( tag ); | 1398 var targetList = e.getElementsByTagName( tag ); |
1399 var isEmbed = (tag === 'object' || tag === 'embed'); | 1399 var isEmbed = (tag === 'object' || tag === 'embed'); |
1400 | 1400 |
1401 for (var y=targetList.length-1; y >= 0; y-=1) { | 1401 for (var y=targetList.length-1; y >= 0; y-=1) { |
1402 /* Allow youtube and vimeo videos through as people usually want to see those. */ | 1402 /* Allow youtube and vimeo videos through as people usually want to see those. */ |
1403 if(isEmbed) { | 1403 if(isEmbed) { |
1404 var attributeValues = ""; | 1404 var attributeValues = ""; |
1405 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) { | 1405 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) { |
1406 attributeValues += targetList[y].attributes[i].value + '|'; | 1406 attributeValues += targetList[y].attributes[i].value + '|'; |
1407 } | 1407 } |
1408 | 1408 |
1409 /* First, check the elements attributes to see if any of them co ntain youtube or vimeo */ | 1409 /* First, check the elements attributes to see if any of them co ntain youtube or vimeo */ |
1410 if (attributeValues.search(readability.regexps.videos) !== -1) { | 1410 if (attributeValues.search(readability.regexps.videos) !== -1) { |
1411 continue; | 1411 continue; |
1412 } | 1412 } |
1413 | 1413 |
1414 /* Then check the elements inside this element for the same. */ | 1414 /* Then check the elements inside this element for the same. */ |
1415 if (targetList[y].innerHTML.search(readability.regexps.videos) ! == -1) { | 1415 if (targetList[y].innerHTML.search(readability.regexps.videos) ! == -1) { |
1416 continue; | 1416 continue; |
1417 } | 1417 } |
1418 | 1418 |
1419 } | 1419 } |
1420 | 1420 |
1421 targetList[y].parentNode.removeChild(targetList[y]); | 1421 targetList[y].parentNode.removeChild(targetList[y]); |
1422 } | 1422 } |
1423 }, | 1423 }, |
1424 | 1424 |
1425 /** | 1425 /** |
1426 * Clean an element of all tags of type "tag" if they look fishy. | 1426 * Clean an element of all tags of type "tag" if they look fishy. |
1427 * "Fishy" is an algorithm based on content length, classnames, link density , number of images & embeds, etc. | 1427 * "Fishy" is an algorithm based on content length, classnames, link density , number of images & embeds, etc. |
1428 * | 1428 * |
1429 * @return void | 1429 * @return void |
1430 **/ | 1430 **/ |
1431 cleanConditionally: function (e, tag) { | 1431 cleanConditionally: function (e, tag) { |
1432 | 1432 |
1433 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { | 1433 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { |
1434 return; | 1434 return; |
1435 } | 1435 } |
1436 | 1436 |
1437 var tagsList = e.getElementsByTagName(tag); | 1437 var tagsList = e.getElementsByTagName(tag); |
1438 var curTagsLength = tagsList.length; | 1438 var curTagsLength = tagsList.length; |
1439 | 1439 |
1440 /** | 1440 /** |
1441 * Gather counts for other typical elements embedded within. | 1441 * Gather counts for other typical elements embedded within. |
1442 * Traverse backwards so we can remove nodes at the same time without ef fecting the traversal. | 1442 * Traverse backwards so we can remove nodes at the same time without ef fecting the traversal. |
1443 * | 1443 * |
1444 * TODO: Consider taking into account original contentScore here. | 1444 * TODO: Consider taking into account original contentScore here. |
1445 **/ | 1445 **/ |
1446 for (var i=curTagsLength-1; i >= 0; i-=1) { | 1446 for (var i=curTagsLength-1; i >= 0; i-=1) { |
1447 var weight = readability.getClassWeight(tagsList[i]); | 1447 var weight = readability.getClassWeight(tagsList[i]); |
1448 var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0; | 1448 var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0; |
1449 | 1449 |
1450 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde fined') ? (" with score " + tagsList[i].readability.contentScore) : '')); | 1450 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde fined') ? (" with score " + tagsList[i].readability.contentScore) : '')); |
1451 | 1451 |
1452 if(weight+contentScore < 0) | 1452 if(weight+contentScore < 0) |
1453 { | 1453 { |
1454 tagsList[i].parentNode.removeChild(tagsList[i]); | 1454 tagsList[i].parentNode.removeChild(tagsList[i]); |
1455 } | 1455 } |
1456 else if ( readability.getCharCount(tagsList[i],',') < 10) { | 1456 else if ( readability.getCharCount(tagsList[i],',') < 10) { |
1457 /** | 1457 /** |
1458 * If there are not very many commas, and the number of | 1458 * If there are not very many commas, and the number of |
1459 * non-paragraph elements is more than paragraphs or other omino us signs, remove the element. | 1459 * non-paragraph elements is more than paragraphs or other omino us signs, remove the element. |
1460 **/ | 1460 **/ |
1461 var p = tagsList[i].getElementsByTagName("p").length; | 1461 var p = tagsList[i].getElementsByTagName("p").length; |
1462 var img = tagsList[i].getElementsByTagName("img").length; | 1462 var img = tagsList[i].getElementsByTagName("img").length; |
1463 var li = tagsList[i].getElementsByTagName("li").length-100; | 1463 var li = tagsList[i].getElementsByTagName("li").length-100; |
1464 var input = tagsList[i].getElementsByTagName("input").length; | 1464 var input = tagsList[i].getElementsByTagName("input").length; |
1465 | 1465 |
1466 var embedCount = 0; | 1466 var embedCount = 0; |
1467 var embeds = tagsList[i].getElementsByTagName("embed"); | 1467 var embeds = tagsList[i].getElementsByTagName("embed"); |
1468 for(var ei=0,il=embeds.length; ei < il; ei+=1) { | 1468 for(var ei=0,il=embeds.length; ei < il; ei+=1) { |
1469 if (embeds[ei].src.search(readability.regexps.videos) === -1 ) { | 1469 if (embeds[ei].src.search(readability.regexps.videos) === -1 ) { |
1470 embedCount+=1; | 1470 embedCount+=1; |
1471 } | 1471 } |
1472 } | 1472 } |
1473 | 1473 |
1474 var linkDensity = readability.getLinkDensity(tagsList[i]); | 1474 var linkDensity = readability.getLinkDensity(tagsList[i]); |
1475 var contentLength = readability.getInnerText(tagsList[i]).length ; | 1475 var contentLength = readability.getInnerText(tagsList[i]).length ; |
1476 var toRemove = false; | 1476 var toRemove = false; |
1477 | 1477 |
1478 if ( img > p ) { | 1478 if ( img > p ) { |
1479 toRemove = true; | 1479 toRemove = true; |
1480 } else if(li > p && tag !== "ul" && tag !== "ol") { | 1480 } else if(li > p && tag !== "ul" && tag !== "ol") { |
1481 toRemove = true; | 1481 toRemove = true; |
1482 } else if( input > Math.floor(p/3) ) { | 1482 } else if( input > Math.floor(p/3) ) { |
1483 toRemove = true; | 1483 toRemove = true; |
1484 } else if(contentLength < 25 && (img === 0 || img > 2) ) { | 1484 } else if(contentLength < 25 && (img === 0 || img > 2) ) { |
1485 toRemove = true; | 1485 toRemove = true; |
1486 } else if(weight < 25 && linkDensity > 0.2) { | 1486 } else if(weight < 25 && linkDensity > 0.2) { |
1487 toRemove = true; | 1487 toRemove = true; |
1488 } else if(weight >= 25 && linkDensity > 0.5) { | 1488 } else if(weight >= 25 && linkDensity > 0.5) { |
1489 toRemove = true; | 1489 toRemove = true; |
1490 } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) { | 1490 } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) { |
1491 toRemove = true; | 1491 toRemove = true; |
1492 } | 1492 } |
1493 | 1493 |
(...skipping 21 matching lines...) Expand all Loading... | |
1515 } | 1515 } |
1516 }, | 1516 }, |
1517 | 1517 |
1518 flagIsActive: function(flag) { | 1518 flagIsActive: function(flag) { |
1519 return (readability.flags & flag) > 0; | 1519 return (readability.flags & flag) > 0; |
1520 }, | 1520 }, |
1521 | 1521 |
1522 addFlag: function(flag) { | 1522 addFlag: function(flag) { |
1523 readability.flags = readability.flags | flag; | 1523 readability.flags = readability.flags | flag; |
1524 }, | 1524 }, |
1525 | 1525 |
1526 removeFlag: function(flag) { | 1526 removeFlag: function(flag) { |
1527 readability.flags = readability.flags & ~flag; | 1527 readability.flags = readability.flags & ~flag; |
1528 }, | 1528 }, |
1529 | 1529 |
1530 // Removes the children of |src| and appends them to |dest|. | 1530 // Removes the children of |src| and appends them to |dest|. |
1531 moveNodeInnards: function(src, dest) { | 1531 moveNodeInnards: function(src, dest) { |
1532 try { | 1532 try { |
1533 while (src.firstChild) { | 1533 while (src.firstChild) { |
1534 dest.appendChild(src.removeChild(src.firstChild)); | 1534 dest.appendChild(src.removeChild(src.firstChild)); |
1535 } | 1535 } |
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1584 var lastBr = readability.isMultipleBr(node, false); | 1584 var lastBr = readability.isMultipleBr(node, false); |
1585 var ret = false; | 1585 var ret = false; |
1586 while (lastBr && lastBr != node) { | 1586 while (lastBr && lastBr != node) { |
1587 var toRemove = lastBr; | 1587 var toRemove = lastBr; |
1588 lastBr = lastBr.previousSibling; | 1588 lastBr = lastBr.previousSibling; |
1589 toRemove.parentNode.removeChild(toRemove); | 1589 toRemove.parentNode.removeChild(toRemove); |
1590 ret = true; | 1590 ret = true; |
1591 } | 1591 } |
1592 return ret; | 1592 return ret; |
1593 }, | 1593 }, |
1594 | 1594 |
1595 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a | 1595 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a |
1596 // <P> node, and makes all next siblings of that pair children of <P>, up | 1596 // <P> node, and makes all next siblings of that pair children of <P>, up |
1597 // until the next pair of <BR> nodes is reached. | 1597 // until the next pair of <BR> nodes is reached. |
1598 replaceDoubleBrWithP: function(node) { | 1598 replaceDoubleBrWithP: function(node) { |
1599 // Check that we are starting with a BR. | 1599 // Check that we are starting with a BR. |
1600 var second = readability.isMultipleBr(node, true); | 1600 var second = readability.isMultipleBr(node, true); |
1601 if (!second) { | 1601 if (!second) { |
1602 return; | 1602 return; |
1603 } | 1603 } |
1604 // Make all next siblings of the second BR into children of a P. | 1604 // Make all next siblings of the second BR into children of a P. |
1605 var p = document.createElement('p'); | 1605 var p = document.createElement('p'); |
1606 var curr = second.nextSibling; | 1606 var curr = second.nextSibling; |
1607 while (curr) { | 1607 while (curr) { |
1608 if (readability.isMultipleBr(curr, true)) { | 1608 if (readability.isMultipleBr(curr, true)) { |
1609 break; | 1609 break; |
1610 } | 1610 } |
1611 var next = curr.nextSibling; | 1611 var next = curr.nextSibling; |
1612 p.appendChild(curr.parentNode.removeChild(curr)); | 1612 p.appendChild(curr.parentNode.removeChild(curr)); |
1613 curr = next; | 1613 curr = next; |
1614 } | 1614 } |
1615 var ret = curr; | 1615 var ret = curr; |
1616 | 1616 |
1617 // Remove all nodes between the first and second BR. | 1617 // Remove all nodes between the first and second BR. |
1618 curr = node.nextSibling; | 1618 curr = node.nextSibling; |
1619 while (curr && curr != second) { | 1619 while (curr && curr != second) { |
1620 var next = curr.nextSibling; | 1620 var next = curr.nextSibling; |
1621 curr.parentNode.removeChild(curr); | 1621 curr.parentNode.removeChild(curr); |
1622 curr = next; | 1622 curr = next; |
1623 } | 1623 } |
1624 // Remove the second BR. | 1624 // Remove the second BR. |
1625 second.parentNode.removeChild(second); | 1625 second.parentNode.removeChild(second); |
1626 // Replace the first BR with the P. | 1626 // Replace the first BR with the P. |
1627 node.parentNode.replaceChild(p, node); | 1627 node.parentNode.replaceChild(p, node); |
1628 | 1628 |
1629 return ret; | 1629 return ret; |
1630 }, | 1630 }, |
1631 | 1631 |
1632 // Returns true if the NodeList contains a double <BR>. | 1632 // Returns true if the NodeList contains a double <BR>. |
1633 hasDoubleBr: function(nodeList) { | 1633 hasDoubleBr: function(nodeList) { |
1634 for (var i = 0; i < nodeList.length; nodeList++) { | 1634 for (var i = 0; i < nodeList.length; nodeList++) { |
1635 if (readability.isMultipleBr(nodeList[i], true)) { | 1635 if (readability.isMultipleBr(nodeList[i], true)) { |
1636 return true; | 1636 return true; |
1637 } | 1637 } |
1638 } | 1638 } |
1639 return false; | 1639 return false; |
1640 }, | 1640 }, |
1641 | 1641 |
1642 // Replaces double <BR> tags with <P> tags. | 1642 // Replaces double <BR> tags with <P> tags. |
1643 replaceDoubleBrsWithPs: function(node) { | 1643 replaceDoubleBrsWithPs: function(node) { |
1644 var allElements = node.getElementsByTagName('BR'); | 1644 var allElements = node.getElementsByTagName('BR'); |
1645 var node = null; | 1645 var node = null; |
1646 while (allElements && allElements.length > 0 && | 1646 while (allElements && allElements.length > 0 && |
1647 readability.hasDoubleBr(allElements)) { | 1647 readability.hasDoubleBr(allElements)) { |
1648 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) { | 1648 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) { |
1649 var next = node; | 1649 var next = node; |
1650 while (next = readability.replaceDoubleBrWithP(next)); | 1650 while (next = readability.replaceDoubleBrWithP(next)); |
1651 } | 1651 } |
1652 allElements = document.body.getElementsByTagName('BR'); | 1652 allElements = document.body.getElementsByTagName('BR'); |
1653 } | 1653 } |
1654 }, | 1654 }, |
1655 | 1655 |
1656 | 1656 |
1657 // Replaces a BR and the whitespace that follows it with a P. | 1657 // Replaces a BR and the whitespace that follows it with a P. |
1658 replaceBrWithP: function(node) { | 1658 replaceBrWithP: function(node) { |
1659 if (!readability.isBrNode(node)) { | 1659 if (!readability.isBrNode(node)) { |
1660 return; | 1660 return; |
1661 } | 1661 } |
1662 var p = document.createElement('p'); | 1662 var p = document.createElement('p'); |
1663 var curr = node.nextSibling; | 1663 var curr = node.nextSibling; |
1664 while (curr && !isBrNode(curr)) { | 1664 while (curr && !isBrNode(curr)) { |
1665 var next = curr.nextSibling; | 1665 var next = curr.nextSibling; |
1666 if (readability.isWhitespaceNode(curr)) { | 1666 if (readability.isWhitespaceNode(curr)) { |
1667 curr.parentNode.removeChild(curr); | 1667 curr.parentNode.removeChild(curr); |
1668 } else { | 1668 } else { |
1669 p.appendChild(curr.parentNode.removeChild(curr)); | 1669 p.appendChild(curr.parentNode.removeChild(curr)); |
1670 } | 1670 } |
1671 curr = next; | 1671 curr = next; |
1672 } | 1672 } |
1673 node.parentNode.replaceChild(p, node); | 1673 node.parentNode.replaceChild(p, node); |
1674 return curr; | 1674 return curr; |
1675 }, | 1675 }, |
1676 | 1676 |
1677 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t ag | 1677 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t ag |
1678 // children of the <P>. | 1678 // children of the <P>. |
1679 replaceBrsWithPs: function(node) { | 1679 replaceBrsWithPs: function(node) { |
1680 var allElements = node.getElementsByTagName('BR'); | 1680 var allElements = node.getElementsByTagName('BR'); |
1681 var node = null; | 1681 var node = null; |
1682 while (allElements && allElements.length > 0) { | 1682 while (allElements && allElements.length > 0) { |
1683 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) { | 1683 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) { |
1684 var next = node; | 1684 var next = node; |
1685 while (next = readability.replaceBrWithP(next)); | 1685 while (next = readability.replaceBrWithP(next)); |
1686 } | 1686 } |
1687 allElements = document.body.getElementsByTagName('BR'); | 1687 allElements = document.body.getElementsByTagName('BR'); |
1688 } | 1688 } |
1689 }, | 1689 }, |
1690 | 1690 |
1691 // Replaces any tag with any other tag. | 1691 // Replaces any tag with any other tag. |
1692 replaceTagsWithTags: function(node, srcTag, destTag) { | 1692 replaceTagsWithTags: function(node, srcTag, destTag) { |
1693 var allElements = node.getElementsByTagName(srcTag); | 1693 var allElements = node.getElementsByTagName(srcTag); |
1694 for (var i = 0; i < allElements.length; i++) { | 1694 for (var i = 0; i < allElements.length; i++) { |
1695 var dest = document.createElement(destTag); | 1695 var dest = document.createElement(destTag); |
1696 readability.moveNodeInnards(allElements[i], dest); | 1696 readability.moveNodeInnards(allElements[i], dest); |
1697 node.replaceNode(dest, allElements[i]); | 1697 allElements[i].parentNode.replaceChild(dest, allElements[i]); |
1698 } | 1698 } |
1699 }, | 1699 }, |
1700 | 1700 |
1701 // Replaces all <noscript> tags with <p> tags. | 1701 // Replaces all <noscript> tags with <p> tags. |
1702 replaceNoscriptsWithPs: function(node) { | 1702 replaceNoscriptsWithPs: function(node) { |
1703 readability.replaceTagsWithTags(node, 'noscript', 'p'); | 1703 readability.replaceTagsWithTags(node, 'noscript', 'p'); |
1704 }, | 1704 }, |
1705 | 1705 |
1706 // Replaces all <font> tags with <span> tags. | 1706 // Replaces all <font> tags with <span> tags. |
1707 replaceFontsWithSpans: function(node) { | 1707 replaceFontsWithSpans: function(node) { |
1708 readability.replaceTagsWithTags(node, 'font', 'span'); | 1708 readability.replaceTagsWithTags(node, 'font', 'span'); |
1709 }, | 1709 }, |
1710 | 1710 |
1711 // Returns a list of image URLs in the distilled article. | 1711 // Returns a list of image URLs in the distilled article. |
1712 getImages : function() { | 1712 getImages : function() { |
1713 var images = document.getElementsByTagName('img'); | 1713 var images = document.getElementsByTagName('img'); |
1714 var result = new Array(images.length); | 1714 var result = new Array(images.length); |
1715 dbg("Number of images: " + images.length); | 1715 dbg("Number of images: " + images.length); |
1716 for(i = 0; i < images.length; i++) { | 1716 for(i = 0; i < images.length; i++) { |
1717 result[i] = images[i].src; | 1717 result[i] = images[i].src; |
1718 dbg("Image: " + result[i]); | 1718 dbg("Image: " + result[i]); |
1719 } | 1719 } |
1720 return result; | 1720 return result; |
1721 }, | 1721 }, |
1722 | 1722 |
1723 // Returns the distilled article HTML from the page(s). | 1723 // Returns the distilled article HTML from the page(s). |
1724 getDistilledArticleHTML : function() { | 1724 getDistilledArticleHTML : function() { |
1725 return readability.distilledHTML; | 1725 return readability.distilledHTML; |
1726 }, | |
1727 | |
1728 // Returns the next page of this article. | |
1729 getNextPageLink : function() { | |
1730 return readability.nextPageLink; | |
1726 } | 1731 } |
1727 }; | 1732 }; |
1728 | 1733 |
1729 // Extracts long-form content from a page and returns and array where the first | 1734 // Extracts long-form content from a page and returns and array where the first |
1730 // element is the article title, the second element is HTML containing the | 1735 // element is the article title, the second element is HTML containing the |
1731 // long-form content, and remaining elements are URLs for images referenced by | 1736 // long-form content, and remaining elements are URLs for images referenced by |
1732 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which | 1737 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which |
1733 // corresponds to a URL listed at index k in the array returned. | 1738 // corresponds to a URL listed at index k in the array returned. |
1734 (function () { | 1739 (function () { |
1735 readability.init(); | 1740 readability.init(); |
1736 var result = new Array(2); | 1741 var result = new Array(3); |
1737 result[0] = readability.getArticleTitle(); | 1742 result[0] = readability.getArticleTitle(); |
1738 result[1] = readability.getDistilledArticleHTML(); | 1743 result[1] = readability.getDistilledArticleHTML(); |
1744 result[2] = readability.getNextPageLink(); | |
1739 return result.concat(readability.getImages()); | 1745 return result.concat(readability.getImages()); |
1740 }()) | 1746 }()) |
1741 | 1747 |
OLD | NEW |