| OLD | NEW |
| 1 | 1 |
| 2 var dbg = (typeof console !== 'undefined') ? function(s) { | 2 var dbg = (typeof console !== 'undefined') ? function(s) { |
| 3 console.log("Readability: " + s); | 3 console.log("Readability: " + s); |
| 4 } : function() {}; | 4 } : function() {}; |
| 5 | 5 |
| 6 /* | 6 /* |
| 7 * Readability. An Arc90 Lab Experiment. | 7 * Readability. An Arc90 Lab Experiment. |
| 8 * Website: http://lab.arc90.com/experiments/readability | 8 * Website: http://lab.arc90.com/experiments/readability |
| 9 * Source: http://code.google.com/p/arc90labs-readability | 9 * Source: http://code.google.com/p/arc90labs-readability |
| 10 * | 10 * |
| 11 * "Readability" is a trademark of Arc90 Inc and may not be used without explici
t permission. | 11 * "Readability" is a trademark of Arc90 Inc and may not be used without explici
t permission. |
| 12 * | 12 * |
| 13 * Copyright (c) 2010 Arc90 Inc | 13 * Copyright (c) 2010 Arc90 Inc |
| 14 * Readability is licensed under the Apache License, Version 2.0. | 14 * Readability is licensed under the Apache License, Version 2.0. |
| 15 **/ | 15 **/ |
| 16 var readability = { | 16 var readability = { |
| 17 readStyle: "style-newspaper", | 17 readStyle: "style-newspaper", |
| 18 readSize: "size-medium", | 18 readSize: "size-medium", |
| 19 readMargin: "margin-wide", | 19 readMargin: "margin-wide", |
| 20 | 20 |
| 21 distilledHTML: '', | 21 distilledHTML: '', |
| 22 distilledArticleContent: null, | 22 distilledArticleContent: null, |
| 23 nextPageLink: '', |
| 23 | 24 |
| 24 version: '1.7.1', | 25 version: '1.7.1', |
| 25 iframeLoads: 0, | 26 iframeLoads: 0, |
| 26 convertLinksToFootnotes: false, | 27 convertLinksToFootnotes: false, |
| 27 reversePageScroll: false, /* If they hold shift and hit space, scroll
up */ | 28 reversePageScroll: false, /* If they hold shift and hit space, scroll
up */ |
| 28 frameHack: false, /** | 29 frameHack: false, /** |
| 29 * The frame hack is to workaround a firefo
x bug where if you | 30 * The frame hack is to workaround a firefo
x bug where if you |
| 30 * pull content out of a frame and stick it
into the parent element, the scrollbar won't appear. | 31 * pull content out of a frame and stick it
into the parent element, the scrollbar won't appear. |
| 31 * So we fake a scrollbar in the wrapping d
iv. | 32 * So we fake a scrollbar in the wrapping d
iv. |
| 32 **/ | 33 **/ |
| 33 biggestFrame: false, | 34 biggestFrame: false, |
| 34 flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */ | 35 flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */ |
| 35 | 36 |
| 36 /* constants */ | 37 /* constants */ |
| 37 FLAG_STRIP_UNLIKELYS: 0x1, | 38 FLAG_STRIP_UNLIKELYS: 0x1, |
| 38 FLAG_WEIGHT_CLASSES: 0x2, | 39 FLAG_WEIGHT_CLASSES: 0x2, |
| 39 FLAG_CLEAN_CONDITIONALLY: 0x4, | 40 FLAG_CLEAN_CONDITIONALLY: 0x4, |
| 40 | 41 |
| 41 maxPages: 30, /* The maximum number of pages to loop through before we ca
ll it quits and just show a link. */ | 42 maxPages: 30, /* The maximum number of pages to loop through before we ca
ll it quits and just show a link. */ |
| 42 parsedPages: {}, /* The list of pages we've parsed in this call of readabili
ty, for autopaging. As a key store for easier searching. */ | 43 parsedPages: {}, /* The list of pages we've parsed in this call of readabili
ty, for autopaging. As a key store for easier searching. */ |
| 43 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas
e they happen to match, we'll know it's a duplicate. */ | 44 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas
e they happen to match, we'll know it's a duplicate. */ |
| 44 | 45 |
| 45 /** | 46 /** |
| 46 * All of the regular expressions in use within readability. | 47 * All of the regular expressions in use within readability. |
| 47 * Defined up here so we don't instantiate them repeatedly in loops. | 48 * Defined up here so we don't instantiate them repeatedly in loops. |
| 48 **/ | 49 **/ |
| 49 regexps: { | 50 regexps: { |
| 50 unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header
|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popu
p|tweet|twitter/i, | 51 unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header
|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popu
p|tweet|twitter/i, |
| 51 okMaybeItsACandidate: /and|article|body|column|main|shadow/i, | 52 okMaybeItsACandidate: /and|article|body|column|main|shadow/i, |
| 52 positive: /article|body|content|entry|hentry|main|page|pagi
nation|post|text|blog|story/i, | 53 positive: /article|body|content|entry|hentry|main|page|pagi
nation|post|text|blog|story/i, |
| 53 negative: /combx|comment|com-|contact|foot|footer|footnote|
masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopp
ing|tags|tool|widget/i, | 54 negative: /combx|comment|com-|contact|foot|footer|footnote|
masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopp
ing|tags|tool|widget/i, |
| 54 extraneous: /print|archive|comment|discuss|e[\-]?mail|share|r
eply|all|login|sign|single/i, | 55 extraneous: /print|archive|comment|discuss|e[\-]?mail|share|r
eply|all|login|sign|single/i, |
| 55 divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, | 56 divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, |
| 56 replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi, | 57 replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi, |
| 57 replaceFonts: /<(\/?)font[^>]*>/gi, | 58 replaceFonts: /<(\/?)font[^>]*>/gi, |
| 58 trim: /^\s+|\s+$/g, | 59 trim: /^\s+|\s+$/g, |
| 59 normalize: /\s{2,}/g, | 60 normalize: /\s{2,}/g, |
| 60 killBreaks: /(<br\s*\/?>(\s| ?)*){1,}/g, | 61 killBreaks: /(<br\s*\/?>(\s| ?)*){1,}/g, |
| 61 videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, | 62 videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, |
| 62 skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)
\s*$/i, | 63 skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)
\s*$/i, |
| 63 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
// Match: next, continue, >, >>, » but not >|, »| as those usually mean last. | 64 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
// Match: next, continue, >, >>, » but not >|, »| as those usually mean last. |
| 64 prevLink: /(prev|earl|old|new|<|«)/i | 65 prevLink: /(prev|earl|old|new|<|«)/i |
| 65 }, | 66 }, |
| 66 | 67 |
| 67 /** | 68 /** |
| 68 * Runs readability. | 69 * Runs readability. |
| 69 * | 70 * |
| 70 * Workflow: | 71 * Workflow: |
| 71 * 1. Prep the document by removing script tags, css, etc. | 72 * 1. Prep the document by removing script tags, css, etc. |
| 72 * 2. Build readability's DOM tree. | 73 * 2. Build readability's DOM tree. |
| 73 * 3. Grab the article content from the current dom tree. | 74 * 3. Grab the article content from the current dom tree. |
| 74 * 4. Replace the current DOM tree with the new one. | 75 * 4. Replace the current DOM tree with the new one. |
| 75 * 5. Read peacefully. | 76 * 5. Read peacefully. |
| 76 * | 77 * |
| 77 * @return void | 78 * @return void |
| 78 **/ | 79 **/ |
| 79 init: function() { | 80 init: function() { |
| 80 /* Before we do anything, remove all scripts that are not readability. *
/ | 81 /* Before we do anything, remove all scripts that are not readability. *
/ |
| 81 window.onload = window.onunload = function() {}; | 82 window.onload = window.onunload = function() {}; |
| 82 | 83 |
| 83 readability.removeScripts(document); | 84 readability.removeScripts(document); |
| 84 | 85 |
| 85 /* Make sure this document is added to the list of parsed pages first, s
o we don't double up on the first page */ | 86 /* Make sure this document is added to the list of parsed pages first, s
o we don't double up on the first page */ |
| 86 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true; | 87 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true; |
| 87 | 88 |
| 88 /* Pull out any possible next page link first */ | 89 /* Pull out any possible next page link first */ |
| 89 var nextPageLink = readability.findNextPageLink(document.body); | 90 readability.nextPageLink = readability.findNextPageLink(document.body); |
| 90 | 91 |
| 92 /* We handle processing of nextPage from C++ set nextPageLink to null */ |
| 93 var nextPageLink = null; |
| 94 |
| 91 readability.prepDocument(); | 95 readability.prepDocument(); |
| 92 | 96 |
| 93 /* Build readability's DOM tree */ | 97 /* Build readability's DOM tree */ |
| 94 var overlay = document.createElement("DIV"); | 98 var overlay = document.createElement("DIV"); |
| 95 var innerDiv = document.createElement("DIV"); | 99 var innerDiv = document.createElement("DIV"); |
| 96 var articleTools = readability.getArticleTools(); | 100 var articleTools = readability.getArticleTools(); |
| 97 var articleTitleText = readability.getArticleTitle(); | 101 var articleTitleText = readability.getArticleTitle(); |
| 98 var articleContent = readability.grabArticle(); | 102 var articleContent = readability.grabArticle(); |
| 99 | 103 |
| 100 if(!articleContent) { | 104 if(!articleContent) { |
| (...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 145 rootWarning.innerHTML = "<em>Readability</em> was intended for u
se on individual articles and not home pages. " + | 149 rootWarning.innerHTML = "<em>Readability</em> was intended for u
se on individual articles and not home pages. " + |
| 146 "If you'd like to try rendering this page anyway, <a onClick='ja
vascript:document.getElementById(\"readability-warning\").style.display=\"none\"
;document.getElementById(\"readability-content\").style.display=\"block\";'>clic
k here</a> to continue."; | 150 "If you'd like to try rendering this page anyway, <a onClick='ja
vascript:document.getElementById(\"readability-warning\").style.display=\"none\"
;document.getElementById(\"readability-content\").style.display=\"block\";'>clic
k here</a> to continue."; |
| 147 | 151 |
| 148 innerDiv.insertBefore( rootWarning, articleContent ); | 152 innerDiv.insertBefore( rootWarning, articleContent ); |
| 149 } | 153 } |
| 150 | 154 |
| 151 readability.postProcessContent(articleContent); | 155 readability.postProcessContent(articleContent); |
| 152 | 156 |
| 153 window.scrollTo(0, 0); | 157 window.scrollTo(0, 0); |
| 154 | 158 |
| 155 // TODO(bengr): Remove this assignment of null to nextPageLink when | |
| 156 // the processing of the next page link is safe. | |
| 157 nextPageLink = null; | |
| 158 | |
| 159 if (nextPageLink) { | 159 if (nextPageLink) { |
| 160 /** | 160 /** |
| 161 * Append any additional pages after a small timeout so that people | 161 * Append any additional pages after a small timeout so that people |
| 162 * can start reading without having to wait for this to finish proce
ssing. | 162 * can start reading without having to wait for this to finish proce
ssing. |
| 163 **/ | 163 **/ |
| 164 window.setTimeout(function() { | 164 window.setTimeout(function() { |
| 165 readability.appendNextPage(nextPageLink); | 165 readability.appendNextPage(nextPageLink); |
| 166 }, 500); | 166 }, 500); |
| 167 } | 167 } |
| 168 | 168 |
| 169 /** Smooth scrolling **/ | 169 /** Smooth scrolling **/ |
| 170 document.onkeydown = function(e) { | 170 document.onkeydown = function(e) { |
| 171 var code = (window.event) ? event.keyCode : e.keyCode; | 171 var code = (window.event) ? event.keyCode : e.keyCode; |
| 172 if (code === 16) { | 172 if (code === 16) { |
| 173 readability.reversePageScroll = true; | 173 readability.reversePageScroll = true; |
| 174 return; | 174 return; |
| 175 } | 175 } |
| 176 | 176 |
| 177 if (code === 32) { | 177 if (code === 32) { |
| 178 readability.curScrollStep = 0; | 178 readability.curScrollStep = 0; |
| 179 var windowHeight = window.innerHeight ? window.innerHeight : (do
cument.documentElement.clientHeight ? document.documentElement.clientHeight : do
cument.body.clientHeight); | 179 var windowHeight = window.innerHeight ? window.innerHeight : (do
cument.documentElement.clientHeight ? document.documentElement.clientHeight : do
cument.body.clientHeight); |
| 180 | 180 |
| 181 if(readability.reversePageScroll) { | 181 if(readability.reversePageScroll) { |
| 182 readability.scrollTo(readability.scrollTop(), readability.sc
rollTop() - (windowHeight - 50), 20, 10); | 182 readability.scrollTo(readability.scrollTop(), readability.sc
rollTop() - (windowHeight - 50), 20, 10); |
| 183 } | 183 } |
| 184 else { | 184 else { |
| 185 readability.scrollTo(readability.scrollTop(), readability.sc
rollTop() + (windowHeight - 50), 20, 10); | 185 readability.scrollTo(readability.scrollTop(), readability.sc
rollTop() + (windowHeight - 50), 20, 10); |
| 186 } | 186 } |
| 187 | 187 |
| 188 return false; | 188 return false; |
| 189 } | 189 } |
| 190 }; | 190 }; |
| 191 | 191 |
| 192 document.onkeyup = function(e) { | 192 document.onkeyup = function(e) { |
| 193 var code = (window.event) ? event.keyCode : e.keyCode; | 193 var code = (window.event) ? event.keyCode : e.keyCode; |
| 194 if (code === 16) { | 194 if (code === 16) { |
| 195 readability.reversePageScroll = false; | 195 readability.reversePageScroll = false; |
| 196 return; | 196 return; |
| 197 } | 197 } |
| 198 }; | 198 }; |
| 199 }, | 199 }, |
| 200 | 200 |
| 201 /** | 201 /** |
| 202 * Run any post-process modifications to article content as necessary. | 202 * Run any post-process modifications to article content as necessary. |
| 203 * | 203 * |
| 204 * @param Element | 204 * @param Element |
| 205 * @return void | 205 * @return void |
| 206 **/ | 206 **/ |
| 207 postProcessContent: function(articleContent) { | 207 postProcessContent: function(articleContent) { |
| 208 if(readability.convertLinksToFootnotes && !window.location.href.match(/w
ikipedia\.org/g)) { | 208 if(readability.convertLinksToFootnotes && !window.location.href.match(/w
ikipedia\.org/g)) { |
| 209 readability.addFootnotes(articleContent); | 209 readability.addFootnotes(articleContent); |
| 210 } | 210 } |
| 211 | 211 |
| 212 readability.fixImageFloats(articleContent); | 212 readability.fixImageFloats(articleContent); |
| 213 }, | 213 }, |
| 214 | 214 |
| 215 /** | 215 /** |
| 216 * Some content ends up looking ugly if the image is too large to be floated
. | 216 * Some content ends up looking ugly if the image is too large to be floated
. |
| 217 * If the image is wider than a threshold (currently 55%), no longer float i
t, | 217 * If the image is wider than a threshold (currently 55%), no longer float i
t, |
| 218 * center it instead. | 218 * center it instead. |
| 219 * | 219 * |
| 220 * @param Element | 220 * @param Element |
| 221 * @return void | 221 * @return void |
| 222 **/ | 222 **/ |
| 223 fixImageFloats: function (articleContent) { | 223 fixImageFloats: function (articleContent) { |
| 224 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.
55, | 224 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.
55, |
| 225 images = articleContent.getElementsByTagName('img'); | 225 images = articleContent.getElementsByTagName('img'); |
| 226 | 226 |
| 227 for(var i=0, il = images.length; i < il; i+=1) { | 227 for(var i=0, il = images.length; i < il; i+=1) { |
| 228 var image = images[i]; | 228 var image = images[i]; |
| 229 | 229 |
| 230 if(image.offsetWidth > imageWidthThreshold) { | 230 if(image.offsetWidth > imageWidthThreshold) { |
| 231 image.className += " blockImage"; | 231 image.className += " blockImage"; |
| 232 } | 232 } |
| 233 } | 233 } |
| 234 }, | 234 }, |
| 235 | 235 |
| 236 /** | 236 /** |
| 237 * Get the article tools Element that has buttons like reload, print. | 237 * Get the article tools Element that has buttons like reload, print. |
| 238 * | 238 * |
| 239 * @return void | 239 * @return void |
| 240 **/ | 240 **/ |
| 241 getArticleTools: function () { | 241 getArticleTools: function () { |
| 242 var articleTools = document.createElement("DIV"); | 242 var articleTools = document.createElement("DIV"); |
| 243 | 243 |
| 244 articleTools.id = "readTools"; | 244 articleTools.id = "readTools"; |
| 245 articleTools.innerHTML = | 245 articleTools.innerHTML = |
| 246 "<a href='#' onclick='return window.location.reload()' title='Reload
original page' id='reload-page'>Reload Original Page</a>" + | 246 "<a href='#' onclick='return window.location.reload()' title='Reload
original page' id='reload-page'>Reload Original Page</a>" + |
| 247 "<a href='#' onclick='javascript:window.print();' title='Print page'
id='print-page'>Print Page</a>" + | 247 "<a href='#' onclick='javascript:window.print();' title='Print page'
id='print-page'>Print Page</a>" + |
| 248 "<a href='#' onclick='readability.emailBox(); return false;' title='
Email page' id='email-page'>Email Page</a>"; | 248 "<a href='#' onclick='readability.emailBox(); return false;' title='
Email page' id='email-page'>Email Page</a>"; |
| 249 | 249 |
| 250 return articleTools; | 250 return articleTools; |
| 251 }, | 251 }, |
| 252 | 252 |
| 253 /** | 253 /** |
| 254 * retuns the suggested direction of the string | 254 * retuns the suggested direction of the string |
| 255 * | 255 * |
| 256 * @return "rtl" || "ltr" | 256 * @return "rtl" || "ltr" |
| 257 **/ | 257 **/ |
| 258 getSuggestedDirection: function(text) { | 258 getSuggestedDirection: function(text) { |
| 259 function sanitizeText() { | 259 function sanitizeText() { |
| 260 return text.replace(/@\w+/, ""); | 260 return text.replace(/@\w+/, ""); |
| 261 } | 261 } |
| 262 | 262 |
| 263 function countMatches(match) { | 263 function countMatches(match) { |
| 264 var matches = text.match(new RegExp(match, "g")); | 264 var matches = text.match(new RegExp(match, "g")); |
| 265 return matches !== null ? matches.length : 0; | 265 return matches !== null ? matches.length : 0; |
| 266 } | 266 } |
| 267 | 267 |
| 268 function isRTL() { | 268 function isRTL() { |
| 269 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]"); | 269 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]"); |
| 270 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]"); | 270 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]"); |
| 271 | 271 |
| 272 // if 20% of chars are Hebrew or Arbic then direction is rtl | 272 // if 20% of chars are Hebrew or Arbic then direction is rtl |
| 273 return (count_heb + count_arb) * 100 / text.length > 20; | 273 return (count_heb + count_arb) * 100 / text.length > 20; |
| 274 } | 274 } |
| 275 | 275 |
| 276 text = sanitizeText(text); | 276 text = sanitizeText(text); |
| 277 return isRTL() ? "rtl" : "ltr"; | 277 return isRTL() ? "rtl" : "ltr"; |
| 278 }, | 278 }, |
| 279 | 279 |
| 280 /** | 280 /** |
| 281 * Get the article title as an H1. | 281 * Get the article title as an H1. |
| 282 * | 282 * |
| 283 * @return void | 283 * @return void |
| 284 **/ | 284 **/ |
| 285 getArticleTitle: function () { | 285 getArticleTitle: function () { |
| 286 var curTitle = "", | 286 var curTitle = "", |
| 287 origTitle = ""; | 287 origTitle = ""; |
| 288 | 288 |
| 289 try { | 289 try { |
| 290 curTitle = origTitle = document.title; | 290 curTitle = origTitle = document.title; |
| 291 if(typeof curTitle !== "string") { /* If they had an element with id
"title" in their HTML */ | 291 if(typeof curTitle !== "string") { /* If they had an element with id
"title" in their HTML */ |
| 292 curTitle = origTitle = readability.getInnerText(document.getElem
entsByTagName('title')[0]); | 292 curTitle = origTitle = readability.getInnerText(document.getElem
entsByTagName('title')[0]); |
| 293 } | 293 } |
| 294 } | 294 } |
| 295 catch(e) {} | 295 catch(e) {} |
| 296 | 296 |
| 297 if(curTitle.match(/ [\|\-] /)) | 297 if(curTitle.match(/ [\|\-] /)) |
| 298 { | 298 { |
| 299 curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); | 299 curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); |
| 300 | 300 |
| 301 if(curTitle.split(' ').length < 3) { | 301 if(curTitle.split(' ').length < 3) { |
| 302 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); | 302 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); |
| 303 } | 303 } |
| 304 } | 304 } |
| 305 else if(curTitle.indexOf(': ') !== -1) | 305 else if(curTitle.indexOf(': ') !== -1) |
| 306 { | 306 { |
| 307 curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); | 307 curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); |
| 308 | 308 |
| 309 if(curTitle.split(' ').length < 3) { | 309 if(curTitle.split(' ').length < 3) { |
| 310 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); | 310 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); |
| (...skipping 12 matching lines...) Expand all Loading... |
| 323 | 323 |
| 324 if(curTitle.split(' ').length <= 4) { | 324 if(curTitle.split(' ').length <= 4) { |
| 325 curTitle = origTitle; | 325 curTitle = origTitle; |
| 326 } | 326 } |
| 327 return curTitle; | 327 return curTitle; |
| 328 }, | 328 }, |
| 329 | 329 |
| 330 /** | 330 /** |
| 331 * Prepare the HTML document for readability to scrape it. | 331 * Prepare the HTML document for readability to scrape it. |
| 332 * This includes things like stripping javascript, CSS, and handling terribl
e markup. | 332 * This includes things like stripping javascript, CSS, and handling terribl
e markup. |
| 333 * | 333 * |
| 334 * @return void | 334 * @return void |
| 335 **/ | 335 **/ |
| 336 prepDocument: function () { | 336 prepDocument: function () { |
| 337 /** | 337 /** |
| 338 * In some cases a body element can't be found (if the HTML is totally h
osed for example) | 338 * In some cases a body element can't be found (if the HTML is totally h
osed for example) |
| 339 * so we create a new body node and append it to the document. | 339 * so we create a new body node and append it to the document. |
| 340 */ | 340 */ |
| 341 if(document.body === null) | 341 if(document.body === null) |
| 342 { | 342 { |
| 343 var body = document.createElement("body"); | 343 var body = document.createElement("body"); |
| 344 try { | 344 try { |
| 345 document.body = body; | 345 document.body = body; |
| 346 } | 346 } |
| 347 catch(e) { | 347 catch(e) { |
| 348 document.documentElement.appendChild(body); | 348 document.documentElement.appendChild(body); |
| 349 dbg(e); | 349 dbg(e); |
| 350 } | 350 } |
| 351 } | 351 } |
| 352 | 352 |
| 353 document.body.id = "readabilityBody"; | 353 document.body.id = "readabilityBody"; |
| 354 | 354 |
| 355 var frames = document.getElementsByTagName('frame'); | 355 var frames = document.getElementsByTagName('frame'); |
| (...skipping 11 matching lines...) Expand all Loading... |
| 367 canAccessFrame = true; | 367 canAccessFrame = true; |
| 368 } | 368 } |
| 369 catch(eFrames) { | 369 catch(eFrames) { |
| 370 dbg(eFrames); | 370 dbg(eFrames); |
| 371 } | 371 } |
| 372 | 372 |
| 373 if(frameSize > biggestFrameSize) { | 373 if(frameSize > biggestFrameSize) { |
| 374 biggestFrameSize = frameSize; | 374 biggestFrameSize = frameSize; |
| 375 readability.biggestFrame = frames[frameIndex]; | 375 readability.biggestFrame = frames[frameIndex]; |
| 376 } | 376 } |
| 377 | 377 |
| 378 if(canAccessFrame && frameSize > bestFrameSize) | 378 if(canAccessFrame && frameSize > bestFrameSize) |
| 379 { | 379 { |
| 380 readability.frameHack = true; | 380 readability.frameHack = true; |
| 381 | 381 |
| 382 bestFrame = frames[frameIndex]; | 382 bestFrame = frames[frameIndex]; |
| 383 bestFrameSize = frameSize; | 383 bestFrameSize = frameSize; |
| 384 } | 384 } |
| 385 } | 385 } |
| 386 | 386 |
| 387 if(bestFrame) | 387 if(bestFrame) |
| 388 { | 388 { |
| 389 var newBody = document.createElement('body'); | 389 var newBody = document.createElement('body'); |
| 390 readability.moveNodeInnards(bestFrame.contentWindow.document.bod
y, newBody); | 390 readability.moveNodeInnards(bestFrame.contentWindow.document.bod
y, newBody); |
| 391 newBody.style.overflow = 'scroll'; | 391 newBody.style.overflow = 'scroll'; |
| 392 document.body = newBody; | 392 document.body = newBody; |
| 393 | 393 |
| 394 var frameset = document.getElementsByTagName('frameset')[0]; | 394 var frameset = document.getElementsByTagName('frameset')[0]; |
| 395 if(frameset) { | 395 if(frameset) { |
| 396 frameset.parentNode.removeChild(frameset); } | 396 frameset.parentNode.removeChild(frameset); } |
| 397 } | 397 } |
| 398 } | 398 } |
| 399 | 399 |
| 400 /* Remove all stylesheets */ | 400 /* Remove all stylesheets */ |
| 401 for (var k=0;k < document.styleSheets.length; k+=1) { | 401 for (var k=0;k < document.styleSheets.length; k+=1) { |
| 402 if (document.styleSheets[k].href !== null && document.styleSheets[k]
.href.lastIndexOf("readability") === -1) { | 402 if (document.styleSheets[k].href !== null && document.styleSheets[k]
.href.lastIndexOf("readability") === -1) { |
| 403 document.styleSheets[k].disabled = true; | 403 document.styleSheets[k].disabled = true; |
| (...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 448 readability.cleanConditionally(articleContent, "table"); | 448 readability.cleanConditionally(articleContent, "table"); |
| 449 readability.cleanConditionally(articleContent, "ul"); | 449 readability.cleanConditionally(articleContent, "ul"); |
| 450 readability.cleanConditionally(articleContent, "div"); | 450 readability.cleanConditionally(articleContent, "div"); |
| 451 | 451 |
| 452 /* Remove extra paragraphs */ | 452 /* Remove extra paragraphs */ |
| 453 var articleParagraphs = articleContent.getElementsByTagName('p'); | 453 var articleParagraphs = articleContent.getElementsByTagName('p'); |
| 454 for(var i = articleParagraphs.length-1; i >= 0; i-=1) { | 454 for(var i = articleParagraphs.length-1; i >= 0; i-=1) { |
| 455 var imgCount = articleParagraphs[i].getElementsByTagName('img').l
ength; | 455 var imgCount = articleParagraphs[i].getElementsByTagName('img').l
ength; |
| 456 var embedCount = articleParagraphs[i].getElementsByTagName('embed')
.length; | 456 var embedCount = articleParagraphs[i].getElementsByTagName('embed')
.length; |
| 457 var objectCount = articleParagraphs[i].getElementsByTagName('object'
).length; | 457 var objectCount = articleParagraphs[i].getElementsByTagName('object'
).length; |
| 458 | 458 |
| 459 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab
ility.getInnerText(articleParagraphs[i], false) === '') { | 459 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab
ility.getInnerText(articleParagraphs[i], false) === '') { |
| 460 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]
); | 460 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]
); |
| 461 } | 461 } |
| 462 } | 462 } |
| 463 | 463 |
| 464 try { | 464 try { |
| 465 readability.replaceBrsWithPs(articleContent); | 465 readability.replaceBrsWithPs(articleContent); |
| 466 } | 466 } |
| 467 catch (e) { | 467 catch (e) { |
| 468 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block
-elements bug. Ignoring.: " + e); | 468 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block
-elements bug. Ignoring.: " + e); |
| 469 } | 469 } |
| 470 }, | 470 }, |
| 471 | 471 |
| 472 /** | 472 /** |
| 473 * Initialize a node with the readability object. Also checks the | 473 * Initialize a node with the readability object. Also checks the |
| 474 * className/id for special names to add to its score. | 474 * className/id for special names to add to its score. |
| 475 * | 475 * |
| 476 * @param Element | 476 * @param Element |
| 477 * @return void | 477 * @return void |
| 478 **/ | 478 **/ |
| 479 initializeNode: function (node) { | 479 initializeNode: function (node) { |
| 480 node.readability = {"contentScore": 0}; | 480 node.readability = {"contentScore": 0}; |
| 481 | 481 |
| 482 switch(node.tagName) { | 482 switch(node.tagName) { |
| 483 case 'DIV': | 483 case 'DIV': |
| 484 node.readability.contentScore += 5; | 484 node.readability.contentScore += 5; |
| 485 break; | 485 break; |
| 486 | 486 |
| 487 case 'PRE': | 487 case 'PRE': |
| 488 case 'TD': | 488 case 'TD': |
| 489 case 'BLOCKQUOTE': | 489 case 'BLOCKQUOTE': |
| 490 node.readability.contentScore += 3; | 490 node.readability.contentScore += 3; |
| 491 break; | 491 break; |
| 492 | 492 |
| 493 case 'ADDRESS': | 493 case 'ADDRESS': |
| 494 case 'OL': | 494 case 'OL': |
| 495 case 'UL': | 495 case 'UL': |
| 496 case 'DL': | 496 case 'DL': |
| 497 case 'DD': | 497 case 'DD': |
| 498 case 'DT': | 498 case 'DT': |
| 499 case 'LI': | 499 case 'LI': |
| 500 case 'FORM': | 500 case 'FORM': |
| 501 node.readability.contentScore -= 3; | 501 node.readability.contentScore -= 3; |
| 502 break; | 502 break; |
| 503 | 503 |
| 504 case 'H1': | 504 case 'H1': |
| 505 case 'H2': | 505 case 'H2': |
| 506 case 'H3': | 506 case 'H3': |
| 507 case 'H4': | 507 case 'H4': |
| 508 case 'H5': | 508 case 'H5': |
| 509 case 'H6': | 509 case 'H6': |
| 510 case 'TH': | 510 case 'TH': |
| 511 node.readability.contentScore -= 5; | 511 node.readability.contentScore -= 5; |
| 512 break; | 512 break; |
| 513 } | 513 } |
| 514 | 514 |
| 515 node.readability.contentScore += readability.getClassWeight(node); | 515 node.readability.contentScore += readability.getClassWeight(node); |
| 516 }, | 516 }, |
| 517 | 517 |
| 518 /*** | 518 /*** |
| 519 * grabArticle - Using a variety of metrics (content score, classname, eleme
nt types), find the content that is | 519 * grabArticle - Using a variety of metrics (content score, classname, eleme
nt types), find the content that is |
| 520 * most likely to be the stuff a user wants to read. Then retu
rn it wrapped up in a div. | 520 * most likely to be the stuff a user wants to read. Then retu
rn it wrapped up in a div. |
| 521 * | 521 * |
| 522 * @param page a document to run upon. Needs to be a full document, complete
with body. | 522 * @param page a document to run upon. Needs to be a full document, complete
with body. |
| 523 * @return Element | 523 * @return Element |
| 524 **/ | 524 **/ |
| 525 grabArticle: function (pageToClone) { | 525 grabArticle: function (pageToClone) { |
| 526 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_
STRIP_UNLIKELYS), | 526 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_
STRIP_UNLIKELYS), |
| 527 isPaging = (page !== null) ? true: false; | 527 isPaging = (page !== null) ? true: false; |
| 528 | 528 |
| 529 var page = null; | 529 var page = null; |
| 530 // Never work on the actual page. | 530 // Never work on the actual page. |
| 531 if (isPaging) { | 531 if (isPaging) { |
| 532 page = document.body.cloneNode(true); | 532 page = document.body.cloneNode(true); |
| 533 } else { | 533 } else { |
| 534 page = pageToClone.cloneNode(true); | 534 page = pageToClone.cloneNode(true); |
| 535 } | 535 } |
| 536 | 536 |
| 537 var allElements = page.getElementsByTagName('*'); | 537 var allElements = page.getElementsByTagName('*'); |
| 538 | 538 |
| 539 /** | 539 /** |
| 540 * First, node prepping. Trash nodes that look cruddy (like ones with th
e class name "comment", etc), and turn divs | 540 * First, node prepping. Trash nodes that look cruddy (like ones with th
e class name "comment", etc), and turn divs |
| 541 * into P tags where they have been used inappropriately (as in, where t
hey contain no other block level elements.) | 541 * into P tags where they have been used inappropriately (as in, where t
hey contain no other block level elements.) |
| 542 * | 542 * |
| 543 * Note: Assignment from index for performance. See http://www.peachpit.
com/articles/article.aspx?p=31567&seqNum=5 | 543 * Note: Assignment from index for performance. See http://www.peachpit.
com/articles/article.aspx?p=31567&seqNum=5 |
| 544 * TODO: Shouldn't this be a reverse traversal? | 544 * TODO: Shouldn't this be a reverse traversal? |
| 545 **/ | 545 **/ |
| 546 var node = null; | 546 var node = null; |
| 547 var nodesToScore = []; | 547 var nodesToScore = []; |
| 548 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) { | 548 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) { |
| 549 /* Remove unlikely candidates */ | 549 /* Remove unlikely candidates */ |
| 550 if (stripUnlikelyCandidates) { | 550 if (stripUnlikelyCandidates) { |
| 551 var unlikelyMatchString = node.className + node.id; | 551 var unlikelyMatchString = node.className + node.id; |
| 552 if ( | 552 if ( |
| 553 ( | 553 ( |
| 554 unlikelyMatchString.search(readability.regexps.unlikelyC
andidates) !== -1 && | 554 unlikelyMatchString.search(readability.regexps.unlikelyC
andidates) !== -1 && |
| 555 unlikelyMatchString.search(readability.regexps.okMaybeIt
sACandidate) === -1 && | 555 unlikelyMatchString.search(readability.regexps.okMaybeIt
sACandidate) === -1 && |
| 556 node.tagName !== "BODY" | 556 node.tagName !== "BODY" |
| 557 ) | 557 ) |
| 558 ) | 558 ) |
| 559 { | 559 { |
| 560 dbg("Removing unlikely candidate - " + unlikelyMatchString); | 560 dbg("Removing unlikely candidate - " + unlikelyMatchString); |
| 561 node.parentNode.removeChild(node); | 561 node.parentNode.removeChild(node); |
| 562 nodeIndex-=1; | 562 nodeIndex-=1; |
| 563 continue; | 563 continue; |
| 564 } | 564 } |
| 565 } | 565 } |
| 566 | 566 |
| 567 if (node.tagName === "P" || node.tagName === "TD" || node.tagName ==
= "PRE") { | 567 if (node.tagName === "P" || node.tagName === "TD" || node.tagName ==
= "PRE") { |
| 568 nodesToScore[nodesToScore.length] = node; | 568 nodesToScore[nodesToScore.length] = node; |
| 569 } | 569 } |
| 570 | 570 |
| 571 /* Turn all divs that don't have children block level elements into
p's */ | 571 /* Turn all divs that don't have children block level elements into
p's */ |
| 572 if (node.tagName === "DIV") { | 572 if (node.tagName === "DIV") { |
| 573 if (node.innerHTML.search(readability.regexps.divToPElements) ==
= -1) { | 573 if (node.innerHTML.search(readability.regexps.divToPElements) ==
= -1) { |
| 574 var newNode = document.createElement('p'); | 574 var newNode = document.createElement('p'); |
| (...skipping 16 matching lines...) Expand all Loading... |
| 591 if(childNode.nodeType === 3) { // Node.TEXT_NODE | 591 if(childNode.nodeType === 3) { // Node.TEXT_NODE |
| 592 var p = document.createElement('p'); | 592 var p = document.createElement('p'); |
| 593 var t = document.createTextNode(childNode.nodeValue)
; | 593 var t = document.createTextNode(childNode.nodeValue)
; |
| 594 p.appendChild(t); | 594 p.appendChild(t); |
| 595 p.style.display = 'inline'; | 595 p.style.display = 'inline'; |
| 596 p.className = 'readability-styled'; | 596 p.className = 'readability-styled'; |
| 597 childNode.parentNode.replaceChild(p, childNode); | 597 childNode.parentNode.replaceChild(p, childNode); |
| 598 } | 598 } |
| 599 } | 599 } |
| 600 } | 600 } |
| 601 } | 601 } |
| 602 } | 602 } |
| 603 | 603 |
| 604 /** | 604 /** |
| 605 * Loop through all paragraphs, and assign a score to them based on how
content-y they look. | 605 * Loop through all paragraphs, and assign a score to them based on how
content-y they look. |
| 606 * Then add their score to their parent node. | 606 * Then add their score to their parent node. |
| 607 * | 607 * |
| 608 * A score is determined by things like number of commas, class names, e
tc. Maybe eventually link density. | 608 * A score is determined by things like number of commas, class names, e
tc. Maybe eventually link density. |
| 609 **/ | 609 **/ |
| 610 var candidates = []; | 610 var candidates = []; |
| 611 for (var pt=0; pt < nodesToScore.length; pt+=1) { | 611 for (var pt=0; pt < nodesToScore.length; pt+=1) { |
| (...skipping 21 matching lines...) Expand all Loading... |
| 633 candidates.push(grandParentNode); | 633 candidates.push(grandParentNode); |
| 634 } | 634 } |
| 635 | 635 |
| 636 var contentScore = 0; | 636 var contentScore = 0; |
| 637 | 637 |
| 638 /* Add a point for the paragraph itself as a base. */ | 638 /* Add a point for the paragraph itself as a base. */ |
| 639 contentScore+=1; | 639 contentScore+=1; |
| 640 | 640 |
| 641 /* Add points for any commas within this paragraph */ | 641 /* Add points for any commas within this paragraph */ |
| 642 contentScore += innerText.split(',').length; | 642 contentScore += innerText.split(',').length; |
| 643 | 643 |
| 644 /* For every 100 characters in this paragraph, add another point. Up
to 3 points. */ | 644 /* For every 100 characters in this paragraph, add another point. Up
to 3 points. */ |
| 645 contentScore += Math.min(Math.floor(innerText.length / 100), 3); | 645 contentScore += Math.min(Math.floor(innerText.length / 100), 3); |
| 646 | 646 |
| 647 /* Add the score to the parent. The grandparent gets half. */ | 647 /* Add the score to the parent. The grandparent gets half. */ |
| 648 parentNode.readability.contentScore += contentScore; | 648 parentNode.readability.contentScore += contentScore; |
| 649 | 649 |
| 650 if(grandParentNode) { | 650 if(grandParentNode) { |
| 651 grandParentNode.readability.contentScore += contentScore/2;
| 651 grandParentNode.readability.contentScore += contentScore/2; |
| 652 } | 652 } |
| 653 } | 653 } |
| 654 | 654 |
| 655 /** | 655 /** |
| 656 * After we've calculated scores, loop through all of the possible candi
date nodes we found | 656 * After we've calculated scores, loop through all of the possible candi
date nodes we found |
| 657 * and find the one with the highest score. | 657 * and find the one with the highest score. |
| 658 **/ | 658 **/ |
| 659 var topCandidate = null; | 659 var topCandidate = null; |
| 660 for(var c=0, cl=candidates.length; c < cl; c+=1) | 660 for(var c=0, cl=candidates.length; c < cl; c+=1) |
| 661 { | 661 { |
| (...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 718 var contentBonus = 0; | 718 var contentBonus = 0; |
| 719 /* Give a bonus if sibling nodes and top candidates have the example
same classname */ | 719 /* Give a bonus if sibling nodes and top candidates have the example
same classname */ |
| 720 if(siblingNode.className === topCandidate.className && topCandidate.
className !== "") { | 720 if(siblingNode.className === topCandidate.className && topCandidate.
className !== "") { |
| 721 contentBonus += topCandidate.readability.contentScore * 0.2; | 721 contentBonus += topCandidate.readability.contentScore * 0.2; |
| 722 } | 722 } |
| 723 | 723 |
| 724 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re
adability.contentScore+contentBonus) >= siblingScoreThreshold) | 724 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re
adability.contentScore+contentBonus) >= siblingScoreThreshold) |
| 725 { | 725 { |
| 726 append = true; | 726 append = true; |
| 727 } | 727 } |
| 728 | 728 |
| 729 if(siblingNode.nodeName === "P") { | 729 if(siblingNode.nodeName === "P") { |
| 730 var linkDensity = readability.getLinkDensity(siblingNode); | 730 var linkDensity = readability.getLinkDensity(siblingNode); |
| 731 var nodeContent = readability.getInnerText(siblingNode); | 731 var nodeContent = readability.getInnerText(siblingNode); |
| 732 var nodeLength = nodeContent.length; | 732 var nodeLength = nodeContent.length; |
| 733 | 733 |
| 734 if(nodeLength > 80 && linkDensity < 0.25) | 734 if(nodeLength > 80 && linkDensity < 0.25) |
| 735 { | 735 { |
| 736 append = true; | 736 append = true; |
| 737 } | 737 } |
| 738 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear
ch(/\.( |$)/) !== -1) | 738 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear
ch(/\.( |$)/) !== -1) |
| 739 { | 739 { |
| 740 append = true; | 740 append = true; |
| 741 } | 741 } |
| 742 } | 742 } |
| 743 | 743 |
| 744 if(append) { | 744 if(append) { |
| 745 dbg("Appending node: " + siblingNode); | 745 dbg("Appending node: " + siblingNode); |
| 746 | 746 |
| 747 var nodeToAppend = null; | 747 var nodeToAppend = null; |
| 748 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P
") { | 748 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P
") { |
| 749 /* We have a node that isn't a common block level element, l
ike a form or td tag. Turn it into a div so it doesn't get filtered out later by
accident. */ | 749 /* We have a node that isn't a common block level element, l
ike a form or td tag. Turn it into a div so it doesn't get filtered out later by
accident. */ |
| 750 | 750 |
| 751 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to
div.'); | 751 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to
div.'); |
| 752 nodeToAppend = document.createElement("DIV"); | 752 nodeToAppend = document.createElement("DIV"); |
| 753 try { | 753 try { |
| 754 nodeToAppend.id = siblingNode.id; | 754 nodeToAppend.id = siblingNode.id; |
| 755 readability.moveNodeInnards(siblingNode, nodeToAppend); | 755 readability.moveNodeInnards(siblingNode, nodeToAppend); |
| 756 } | 756 } |
| 757 catch(er) { | 757 catch(er) { |
| 758 dbg("Could not alter siblingNode to div, probably an IE
restriction, reverting back to original."); | 758 dbg("Could not alter siblingNode to div, probably an IE
restriction, reverting back to original."); |
| 759 nodeToAppend = siblingNode; | 759 nodeToAppend = siblingNode; |
| 760 s-=1; | 760 s-=1; |
| 761 sl-=1; | 761 sl-=1; |
| 762 } | 762 } |
| 763 } else { | 763 } else { |
| 764 nodeToAppend = siblingNode; | 764 nodeToAppend = siblingNode; |
| 765 s-=1; | 765 s-=1; |
| 766 sl-=1; | 766 sl-=1; |
| 767 } | 767 } |
| 768 | 768 |
| 769 /* To ensure a node does not interfere with readability styles,
remove its classnames */ | 769 /* To ensure a node does not interfere with readability styles,
remove its classnames */ |
| 770 nodeToAppend.className = ""; | 770 nodeToAppend.className = ""; |
| 771 | 771 |
| 772 /* Append sibling and subtract from our list because it removes
the node when you append to another node */ | 772 /* Append sibling and subtract from our list because it removes
the node when you append to another node */ |
| 773 articleContent.appendChild(nodeToAppend); | 773 articleContent.appendChild(nodeToAppend); |
| 774 } | 774 } |
| 775 } | 775 } |
| 776 | 776 |
| 777 /** | 777 /** |
| 778 * So we have all of the content that we need. Now we clean it up for pr
esentation. | 778 * So we have all of the content that we need. Now we clean it up for pr
esentation. |
| 779 **/ | 779 **/ |
| 780 readability.distilledArticleContent = articleContent.cloneNode(true); | 780 readability.distilledArticleContent = articleContent.cloneNode(true); |
| 781 //readability.prepArticle(articleContent); | 781 //readability.prepArticle(articleContent); |
| 782 | 782 |
| 783 if (readability.curPageNum === 1) { | 783 if (readability.curPageNum === 1) { |
| 784 var newNode = document.createElement('div'); | 784 var newNode = document.createElement('div'); |
| 785 newNode.id = "readability-page-1"; | 785 newNode.id = "readability-page-1"; |
| 786 newNode.setAttribute("class", "page"); | 786 newNode.setAttribute("class", "page"); |
| 787 readability.moveNodeInnards(articleContent, newNode); | 787 readability.moveNodeInnards(articleContent, newNode); |
| 788 articleContent.appendChild(newNode); | 788 articleContent.appendChild(newNode); |
| 789 } | 789 } |
| 790 | 790 |
| 791 /** | 791 /** |
| 792 * Now that we've gone through the full algorithm, check to see if we go
t any meaningful content. | 792 * Now that we've gone through the full algorithm, check to see if we go
t any meaningful content. |
| 793 * If we didn't, we may need to re-run grabArticle with different flags
set. This gives us a higher | 793 * If we didn't, we may need to re-run grabArticle with different flags
set. This gives us a higher |
| 794 * likelihood of finding the content, and the sieve approach gives us a
higher likelihood of | 794 * likelihood of finding the content, and the sieve approach gives us a
higher likelihood of |
| 795 * finding the -right- content. | 795 * finding the -right- content. |
| 796 **/ | 796 **/ |
| 797 if(readability.getInnerText(articleContent, false).length < 250) { | 797 if(readability.getInnerText(articleContent, false).length < 250) { |
| 798 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) { | 798 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) { |
| 799 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS); | 799 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS); |
| 800 return readability.grabArticle(document.body); | 800 return readability.grabArticle(document.body); |
| 801 } | 801 } |
| 802 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES))
{ | 802 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES))
{ |
| 803 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES); | 803 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES); |
| 804 return readability.grabArticle(document.body); | 804 return readability.grabArticle(document.body); |
| 805 } | 805 } |
| 806 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL
LY)) { | 806 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL
LY)) { |
| 807 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY); | 807 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY); |
| 808 return readability.grabArticle(document.body); | 808 return readability.grabArticle(document.body); |
| 809 } else { | 809 } else { |
| 810 return null; | 810 return null; |
| 811 } | 811 } |
| 812 } | 812 } |
| 813 | 813 |
| 814 return articleContent; | 814 return articleContent; |
| 815 }, | 815 }, |
| 816 | 816 |
| 817 /** | 817 /** |
| 818 * Removes script tags from the document. | 818 * Removes script tags from the document. |
| 819 * | 819 * |
| 820 * @param Element | 820 * @param Element |
| 821 **/ | 821 **/ |
| 822 removeScripts: function (doc) { | 822 removeScripts: function (doc) { |
| 823 var scripts = doc.getElementsByTagName('script'); | 823 var scripts = doc.getElementsByTagName('script'); |
| 824 for(var i = scripts.length-1; i >= 0; i-=1) | 824 for(var i = scripts.length-1; i >= 0; i-=1) |
| 825 { | 825 { |
| 826 if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf
('readability') === -1 && scripts[i].src.indexOf('typekit') === -1)) | 826 if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf
('readability') === -1 && scripts[i].src.indexOf('typekit') === -1)) |
| 827 { | 827 { |
| 828 scripts[i].nodeValue=""; | 828 scripts[i].nodeValue=""; |
| 829 scripts[i].removeAttribute('src'); | 829 scripts[i].removeAttribute('src'); |
| 830 if (scripts[i].parentNode) { | 830 if (scripts[i].parentNode) { |
| 831 scripts[i].parentNode.removeChild(scripts[i]); | 831 scripts[i].parentNode.removeChild(scripts[i]); |
| 832 } | 832 } |
| 833 } | 833 } |
| 834 } | 834 } |
| 835 }, | 835 }, |
| 836 | 836 |
| 837 /** | 837 /** |
| 838 * Get the inner text of a node - cross browser compatibly. | 838 * Get the inner text of a node - cross browser compatibly. |
| 839 * This also strips out any excess whitespace to be found. | 839 * This also strips out any excess whitespace to be found. |
| 840 * | 840 * |
| 841 * @param Element | 841 * @param Element |
| 842 * @return string | 842 * @return string |
| 843 **/ | 843 **/ |
| 844 getInnerText: function (e, normalizeSpaces) { | 844 getInnerText: function (e, normalizeSpaces) { |
| 845 var textContent = ""; | 845 var textContent = ""; |
| 846 | 846 |
| (...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 889 | 889 |
| 890 // Remove any root styles, if we're able. | 890 // Remove any root styles, if we're able. |
| 891 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili
ty-styled') { | 891 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili
ty-styled') { |
| 892 e.removeAttribute('style'); } | 892 e.removeAttribute('style'); } |
| 893 | 893 |
| 894 // Go until there are no more child nodes | 894 // Go until there are no more child nodes |
| 895 while ( cur !== null ) { | 895 while ( cur !== null ) { |
| 896 if ( cur.nodeType === 1 ) { | 896 if ( cur.nodeType === 1 ) { |
| 897 // Remove style attribute(s) : | 897 // Remove style attribute(s) : |
| 898 if(cur.className !== "readability-styled") { | 898 if(cur.className !== "readability-styled") { |
| 899 cur.removeAttribute("style"); | 899 cur.removeAttribute("style"); |
| 900 } | 900 } |
| 901 readability.cleanStyles( cur ); | 901 readability.cleanStyles( cur ); |
| 902 } | 902 } |
| 903 cur = cur.nextSibling; | 903 cur = cur.nextSibling; |
| 904 } | 904 } |
| 905 }, | 905 }, |
| 906 | 906 |
| 907 /** | 907 /** |
| 908 * Get the density of links as a percentage of the content | 908 * Get the density of links as a percentage of the content |
| 909 * This is the amount of text that is inside a link divided by the total tex
t in the node. | 909 * This is the amount of text that is inside a link divided by the total tex
t in the node. |
| 910 * | 910 * |
| 911 * @param Element | 911 * @param Element |
| 912 * @return number (float) | 912 * @return number (float) |
| 913 **/ | 913 **/ |
| 914 getLinkDensity: function (e) { | 914 getLinkDensity: function (e) { |
| 915 var links = e.getElementsByTagName("a"); | 915 var links = e.getElementsByTagName("a"); |
| 916 var textLength = readability.getInnerText(e).length; | 916 var textLength = readability.getInnerText(e).length; |
| 917 var linkLength = 0; | 917 var linkLength = 0; |
| 918 for(var i=0, il=links.length; i<il;i+=1) | 918 for(var i=0, il=links.length; i<il;i+=1) |
| 919 { | 919 { |
| 920 linkLength += readability.getInnerText(links[i]).length; | 920 linkLength += readability.getInnerText(links[i]).length; |
| 921 } | 921 } |
| 922 | 922 |
| 923 return linkLength / textLength; | 923 return linkLength / textLength; |
| 924 }, | 924 }, |
| 925 | 925 |
| 926 /** | 926 /** |
| 927 * Find a cleaned up version of the current URL, to use for comparing links
for possible next-pageyness. | 927 * Find a cleaned up version of the current URL, to use for comparing links
for possible next-pageyness. |
| 928 * | 928 * |
| 929 * @author Dan Lacy | 929 * @author Dan Lacy |
| 930 * @return string the base url | 930 * @return string the base url |
| 931 **/ | 931 **/ |
| 932 findBaseUrl: function () { | 932 findBaseUrl: function () { |
| 933 var noUrlParams = window.location.pathname.split("?")[0], | 933 var noUrlParams = window.location.pathname.split("?")[0], |
| 934 urlSlashes = noUrlParams.split("/").reverse(), | 934 urlSlashes = noUrlParams.split("/").reverse(), |
| 935 cleanedSegments = [], | 935 cleanedSegments = [], |
| 936 possibleType = ""; | 936 possibleType = ""; |
| 937 | 937 |
| 938 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) { | 938 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) { |
| 939 var segment = urlSlashes[i]; | 939 var segment = urlSlashes[i]; |
| 940 | 940 |
| 941 // Split off and save anything that looks like a file type. | 941 // Split off and save anything that looks like a file type. |
| 942 if (segment.indexOf(".") !== -1) { | 942 if (segment.indexOf(".") !== -1) { |
| 943 possibleType = segment.split(".")[1]; | 943 possibleType = segment.split(".")[1]; |
| 944 | 944 |
| 945 /* If the type isn't alpha-only, it's probably not actually a fi
le extension. */ | 945 /* If the type isn't alpha-only, it's probably not actually a fi
le extension. */ |
| 946 if(!possibleType.match(/[^a-zA-Z]/)) { | 946 if(!possibleType.match(/[^a-zA-Z]/)) { |
| 947 segment = segment.split(".")[0]; | 947 segment = segment.split(".")[0]; |
| 948 } | 948 } |
| 949 } | 949 } |
| 950 | 950 |
| 951 /** | 951 /** |
| 952 * EW-CMS specific segment replacement. Ugly. | 952 * EW-CMS specific segment replacement. Ugly. |
| 953 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm
l | 953 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm
l |
| 954 **/ | 954 **/ |
| 955 if(segment.indexOf(',00') !== -1) { | 955 if(segment.indexOf(',00') !== -1) { |
| 956 segment = segment.replace(',00', ''); | 956 segment = segment.replace(',00', ''); |
| 957 } | 957 } |
| 958 | 958 |
| 959 // If our first or second segment has anything looking like a page n
umber, remove it. | 959 // If our first or second segment has anything looking like a page n
umber, remove it. |
| 960 if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1)
|| (i === 0))) { | 960 if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1)
|| (i === 0))) { |
| 961 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "
"); | 961 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "
"); |
| 962 } | 962 } |
| 963 | 963 |
| 964 | 964 |
| 965 var del = false; | 965 var del = false; |
| 966 | 966 |
| 967 /* If this is purely a number, and it's the first or second segment,
it's probably a page number. Remove it. */ | 967 /* If this is purely a number, and it's the first or second segment,
it's probably a page number. Remove it. */ |
| 968 if (i < 2 && segment.match(/^\d{1,2}$/)) { | 968 if (i < 2 && segment.match(/^\d{1,2}$/)) { |
| 969 del = true; | 969 del = true; |
| 970 } | 970 } |
| 971 | 971 |
| 972 /* If this is the first segment and it's just "index", remove it. */ | 972 /* If this is the first segment and it's just "index", remove it. */ |
| 973 if(i === 0 && segment.toLowerCase() === "index") { | 973 if(i === 0 && segment.toLowerCase() === "index") { |
| 974 del = true; | 974 del = true; |
| 975 } | 975 } |
| 976 | 976 |
| 977 | 977 |
| 978 /* If our first or second segment is smaller than 3 characters, and
the first segment was purely alphas, remove it. */ | 978 /* If our first or second segment is smaller than 3 characters, and
the first segment was purely alphas, remove it. */ |
| 979 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) { | 979 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) { |
| 980 del = true; | 980 del = true; |
| 981 } | 981 } |
| 982 | 982 |
| 983 /* If it's not marked for deletion, push it to cleanedSegments. */ | 983 /* If it's not marked for deletion, push it to cleanedSegments. */ |
| 984 if (!del) { | 984 if (!del) { |
| 985 cleanedSegments.push(segment); | 985 cleanedSegments.push(segment); |
| 986 } | 986 } |
| 987 } | 987 } |
| 988 | 988 |
| 989 // This is our final, cleaned, base article URL. | 989 // This is our final, cleaned, base article URL. |
| 990 return window.location.protocol + "//" + window.location.host + cleanedS
egments.reverse().join("/"); | 990 return window.location.protocol + "//" + window.location.host + cleanedS
egments.reverse().join("/"); |
| 991 }, | 991 }, |
| 992 | 992 |
| 993 /** | 993 /** |
| 994 * Look for any paging links that may occur within the document. | 994 * Look for any paging links that may occur within the document. |
| 995 * | 995 * |
| 996 * @param body | 996 * @param body |
| 997 * @return object (array) | 997 * @return object (array) |
| 998 **/ | 998 **/ |
| 999 findNextPageLink: function (elem) { | 999 findNextPageLink: function (elem) { |
| 1000 var possiblePages = {}, | 1000 var possiblePages = {}, |
| 1001 allLinks = elem.getElementsByTagName('a'), | 1001 allLinks = elem.getElementsByTagName('a'), |
| 1002 articleBaseUrl = readability.findBaseUrl(); | 1002 articleBaseUrl = readability.findBaseUrl(); |
| 1003 | 1003 |
| 1004 /** | 1004 /** |
| 1005 * Loop through all links, looking for hints that they may be next-page
links. | 1005 * Loop through all links, looking for hints that they may be next-page
links. |
| 1006 * Things like having "page" in their textContent, className or id, or b
eing a child | 1006 * Things like having "page" in their textContent, className or id, or b
eing a child |
| 1007 * of a node with a page-y className or id. | 1007 * of a node with a page-y className or id. |
| 1008 * | 1008 * |
| 1009 * Also possible: levenshtein distance? longest common subsequence? | 1009 * Also possible: levenshtein distance? longest common subsequence? |
| 1010 * | 1010 * |
| 1011 * After we do that, assign each page a score, and | 1011 * After we do that, assign each page a score, and |
| 1012 **/ | 1012 **/ |
| 1013 for(var i = 0, il = allLinks.length; i < il; i+=1) { | 1013 for(var i = 0, il = allLinks.length; i < il; i+=1) { |
| 1014 var link = allLinks[i], | 1014 var link = allLinks[i], |
| 1015 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '
'); | 1015 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '
'); |
| 1016 | 1016 |
| 1017 /* If we've already seen this page, ignore it */ | 1017 /* If we've already seen this page, ignore it */ |
| 1018 if(linkHref === "" || linkHref === articleBaseUrl || linkHref === wi
ndow.location.href || linkHref in readability.parsedPages) { | 1018 if(linkHref === "" || linkHref === articleBaseUrl || linkHref === wi
ndow.location.href || linkHref in readability.parsedPages) { |
| 1019 continue; | 1019 continue; |
| 1020 } | 1020 } |
| 1021 | 1021 |
| 1022 /* If it's on a different domain, skip it. */ | 1022 /* If it's on a different domain, skip it. */ |
| 1023 if(window.location.host !== linkHref.split(/\/+/g)[1]) { | 1023 if(window.location.host !== linkHref.split(/\/+/g)[1]) { |
| 1024 continue; | 1024 continue; |
| 1025 } | 1025 } |
| 1026 | 1026 |
| 1027 var linkText = readability.getInnerText(link); | 1027 var linkText = readability.getInnerText(link); |
| 1028 | 1028 |
| 1029 /* If the linkText looks like it's not the next page, skip it. */ | 1029 /* If the linkText looks like it's not the next page, skip it. */ |
| 1030 if(linkText.match(readability.regexps.extraneous) || linkText.length
> 25) { | 1030 if(linkText.match(readability.regexps.extraneous) || linkText.length
> 25) { |
| 1031 continue; | 1031 continue; |
| 1032 } | 1032 } |
| 1033 | 1033 |
| 1034 /* If the leftovers of the URL after removing the base URL don't con
tain any digits, it's certainly not a next page link. */ | 1034 /* If the leftovers of the URL after removing the base URL don't con
tain any digits, it's certainly not a next page link. */ |
| 1035 var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); | 1035 var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); |
| 1036 if(!linkHrefLeftover.match(/\d/)) { | 1036 if(!linkHrefLeftover.match(/\d/)) { |
| 1037 continue; | 1037 continue; |
| 1038 } | 1038 } |
| 1039 | 1039 |
| 1040 if(!(linkHref in possiblePages)) { | 1040 if(!(linkHref in possiblePages)) { |
| 1041 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr
ef": linkHref}; | 1041 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr
ef": linkHref}; |
| 1042 } else { | 1042 } else { |
| 1043 possiblePages[linkHref].linkText += ' | ' + linkText; | 1043 possiblePages[linkHref].linkText += ' | ' + linkText; |
| 1044 } | 1044 } |
| 1045 | 1045 |
| 1046 var linkObj = possiblePages[linkHref]; | 1046 var linkObj = possiblePages[linkHref]; |
| 1047 | 1047 |
| 1048 /** | 1048 /** |
| 1049 * If the articleBaseUrl isn't part of this URL, penalize this link.
It could still be the link, but the odds are lower. | 1049 * If the articleBaseUrl isn't part of this URL, penalize this link.
It could still be the link, but the odds are lower. |
| 1050 * Example: http://www.actionscript.org/resources/articles/745/1/Jav
aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html | 1050 * Example: http://www.actionscript.org/resources/articles/745/1/Jav
aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html |
| 1051 **/ | 1051 **/ |
| 1052 if(linkHref.indexOf(articleBaseUrl) !== 0) { | 1052 if(linkHref.indexOf(articleBaseUrl) !== 0) { |
| 1053 linkObj.score -= 25; | 1053 linkObj.score -= 25; |
| 1054 } | 1054 } |
| 1055 | 1055 |
| 1056 var linkData = linkText + ' ' + link.className + ' ' + link.id; | 1056 var linkData = linkText + ' ' + link.className + ' ' + link.id; |
| 1057 if(linkData.match(readability.regexps.nextLink)) { | 1057 if(linkData.match(readability.regexps.nextLink)) { |
| 1058 linkObj.score += 50; | 1058 linkObj.score += 50; |
| 1059 } | 1059 } |
| 1060 if(linkData.match(/pag(e|ing|inat)/i)) { | 1060 if(linkData.match(/pag(e|ing|inat)/i)) { |
| 1061 linkObj.score += 25; | 1061 linkObj.score += 25; |
| 1062 } | 1062 } |
| 1063 if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any
bonuses gotten from a > or » in the text, | 1063 if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any
bonuses gotten from a > or » in the text, |
| 1064 /* If we already matched on "next", last is probably fine. If we
didn't, then it's bad. Penalize. */ | 1064 /* If we already matched on "next", last is probably fine. If we
didn't, then it's bad. Penalize. */ |
| 1065 if(!linkObj.linkText.match(readability.regexps.nextLink)) { | 1065 if(!linkObj.linkText.match(readability.regexps.nextLink)) { |
| 1066 linkObj.score -= 65; | 1066 linkObj.score -= 65; |
| 1067 } | 1067 } |
| 1068 } | 1068 } |
| 1069 if(linkData.match(readability.regexps.negative) || linkData.match(re
adability.regexps.extraneous)) { | 1069 if(linkData.match(readability.regexps.negative) || linkData.match(re
adability.regexps.extraneous)) { |
| 1070 linkObj.score -= 50; | 1070 linkObj.score -= 50; |
| 1071 } | 1071 } |
| 1072 if(linkData.match(readability.regexps.prevLink)) { | 1072 if(linkData.match(readability.regexps.prevLink)) { |
| 1073 linkObj.score -= 200; | 1073 linkObj.score -= 200; |
| 1074 } | 1074 } |
| 1075 | 1075 |
| 1076 /* If a parentNode contains page or paging or paginat */ | 1076 /* If a parentNode contains page or paging or paginat */ |
| 1077 var parentNode = link.parentNode, | 1077 var parentNode = link.parentNode, |
| 1078 positiveNodeMatch = false, | 1078 positiveNodeMatch = false, |
| 1079 negativeNodeMatch = false; | 1079 negativeNodeMatch = false; |
| 1080 while(parentNode) { | 1080 while(parentNode) { |
| 1081 var parentNodeClassAndId = parentNode.className + ' ' + parentNo
de.id; | 1081 var parentNodeClassAndId = parentNode.className + ' ' + parentNo
de.id; |
| 1082 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass
AndId.match(/pag(e|ing|inat)/i)) { | 1082 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass
AndId.match(/pag(e|ing|inat)/i)) { |
| 1083 positiveNodeMatch = true; | 1083 positiveNodeMatch = true; |
| 1084 linkObj.score += 25; | 1084 linkObj.score += 25; |
| 1085 } | 1085 } |
| 1086 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass
AndId.match(readability.regexps.negative)) { | 1086 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass
AndId.match(readability.regexps.negative)) { |
| 1087 /* If this is just something like "footer", give it a negati
ve. If it's something like "body-and-footer", leave it be. */ | 1087 /* If this is just something like "footer", give it a negati
ve. If it's something like "body-and-footer", leave it be. */ |
| 1088 if(!parentNodeClassAndId.match(readability.regexps.positive)
) { | 1088 if(!parentNodeClassAndId.match(readability.regexps.positive)
) { |
| 1089 linkObj.score -= 25; | 1089 linkObj.score -= 25; |
| 1090 negativeNodeMatch = true; | 1090 negativeNodeMatch = true; |
| 1091 } | 1091 } |
| 1092 } | 1092 } |
| 1093 | 1093 |
| 1094 parentNode = parentNode.parentNode; | 1094 parentNode = parentNode.parentNode; |
| 1095 } | 1095 } |
| 1096 | 1096 |
| 1097 /** | 1097 /** |
| 1098 * If the URL looks like it has paging in it, add to the score. | 1098 * If the URL looks like it has paging in it, add to the score. |
| 1099 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 | 1099 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 |
| 1100 **/ | 1100 **/ |
| 1101 if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) ||
linkHref.match(/(page|paging)/i)) { | 1101 if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) ||
linkHref.match(/(page|paging)/i)) { |
| 1102 linkObj.score += 25; | 1102 linkObj.score += 25; |
| 1103 } | 1103 } |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1145 topPage = possiblePages[page]; | 1145 topPage = possiblePages[page]; |
| 1146 } | 1146 } |
| 1147 } | 1147 } |
| 1148 } | 1148 } |
| 1149 | 1149 |
| 1150 if(topPage) { | 1150 if(topPage) { |
| 1151 var nextHref = topPage.href.replace(/\/$/,''); | 1151 var nextHref = topPage.href.replace(/\/$/,''); |
| 1152 | 1152 |
| 1153 dbg('NEXT PAGE IS ' + nextHref); | 1153 dbg('NEXT PAGE IS ' + nextHref); |
| 1154 readability.parsedPages[nextHref] = true; | 1154 readability.parsedPages[nextHref] = true; |
| 1155 return nextHref; | 1155 return nextHref; |
| 1156 } | 1156 } |
| 1157 else { | 1157 else { |
| 1158 return null; | 1158 return null; |
| 1159 } | 1159 } |
| 1160 }, | 1160 }, |
| 1161 | 1161 |
| 1162 createLinkDiv: function(link) { | 1162 createLinkDiv: function(link) { |
| 1163 var divNode = document.createElement('div'); | 1163 var divNode = document.createElement('div'); |
| 1164 var aNode = document.createElement('a'); | 1164 var aNode = document.createElement('a'); |
| 1165 var tNode = document.createTextNode('View Next Page'); | 1165 var tNode = document.createTextNode('View Next Page'); |
| (...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1197 } | 1197 } |
| 1198 else { | 1198 else { |
| 1199 if (options.error) { options.error(request); } | 1199 if (options.error) { options.error(request); } |
| 1200 } | 1200 } |
| 1201 } | 1201 } |
| 1202 } | 1202 } |
| 1203 | 1203 |
| 1204 if (typeof options === 'undefined') { options = {}; } | 1204 if (typeof options === 'undefined') { options = {}; } |
| 1205 | 1205 |
| 1206 request.onreadystatechange = respondToReadyState; | 1206 request.onreadystatechange = respondToReadyState; |
| 1207 | 1207 |
| 1208 request.open('get', url, true); | 1208 request.open('get', url, true); |
| 1209 request.setRequestHeader('Accept', 'text/html'); | 1209 request.setRequestHeader('Accept', 'text/html'); |
| 1210 | 1210 |
| 1211 try { | 1211 try { |
| 1212 request.send(options.postBody); | 1212 request.send(options.postBody); |
| 1213 } | 1213 } |
| 1214 catch (e) { | 1214 catch (e) { |
| 1215 if (options.error) { options.error(); } | 1215 if (options.error) { options.error(); } |
| 1216 } | 1216 } |
| 1217 | 1217 |
| (...skipping 14 matching lines...) Expand all Loading... |
| 1232 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada
bility.curPageNum + '">§</p>'; | 1232 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada
bility.curPageNum + '">§</p>'; |
| 1233 | 1233 |
| 1234 document.getElementById("readability-content").appendChild(articlePage); | 1234 document.getElementById("readability-content").appendChild(articlePage); |
| 1235 | 1235 |
| 1236 if(readability.curPageNum > readability.maxPages) { | 1236 if(readability.curPageNum > readability.maxPages) { |
| 1237 var linkDiv = readability.createLinkDiv(nextPageLink); | 1237 var linkDiv = readability.createLinkDiv(nextPageLink); |
| 1238 | 1238 |
| 1239 articlePage.appendChild(linkDiv); | 1239 articlePage.appendChild(linkDiv); |
| 1240 return; | 1240 return; |
| 1241 } | 1241 } |
| 1242 | 1242 |
| 1243 /** | 1243 /** |
| 1244 * Now that we've built the article page DOM element, get the page conte
nt | 1244 * Now that we've built the article page DOM element, get the page conte
nt |
| 1245 * asynchronously and load the cleaned content into the div we created f
or it. | 1245 * asynchronously and load the cleaned content into the div we created f
or it. |
| 1246 **/ | 1246 **/ |
| 1247 (function(pageUrl, thisPage) { | 1247 (function(pageUrl, thisPage) { |
| 1248 readability.ajax(pageUrl, { | 1248 readability.ajax(pageUrl, { |
| 1249 success: function(r) { | 1249 success: function(r) { |
| 1250 | 1250 |
| 1251 /* First, check to see if we have a matching ETag in headers
- if we do, this is a duplicate page. */ | 1251 /* First, check to see if we have a matching ETag in headers
- if we do, this is a duplicate page. */ |
| 1252 var eTag = r.getResponseHeader('ETag'); | 1252 var eTag = r.getResponseHeader('ETag'); |
| 1253 if(eTag) { | 1253 if(eTag) { |
| 1254 if(eTag in readability.pageETags) { | 1254 if(eTag in readability.pageETags) { |
| 1255 dbg("Exact duplicate page found via ETag. Aborting."
); | 1255 dbg("Exact duplicate page found via ETag. Aborting."
); |
| 1256 articlePage.style.display = 'none'; | 1256 articlePage.style.display = 'none'; |
| 1257 return; | 1257 return; |
| 1258 } else { | 1258 } else { |
| 1259 readability.pageETags[eTag] = 1; | 1259 readability.pageETags[eTag] = 1; |
| 1260 } | 1260 } |
| 1261 } | 1261 } |
| 1262 | 1262 |
| 1263 // TODO: this ends up doubling up page numbers on NYTimes ar
ticles. Need to generically parse those away. | 1263 // TODO: this ends up doubling up page numbers on NYTimes ar
ticles. Need to generically parse those away. |
| 1264 var page = document.createElement("DIV"); | 1264 var page = document.createElement("DIV"); |
| 1265 | 1265 |
| 1266 /** | 1266 /** |
| 1267 * Do some preprocessing to our HTML to make it ready for ap
pending. | 1267 * Do some preprocessing to our HTML to make it ready for ap
pending. |
| 1268 * • Remove any script tags. Swap and reswap newlines with a
unicode character because multiline regex doesn't work in javascript. | 1268 * • Remove any script tags. Swap and reswap newlines with a
unicode character because multiline regex doesn't work in javascript. |
| 1269 * • Turn any noscript tags into divs so that we can parse t
hem. This allows us to find any next page links hidden via javascript. | 1269 * • Turn any noscript tags into divs so that we can parse t
hem. This allows us to find any next page links hidden via javascript. |
| 1270 * • Turn all double br's into p's - was handled by prepDocu
ment in the original view. | 1270 * • Turn all double br's into p's - was handled by prepDocu
ment in the original view. |
| (...skipping 30 matching lines...) Expand all Loading... |
| 1301 for(var i=1; i <= readability.curPageNum; i+=1) { | 1301 for(var i=1; i <= readability.curPageNum; i+=1) { |
| 1302 var rPage = document.getElementById('readability-pag
e-' + i); | 1302 var rPage = document.getElementById('readability-pag
e-' + i); |
| 1303 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML
) !== -1) { | 1303 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML
) !== -1) { |
| 1304 dbg('Duplicate of page ' + i + ' - skipping.'); | 1304 dbg('Duplicate of page ' + i + ' - skipping.'); |
| 1305 articlePage.style.display = 'none'; | 1305 articlePage.style.display = 'none'; |
| 1306 readability.parsedPages[pageUrl] = true; | 1306 readability.parsedPages[pageUrl] = true; |
| 1307 return; | 1307 return; |
| 1308 } | 1308 } |
| 1309 } | 1309 } |
| 1310 } | 1310 } |
| 1311 | 1311 |
| 1312 readability.removeScripts(content); | 1312 readability.removeScripts(content); |
| 1313 | 1313 |
| 1314 readability.moveNodeInnards(content, thisPage); | 1314 readability.moveNodeInnards(content, thisPage); |
| 1315 | 1315 |
| 1316 /** | 1316 /** |
| 1317 * After the page has rendered, post process the content. Th
is delay is necessary because, | 1317 * After the page has rendered, post process the content. Th
is delay is necessary because, |
| 1318 * in webkit at least, offsetWidth is not set in time to det
ermine image width. We have to | 1318 * in webkit at least, offsetWidth is not set in time to det
ermine image width. We have to |
| 1319 * wait a little bit for reflow to finish before we can fix
floating images. | 1319 * wait a little bit for reflow to finish before we can fix
floating images. |
| 1320 **/ | 1320 **/ |
| 1321 window.setTimeout( | 1321 window.setTimeout( |
| 1322 function() { readability.postProcessContent(thisPage); }
, | 1322 function() { readability.postProcessContent(thisPage); }
, |
| 1323 500 | 1323 500 |
| 1324 ); | 1324 ); |
| 1325 | 1325 |
| 1326 if(nextPageLink) { | 1326 if(nextPageLink) { |
| 1327 readability.appendNextPage(nextPageLink); | 1327 readability.appendNextPage(nextPageLink); |
| 1328 } | 1328 } |
| 1329 } | 1329 } |
| 1330 }); | 1330 }); |
| 1331 }(nextPageLink, articlePage)); | 1331 }(nextPageLink, articlePage)); |
| 1332 }, | 1332 }, |
| 1333 | 1333 |
| 1334 /** | 1334 /** |
| 1335 * Get an elements class/id weight. Uses regular expressions to tell if this
| 1335 * Get an elements class/id weight. Uses regular expressions to tell if this |
| 1336 * element looks good or bad. | 1336 * element looks good or bad. |
| 1337 * | 1337 * |
| 1338 * @param Element | 1338 * @param Element |
| 1339 * @return number (Integer) | 1339 * @return number (Integer) |
| 1340 **/ | 1340 **/ |
| 1341 getClassWeight: function (e) { | 1341 getClassWeight: function (e) { |
| 1342 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { | 1342 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { |
| 1343 return 0; | 1343 return 0; |
| 1344 } | 1344 } |
| 1345 | 1345 |
| (...skipping 29 matching lines...) Expand all Loading... |
| 1375 /** | 1375 /** |
| 1376 * Remove extraneous break tags from a node. | 1376 * Remove extraneous break tags from a node. |
| 1377 * | 1377 * |
| 1378 * @param Element | 1378 * @param Element |
| 1379 * @return void | 1379 * @return void |
| 1380 **/ | 1380 **/ |
| 1381 killBreaks: function (e) { | 1381 killBreaks: function (e) { |
| 1382 var allElements = e.getElementsByTagName('*'); | 1382 var allElements = e.getElementsByTagName('*'); |
| 1383 while (i < allElements.length) { | 1383 while (i < allElements.length) { |
| 1384 readability.deleteExtraBreaks(allElements[i]); | 1384 readability.deleteExtraBreaks(allElements[i]); |
| 1385 i++; | 1385 i++; |
| 1386 } | 1386 } |
| 1387 }, | 1387 }, |
| 1388 | 1388 |
| 1389 /** | 1389 /** |
| 1390 * Clean a node of all elements of type "tag". | 1390 * Clean a node of all elements of type "tag". |
| 1391 * (Unless it's a youtube/vimeo video. People love movies.) | 1391 * (Unless it's a youtube/vimeo video. People love movies.) |
| 1392 * | 1392 * |
| 1393 * @param Element | 1393 * @param Element |
| 1394 * @param string tag to clean | 1394 * @param string tag to clean |
| 1395 * @return void | 1395 * @return void |
| 1396 **/ | 1396 **/ |
| 1397 clean: function (e, tag) { | 1397 clean: function (e, tag) { |
| 1398 var targetList = e.getElementsByTagName( tag ); | 1398 var targetList = e.getElementsByTagName( tag ); |
| 1399 var isEmbed = (tag === 'object' || tag === 'embed'); | 1399 var isEmbed = (tag === 'object' || tag === 'embed'); |
| 1400 | 1400 |
| 1401 for (var y=targetList.length-1; y >= 0; y-=1) { | 1401 for (var y=targetList.length-1; y >= 0; y-=1) { |
| 1402 /* Allow youtube and vimeo videos through as people usually want to
see those. */ | 1402 /* Allow youtube and vimeo videos through as people usually want to
see those. */ |
| 1403 if(isEmbed) { | 1403 if(isEmbed) { |
| 1404 var attributeValues = ""; | 1404 var attributeValues = ""; |
| 1405 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1)
{ | 1405 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1)
{ |
| 1406 attributeValues += targetList[y].attributes[i].value + '|'; | 1406 attributeValues += targetList[y].attributes[i].value + '|'; |
| 1407 } | 1407 } |
| 1408 | 1408 |
| 1409 /* First, check the elements attributes to see if any of them co
ntain youtube or vimeo */ | 1409 /* First, check the elements attributes to see if any of them co
ntain youtube or vimeo */ |
| 1410 if (attributeValues.search(readability.regexps.videos) !== -1) { | 1410 if (attributeValues.search(readability.regexps.videos) !== -1) { |
| 1411 continue; | 1411 continue; |
| 1412 } | 1412 } |
| 1413 | 1413 |
| 1414 /* Then check the elements inside this element for the same. */ | 1414 /* Then check the elements inside this element for the same. */ |
| 1415 if (targetList[y].innerHTML.search(readability.regexps.videos) !
== -1) { | 1415 if (targetList[y].innerHTML.search(readability.regexps.videos) !
== -1) { |
| 1416 continue; | 1416 continue; |
| 1417 } | 1417 } |
| 1418 | 1418 |
| 1419 } | 1419 } |
| 1420 | 1420 |
| 1421 targetList[y].parentNode.removeChild(targetList[y]); | 1421 targetList[y].parentNode.removeChild(targetList[y]); |
| 1422 } | 1422 } |
| 1423 }, | 1423 }, |
| 1424 | 1424 |
| 1425 /** | 1425 /** |
| 1426 * Clean an element of all tags of type "tag" if they look fishy. | 1426 * Clean an element of all tags of type "tag" if they look fishy. |
| 1427 * "Fishy" is an algorithm based on content length, classnames, link density
, number of images & embeds, etc. | 1427 * "Fishy" is an algorithm based on content length, classnames, link density
, number of images & embeds, etc. |
| 1428 * | 1428 * |
| 1429 * @return void | 1429 * @return void |
| 1430 **/ | 1430 **/ |
| 1431 cleanConditionally: function (e, tag) { | 1431 cleanConditionally: function (e, tag) { |
| 1432 | 1432 |
| 1433 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { | 1433 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { |
| 1434 return; | 1434 return; |
| 1435 } | 1435 } |
| 1436 | 1436 |
| 1437 var tagsList = e.getElementsByTagName(tag); | 1437 var tagsList = e.getElementsByTagName(tag); |
| 1438 var curTagsLength = tagsList.length; | 1438 var curTagsLength = tagsList.length; |
| 1439 | 1439 |
| 1440 /** | 1440 /** |
| 1441 * Gather counts for other typical elements embedded within. | 1441 * Gather counts for other typical elements embedded within. |
| 1442 * Traverse backwards so we can remove nodes at the same time without ef
fecting the traversal. | 1442 * Traverse backwards so we can remove nodes at the same time without ef
fecting the traversal. |
| 1443 * | 1443 * |
| 1444 * TODO: Consider taking into account original contentScore here. | 1444 * TODO: Consider taking into account original contentScore here. |
| 1445 **/ | 1445 **/ |
| 1446 for (var i=curTagsLength-1; i >= 0; i-=1) { | 1446 for (var i=curTagsLength-1; i >= 0; i-=1) { |
| 1447 var weight = readability.getClassWeight(tagsList[i]); | 1447 var weight = readability.getClassWeight(tagsList[i]); |
| 1448 var contentScore = (typeof tagsList[i].readability !== 'undefined')
? tagsList[i].readability.contentScore : 0; | 1448 var contentScore = (typeof tagsList[i].readability !== 'undefined')
? tagsList[i].readability.contentScore : 0; |
| 1449 | 1449 |
| 1450 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla
ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde
fined') ? (" with score " + tagsList[i].readability.contentScore) : '')); | 1450 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla
ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde
fined') ? (" with score " + tagsList[i].readability.contentScore) : '')); |
| 1451 | 1451 |
| 1452 if(weight+contentScore < 0) | 1452 if(weight+contentScore < 0) |
| 1453 { | 1453 { |
| 1454 tagsList[i].parentNode.removeChild(tagsList[i]); | 1454 tagsList[i].parentNode.removeChild(tagsList[i]); |
| 1455 } | 1455 } |
| 1456 else if ( readability.getCharCount(tagsList[i],',') < 10) { | 1456 else if ( readability.getCharCount(tagsList[i],',') < 10) { |
| 1457 /** | 1457 /** |
| 1458 * If there are not very many commas, and the number of | 1458 * If there are not very many commas, and the number of |
| 1459 * non-paragraph elements is more than paragraphs or other omino
us signs, remove the element. | 1459 * non-paragraph elements is more than paragraphs or other omino
us signs, remove the element. |
| 1460 **/ | 1460 **/ |
| 1461 var p = tagsList[i].getElementsByTagName("p").length; | 1461 var p = tagsList[i].getElementsByTagName("p").length; |
| 1462 var img = tagsList[i].getElementsByTagName("img").length; | 1462 var img = tagsList[i].getElementsByTagName("img").length; |
| 1463 var li = tagsList[i].getElementsByTagName("li").length-100; | 1463 var li = tagsList[i].getElementsByTagName("li").length-100; |
| 1464 var input = tagsList[i].getElementsByTagName("input").length; | 1464 var input = tagsList[i].getElementsByTagName("input").length; |
| 1465 | 1465 |
| 1466 var embedCount = 0; | 1466 var embedCount = 0; |
| 1467 var embeds = tagsList[i].getElementsByTagName("embed"); | 1467 var embeds = tagsList[i].getElementsByTagName("embed"); |
| 1468 for(var ei=0,il=embeds.length; ei < il; ei+=1) { | 1468 for(var ei=0,il=embeds.length; ei < il; ei+=1) { |
| 1469 if (embeds[ei].src.search(readability.regexps.videos) === -1
) { | 1469 if (embeds[ei].src.search(readability.regexps.videos) === -1
) { |
| 1470 embedCount+=1; | 1470 embedCount+=1; |
| 1471 } | 1471 } |
| 1472 } | 1472 } |
| 1473 | 1473 |
| 1474 var linkDensity = readability.getLinkDensity(tagsList[i]); | 1474 var linkDensity = readability.getLinkDensity(tagsList[i]); |
| 1475 var contentLength = readability.getInnerText(tagsList[i]).length
; | 1475 var contentLength = readability.getInnerText(tagsList[i]).length
; |
| 1476 var toRemove = false; | 1476 var toRemove = false; |
| 1477 | 1477 |
| 1478 if ( img > p ) { | 1478 if ( img > p ) { |
| 1479 toRemove = true; | 1479 toRemove = true; |
| 1480 } else if(li > p && tag !== "ul" && tag !== "ol") { | 1480 } else if(li > p && tag !== "ul" && tag !== "ol") { |
| 1481 toRemove = true; | 1481 toRemove = true; |
| 1482 } else if( input > Math.floor(p/3) ) { | 1482 } else if( input > Math.floor(p/3) ) { |
| 1483 toRemove = true; | 1483 toRemove = true; |
| 1484 } else if(contentLength < 25 && (img === 0 || img > 2) ) { | 1484 } else if(contentLength < 25 && (img === 0 || img > 2) ) { |
| 1485 toRemove = true; | 1485 toRemove = true; |
| 1486 } else if(weight < 25 && linkDensity > 0.2) { | 1486 } else if(weight < 25 && linkDensity > 0.2) { |
| 1487 toRemove = true; | 1487 toRemove = true; |
| 1488 } else if(weight >= 25 && linkDensity > 0.5) { | 1488 } else if(weight >= 25 && linkDensity > 0.5) { |
| 1489 toRemove = true; | 1489 toRemove = true; |
| 1490 } else if((embedCount === 1 && contentLength < 75) || embedCount
> 1) { | 1490 } else if((embedCount === 1 && contentLength < 75) || embedCount
> 1) { |
| 1491 toRemove = true; | 1491 toRemove = true; |
| 1492 } | 1492 } |
| 1493 | 1493 |
| (...skipping 21 matching lines...) Expand all Loading... |
| 1515 } | 1515 } |
| 1516 }, | 1516 }, |
| 1517 | 1517 |
| 1518 flagIsActive: function(flag) { | 1518 flagIsActive: function(flag) { |
| 1519 return (readability.flags & flag) > 0; | 1519 return (readability.flags & flag) > 0; |
| 1520 }, | 1520 }, |
| 1521 | 1521 |
| 1522 addFlag: function(flag) { | 1522 addFlag: function(flag) { |
| 1523 readability.flags = readability.flags | flag; | 1523 readability.flags = readability.flags | flag; |
| 1524 }, | 1524 }, |
| 1525 | 1525 |
| 1526 removeFlag: function(flag) { | 1526 removeFlag: function(flag) { |
| 1527 readability.flags = readability.flags & ~flag; | 1527 readability.flags = readability.flags & ~flag; |
| 1528 }, | 1528 }, |
| 1529 | 1529 |
| 1530 // Removes the children of |src| and appends them to |dest|. | 1530 // Removes the children of |src| and appends them to |dest|. |
| 1531 moveNodeInnards: function(src, dest) { | 1531 moveNodeInnards: function(src, dest) { |
| 1532 try { | 1532 try { |
| 1533 while (src.firstChild) { | 1533 while (src.firstChild) { |
| 1534 dest.appendChild(src.removeChild(src.firstChild)); | 1534 dest.appendChild(src.removeChild(src.firstChild)); |
| 1535 } | 1535 } |
| (...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1584 var lastBr = readability.isMultipleBr(node, false); | 1584 var lastBr = readability.isMultipleBr(node, false); |
| 1585 var ret = false; | 1585 var ret = false; |
| 1586 while (lastBr && lastBr != node) { | 1586 while (lastBr && lastBr != node) { |
| 1587 var toRemove = lastBr; | 1587 var toRemove = lastBr; |
| 1588 lastBr = lastBr.previousSibling; | 1588 lastBr = lastBr.previousSibling; |
| 1589 toRemove.parentNode.removeChild(toRemove); | 1589 toRemove.parentNode.removeChild(toRemove); |
| 1590 ret = true; | 1590 ret = true; |
| 1591 } | 1591 } |
| 1592 return ret; | 1592 return ret; |
| 1593 }, | 1593 }, |
| 1594 | 1594 |
| 1595 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a | 1595 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a |
| 1596 // <P> node, and makes all next siblings of that pair children of <P>, up | 1596 // <P> node, and makes all next siblings of that pair children of <P>, up |
| 1597 // until the next pair of <BR> nodes is reached. | 1597 // until the next pair of <BR> nodes is reached. |
| 1598 replaceDoubleBrWithP: function(node) { | 1598 replaceDoubleBrWithP: function(node) { |
| 1599 // Check that we are starting with a BR. | 1599 // Check that we are starting with a BR. |
| 1600 var second = readability.isMultipleBr(node, true); | 1600 var second = readability.isMultipleBr(node, true); |
| 1601 if (!second) { | 1601 if (!second) { |
| 1602 return; | 1602 return; |
| 1603 } | 1603 } |
| 1604 // Make all next siblings of the second BR into children of a P. | 1604 // Make all next siblings of the second BR into children of a P. |
| 1605 var p = document.createElement('p'); | 1605 var p = document.createElement('p'); |
| 1606 var curr = second.nextSibling; | 1606 var curr = second.nextSibling; |
| 1607 while (curr) { | 1607 while (curr) { |
| 1608 if (readability.isMultipleBr(curr, true)) { | 1608 if (readability.isMultipleBr(curr, true)) { |
| 1609 break; | 1609 break; |
| 1610 } | 1610 } |
| 1611 var next = curr.nextSibling; | 1611 var next = curr.nextSibling; |
| 1612 p.appendChild(curr.parentNode.removeChild(curr)); | 1612 p.appendChild(curr.parentNode.removeChild(curr)); |
| 1613 curr = next; | 1613 curr = next; |
| 1614 } | 1614 } |
| 1615 var ret = curr; | 1615 var ret = curr; |
| 1616 | 1616 |
| 1617 // Remove all nodes between the first and second BR. | 1617 // Remove all nodes between the first and second BR. |
| 1618 curr = node.nextSibling; | 1618 curr = node.nextSibling; |
| 1619 while (curr && curr != second) { | 1619 while (curr && curr != second) { |
| 1620 var next = curr.nextSibling; | 1620 var next = curr.nextSibling; |
| 1621 curr.parentNode.removeChild(curr); | 1621 curr.parentNode.removeChild(curr); |
| 1622 curr = next; | 1622 curr = next; |
| 1623 } | 1623 } |
| 1624 // Remove the second BR. | 1624 // Remove the second BR. |
| 1625 second.parentNode.removeChild(second); | 1625 second.parentNode.removeChild(second); |
| 1626 // Replace the first BR with the P. | 1626 // Replace the first BR with the P. |
| 1627 node.parentNode.replaceChild(p, node); | 1627 node.parentNode.replaceChild(p, node); |
| 1628 | 1628 |
| 1629 return ret; | 1629 return ret; |
| 1630 }, | 1630 }, |
| 1631 | 1631 |
| 1632 // Returns true if the NodeList contains a double <BR>. | 1632 // Returns true if the NodeList contains a double <BR>. |
| 1633 hasDoubleBr: function(nodeList) { | 1633 hasDoubleBr: function(nodeList) { |
| 1634 for (var i = 0; i < nodeList.length; nodeList++) { | 1634 for (var i = 0; i < nodeList.length; nodeList++) { |
| 1635 if (readability.isMultipleBr(nodeList[i], true)) { | 1635 if (readability.isMultipleBr(nodeList[i], true)) { |
| 1636 return true; | 1636 return true; |
| 1637 } | 1637 } |
| 1638 } | 1638 } |
| 1639 return false; | 1639 return false; |
| 1640 }, | 1640 }, |
| 1641 | 1641 |
| 1642 // Replaces double <BR> tags with <P> tags. | 1642 // Replaces double <BR> tags with <P> tags. |
| 1643 replaceDoubleBrsWithPs: function(node) { | 1643 replaceDoubleBrsWithPs: function(node) { |
| 1644 var allElements = node.getElementsByTagName('BR'); | 1644 var allElements = node.getElementsByTagName('BR'); |
| 1645 var node = null; | 1645 var node = null; |
| 1646 while (allElements && allElements.length > 0 && | 1646 while (allElements && allElements.length > 0 && |
| 1647 readability.hasDoubleBr(allElements)) { | 1647 readability.hasDoubleBr(allElements)) { |
| 1648 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex +
= 1) { | 1648 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex +
= 1) { |
| 1649 var next = node; | 1649 var next = node; |
| 1650 while (next = readability.replaceDoubleBrWithP(next)); | 1650 while (next = readability.replaceDoubleBrWithP(next)); |
| 1651 } | 1651 } |
| 1652 allElements = document.body.getElementsByTagName('BR'); | 1652 allElements = document.body.getElementsByTagName('BR'); |
| 1653 } | 1653 } |
| 1654 }, | 1654 }, |
| 1655 | 1655 |
| 1656 | 1656 |
| 1657 // Replaces a BR and the whitespace that follows it with a P. | 1657 // Replaces a BR and the whitespace that follows it with a P. |
| 1658 replaceBrWithP: function(node) { | 1658 replaceBrWithP: function(node) { |
| 1659 if (!readability.isBrNode(node)) { | 1659 if (!readability.isBrNode(node)) { |
| 1660 return; | 1660 return; |
| 1661 } | 1661 } |
| 1662 var p = document.createElement('p'); | 1662 var p = document.createElement('p'); |
| 1663 var curr = node.nextSibling; | 1663 var curr = node.nextSibling; |
| 1664 while (curr && !isBrNode(curr)) { | 1664 while (curr && !isBrNode(curr)) { |
| 1665 var next = curr.nextSibling; | 1665 var next = curr.nextSibling; |
| 1666 if (readability.isWhitespaceNode(curr)) { | 1666 if (readability.isWhitespaceNode(curr)) { |
| 1667 curr.parentNode.removeChild(curr); | 1667 curr.parentNode.removeChild(curr); |
| 1668 } else { | 1668 } else { |
| 1669 p.appendChild(curr.parentNode.removeChild(curr)); | 1669 p.appendChild(curr.parentNode.removeChild(curr)); |
| 1670 } | 1670 } |
| 1671 curr = next; | 1671 curr = next; |
| 1672 } | 1672 } |
| 1673 node.parentNode.replaceChild(p, node); | 1673 node.parentNode.replaceChild(p, node); |
| 1674 return curr; | 1674 return curr; |
| 1675 }, | 1675 }, |
| 1676 | 1676 |
| 1677 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t
ag | 1677 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t
ag |
| 1678 // children of the <P>. | 1678 // children of the <P>. |
| 1679 replaceBrsWithPs: function(node) { | 1679 replaceBrsWithPs: function(node) { |
| 1680 var allElements = node.getElementsByTagName('BR'); | 1680 var allElements = node.getElementsByTagName('BR'); |
| 1681 var node = null; | 1681 var node = null; |
| 1682 while (allElements && allElements.length > 0) { | 1682 while (allElements && allElements.length > 0) { |
| 1683 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex +
= 1) { | 1683 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex +
= 1) { |
| 1684 var next = node; | 1684 var next = node; |
| 1685 while (next = readability.replaceBrWithP(next)); | 1685 while (next = readability.replaceBrWithP(next)); |
| 1686 } | 1686 } |
| 1687 allElements = document.body.getElementsByTagName('BR'); | 1687 allElements = document.body.getElementsByTagName('BR'); |
| 1688 } | 1688 } |
| 1689 }, | 1689 }, |
| 1690 | 1690 |
| 1691 // Replaces any tag with any other tag. | 1691 // Replaces any tag with any other tag. |
| 1692 replaceTagsWithTags: function(node, srcTag, destTag) { | 1692 replaceTagsWithTags: function(node, srcTag, destTag) { |
| 1693 var allElements = node.getElementsByTagName(srcTag); | 1693 var allElements = node.getElementsByTagName(srcTag); |
| 1694 for (var i = 0; i < allElements.length; i++) { | 1694 for (var i = 0; i < allElements.length; i++) { |
| 1695 var dest = document.createElement(destTag); | 1695 var dest = document.createElement(destTag); |
| 1696 readability.moveNodeInnards(allElements[i], dest); | 1696 readability.moveNodeInnards(allElements[i], dest); |
| 1697 node.replaceNode(dest, allElements[i]); | 1697 allElements[i].parentNode.replaceChild(dest, allElements[i]); |
| 1698 } | 1698 } |
| 1699 }, | 1699 }, |
| 1700 | 1700 |
| 1701 // Replaces all <noscript> tags with <p> tags. | 1701 // Replaces all <noscript> tags with <p> tags. |
| 1702 replaceNoscriptsWithPs: function(node) { | 1702 replaceNoscriptsWithPs: function(node) { |
| 1703 readability.replaceTagsWithTags(node, 'noscript', 'p'); | 1703 readability.replaceTagsWithTags(node, 'noscript', 'p'); |
| 1704 }, | 1704 }, |
| 1705 | 1705 |
| 1706 // Replaces all <font> tags with <span> tags. | 1706 // Replaces all <font> tags with <span> tags. |
| 1707 replaceFontsWithSpans: function(node) { | 1707 replaceFontsWithSpans: function(node) { |
| 1708 readability.replaceTagsWithTags(node, 'font', 'span'); | 1708 readability.replaceTagsWithTags(node, 'font', 'span'); |
| 1709 }, | 1709 }, |
| 1710 | 1710 |
| 1711 // Returns a list of image URLs in the distilled article. | 1711 // Returns a list of image URLs in the distilled article. |
| 1712 getImages : function() { | 1712 getImages : function() { |
| 1713 var images = document.getElementsByTagName('img'); | 1713 var images = document.getElementsByTagName('img'); |
| 1714 var result = new Array(images.length); | 1714 var result = new Array(images.length); |
| 1715 dbg("Number of images: " + images.length); | 1715 dbg("Number of images: " + images.length); |
| 1716 for(i = 0; i < images.length; i++) { | 1716 for(i = 0; i < images.length; i++) { |
| 1717 result[i] = images[i].src; | 1717 result[i] = images[i].src; |
| 1718 dbg("Image: " + result[i]); | 1718 dbg("Image: " + result[i]); |
| 1719 } | 1719 } |
| 1720 return result; | 1720 return result; |
| 1721 }, | 1721 }, |
| 1722 | 1722 |
| 1723 // Returns the distilled article HTML from the page(s). | 1723 // Returns the distilled article HTML from the page(s). |
| 1724 getDistilledArticleHTML : function() { | 1724 getDistilledArticleHTML : function() { |
| 1725 return readability.distilledHTML; | 1725 return readability.distilledHTML; |
| 1726 }, |
| 1727 |
| 1728 // Returns the next page of this article. |
| 1729 getNextPageLink : function() { |
| 1730 return readability.nextPageLink; |
| 1726 } | 1731 } |
| 1727 }; | 1732 }; |
| 1728 | 1733 |
| 1729 // Extracts long-form content from a page and returns and array where the first | 1734 // Extracts long-form content from a page and returns and array where the first |
| 1730 // element is the article title, the second element is HTML containing the | 1735 // element is the article title, the second element is HTML containing the |
| 1731 // long-form content, and remaining elements are URLs for images referenced by | 1736 // long-form content, and remaining elements are URLs for images referenced by |
| 1732 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which | 1737 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which |
| 1733 // corresponds to a URL listed at index k in the array returned. | 1738 // corresponds to a URL listed at index k in the array returned. |
| 1734 (function () { | 1739 (function () { |
| 1735 readability.init(); | 1740 readability.init(); |
| 1736 var result = new Array(2); | 1741 var result = new Array(3); |
| 1737 result[0] = readability.getArticleTitle(); | 1742 result[0] = readability.getArticleTitle(); |
| 1738 result[1] = readability.getDistilledArticleHTML(); | 1743 result[1] = readability.getDistilledArticleHTML(); |
| 1744 result[2] = readability.getNextPageLink(); |
| 1739 return result.concat(readability.getImages()); | 1745 return result.concat(readability.getImages()); |
| 1740 }()) | 1746 }()) |
| 1741 | 1747 |
| OLD | NEW |