third_party/readability/js/readability.js - Issue 146843010: Add support for multipage distillation. - Code Review

Chromium Code Reviews

chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out

(167)

My Issues | Starred Open | Closed | All

Side by Side Diff: third_party/readability/js/readability.js

Issue 146843010: Add support for multipage distillation. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: rebase address comments. Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« components/dom_distiller/core/distiller.h ('K') | « components/dom_distiller/core/task_tracker_unittest.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1	1
cjhopman 2014/02/03 23:56:53 Since this is modified, it needs to contain a noti Since this is modified, it needs to contain a notice that you've modified it. I believe that just adding the chromium copyright header is the correct approach. Also, make sure that the README.chromium is updated to reflect the fact that changes have been made to this file. shashi 2014/02/04 01:39:37 Done. Show quoted text On 2014/02/03 23:56:53, cjhopman wrote: > Since this is modified, it needs to contain a notice that you've modified it. I > believe that just adding the chromium copyright header is the correct approach. > > Also, make sure that the README.chromium is updated to reflect the fact that > changes have been made to this file. Done.
2 var dbg = (typeof console !== 'undefined') ? function(s) {	2 var dbg = (typeof console !== 'undefined') ? function(s) {

3 console.log("Readability: " + s);	3 console.log("Readability: " + s);

4 } : function() {};	4 } : function() {};

5	5

6 /*	6 /*

7 * Readability. An Arc90 Lab Experiment.	7 * Readability. An Arc90 Lab Experiment.

8 * Website: http://lab.arc90.com/experiments/readability	8 * Website: http://lab.arc90.com/experiments/readability

9 * Source: http://code.google.com/p/arc90labs-readability	9 * Source: http://code.google.com/p/arc90labs-readability

10 *	10 *

11 * "Readability" is a trademark of Arc90 Inc and may not be used without explici t permission.	11 * "Readability" is a trademark of Arc90 Inc and may not be used without explici t permission.

12 *	12 *

13 * Copyright (c) 2010 Arc90 Inc	13 * Copyright (c) 2010 Arc90 Inc

14 * Readability is licensed under the Apache License, Version 2.0.	14 * Readability is licensed under the Apache License, Version 2.0.

15 **/	15 **/

16 var readability = {	16 var readability = {

17 readStyle: "style-newspaper",	17 readStyle: "style-newspaper",

18 readSize: "size-medium",	18 readSize: "size-medium",

19 readMargin: "margin-wide",	19 readMargin: "margin-wide",

20	20

21 distilledHTML: '',	21 distilledHTML: '',

22 distilledArticleContent: null,	22 distilledArticleContent: null,

	23 nextPageLink: '',

23	24

24 version: '1.7.1',	25 version: '1.7.1',

25 iframeLoads: 0,	26 iframeLoads: 0,

26 convertLinksToFootnotes: false,	27 convertLinksToFootnotes: false,

27 reversePageScroll: false, /* If they hold shift and hit space, scroll up */	28 reversePageScroll: false, /* If they hold shift and hit space, scroll up */

28 frameHack: false, /**	29 frameHack: false, /**

29 * The frame hack is to workaround a firefo x bug where if you	30 * The frame hack is to workaround a firefo x bug where if you

30 * pull content out of a frame and stick it into the parent element, the scrollbar won't appear.	31 * pull content out of a frame and stick it into the parent element, the scrollbar won't appear.

31 * So we fake a scrollbar in the wrapping d iv.	32 * So we fake a scrollbar in the wrapping d iv.

32 **/	33 **/

33 biggestFrame: false,	34 biggestFrame: false,

34 flags: 0x1 \| 0x2 \| 0x4, /* Start with all flags set. */	35 flags: 0x1 \| 0x2 \| 0x4, /* Start with all flags set. */

35	36

36 /* constants */	37 /* constants */

37 FLAG_STRIP_UNLIKELYS: 0x1,	38 FLAG_STRIP_UNLIKELYS: 0x1,

38 FLAG_WEIGHT_CLASSES: 0x2,	39 FLAG_WEIGHT_CLASSES: 0x2,

39 FLAG_CLEAN_CONDITIONALLY: 0x4,	40 FLAG_CLEAN_CONDITIONALLY: 0x4,

40	41

41 maxPages: 30, /* The maximum number of pages to loop through before we ca ll it quits and just show a link. */	42 maxPages: 30, /* The maximum number of pages to loop through before we ca ll it quits and just show a link. */

42 parsedPages: {}, /* The list of pages we've parsed in this call of readabili ty, for autopaging. As a key store for easier searching. */	43 parsedPages: {}, /* The list of pages we've parsed in this call of readabili ty, for autopaging. As a key store for easier searching. */

43 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas e they happen to match, we'll know it's a duplicate. */	44 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas e they happen to match, we'll know it's a duplicate. */

44	45

45 /**	46 /**

46 * All of the regular expressions in use within readability.	47 * All of the regular expressions in use within readability.

47 * Defined up here so we don't instantiate them repeatedly in loops.	48 * Defined up here so we don't instantiate them repeatedly in loops.

48 **/	49 **/

49 regexps: {	50 regexps: {

50 unlikelyCandidates: /combx\|comment\|community\|disqus\|extra\|foot\|header \|menu\|remark\|rss\|shoutbox\|sidebar\|sponsor\|ad-break\|agegate\|pagination\|pager\|popu p\|tweet\|twitter/i,	51 unlikelyCandidates: /combx\|comment\|community\|disqus\|extra\|foot\|header \|menu\|remark\|rss\|shoutbox\|sidebar\|sponsor\|ad-break\|agegate\|pagination\|pager\|popu p\|tweet\|twitter/i,

51 okMaybeItsACandidate: /and\|article\|body\|column\|main\|shadow/i,	52 okMaybeItsACandidate: /and\|article\|body\|column\|main\|shadow/i,

52 positive: /article\|body\|content\|entry\|hentry\|main\|page\|pagi nation\|post\|text\|blog\|story/i,	53 positive: /article\|body\|content\|entry\|hentry\|main\|page\|pagi nation\|post\|text\|blog\|story/i,

53 negative: /combx\|comment\|com-\|contact\|foot\|footer\|footnote\| masthead\|media\|meta\|outbrain\|promo\|related\|scroll\|shoutbox\|sidebar\|sponsor\|shopp ing\|tags\|tool\|widget/i,	54 negative: /combx\|comment\|com-\|contact\|foot\|footer\|footnote\| masthead\|media\|meta\|outbrain\|promo\|related\|scroll\|shoutbox\|sidebar\|sponsor\|shopp ing\|tags\|tool\|widget/i,

54 extraneous: /print\|archive\|comment\|discuss\|e[\-]?mail\|share\|r eply\|all\|login\|sign\|single/i,	55 extraneous: /print\|archive\|comment\|discuss\|e[\-]?mail\|share\|r eply\|all\|login\|sign\|single/i,

55 divToPElements: /<(a\|blockquote\|dl\|div\|img\|ol\|p\|pre\|table\|ul)/i,	56 divToPElements: /<(a\|blockquote\|dl\|div\|img\|ol\|p\|pre\|table\|ul)/i,

56 replaceBrs: /(<br[^>]>[ \n\r\t]){2,}/gi,	57 replaceBrs: /(<br[^>]>[ \n\r\t]){2,}/gi,

57 replaceFonts: /<(\/?)font[^>]*>/gi,	58 replaceFonts: /<(\/?)font[^>]*>/gi,

58 trim: /^\s+\|\s+$/g,	59 trim: /^\s+\|\s+$/g,

59 normalize: /\s{2,}/g,	60 normalize: /\s{2,}/g,

60 killBreaks: /(<br\s\/?>(\s\| ?)){1,}/g,	61 killBreaks: /(<br\s\/?>(\s\| ?)){1,}/g,

61 videos: /http:\/\/(www\.)?(youtube\|vimeo)\.com/i,	62 videos: /http:\/\/(www\.)?(youtube\|vimeo)\.com/i,

62 skipFootnoteLink: /^\s(\[?[a-z0-9]{1,2}\]?\|^\|edit\|citation needed) \s$/i,	63 skipFootnoteLink: /^\s(\[?[a-z0-9]{1,2}\]?\|^\|edit\|citation needed) \s$/i,

63 nextLink: /(next\|weiter\|continue\|>([^\\|]\|$)\|»([^\\|]\|$))/i, // Match: next, continue, >, >>, » but not >\|, »\| as those usually mean last.	64 nextLink: /(next\|weiter\|continue\|>([^\\|]\|$)\|»([^\\|]\|$))/i, // Match: next, continue, >, >>, » but not >\|, »\| as those usually mean last.

64 prevLink: /(prev\|earl\|old\|new\|<\|«)/i	65 prevLink: /(prev\|earl\|old\|new\|<\|«)/i

65 },	66 },

66	67

67 /**	68 /**

68 * Runs readability.	69 * Runs readability.

69 *	70 *

70 * Workflow:	71 * Workflow:

71 * 1. Prep the document by removing script tags, css, etc.	72 * 1. Prep the document by removing script tags, css, etc.

72 * 2. Build readability's DOM tree.	73 * 2. Build readability's DOM tree.

73 * 3. Grab the article content from the current dom tree.	74 * 3. Grab the article content from the current dom tree.

74 * 4. Replace the current DOM tree with the new one.	75 * 4. Replace the current DOM tree with the new one.

75 * 5. Read peacefully.	76 * 5. Read peacefully.

76 *	77 *

77 * @return void	78 * @return void

78 **/	79 **/

79 init: function() {	80 init: function() {

80 /* Before we do anything, remove all scripts that are not readability. * /	81 /* Before we do anything, remove all scripts that are not readability. * /

81 window.onload = window.onunload = function() {};	82 window.onload = window.onunload = function() {};

82	83

83 readability.removeScripts(document);	84 readability.removeScripts(document);

84	85

85 /* Make sure this document is added to the list of parsed pages first, s o we don't double up on the first page */	86 /* Make sure this document is added to the list of parsed pages first, s o we don't double up on the first page */

86 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;	87 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;

87	88

88 /* Pull out any possible next page link first */	89 /* Pull out any possible next page link first */

89 var nextPageLink = readability.findNextPageLink(document.body);	90 readability.nextPageLink = readability.findNextPageLink(document.body);

90	91

	92 /* We handle processing of nextPage from C++ set nextPageLink to null */

	93 var nextPageLink = null;

	94

91 readability.prepDocument();	95 readability.prepDocument();

92	96

93 /* Build readability's DOM tree */	97 /* Build readability's DOM tree */

94 var overlay = document.createElement("DIV");	98 var overlay = document.createElement("DIV");

95 var innerDiv = document.createElement("DIV");	99 var innerDiv = document.createElement("DIV");

96 var articleTools = readability.getArticleTools();	100 var articleTools = readability.getArticleTools();

97 var articleTitleText = readability.getArticleTitle();	101 var articleTitleText = readability.getArticleTitle();

98 var articleContent = readability.grabArticle();	102 var articleContent = readability.grabArticle();

99	103

100 if(!articleContent) {	104 if(!articleContent) {

(...skipping 44 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
145 rootWarning.innerHTML = "<em>Readability</em> was intended for u se on individual articles and not home pages. " +	149 rootWarning.innerHTML = "<em>Readability</em> was intended for u se on individual articles and not home pages. " +

146 "If you'd like to try rendering this page anyway, <a onClick='ja vascript:document.getElementById(\"readability-warning\").style.display=\"none\" ;document.getElementById(\"readability-content\").style.display=\"block\";'>clic k here</a> to continue.";	150 "If you'd like to try rendering this page anyway, <a onClick='ja vascript:document.getElementById(\"readability-warning\").style.display=\"none\" ;document.getElementById(\"readability-content\").style.display=\"block\";'>clic k here</a> to continue.";

147	151

148 innerDiv.insertBefore( rootWarning, articleContent );	152 innerDiv.insertBefore( rootWarning, articleContent );

149 }	153 }

150	154

151 readability.postProcessContent(articleContent);	155 readability.postProcessContent(articleContent);

152	156

153 window.scrollTo(0, 0);	157 window.scrollTo(0, 0);

154	158

155 // TODO(bengr): Remove this assignment of null to nextPageLink when

156 // the processing of the next page link is safe.

157 nextPageLink = null;

158

159 if (nextPageLink) {	159 if (nextPageLink) {

160 /**	160 /**

161 * Append any additional pages after a small timeout so that people	161 * Append any additional pages after a small timeout so that people

162 * can start reading without having to wait for this to finish proce ssing.	162 * can start reading without having to wait for this to finish proce ssing.

163 **/	163 **/

164 window.setTimeout(function() {	164 window.setTimeout(function() {

165 readability.appendNextPage(nextPageLink);	165 readability.appendNextPage(nextPageLink);

166 }, 500);	166 }, 500);

167 }	167 }

168	168

169 / Smooth scrolling /	169 / Smooth scrolling /

170 document.onkeydown = function(e) {	170 document.onkeydown = function(e) {

171 var code = (window.event) ? event.keyCode : e.keyCode;	171 var code = (window.event) ? event.keyCode : e.keyCode;

172 if (code === 16) {	172 if (code === 16) {

173 readability.reversePageScroll = true;	173 readability.reversePageScroll = true;

174 return;	174 return;

175 }	175 }

176	176

177 if (code === 32) {	177 if (code === 32) {

178 readability.curScrollStep = 0;	178 readability.curScrollStep = 0;

179 var windowHeight = window.innerHeight ? window.innerHeight : (do cument.documentElement.clientHeight ? document.documentElement.clientHeight : do cument.body.clientHeight);	179 var windowHeight = window.innerHeight ? window.innerHeight : (do cument.documentElement.clientHeight ? document.documentElement.clientHeight : do cument.body.clientHeight);

180	180

181 if(readability.reversePageScroll) {	181 if(readability.reversePageScroll) {

182 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() - (windowHeight - 50), 20, 10);	182 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() - (windowHeight - 50), 20, 10);

183 }	183 }

184 else {	184 else {

185 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() + (windowHeight - 50), 20, 10);	185 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() + (windowHeight - 50), 20, 10);

186 }	186 }

187	187

188 return false;	188 return false;

189 }	189 }

190 };	190 };

191	191

192 document.onkeyup = function(e) {	192 document.onkeyup = function(e) {

193 var code = (window.event) ? event.keyCode : e.keyCode;	193 var code = (window.event) ? event.keyCode : e.keyCode;

194 if (code === 16) {	194 if (code === 16) {

195 readability.reversePageScroll = false;	195 readability.reversePageScroll = false;

196 return;	196 return;

197 }	197 }

198 };	198 };

199 },	199 },

200	200

201 /**	201 /**

202 * Run any post-process modifications to article content as necessary.	202 * Run any post-process modifications to article content as necessary.

203 *	203 *

204 * @param Element	204 * @param Element

205 * @return void	205 * @return void

206 **/	206 **/

207 postProcessContent: function(articleContent) {	207 postProcessContent: function(articleContent) {

208 if(readability.convertLinksToFootnotes && !window.location.href.match(/w ikipedia\.org/g)) {	208 if(readability.convertLinksToFootnotes && !window.location.href.match(/w ikipedia\.org/g)) {

209 readability.addFootnotes(articleContent);	209 readability.addFootnotes(articleContent);

210 }	210 }

211	211

212 readability.fixImageFloats(articleContent);	212 readability.fixImageFloats(articleContent);

213 },	213 },

214	214

215 /**	215 /**

216 * Some content ends up looking ugly if the image is too large to be floated .	216 * Some content ends up looking ugly if the image is too large to be floated .

217 * If the image is wider than a threshold (currently 55%), no longer float i t,	217 * If the image is wider than a threshold (currently 55%), no longer float i t,

218 * center it instead.	218 * center it instead.

219 *	219 *

220 * @param Element	220 * @param Element

221 * @return void	221 * @return void

222 **/	222 **/

223 fixImageFloats: function (articleContent) {	223 fixImageFloats: function (articleContent) {

224 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0. 55,	224 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0. 55,

225 images = articleContent.getElementsByTagName('img');	225 images = articleContent.getElementsByTagName('img');

226	226

227 for(var i=0, il = images.length; i < il; i+=1) {	227 for(var i=0, il = images.length; i < il; i+=1) {

228 var image = images[i];	228 var image = images[i];

229	229

230 if(image.offsetWidth > imageWidthThreshold) {	230 if(image.offsetWidth > imageWidthThreshold) {

231 image.className += " blockImage";	231 image.className += " blockImage";

232 }	232 }

233 }	233 }

234 },	234 },

235	235

236 /**	236 /**

237 * Get the article tools Element that has buttons like reload, print.	237 * Get the article tools Element that has buttons like reload, print.

238 *	238 *

239 * @return void	239 * @return void

240 **/	240 **/

241 getArticleTools: function () {	241 getArticleTools: function () {

242 var articleTools = document.createElement("DIV");	242 var articleTools = document.createElement("DIV");

243	243

244 articleTools.id = "readTools";	244 articleTools.id = "readTools";

245 articleTools.innerHTML =	245 articleTools.innerHTML =

246 "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +	246 "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +

247 "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +	247 "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +

248 "<a href='#' onclick='readability.emailBox(); return false;' title=' Email page' id='email-page'>Email Page</a>";	248 "<a href='#' onclick='readability.emailBox(); return false;' title=' Email page' id='email-page'>Email Page</a>";

249	249

250 return articleTools;	250 return articleTools;

251 },	251 },

252	252

253 /**	253 /**

254 * retuns the suggested direction of the string	254 * retuns the suggested direction of the string

255 *	255 *

256 * @return "rtl" \|\| "ltr"	256 * @return "rtl" \|\| "ltr"

257 **/	257 **/

258 getSuggestedDirection: function(text) {	258 getSuggestedDirection: function(text) {

259 function sanitizeText() {	259 function sanitizeText() {

260 return text.replace(/@\w+/, "");	260 return text.replace(/@\w+/, "");

261 }	261 }

262	262

263 function countMatches(match) {	263 function countMatches(match) {

264 var matches = text.match(new RegExp(match, "g"));	264 var matches = text.match(new RegExp(match, "g"));

265 return matches !== null ? matches.length : 0;	265 return matches !== null ? matches.length : 0;

266 }	266 }

267	267

268 function isRTL() {	268 function isRTL() {

269 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");	269 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");

270 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");	270 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");

271	271

272 // if 20% of chars are Hebrew or Arbic then direction is rtl	272 // if 20% of chars are Hebrew or Arbic then direction is rtl

273 return (count_heb + count_arb) * 100 / text.length > 20;	273 return (count_heb + count_arb) * 100 / text.length > 20;

274 }	274 }

275	275

276 text = sanitizeText(text);	276 text = sanitizeText(text);

277 return isRTL() ? "rtl" : "ltr";	277 return isRTL() ? "rtl" : "ltr";

278 },	278 },

279	279

280 /**	280 /**

281 * Get the article title as an H1.	281 * Get the article title as an H1.

282 *	282 *

283 * @return void	283 * @return void

284 **/	284 **/

285 getArticleTitle: function () {	285 getArticleTitle: function () {

286 var curTitle = "",	286 var curTitle = "",

287 origTitle = "";	287 origTitle = "";

288	288

289 try {	289 try {

290 curTitle = origTitle = document.title;	290 curTitle = origTitle = document.title;

291 if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */	291 if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */

292 curTitle = origTitle = readability.getInnerText(document.getElem entsByTagName('title')[0]);	292 curTitle = origTitle = readability.getInnerText(document.getElem entsByTagName('title')[0]);

293 }	293 }

294 }	294 }

295 catch(e) {}	295 catch(e) {}

296	296

297 if(curTitle.match(/ [\\|\-] /))	297 if(curTitle.match(/ [\\|\-] /))

298 {	298 {

299 curTitle = origTitle.replace(/(.)[\\|\-] ./gi,'$1');	299 curTitle = origTitle.replace(/(.)[\\|\-] ./gi,'$1');

300	300

301 if(curTitle.split(' ').length < 3) {	301 if(curTitle.split(' ').length < 3) {

302 curTitle = origTitle.replace(/[^\\|\-][\\|\-](.)/gi,'$1');	302 curTitle = origTitle.replace(/[^\\|\-][\\|\-](.)/gi,'$1');

303 }	303 }

304 }	304 }

305 else if(curTitle.indexOf(': ') !== -1)	305 else if(curTitle.indexOf(': ') !== -1)

306 {	306 {

307 curTitle = origTitle.replace(/.:(.)/gi, '$1');	307 curTitle = origTitle.replace(/.:(.)/gi, '$1');

308	308

309 if(curTitle.split(' ').length < 3) {	309 if(curTitle.split(' ').length < 3) {

310 curTitle = origTitle.replace(/[^:][:](.)/gi,'$1');	310 curTitle = origTitle.replace(/[^:][:](.)/gi,'$1');

(...skipping 12 matching lines...) Expand all Loading...
323	323

324 if(curTitle.split(' ').length <= 4) {	324 if(curTitle.split(' ').length <= 4) {

325 curTitle = origTitle;	325 curTitle = origTitle;

326 }	326 }

327 return curTitle;	327 return curTitle;

328 },	328 },

329	329

330 /**	330 /**

331 * Prepare the HTML document for readability to scrape it.	331 * Prepare the HTML document for readability to scrape it.

332 * This includes things like stripping javascript, CSS, and handling terribl e markup.	332 * This includes things like stripping javascript, CSS, and handling terribl e markup.

333 *	333 *

334 * @return void	334 * @return void

335 **/	335 **/

336 prepDocument: function () {	336 prepDocument: function () {

337 /**	337 /**

338 * In some cases a body element can't be found (if the HTML is totally h osed for example)	338 * In some cases a body element can't be found (if the HTML is totally h osed for example)

339 * so we create a new body node and append it to the document.	339 * so we create a new body node and append it to the document.

340 */	340 */

341 if(document.body === null)	341 if(document.body === null)

342 {	342 {

343 var body = document.createElement("body");	343 var body = document.createElement("body");

344 try {	344 try {

345 document.body = body;	345 document.body = body;

346 }	346 }

347 catch(e) {	347 catch(e) {

348 document.documentElement.appendChild(body);	348 document.documentElement.appendChild(body);

349 dbg(e);	349 dbg(e);

350 }	350 }

351 }	351 }

352	352

353 document.body.id = "readabilityBody";	353 document.body.id = "readabilityBody";

354	354

355 var frames = document.getElementsByTagName('frame');	355 var frames = document.getElementsByTagName('frame');

(...skipping 11 matching lines...) Expand all Loading...
367 canAccessFrame = true;	367 canAccessFrame = true;

368 }	368 }

369 catch(eFrames) {	369 catch(eFrames) {

370 dbg(eFrames);	370 dbg(eFrames);

371 }	371 }

372	372

373 if(frameSize > biggestFrameSize) {	373 if(frameSize > biggestFrameSize) {

374 biggestFrameSize = frameSize;	374 biggestFrameSize = frameSize;

375 readability.biggestFrame = frames[frameIndex];	375 readability.biggestFrame = frames[frameIndex];

376 }	376 }

377	377

378 if(canAccessFrame && frameSize > bestFrameSize)	378 if(canAccessFrame && frameSize > bestFrameSize)

379 {	379 {

380 readability.frameHack = true;	380 readability.frameHack = true;

381	381

382 bestFrame = frames[frameIndex];	382 bestFrame = frames[frameIndex];

383 bestFrameSize = frameSize;	383 bestFrameSize = frameSize;

384 }	384 }

385 }	385 }

386	386

387 if(bestFrame)	387 if(bestFrame)

388 {	388 {

389 var newBody = document.createElement('body');	389 var newBody = document.createElement('body');

390 readability.moveNodeInnards(bestFrame.contentWindow.document.bod y, newBody);	390 readability.moveNodeInnards(bestFrame.contentWindow.document.bod y, newBody);

391 newBody.style.overflow = 'scroll';	391 newBody.style.overflow = 'scroll';

392 document.body = newBody;	392 document.body = newBody;

393	393

394 var frameset = document.getElementsByTagName('frameset')[0];	394 var frameset = document.getElementsByTagName('frameset')[0];

395 if(frameset) {	395 if(frameset) {

396 frameset.parentNode.removeChild(frameset); }	396 frameset.parentNode.removeChild(frameset); }

397 }	397 }

398 }	398 }

399	399

400 /* Remove all stylesheets */	400 /* Remove all stylesheets */

401 for (var k=0;k < document.styleSheets.length; k+=1) {	401 for (var k=0;k < document.styleSheets.length; k+=1) {

402 if (document.styleSheets[k].href !== null && document.styleSheets[k] .href.lastIndexOf("readability") === -1) {	402 if (document.styleSheets[k].href !== null && document.styleSheets[k] .href.lastIndexOf("readability") === -1) {

403 document.styleSheets[k].disabled = true;	403 document.styleSheets[k].disabled = true;

(...skipping 44 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
448 readability.cleanConditionally(articleContent, "table");	448 readability.cleanConditionally(articleContent, "table");

449 readability.cleanConditionally(articleContent, "ul");	449 readability.cleanConditionally(articleContent, "ul");

450 readability.cleanConditionally(articleContent, "div");	450 readability.cleanConditionally(articleContent, "div");

451	451

452 /* Remove extra paragraphs */	452 /* Remove extra paragraphs */

453 var articleParagraphs = articleContent.getElementsByTagName('p');	453 var articleParagraphs = articleContent.getElementsByTagName('p');

454 for(var i = articleParagraphs.length-1; i >= 0; i-=1) {	454 for(var i = articleParagraphs.length-1; i >= 0; i-=1) {

455 var imgCount = articleParagraphs[i].getElementsByTagName('img').l ength;	455 var imgCount = articleParagraphs[i].getElementsByTagName('img').l ength;

456 var embedCount = articleParagraphs[i].getElementsByTagName('embed') .length;	456 var embedCount = articleParagraphs[i].getElementsByTagName('embed') .length;

457 var objectCount = articleParagraphs[i].getElementsByTagName('object' ).length;	457 var objectCount = articleParagraphs[i].getElementsByTagName('object' ).length;

458	458

459 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab ility.getInnerText(articleParagraphs[i], false) === '') {	459 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab ility.getInnerText(articleParagraphs[i], false) === '') {

460 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i] );	460 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i] );

461 }	461 }

462 }	462 }

463	463

464 try {	464 try {

465 readability.replaceBrsWithPs(articleContent);	465 readability.replaceBrsWithPs(articleContent);

466 }	466 }

467 catch (e) {	467 catch (e) {

468 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block -elements bug. Ignoring.: " + e);	468 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block -elements bug. Ignoring.: " + e);

469 }	469 }

470 },	470 },

471	471

472 /**	472 /**

473 * Initialize a node with the readability object. Also checks the	473 * Initialize a node with the readability object. Also checks the

474 * className/id for special names to add to its score.	474 * className/id for special names to add to its score.

475 *	475 *

476 * @param Element	476 * @param Element

477 * @return void	477 * @return void

478 **/	478 **/

479 initializeNode: function (node) {	479 initializeNode: function (node) {

480 node.readability = {"contentScore": 0};	480 node.readability = {"contentScore": 0};

481	481

482 switch(node.tagName) {	482 switch(node.tagName) {

483 case 'DIV':	483 case 'DIV':

484 node.readability.contentScore += 5;	484 node.readability.contentScore += 5;

485 break;	485 break;

486	486

487 case 'PRE':	487 case 'PRE':

488 case 'TD':	488 case 'TD':

489 case 'BLOCKQUOTE':	489 case 'BLOCKQUOTE':

490 node.readability.contentScore += 3;	490 node.readability.contentScore += 3;

491 break;	491 break;

492	492

493 case 'ADDRESS':	493 case 'ADDRESS':

494 case 'OL':	494 case 'OL':

495 case 'UL':	495 case 'UL':

496 case 'DL':	496 case 'DL':

497 case 'DD':	497 case 'DD':

498 case 'DT':	498 case 'DT':

499 case 'LI':	499 case 'LI':

500 case 'FORM':	500 case 'FORM':

501 node.readability.contentScore -= 3;	501 node.readability.contentScore -= 3;

502 break;	502 break;

503	503

504 case 'H1':	504 case 'H1':

505 case 'H2':	505 case 'H2':

506 case 'H3':	506 case 'H3':

507 case 'H4':	507 case 'H4':

508 case 'H5':	508 case 'H5':

509 case 'H6':	509 case 'H6':

510 case 'TH':	510 case 'TH':

511 node.readability.contentScore -= 5;	511 node.readability.contentScore -= 5;

512 break;	512 break;

513 }	513 }

514	514

515 node.readability.contentScore += readability.getClassWeight(node);	515 node.readability.contentScore += readability.getClassWeight(node);

516 },	516 },

517	517

518 /***	518 /***

519 * grabArticle - Using a variety of metrics (content score, classname, eleme nt types), find the content that is	519 * grabArticle - Using a variety of metrics (content score, classname, eleme nt types), find the content that is

520 * most likely to be the stuff a user wants to read. Then retu rn it wrapped up in a div.	520 * most likely to be the stuff a user wants to read. Then retu rn it wrapped up in a div.

521 *	521 *

522 * @param page a document to run upon. Needs to be a full document, complete with body.	522 * @param page a document to run upon. Needs to be a full document, complete with body.

523 * @return Element	523 * @return Element

524 **/	524 **/

525 grabArticle: function (pageToClone) {	525 grabArticle: function (pageToClone) {

526 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_ STRIP_UNLIKELYS),	526 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_ STRIP_UNLIKELYS),

527 isPaging = (page !== null) ? true: false;	527 isPaging = (page !== null) ? true: false;

528	528

529 var page = null;	529 var page = null;

530 // Never work on the actual page.	530 // Never work on the actual page.

531 if (isPaging) {	531 if (isPaging) {

532 page = document.body.cloneNode(true);	532 page = document.body.cloneNode(true);

533 } else {	533 } else {

534 page = pageToClone.cloneNode(true);	534 page = pageToClone.cloneNode(true);

535 }	535 }

536	536

537 var allElements = page.getElementsByTagName('*');	537 var allElements = page.getElementsByTagName('*');

538	538

539 /**	539 /**

540 * First, node prepping. Trash nodes that look cruddy (like ones with th e class name "comment", etc), and turn divs	540 * First, node prepping. Trash nodes that look cruddy (like ones with th e class name "comment", etc), and turn divs

541 * into P tags where they have been used inappropriately (as in, where t hey contain no other block level elements.)	541 * into P tags where they have been used inappropriately (as in, where t hey contain no other block level elements.)

542 *	542 *

543 * Note: Assignment from index for performance. See http://www.peachpit. com/articles/article.aspx?p=31567&seqNum=5	543 * Note: Assignment from index for performance. See http://www.peachpit. com/articles/article.aspx?p=31567&seqNum=5

544 * TODO: Shouldn't this be a reverse traversal?	544 * TODO: Shouldn't this be a reverse traversal?

545 **/	545 **/

546 var node = null;	546 var node = null;

547 var nodesToScore = [];	547 var nodesToScore = [];

548 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {	548 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {

549 /* Remove unlikely candidates */	549 /* Remove unlikely candidates */

550 if (stripUnlikelyCandidates) {	550 if (stripUnlikelyCandidates) {

551 var unlikelyMatchString = node.className + node.id;	551 var unlikelyMatchString = node.className + node.id;

552 if (	552 if (

553 (	553 (

554 unlikelyMatchString.search(readability.regexps.unlikelyC andidates) !== -1 &&	554 unlikelyMatchString.search(readability.regexps.unlikelyC andidates) !== -1 &&

555 unlikelyMatchString.search(readability.regexps.okMaybeIt sACandidate) === -1 &&	555 unlikelyMatchString.search(readability.regexps.okMaybeIt sACandidate) === -1 &&

556 node.tagName !== "BODY"	556 node.tagName !== "BODY"

557 )	557 )

558 )	558 )

559 {	559 {

560 dbg("Removing unlikely candidate - " + unlikelyMatchString);	560 dbg("Removing unlikely candidate - " + unlikelyMatchString);

561 node.parentNode.removeChild(node);	561 node.parentNode.removeChild(node);

562 nodeIndex-=1;	562 nodeIndex-=1;

563 continue;	563 continue;

564 }	564 }

565 }	565 }

566	566

567 if (node.tagName === "P" \|\| node.tagName === "TD" \|\| node.tagName == = "PRE") {	567 if (node.tagName === "P" \|\| node.tagName === "TD" \|\| node.tagName == = "PRE") {

568 nodesToScore[nodesToScore.length] = node;	568 nodesToScore[nodesToScore.length] = node;

569 }	569 }

570	570

571 /* Turn all divs that don't have children block level elements into p's */	571 /* Turn all divs that don't have children block level elements into p's */

572 if (node.tagName === "DIV") {	572 if (node.tagName === "DIV") {

573 if (node.innerHTML.search(readability.regexps.divToPElements) == = -1) {	573 if (node.innerHTML.search(readability.regexps.divToPElements) == = -1) {

574 var newNode = document.createElement('p');	574 var newNode = document.createElement('p');

(...skipping 16 matching lines...) Expand all Loading...
591 if(childNode.nodeType === 3) { // Node.TEXT_NODE	591 if(childNode.nodeType === 3) { // Node.TEXT_NODE

592 var p = document.createElement('p');	592 var p = document.createElement('p');

593 var t = document.createTextNode(childNode.nodeValue) ;	593 var t = document.createTextNode(childNode.nodeValue) ;

594 p.appendChild(t);	594 p.appendChild(t);

595 p.style.display = 'inline';	595 p.style.display = 'inline';

596 p.className = 'readability-styled';	596 p.className = 'readability-styled';

597 childNode.parentNode.replaceChild(p, childNode);	597 childNode.parentNode.replaceChild(p, childNode);

598 }	598 }

599 }	599 }

600 }	600 }

601 }	601 }

602 }	602 }

603	603

604 /**	604 /**

605 * Loop through all paragraphs, and assign a score to them based on how content-y they look.	605 * Loop through all paragraphs, and assign a score to them based on how content-y they look.

606 * Then add their score to their parent node.	606 * Then add their score to their parent node.

607 *	607 *

608 * A score is determined by things like number of commas, class names, e tc. Maybe eventually link density.	608 * A score is determined by things like number of commas, class names, e tc. Maybe eventually link density.

609 **/	609 **/

610 var candidates = [];	610 var candidates = [];

611 for (var pt=0; pt < nodesToScore.length; pt+=1) {	611 for (var pt=0; pt < nodesToScore.length; pt+=1) {

(...skipping 21 matching lines...) Expand all Loading...
633 candidates.push(grandParentNode);	633 candidates.push(grandParentNode);

634 }	634 }

635	635

636 var contentScore = 0;	636 var contentScore = 0;

637	637

638 /* Add a point for the paragraph itself as a base. */	638 /* Add a point for the paragraph itself as a base. */

639 contentScore+=1;	639 contentScore+=1;

640	640

641 /* Add points for any commas within this paragraph */	641 /* Add points for any commas within this paragraph */

642 contentScore += innerText.split(',').length;	642 contentScore += innerText.split(',').length;

643	643

644 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */	644 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */

645 contentScore += Math.min(Math.floor(innerText.length / 100), 3);	645 contentScore += Math.min(Math.floor(innerText.length / 100), 3);

646	646

647 /* Add the score to the parent. The grandparent gets half. */	647 /* Add the score to the parent. The grandparent gets half. */

648 parentNode.readability.contentScore += contentScore;	648 parentNode.readability.contentScore += contentScore;

649	649

650 if(grandParentNode) {	650 if(grandParentNode) {

651 grandParentNode.readability.contentScore += contentScore/2;	651 grandParentNode.readability.contentScore += contentScore/2;

652 }	652 }

653 }	653 }

654	654

655 /**	655 /**

656 * After we've calculated scores, loop through all of the possible candi date nodes we found	656 * After we've calculated scores, loop through all of the possible candi date nodes we found

657 * and find the one with the highest score.	657 * and find the one with the highest score.

658 **/	658 **/

659 var topCandidate = null;	659 var topCandidate = null;

660 for(var c=0, cl=candidates.length; c < cl; c+=1)	660 for(var c=0, cl=candidates.length; c < cl; c+=1)

661 {	661 {

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
718 var contentBonus = 0;	718 var contentBonus = 0;

719 /* Give a bonus if sibling nodes and top candidates have the example same classname */	719 /* Give a bonus if sibling nodes and top candidates have the example same classname */

720 if(siblingNode.className === topCandidate.className && topCandidate. className !== "") {	720 if(siblingNode.className === topCandidate.className && topCandidate. className !== "") {

721 contentBonus += topCandidate.readability.contentScore * 0.2;	721 contentBonus += topCandidate.readability.contentScore * 0.2;

722 }	722 }

723	723

724 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re adability.contentScore+contentBonus) >= siblingScoreThreshold)	724 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re adability.contentScore+contentBonus) >= siblingScoreThreshold)

725 {	725 {

726 append = true;	726 append = true;

727 }	727 }

728	728

729 if(siblingNode.nodeName === "P") {	729 if(siblingNode.nodeName === "P") {

730 var linkDensity = readability.getLinkDensity(siblingNode);	730 var linkDensity = readability.getLinkDensity(siblingNode);

731 var nodeContent = readability.getInnerText(siblingNode);	731 var nodeContent = readability.getInnerText(siblingNode);

732 var nodeLength = nodeContent.length;	732 var nodeLength = nodeContent.length;

733	733

734 if(nodeLength > 80 && linkDensity < 0.25)	734 if(nodeLength > 80 && linkDensity < 0.25)

735 {	735 {

736 append = true;	736 append = true;

737 }	737 }

738 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear ch(/\.( \|$)/) !== -1)	738 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear ch(/\.( \|$)/) !== -1)

739 {	739 {

740 append = true;	740 append = true;

741 }	741 }

742 }	742 }

743	743

744 if(append) {	744 if(append) {

745 dbg("Appending node: " + siblingNode);	745 dbg("Appending node: " + siblingNode);

746	746

747 var nodeToAppend = null;	747 var nodeToAppend = null;

748 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P ") {	748 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P ") {

749 /* We have a node that isn't a common block level element, l ike a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */	749 /* We have a node that isn't a common block level element, l ike a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */

750	750

751 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');	751 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');

752 nodeToAppend = document.createElement("DIV");	752 nodeToAppend = document.createElement("DIV");

753 try {	753 try {

754 nodeToAppend.id = siblingNode.id;	754 nodeToAppend.id = siblingNode.id;

755 readability.moveNodeInnards(siblingNode, nodeToAppend);	755 readability.moveNodeInnards(siblingNode, nodeToAppend);

756 }	756 }

757 catch(er) {	757 catch(er) {

758 dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");	758 dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");

759 nodeToAppend = siblingNode;	759 nodeToAppend = siblingNode;

760 s-=1;	760 s-=1;

761 sl-=1;	761 sl-=1;

762 }	762 }

763 } else {	763 } else {

764 nodeToAppend = siblingNode;	764 nodeToAppend = siblingNode;

765 s-=1;	765 s-=1;

766 sl-=1;	766 sl-=1;

767 }	767 }

768	768

769 /* To ensure a node does not interfere with readability styles, remove its classnames */	769 /* To ensure a node does not interfere with readability styles, remove its classnames */

770 nodeToAppend.className = "";	770 nodeToAppend.className = "";

771	771

772 /* Append sibling and subtract from our list because it removes the node when you append to another node */	772 /* Append sibling and subtract from our list because it removes the node when you append to another node */

773 articleContent.appendChild(nodeToAppend);	773 articleContent.appendChild(nodeToAppend);

774 }	774 }

775 }	775 }

776	776

777 /**	777 /**

778 * So we have all of the content that we need. Now we clean it up for pr esentation.	778 * So we have all of the content that we need. Now we clean it up for pr esentation.

779 **/	779 **/

780 readability.distilledArticleContent = articleContent.cloneNode(true);	780 readability.distilledArticleContent = articleContent.cloneNode(true);

781 //readability.prepArticle(articleContent);	781 //readability.prepArticle(articleContent);

782	782

783 if (readability.curPageNum === 1) {	783 if (readability.curPageNum === 1) {

784 var newNode = document.createElement('div');	784 var newNode = document.createElement('div');

785 newNode.id = "readability-page-1";	785 newNode.id = "readability-page-1";

786 newNode.setAttribute("class", "page");	786 newNode.setAttribute("class", "page");

787 readability.moveNodeInnards(articleContent, newNode);	787 readability.moveNodeInnards(articleContent, newNode);

788 articleContent.appendChild(newNode);	788 articleContent.appendChild(newNode);

789 }	789 }

790	790

791 /**	791 /**

792 * Now that we've gone through the full algorithm, check to see if we go t any meaningful content.	792 * Now that we've gone through the full algorithm, check to see if we go t any meaningful content.

793 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher	793 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher

794 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of	794 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of

795 * finding the -right- content.	795 * finding the -right- content.

796 **/	796 **/

797 if(readability.getInnerText(articleContent, false).length < 250) {	797 if(readability.getInnerText(articleContent, false).length < 250) {

798 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {	798 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {

799 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);	799 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);

800 return readability.grabArticle(document.body);	800 return readability.grabArticle(document.body);

801 }	801 }

802 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {	802 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {

803 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);	803 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);

804 return readability.grabArticle(document.body);	804 return readability.grabArticle(document.body);

805 }	805 }

806 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL LY)) {	806 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL LY)) {

807 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);	807 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);

808 return readability.grabArticle(document.body);	808 return readability.grabArticle(document.body);

809 } else {	809 } else {

810 return null;	810 return null;

811 }	811 }

812 }	812 }

813	813

814 return articleContent;	814 return articleContent;

815 },	815 },

816	816

817 /**	817 /**

818 * Removes script tags from the document.	818 * Removes script tags from the document.

819 *	819 *

820 * @param Element	820 * @param Element

821 **/	821 **/

822 removeScripts: function (doc) {	822 removeScripts: function (doc) {

823 var scripts = doc.getElementsByTagName('script');	823 var scripts = doc.getElementsByTagName('script');

824 for(var i = scripts.length-1; i >= 0; i-=1)	824 for(var i = scripts.length-1; i >= 0; i-=1)

825 {	825 {

826 if(typeof(scripts[i].src) === "undefined" \|\| (scripts[i].src.indexOf ('readability') === -1 && scripts[i].src.indexOf('typekit') === -1))	826 if(typeof(scripts[i].src) === "undefined" \|\| (scripts[i].src.indexOf ('readability') === -1 && scripts[i].src.indexOf('typekit') === -1))

827 {	827 {

828 scripts[i].nodeValue="";	828 scripts[i].nodeValue="";

829 scripts[i].removeAttribute('src');	829 scripts[i].removeAttribute('src');

830 if (scripts[i].parentNode) {	830 if (scripts[i].parentNode) {

831 scripts[i].parentNode.removeChild(scripts[i]);	831 scripts[i].parentNode.removeChild(scripts[i]);

832 }	832 }

833 }	833 }

834 }	834 }

835 },	835 },

836	836

837 /**	837 /**

838 * Get the inner text of a node - cross browser compatibly.	838 * Get the inner text of a node - cross browser compatibly.

839 * This also strips out any excess whitespace to be found.	839 * This also strips out any excess whitespace to be found.

840 *	840 *

841 * @param Element	841 * @param Element

842 * @return string	842 * @return string

843 **/	843 **/

844 getInnerText: function (e, normalizeSpaces) {	844 getInnerText: function (e, normalizeSpaces) {

845 var textContent = "";	845 var textContent = "";

846	846

(...skipping 42 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
889	889

890 // Remove any root styles, if we're able.	890 // Remove any root styles, if we're able.

891 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili ty-styled') {	891 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili ty-styled') {

892 e.removeAttribute('style'); }	892 e.removeAttribute('style'); }

893	893

894 // Go until there are no more child nodes	894 // Go until there are no more child nodes

895 while ( cur !== null ) {	895 while ( cur !== null ) {

896 if ( cur.nodeType === 1 ) {	896 if ( cur.nodeType === 1 ) {

897 // Remove style attribute(s) :	897 // Remove style attribute(s) :

898 if(cur.className !== "readability-styled") {	898 if(cur.className !== "readability-styled") {

899 cur.removeAttribute("style");	899 cur.removeAttribute("style");

900 }	900 }

901 readability.cleanStyles( cur );	901 readability.cleanStyles( cur );

902 }	902 }

903 cur = cur.nextSibling;	903 cur = cur.nextSibling;

904 }	904 }

905 },	905 },

906	906

907 /**	907 /**

908 * Get the density of links as a percentage of the content	908 * Get the density of links as a percentage of the content

909 * This is the amount of text that is inside a link divided by the total tex t in the node.	909 * This is the amount of text that is inside a link divided by the total tex t in the node.

910 *	910 *

911 * @param Element	911 * @param Element

912 * @return number (float)	912 * @return number (float)

913 **/	913 **/

914 getLinkDensity: function (e) {	914 getLinkDensity: function (e) {

915 var links = e.getElementsByTagName("a");	915 var links = e.getElementsByTagName("a");

916 var textLength = readability.getInnerText(e).length;	916 var textLength = readability.getInnerText(e).length;

917 var linkLength = 0;	917 var linkLength = 0;

918 for(var i=0, il=links.length; i<il;i+=1)	918 for(var i=0, il=links.length; i<il;i+=1)

919 {	919 {

920 linkLength += readability.getInnerText(links[i]).length;	920 linkLength += readability.getInnerText(links[i]).length;

921 }	921 }

922	922

923 return linkLength / textLength;	923 return linkLength / textLength;

924 },	924 },

925	925

926 /**	926 /**

927 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.	927 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.

928 *	928 *

929 * @author Dan Lacy	929 * @author Dan Lacy

930 * @return string the base url	930 * @return string the base url

931 **/	931 **/

932 findBaseUrl: function () {	932 findBaseUrl: function () {

933 var noUrlParams = window.location.pathname.split("?")[0],	933 var noUrlParams = window.location.pathname.split("?")[0],

934 urlSlashes = noUrlParams.split("/").reverse(),	934 urlSlashes = noUrlParams.split("/").reverse(),

935 cleanedSegments = [],	935 cleanedSegments = [],

936 possibleType = "";	936 possibleType = "";

937	937

938 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) {	938 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) {

939 var segment = urlSlashes[i];	939 var segment = urlSlashes[i];

940	940

941 // Split off and save anything that looks like a file type.	941 // Split off and save anything that looks like a file type.

942 if (segment.indexOf(".") !== -1) {	942 if (segment.indexOf(".") !== -1) {

943 possibleType = segment.split(".")[1];	943 possibleType = segment.split(".")[1];

944	944

945 /* If the type isn't alpha-only, it's probably not actually a fi le extension. */	945 /* If the type isn't alpha-only, it's probably not actually a fi le extension. */

946 if(!possibleType.match(/[^a-zA-Z]/)) {	946 if(!possibleType.match(/[^a-zA-Z]/)) {

947 segment = segment.split(".")[0];	947 segment = segment.split(".")[0];

948 }	948 }

949 }	949 }

950	950

951 /**	951 /**

952 * EW-CMS specific segment replacement. Ugly.	952 * EW-CMS specific segment replacement. Ugly.

953 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm l	953 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm l

954 **/	954 **/

955 if(segment.indexOf(',00') !== -1) {	955 if(segment.indexOf(',00') !== -1) {

956 segment = segment.replace(',00', '');	956 segment = segment.replace(',00', '');

957 }	957 }

958	958

959 // If our first or second segment has anything looking like a page n umber, remove it.	959 // If our first or second segment has anything looking like a page n umber, remove it.

960 if (segment.match(/((_\|-)?p[a-z]*\|(_\|-))[0-9]{1,2}$/i) && ((i === 1) \|\| (i === 0))) {	960 if (segment.match(/((_\|-)?p[a-z]*\|(_\|-))[0-9]{1,2}$/i) && ((i === 1) \|\| (i === 0))) {

961 segment = segment.replace(/((_\|-)?p[a-z]*\|(_\|-))[0-9]{1,2}$/i, " ");	961 segment = segment.replace(/((_\|-)?p[a-z]*\|(_\|-))[0-9]{1,2}$/i, " ");

962 }	962 }

963	963

964	964

965 var del = false;	965 var del = false;

966	966

967 /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */	967 /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */

968 if (i < 2 && segment.match(/^\d{1,2}$/)) {	968 if (i < 2 && segment.match(/^\d{1,2}$/)) {

969 del = true;	969 del = true;

970 }	970 }

971	971

972 /* If this is the first segment and it's just "index", remove it. */	972 /* If this is the first segment and it's just "index", remove it. */

973 if(i === 0 && segment.toLowerCase() === "index") {	973 if(i === 0 && segment.toLowerCase() === "index") {

974 del = true;	974 del = true;

975 }	975 }

976	976

977	977

978 /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */	978 /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */

979 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) {	979 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) {

980 del = true;	980 del = true;

981 }	981 }

982	982

983 /* If it's not marked for deletion, push it to cleanedSegments. */	983 /* If it's not marked for deletion, push it to cleanedSegments. */

984 if (!del) {	984 if (!del) {

985 cleanedSegments.push(segment);	985 cleanedSegments.push(segment);

986 }	986 }

987 }	987 }

988	988

989 // This is our final, cleaned, base article URL.	989 // This is our final, cleaned, base article URL.

990 return window.location.protocol + "//" + window.location.host + cleanedS egments.reverse().join("/");	990 return window.location.protocol + "//" + window.location.host + cleanedS egments.reverse().join("/");

991 },	991 },

992	992

993 /**	993 /**

994 * Look for any paging links that may occur within the document.	994 * Look for any paging links that may occur within the document.

995 *	995 *

996 * @param body	996 * @param body

997 * @return object (array)	997 * @return object (array)

998 **/	998 **/

999 findNextPageLink: function (elem) {	999 findNextPageLink: function (elem) {

1000 var possiblePages = {},	1000 var possiblePages = {},

1001 allLinks = elem.getElementsByTagName('a'),	1001 allLinks = elem.getElementsByTagName('a'),

1002 articleBaseUrl = readability.findBaseUrl();	1002 articleBaseUrl = readability.findBaseUrl();

1003	1003

1004 /**	1004 /**

1005 * Loop through all links, looking for hints that they may be next-page links.	1005 * Loop through all links, looking for hints that they may be next-page links.

1006 * Things like having "page" in their textContent, className or id, or b eing a child	1006 * Things like having "page" in their textContent, className or id, or b eing a child

1007 * of a node with a page-y className or id.	1007 * of a node with a page-y className or id.

1008 *	1008 *

1009 * Also possible: levenshtein distance? longest common subsequence?	1009 * Also possible: levenshtein distance? longest common subsequence?

1010 *	1010 *

1011 * After we do that, assign each page a score, and	1011 * After we do that, assign each page a score, and

1012 **/	1012 **/

1013 for(var i = 0, il = allLinks.length; i < il; i+=1) {	1013 for(var i = 0, il = allLinks.length; i < il; i+=1) {

1014 var link = allLinks[i],	1014 var link = allLinks[i],

1015 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ' ');	1015 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ' ');

1016	1016

1017 /* If we've already seen this page, ignore it */	1017 /* If we've already seen this page, ignore it */

1018 if(linkHref === "" \|\| linkHref === articleBaseUrl \|\| linkHref === wi ndow.location.href \|\| linkHref in readability.parsedPages) {	1018 if(linkHref === "" \|\| linkHref === articleBaseUrl \|\| linkHref === wi ndow.location.href \|\| linkHref in readability.parsedPages) {

1019 continue;	1019 continue;

1020 }	1020 }

1021	1021

1022 /* If it's on a different domain, skip it. */	1022 /* If it's on a different domain, skip it. */

1023 if(window.location.host !== linkHref.split(/\/+/g)[1]) {	1023 if(window.location.host !== linkHref.split(/\/+/g)[1]) {

1024 continue;	1024 continue;

1025 }	1025 }

1026	1026

1027 var linkText = readability.getInnerText(link);	1027 var linkText = readability.getInnerText(link);

1028	1028

1029 /* If the linkText looks like it's not the next page, skip it. */	1029 /* If the linkText looks like it's not the next page, skip it. */

1030 if(linkText.match(readability.regexps.extraneous) \|\| linkText.length > 25) {	1030 if(linkText.match(readability.regexps.extraneous) \|\| linkText.length > 25) {

1031 continue;	1031 continue;

1032 }	1032 }

1033	1033

1034 /* If the leftovers of the URL after removing the base URL don't con tain any digits, it's certainly not a next page link. */	1034 /* If the leftovers of the URL after removing the base URL don't con tain any digits, it's certainly not a next page link. */

1035 var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');	1035 var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');

1036 if(!linkHrefLeftover.match(/\d/)) {	1036 if(!linkHrefLeftover.match(/\d/)) {

1037 continue;	1037 continue;

1038 }	1038 }

1039	1039

1040 if(!(linkHref in possiblePages)) {	1040 if(!(linkHref in possiblePages)) {

1041 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr ef": linkHref};	1041 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr ef": linkHref};

1042 } else {	1042 } else {

1043 possiblePages[linkHref].linkText += ' \| ' + linkText;	1043 possiblePages[linkHref].linkText += ' \| ' + linkText;

1044 }	1044 }

1045	1045

1046 var linkObj = possiblePages[linkHref];	1046 var linkObj = possiblePages[linkHref];

1047	1047

1048 /**	1048 /**

1049 * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.	1049 * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.

1050 * Example: http://www.actionscript.org/resources/articles/745/1/Jav aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html	1050 * Example: http://www.actionscript.org/resources/articles/745/1/Jav aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html

1051 **/	1051 **/

1052 if(linkHref.indexOf(articleBaseUrl) !== 0) {	1052 if(linkHref.indexOf(articleBaseUrl) !== 0) {

1053 linkObj.score -= 25;	1053 linkObj.score -= 25;

1054 }	1054 }

1055	1055

1056 var linkData = linkText + ' ' + link.className + ' ' + link.id;	1056 var linkData = linkText + ' ' + link.className + ' ' + link.id;

1057 if(linkData.match(readability.regexps.nextLink)) {	1057 if(linkData.match(readability.regexps.nextLink)) {

1058 linkObj.score += 50;	1058 linkObj.score += 50;

1059 }	1059 }

1060 if(linkData.match(/pag(e\|ing\|inat)/i)) {	1060 if(linkData.match(/pag(e\|ing\|inat)/i)) {

1061 linkObj.score += 25;	1061 linkObj.score += 25;

1062 }	1062 }

1063 if(linkData.match(/(first\|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,	1063 if(linkData.match(/(first\|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,

1064 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */	1064 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */

1065 if(!linkObj.linkText.match(readability.regexps.nextLink)) {	1065 if(!linkObj.linkText.match(readability.regexps.nextLink)) {

1066 linkObj.score -= 65;	1066 linkObj.score -= 65;

1067 }	1067 }

1068 }	1068 }

1069 if(linkData.match(readability.regexps.negative) \|\| linkData.match(re adability.regexps.extraneous)) {	1069 if(linkData.match(readability.regexps.negative) \|\| linkData.match(re adability.regexps.extraneous)) {

1070 linkObj.score -= 50;	1070 linkObj.score -= 50;

1071 }	1071 }

1072 if(linkData.match(readability.regexps.prevLink)) {	1072 if(linkData.match(readability.regexps.prevLink)) {

1073 linkObj.score -= 200;	1073 linkObj.score -= 200;

1074 }	1074 }

1075	1075

1076 /* If a parentNode contains page or paging or paginat */	1076 /* If a parentNode contains page or paging or paginat */

1077 var parentNode = link.parentNode,	1077 var parentNode = link.parentNode,

1078 positiveNodeMatch = false,	1078 positiveNodeMatch = false,

1079 negativeNodeMatch = false;	1079 negativeNodeMatch = false;

1080 while(parentNode) {	1080 while(parentNode) {

1081 var parentNodeClassAndId = parentNode.className + ' ' + parentNo de.id;	1081 var parentNodeClassAndId = parentNode.className + ' ' + parentNo de.id;

1082 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(/pag(e\|ing\|inat)/i)) {	1082 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(/pag(e\|ing\|inat)/i)) {

1083 positiveNodeMatch = true;	1083 positiveNodeMatch = true;

1084 linkObj.score += 25;	1084 linkObj.score += 25;

1085 }	1085 }

1086 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(readability.regexps.negative)) {	1086 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(readability.regexps.negative)) {

1087 /* If this is just something like "footer", give it a negati ve. If it's something like "body-and-footer", leave it be. */	1087 /* If this is just something like "footer", give it a negati ve. If it's something like "body-and-footer", leave it be. */

1088 if(!parentNodeClassAndId.match(readability.regexps.positive) ) {	1088 if(!parentNodeClassAndId.match(readability.regexps.positive) ) {

1089 linkObj.score -= 25;	1089 linkObj.score -= 25;

1090 negativeNodeMatch = true;	1090 negativeNodeMatch = true;

1091 }	1091 }

1092 }	1092 }

1093	1093

1094 parentNode = parentNode.parentNode;	1094 parentNode = parentNode.parentNode;

1095 }	1095 }

1096	1096

1097 /**	1097 /**

1098 * If the URL looks like it has paging in it, add to the score.	1098 * If the URL looks like it has paging in it, add to the score.

1099 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34	1099 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34

1100 **/	1100 **/

1101 if (linkHref.match(/p(a\|g\|ag)?(e\|ing\|ination)?(=\|\/)[0-9]{1,2}/i) \|\| linkHref.match(/(page\|paging)/i)) {	1101 if (linkHref.match(/p(a\|g\|ag)?(e\|ing\|ination)?(=\|\/)[0-9]{1,2}/i) \|\| linkHref.match(/(page\|paging)/i)) {

1102 linkObj.score += 25;	1102 linkObj.score += 25;

1103 }	1103 }

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1145 topPage = possiblePages[page];	1145 topPage = possiblePages[page];

1146 }	1146 }

1147 }	1147 }

1148 }	1148 }

1149	1149

1150 if(topPage) {	1150 if(topPage) {

1151 var nextHref = topPage.href.replace(/\/$/,'');	1151 var nextHref = topPage.href.replace(/\/$/,'');

1152	1152

1153 dbg('NEXT PAGE IS ' + nextHref);	1153 dbg('NEXT PAGE IS ' + nextHref);

1154 readability.parsedPages[nextHref] = true;	1154 readability.parsedPages[nextHref] = true;

1155 return nextHref;	1155 return nextHref;

1156 }	1156 }

1157 else {	1157 else {

1158 return null;	1158 return null;

1159 }	1159 }

1160 },	1160 },

1161	1161

1162 createLinkDiv: function(link) {	1162 createLinkDiv: function(link) {

1163 var divNode = document.createElement('div');	1163 var divNode = document.createElement('div');

1164 var aNode = document.createElement('a');	1164 var aNode = document.createElement('a');

1165 var tNode = document.createTextNode('View Next Page');	1165 var tNode = document.createTextNode('View Next Page');

(...skipping 31 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1197 }	1197 }

1198 else {	1198 else {

1199 if (options.error) { options.error(request); }	1199 if (options.error) { options.error(request); }

1200 }	1200 }

1201 }	1201 }

1202 }	1202 }

1203	1203

1204 if (typeof options === 'undefined') { options = {}; }	1204 if (typeof options === 'undefined') { options = {}; }

1205	1205

1206 request.onreadystatechange = respondToReadyState;	1206 request.onreadystatechange = respondToReadyState;

1207	1207

1208 request.open('get', url, true);	1208 request.open('get', url, true);

1209 request.setRequestHeader('Accept', 'text/html');	1209 request.setRequestHeader('Accept', 'text/html');

1210	1210

1211 try {	1211 try {

1212 request.send(options.postBody);	1212 request.send(options.postBody);

1213 }	1213 }

1214 catch (e) {	1214 catch (e) {

1215 if (options.error) { options.error(); }	1215 if (options.error) { options.error(); }

1216 }	1216 }

1217	1217

(...skipping 14 matching lines...) Expand all Loading...
1232 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada bility.curPageNum + '">§</p>';	1232 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada bility.curPageNum + '">§</p>';

1233	1233

1234 document.getElementById("readability-content").appendChild(articlePage);	1234 document.getElementById("readability-content").appendChild(articlePage);

1235	1235

1236 if(readability.curPageNum > readability.maxPages) {	1236 if(readability.curPageNum > readability.maxPages) {

1237 var linkDiv = readability.createLinkDiv(nextPageLink);	1237 var linkDiv = readability.createLinkDiv(nextPageLink);

1238	1238

1239 articlePage.appendChild(linkDiv);	1239 articlePage.appendChild(linkDiv);

1240 return;	1240 return;

1241 }	1241 }

1242	1242

1243 /**	1243 /**

1244 * Now that we've built the article page DOM element, get the page conte nt	1244 * Now that we've built the article page DOM element, get the page conte nt

1245 * asynchronously and load the cleaned content into the div we created f or it.	1245 * asynchronously and load the cleaned content into the div we created f or it.

1246 **/	1246 **/

1247 (function(pageUrl, thisPage) {	1247 (function(pageUrl, thisPage) {

1248 readability.ajax(pageUrl, {	1248 readability.ajax(pageUrl, {

1249 success: function(r) {	1249 success: function(r) {

1250	1250

1251 /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */	1251 /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */

1252 var eTag = r.getResponseHeader('ETag');	1252 var eTag = r.getResponseHeader('ETag');

1253 if(eTag) {	1253 if(eTag) {

1254 if(eTag in readability.pageETags) {	1254 if(eTag in readability.pageETags) {

1255 dbg("Exact duplicate page found via ETag. Aborting." );	1255 dbg("Exact duplicate page found via ETag. Aborting." );

1256 articlePage.style.display = 'none';	1256 articlePage.style.display = 'none';

1257 return;	1257 return;

1258 } else {	1258 } else {

1259 readability.pageETags[eTag] = 1;	1259 readability.pageETags[eTag] = 1;

1260 }	1260 }

1261 }	1261 }

1262	1262

1263 // TODO: this ends up doubling up page numbers on NYTimes ar ticles. Need to generically parse those away.	1263 // TODO: this ends up doubling up page numbers on NYTimes ar ticles. Need to generically parse those away.

1264 var page = document.createElement("DIV");	1264 var page = document.createElement("DIV");

1265	1265

1266 /**	1266 /**

1267 * Do some preprocessing to our HTML to make it ready for ap pending.	1267 * Do some preprocessing to our HTML to make it ready for ap pending.

1268 * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript.	1268 * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript.

1269 * • Turn any noscript tags into divs so that we can parse t hem. This allows us to find any next page links hidden via javascript.	1269 * • Turn any noscript tags into divs so that we can parse t hem. This allows us to find any next page links hidden via javascript.

1270 * • Turn all double br's into p's - was handled by prepDocu ment in the original view.	1270 * • Turn all double br's into p's - was handled by prepDocu ment in the original view.

(...skipping 30 matching lines...) Expand all Loading...
1301 for(var i=1; i <= readability.curPageNum; i+=1) {	1301 for(var i=1; i <= readability.curPageNum; i+=1) {

1302 var rPage = document.getElementById('readability-pag e-' + i);	1302 var rPage = document.getElementById('readability-pag e-' + i);

1303 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML ) !== -1) {	1303 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML ) !== -1) {

1304 dbg('Duplicate of page ' + i + ' - skipping.');	1304 dbg('Duplicate of page ' + i + ' - skipping.');

1305 articlePage.style.display = 'none';	1305 articlePage.style.display = 'none';

1306 readability.parsedPages[pageUrl] = true;	1306 readability.parsedPages[pageUrl] = true;

1307 return;	1307 return;

1308 }	1308 }

1309 }	1309 }

1310 }	1310 }

1311	1311

1312 readability.removeScripts(content);	1312 readability.removeScripts(content);

1313	1313

1314 readability.moveNodeInnards(content, thisPage);	1314 readability.moveNodeInnards(content, thisPage);

1315	1315

1316 /**	1316 /**

1317 * After the page has rendered, post process the content. Th is delay is necessary because,	1317 * After the page has rendered, post process the content. Th is delay is necessary because,

1318 * in webkit at least, offsetWidth is not set in time to det ermine image width. We have to	1318 * in webkit at least, offsetWidth is not set in time to det ermine image width. We have to

1319 * wait a little bit for reflow to finish before we can fix floating images.	1319 * wait a little bit for reflow to finish before we can fix floating images.

1320 **/	1320 **/

1321 window.setTimeout(	1321 window.setTimeout(

1322 function() { readability.postProcessContent(thisPage); } ,	1322 function() { readability.postProcessContent(thisPage); } ,

1323 500	1323 500

1324 );	1324 );

1325	1325

1326 if(nextPageLink) {	1326 if(nextPageLink) {

1327 readability.appendNextPage(nextPageLink);	1327 readability.appendNextPage(nextPageLink);

1328 }	1328 }

1329 }	1329 }

1330 });	1330 });

1331 }(nextPageLink, articlePage));	1331 }(nextPageLink, articlePage));

1332 },	1332 },

1333	1333

1334 /**	1334 /**

1335 * Get an elements class/id weight. Uses regular expressions to tell if this	1335 * Get an elements class/id weight. Uses regular expressions to tell if this

1336 * element looks good or bad.	1336 * element looks good or bad.

1337 *	1337 *

1338 * @param Element	1338 * @param Element

1339 * @return number (Integer)	1339 * @return number (Integer)

1340 **/	1340 **/

1341 getClassWeight: function (e) {	1341 getClassWeight: function (e) {

1342 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {	1342 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {

1343 return 0;	1343 return 0;

1344 }	1344 }

1345	1345

(...skipping 29 matching lines...) Expand all Loading...
1375 /**	1375 /**

1376 * Remove extraneous break tags from a node.	1376 * Remove extraneous break tags from a node.

1377 *	1377 *

1378 * @param Element	1378 * @param Element

1379 * @return void	1379 * @return void

1380 **/	1380 **/

1381 killBreaks: function (e) {	1381 killBreaks: function (e) {

1382 var allElements = e.getElementsByTagName('*');	1382 var allElements = e.getElementsByTagName('*');

1383 while (i < allElements.length) {	1383 while (i < allElements.length) {

1384 readability.deleteExtraBreaks(allElements[i]);	1384 readability.deleteExtraBreaks(allElements[i]);

1385 i++;	1385 i++;

1386 }	1386 }

1387 },	1387 },

1388	1388

1389 /**	1389 /**

1390 * Clean a node of all elements of type "tag".	1390 * Clean a node of all elements of type "tag".

1391 * (Unless it's a youtube/vimeo video. People love movies.)	1391 * (Unless it's a youtube/vimeo video. People love movies.)

1392 *	1392 *

1393 * @param Element	1393 * @param Element

1394 * @param string tag to clean	1394 * @param string tag to clean

1395 * @return void	1395 * @return void

1396 **/	1396 **/

1397 clean: function (e, tag) {	1397 clean: function (e, tag) {

1398 var targetList = e.getElementsByTagName( tag );	1398 var targetList = e.getElementsByTagName( tag );

1399 var isEmbed = (tag === 'object' \|\| tag === 'embed');	1399 var isEmbed = (tag === 'object' \|\| tag === 'embed');

1400	1400

1401 for (var y=targetList.length-1; y >= 0; y-=1) {	1401 for (var y=targetList.length-1; y >= 0; y-=1) {

1402 /* Allow youtube and vimeo videos through as people usually want to see those. */	1402 /* Allow youtube and vimeo videos through as people usually want to see those. */

1403 if(isEmbed) {	1403 if(isEmbed) {

1404 var attributeValues = "";	1404 var attributeValues = "";

1405 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {	1405 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {

1406 attributeValues += targetList[y].attributes[i].value + '\|';	1406 attributeValues += targetList[y].attributes[i].value + '\|';

1407 }	1407 }

1408	1408

1409 /* First, check the elements attributes to see if any of them co ntain youtube or vimeo */	1409 /* First, check the elements attributes to see if any of them co ntain youtube or vimeo */

1410 if (attributeValues.search(readability.regexps.videos) !== -1) {	1410 if (attributeValues.search(readability.regexps.videos) !== -1) {

1411 continue;	1411 continue;

1412 }	1412 }

1413	1413

1414 /* Then check the elements inside this element for the same. */	1414 /* Then check the elements inside this element for the same. */

1415 if (targetList[y].innerHTML.search(readability.regexps.videos) ! == -1) {	1415 if (targetList[y].innerHTML.search(readability.regexps.videos) ! == -1) {

1416 continue;	1416 continue;

1417 }	1417 }

1418	1418

1419 }	1419 }

1420	1420

1421 targetList[y].parentNode.removeChild(targetList[y]);	1421 targetList[y].parentNode.removeChild(targetList[y]);

1422 }	1422 }

1423 },	1423 },

1424	1424

1425 /**	1425 /**

1426 * Clean an element of all tags of type "tag" if they look fishy.	1426 * Clean an element of all tags of type "tag" if they look fishy.

1427 * "Fishy" is an algorithm based on content length, classnames, link density , number of images & embeds, etc.	1427 * "Fishy" is an algorithm based on content length, classnames, link density , number of images & embeds, etc.

1428 *	1428 *

1429 * @return void	1429 * @return void

1430 **/	1430 **/

1431 cleanConditionally: function (e, tag) {	1431 cleanConditionally: function (e, tag) {

1432	1432

1433 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {	1433 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {

1434 return;	1434 return;

1435 }	1435 }

1436	1436

1437 var tagsList = e.getElementsByTagName(tag);	1437 var tagsList = e.getElementsByTagName(tag);

1438 var curTagsLength = tagsList.length;	1438 var curTagsLength = tagsList.length;

1439	1439

1440 /**	1440 /**

1441 * Gather counts for other typical elements embedded within.	1441 * Gather counts for other typical elements embedded within.

1442 * Traverse backwards so we can remove nodes at the same time without ef fecting the traversal.	1442 * Traverse backwards so we can remove nodes at the same time without ef fecting the traversal.

1443 *	1443 *

1444 * TODO: Consider taking into account original contentScore here.	1444 * TODO: Consider taking into account original contentScore here.

1445 **/	1445 **/

1446 for (var i=curTagsLength-1; i >= 0; i-=1) {	1446 for (var i=curTagsLength-1; i >= 0; i-=1) {

1447 var weight = readability.getClassWeight(tagsList[i]);	1447 var weight = readability.getClassWeight(tagsList[i]);

1448 var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;	1448 var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;

1449	1449

1450 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde fined') ? (" with score " + tagsList[i].readability.contentScore) : ''));	1450 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde fined') ? (" with score " + tagsList[i].readability.contentScore) : ''));

1451	1451

1452 if(weight+contentScore < 0)	1452 if(weight+contentScore < 0)

1453 {	1453 {

1454 tagsList[i].parentNode.removeChild(tagsList[i]);	1454 tagsList[i].parentNode.removeChild(tagsList[i]);

1455 }	1455 }

1456 else if ( readability.getCharCount(tagsList[i],',') < 10) {	1456 else if ( readability.getCharCount(tagsList[i],',') < 10) {

1457 /**	1457 /**

1458 * If there are not very many commas, and the number of	1458 * If there are not very many commas, and the number of

1459 * non-paragraph elements is more than paragraphs or other omino us signs, remove the element.	1459 * non-paragraph elements is more than paragraphs or other omino us signs, remove the element.

1460 **/	1460 **/

1461 var p = tagsList[i].getElementsByTagName("p").length;	1461 var p = tagsList[i].getElementsByTagName("p").length;

1462 var img = tagsList[i].getElementsByTagName("img").length;	1462 var img = tagsList[i].getElementsByTagName("img").length;

1463 var li = tagsList[i].getElementsByTagName("li").length-100;	1463 var li = tagsList[i].getElementsByTagName("li").length-100;

1464 var input = tagsList[i].getElementsByTagName("input").length;	1464 var input = tagsList[i].getElementsByTagName("input").length;

1465	1465

1466 var embedCount = 0;	1466 var embedCount = 0;

1467 var embeds = tagsList[i].getElementsByTagName("embed");	1467 var embeds = tagsList[i].getElementsByTagName("embed");

1468 for(var ei=0,il=embeds.length; ei < il; ei+=1) {	1468 for(var ei=0,il=embeds.length; ei < il; ei+=1) {

1469 if (embeds[ei].src.search(readability.regexps.videos) === -1 ) {	1469 if (embeds[ei].src.search(readability.regexps.videos) === -1 ) {

1470 embedCount+=1;	1470 embedCount+=1;

1471 }	1471 }

1472 }	1472 }

1473	1473

1474 var linkDensity = readability.getLinkDensity(tagsList[i]);	1474 var linkDensity = readability.getLinkDensity(tagsList[i]);

1475 var contentLength = readability.getInnerText(tagsList[i]).length ;	1475 var contentLength = readability.getInnerText(tagsList[i]).length ;

1476 var toRemove = false;	1476 var toRemove = false;

1477	1477

1478 if ( img > p ) {	1478 if ( img > p ) {

1479 toRemove = true;	1479 toRemove = true;

1480 } else if(li > p && tag !== "ul" && tag !== "ol") {	1480 } else if(li > p && tag !== "ul" && tag !== "ol") {

1481 toRemove = true;	1481 toRemove = true;

1482 } else if( input > Math.floor(p/3) ) {	1482 } else if( input > Math.floor(p/3) ) {

1483 toRemove = true;	1483 toRemove = true;

1484 } else if(contentLength < 25 && (img === 0 \|\| img > 2) ) {	1484 } else if(contentLength < 25 && (img === 0 \|\| img > 2) ) {

1485 toRemove = true;	1485 toRemove = true;

1486 } else if(weight < 25 && linkDensity > 0.2) {	1486 } else if(weight < 25 && linkDensity > 0.2) {

1487 toRemove = true;	1487 toRemove = true;

1488 } else if(weight >= 25 && linkDensity > 0.5) {	1488 } else if(weight >= 25 && linkDensity > 0.5) {

1489 toRemove = true;	1489 toRemove = true;

1490 } else if((embedCount === 1 && contentLength < 75) \|\| embedCount > 1) {	1490 } else if((embedCount === 1 && contentLength < 75) \|\| embedCount > 1) {

1491 toRemove = true;	1491 toRemove = true;

1492 }	1492 }

1493	1493

(...skipping 21 matching lines...) Expand all Loading...
1515 }	1515 }

1516 },	1516 },

1517	1517

1518 flagIsActive: function(flag) {	1518 flagIsActive: function(flag) {

1519 return (readability.flags & flag) > 0;	1519 return (readability.flags & flag) > 0;

1520 },	1520 },

1521	1521

1522 addFlag: function(flag) {	1522 addFlag: function(flag) {

1523 readability.flags = readability.flags \| flag;	1523 readability.flags = readability.flags \| flag;

1524 },	1524 },

1525	1525

1526 removeFlag: function(flag) {	1526 removeFlag: function(flag) {

1527 readability.flags = readability.flags & ~flag;	1527 readability.flags = readability.flags & ~flag;

1528 },	1528 },

1529	1529

1530 // Removes the children of \|src\| and appends them to \|dest\|.	1530 // Removes the children of \|src\| and appends them to \|dest\|.

1531 moveNodeInnards: function(src, dest) {	1531 moveNodeInnards: function(src, dest) {

1532 try {	1532 try {

1533 while (src.firstChild) {	1533 while (src.firstChild) {

1534 dest.appendChild(src.removeChild(src.firstChild));	1534 dest.appendChild(src.removeChild(src.firstChild));

1535 }	1535 }

(...skipping 48 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1584 var lastBr = readability.isMultipleBr(node, false);	1584 var lastBr = readability.isMultipleBr(node, false);

1585 var ret = false;	1585 var ret = false;

1586 while (lastBr && lastBr != node) {	1586 while (lastBr && lastBr != node) {

1587 var toRemove = lastBr;	1587 var toRemove = lastBr;

1588 lastBr = lastBr.previousSibling;	1588 lastBr = lastBr.previousSibling;

1589 toRemove.parentNode.removeChild(toRemove);	1589 toRemove.parentNode.removeChild(toRemove);

1590 ret = true;	1590 ret = true;

1591 }	1591 }

1592 return ret;	1592 return ret;

1593 },	1593 },

1594	1594

1595 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a	1595 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a

1596 // <P> node, and makes all next siblings of that pair children of <P>, up	1596 // <P> node, and makes all next siblings of that pair children of <P>, up

1597 // until the next pair of <BR> nodes is reached.	1597 // until the next pair of <BR> nodes is reached.

1598 replaceDoubleBrWithP: function(node) {	1598 replaceDoubleBrWithP: function(node) {

1599 // Check that we are starting with a BR.	1599 // Check that we are starting with a BR.

1600 var second = readability.isMultipleBr(node, true);	1600 var second = readability.isMultipleBr(node, true);

1601 if (!second) {	1601 if (!second) {

1602 return;	1602 return;

1603 }	1603 }

1604 // Make all next siblings of the second BR into children of a P.	1604 // Make all next siblings of the second BR into children of a P.

1605 var p = document.createElement('p');	1605 var p = document.createElement('p');

1606 var curr = second.nextSibling;	1606 var curr = second.nextSibling;

1607 while (curr) {	1607 while (curr) {

1608 if (readability.isMultipleBr(curr, true)) {	1608 if (readability.isMultipleBr(curr, true)) {

1609 break;	1609 break;

1610 }	1610 }

1611 var next = curr.nextSibling;	1611 var next = curr.nextSibling;

1612 p.appendChild(curr.parentNode.removeChild(curr));	1612 p.appendChild(curr.parentNode.removeChild(curr));

1613 curr = next;	1613 curr = next;

1614 }	1614 }

1615 var ret = curr;	1615 var ret = curr;

1616	1616

1617 // Remove all nodes between the first and second BR.	1617 // Remove all nodes between the first and second BR.

1618 curr = node.nextSibling;	1618 curr = node.nextSibling;

1619 while (curr && curr != second) {	1619 while (curr && curr != second) {

1620 var next = curr.nextSibling;	1620 var next = curr.nextSibling;

1621 curr.parentNode.removeChild(curr);	1621 curr.parentNode.removeChild(curr);

1622 curr = next;	1622 curr = next;

1623 }	1623 }

1624 // Remove the second BR.	1624 // Remove the second BR.

1625 second.parentNode.removeChild(second);	1625 second.parentNode.removeChild(second);

1626 // Replace the first BR with the P.	1626 // Replace the first BR with the P.

1627 node.parentNode.replaceChild(p, node);	1627 node.parentNode.replaceChild(p, node);

1628	1628

1629 return ret;	1629 return ret;

1630 },	1630 },

1631	1631

1632 // Returns true if the NodeList contains a double <BR>.	1632 // Returns true if the NodeList contains a double <BR>.

1633 hasDoubleBr: function(nodeList) {	1633 hasDoubleBr: function(nodeList) {

1634 for (var i = 0; i < nodeList.length; nodeList++) {	1634 for (var i = 0; i < nodeList.length; nodeList++) {

1635 if (readability.isMultipleBr(nodeList[i], true)) {	1635 if (readability.isMultipleBr(nodeList[i], true)) {

1636 return true;	1636 return true;

1637 }	1637 }

1638 }	1638 }

1639 return false;	1639 return false;

1640 },	1640 },

1641	1641

1642 // Replaces double <BR> tags with <P> tags.	1642 // Replaces double <BR> tags with <P> tags.

1643 replaceDoubleBrsWithPs: function(node) {	1643 replaceDoubleBrsWithPs: function(node) {

1644 var allElements = node.getElementsByTagName('BR');	1644 var allElements = node.getElementsByTagName('BR');

1645 var node = null;	1645 var node = null;

1646 while (allElements && allElements.length > 0 &&	1646 while (allElements && allElements.length > 0 &&

1647 readability.hasDoubleBr(allElements)) {	1647 readability.hasDoubleBr(allElements)) {

1648 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {	1648 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {

1649 var next = node;	1649 var next = node;

1650 while (next = readability.replaceDoubleBrWithP(next));	1650 while (next = readability.replaceDoubleBrWithP(next));

1651 }	1651 }

1652 allElements = document.body.getElementsByTagName('BR');	1652 allElements = document.body.getElementsByTagName('BR');

1653 }	1653 }

1654 },	1654 },

1655	1655

1656	1656

1657 // Replaces a BR and the whitespace that follows it with a P.	1657 // Replaces a BR and the whitespace that follows it with a P.

1658 replaceBrWithP: function(node) {	1658 replaceBrWithP: function(node) {

1659 if (!readability.isBrNode(node)) {	1659 if (!readability.isBrNode(node)) {

1660 return;	1660 return;

1661 }	1661 }

1662 var p = document.createElement('p');	1662 var p = document.createElement('p');

1663 var curr = node.nextSibling;	1663 var curr = node.nextSibling;

1664 while (curr && !isBrNode(curr)) {	1664 while (curr && !isBrNode(curr)) {

1665 var next = curr.nextSibling;	1665 var next = curr.nextSibling;

1666 if (readability.isWhitespaceNode(curr)) {	1666 if (readability.isWhitespaceNode(curr)) {

1667 curr.parentNode.removeChild(curr);	1667 curr.parentNode.removeChild(curr);

1668 } else {	1668 } else {

1669 p.appendChild(curr.parentNode.removeChild(curr));	1669 p.appendChild(curr.parentNode.removeChild(curr));

1670 }	1670 }

1671 curr = next;	1671 curr = next;

1672 }	1672 }

1673 node.parentNode.replaceChild(p, node);	1673 node.parentNode.replaceChild(p, node);

1674 return curr;	1674 return curr;

1675 },	1675 },

1676	1676

1677 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t ag	1677 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t ag

1678 // children of the <P>.	1678 // children of the <P>.

1679 replaceBrsWithPs: function(node) {	1679 replaceBrsWithPs: function(node) {

1680 var allElements = node.getElementsByTagName('BR');	1680 var allElements = node.getElementsByTagName('BR');

1681 var node = null;	1681 var node = null;

1682 while (allElements && allElements.length > 0) {	1682 while (allElements && allElements.length > 0) {

1683 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {	1683 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {

1684 var next = node;	1684 var next = node;

1685 while (next = readability.replaceBrWithP(next));	1685 while (next = readability.replaceBrWithP(next));

1686 }	1686 }

1687 allElements = document.body.getElementsByTagName('BR');	1687 allElements = document.body.getElementsByTagName('BR');

1688 }	1688 }

1689 },	1689 },

1690	1690

1691 // Replaces any tag with any other tag.	1691 // Replaces any tag with any other tag.

1692 replaceTagsWithTags: function(node, srcTag, destTag) {	1692 replaceTagsWithTags: function(node, srcTag, destTag) {

1693 var allElements = node.getElementsByTagName(srcTag);	1693 var allElements = node.getElementsByTagName(srcTag);

1694 for (var i = 0; i < allElements.length; i++) {	1694 for (var i = 0; i < allElements.length; i++) {

1695 var dest = document.createElement(destTag);	1695 var dest = document.createElement(destTag);

1696 readability.moveNodeInnards(allElements[i], dest);	1696 readability.moveNodeInnards(allElements[i], dest);

1697 node.replaceNode(dest, allElements[i]);	1697 allElements[i].parentNode.replaceChild(dest, allElements[i]);

1698 }	1698 }

1699 },	1699 },

1700	1700

1701 // Replaces all <noscript> tags with <p> tags.	1701 // Replaces all <noscript> tags with <p> tags.

1702 replaceNoscriptsWithPs: function(node) {	1702 replaceNoscriptsWithPs: function(node) {

1703 readability.replaceTagsWithTags(node, 'noscript', 'p');	1703 readability.replaceTagsWithTags(node, 'noscript', 'p');

1704 },	1704 },

1705	1705

1706 // Replaces all <font> tags with <span> tags.	1706 // Replaces all <font> tags with <span> tags.

1707 replaceFontsWithSpans: function(node) {	1707 replaceFontsWithSpans: function(node) {

1708 readability.replaceTagsWithTags(node, 'font', 'span');	1708 readability.replaceTagsWithTags(node, 'font', 'span');

1709 },	1709 },

1710	1710

1711 // Returns a list of image URLs in the distilled article.	1711 // Returns a list of image URLs in the distilled article.

1712 getImages : function() {	1712 getImages : function() {

1713 var images = document.getElementsByTagName('img');	1713 var images = document.getElementsByTagName('img');

1714 var result = new Array(images.length);	1714 var result = new Array(images.length);

1715 dbg("Number of images: " + images.length);	1715 dbg("Number of images: " + images.length);

1716 for(i = 0; i < images.length; i++) {	1716 for(i = 0; i < images.length; i++) {

1717 result[i] = images[i].src;	1717 result[i] = images[i].src;

1718 dbg("Image: " + result[i]);	1718 dbg("Image: " + result[i]);

1719 }	1719 }

1720 return result;	1720 return result;

1721 },	1721 },

1722	1722

1723 // Returns the distilled article HTML from the page(s).	1723 // Returns the distilled article HTML from the page(s).

1724 getDistilledArticleHTML : function() {	1724 getDistilledArticleHTML : function() {

1725 return readability.distilledHTML;	1725 return readability.distilledHTML;

	1726 },

	1727

	1728 // Returns the next page of this article.

	1729 getNextPageLink : function() {

	1730 return readability.nextPageLink;

1726 }	1731 }

1727 };	1732 };

1728	1733

1729 // Extracts long-form content from a page and returns and array where the first	1734 // Extracts long-form content from a page and returns and array where the first

1730 // element is the article title, the second element is HTML containing the	1735 // element is the article title, the second element is HTML containing the

1731 // long-form content, and remaining elements are URLs for images referenced by	1736 // long-form content, and remaining elements are URLs for images referenced by

1732 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which	1737 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which

1733 // corresponds to a URL listed at index k in the array returned.	1738 // corresponds to a URL listed at index k in the array returned.

1734 (function () {	1739 (function () {

1735 readability.init();	1740 readability.init();

1736 var result = new Array(2);	1741 var result = new Array(3);

1737 result[0] = readability.getArticleTitle();	1742 result[0] = readability.getArticleTitle();

1738 result[1] = readability.getDistilledArticleHTML();	1743 result[1] = readability.getDistilledArticleHTML();

	1744 result[2] = readability.getNextPageLink();

1739 return result.concat(readability.getImages());	1745 return result.concat(readability.getImages());

1740 }())	1746 }())

1741	1747

OLD	NEW

« components/dom_distiller/core/distiller.h ('K') | « components/dom_distiller/core/task_tracker_unittest.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine

This is Rietveld 408576698