third_party/readability/js/readability.js - Issue 146843010: Add support for multipage distillation. - Code Review

Chromium Code Reviews

chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out

(632)

My Issues | Starred Open | Closed | All

Side by Side Diff: third_party/readability/js/readability.js

Issue 146843010: Add support for multipage distillation. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« no previous file with comments | « third_party/readability/README.chromium ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	1 // Copyright 2014 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 // Local modifications to this file are described in the README.chromium

	6 // file.

1	7

2 var dbg = (typeof console !== 'undefined') ? function(s) {	8 var dbg = (typeof console !== 'undefined') ? function(s) {

3 console.log("Readability: " + s);	9 console.log("Readability: " + s);

4 } : function() {};	10 } : function() {};

5	11

6 /*	12 /*

7 * Readability. An Arc90 Lab Experiment.	13 * Readability. An Arc90 Lab Experiment.

8 * Website: http://lab.arc90.com/experiments/readability	14 * Website: http://lab.arc90.com/experiments/readability

9 * Source: http://code.google.com/p/arc90labs-readability	15 * Source: http://code.google.com/p/arc90labs-readability

10 *	16 *

11 * "Readability" is a trademark of Arc90 Inc and may not be used without explici t permission.	17 * "Readability" is a trademark of Arc90 Inc and may not be used without explici t permission.

12 *	18 *

13 * Copyright (c) 2010 Arc90 Inc	19 * Copyright (c) 2010 Arc90 Inc

14 * Readability is licensed under the Apache License, Version 2.0.	20 * Readability is licensed under the Apache License, Version 2.0.

15 **/	21 **/

16 var readability = {	22 var readability = {

17 readStyle: "style-newspaper",	23 readStyle: "style-newspaper",

18 readSize: "size-medium",	24 readSize: "size-medium",

19 readMargin: "margin-wide",	25 readMargin: "margin-wide",

20	26

21 distilledHTML: '',	27 distilledHTML: '',

22 distilledArticleContent: null,	28 distilledArticleContent: null,

	29 nextPageLink: '',

23	30

24 version: '1.7.1',	31 version: '1.7.1',

25 iframeLoads: 0,	32 iframeLoads: 0,

26 convertLinksToFootnotes: false,	33 convertLinksToFootnotes: false,

27 reversePageScroll: false, /* If they hold shift and hit space, scroll up */	34 reversePageScroll: false, /* If they hold shift and hit space, scroll up */

28 frameHack: false, /**	35 frameHack: false, /**

29 * The frame hack is to workaround a firefo x bug where if you	36 * The frame hack is to workaround a firefo x bug where if you

30 * pull content out of a frame and stick it into the parent element, the scrollbar won't appear.	37 * pull content out of a frame and stick it into the parent element, the scrollbar won't appear.

31 * So we fake a scrollbar in the wrapping d iv.	38 * So we fake a scrollbar in the wrapping d iv.

32 **/	39 **/

33 biggestFrame: false,	40 biggestFrame: false,

34 flags: 0x1 \| 0x2 \| 0x4, /* Start with all flags set. */	41 flags: 0x1 \| 0x2 \| 0x4, /* Start with all flags set. */

35	42

36 /* constants */	43 /* constants */

37 FLAG_STRIP_UNLIKELYS: 0x1,	44 FLAG_STRIP_UNLIKELYS: 0x1,

38 FLAG_WEIGHT_CLASSES: 0x2,	45 FLAG_WEIGHT_CLASSES: 0x2,

39 FLAG_CLEAN_CONDITIONALLY: 0x4,	46 FLAG_CLEAN_CONDITIONALLY: 0x4,

40	47

41 maxPages: 30, /* The maximum number of pages to loop through before we ca ll it quits and just show a link. */	48 maxPages: 30, /* The maximum number of pages to loop through before we ca ll it quits and just show a link. */

42 parsedPages: {}, /* The list of pages we've parsed in this call of readabili ty, for autopaging. As a key store for easier searching. */	49 parsedPages: {}, /* The list of pages we've parsed in this call of readabili ty, for autopaging. As a key store for easier searching. */

43 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas e they happen to match, we'll know it's a duplicate. */	50 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas e they happen to match, we'll know it's a duplicate. */

44	51

45 /**	52 /**

46 * All of the regular expressions in use within readability.	53 * All of the regular expressions in use within readability.

47 * Defined up here so we don't instantiate them repeatedly in loops.	54 * Defined up here so we don't instantiate them repeatedly in loops.

48 **/	55 **/

49 regexps: {	56 regexps: {

50 unlikelyCandidates: /combx\|comment\|community\|disqus\|extra\|foot\|header \|menu\|remark\|rss\|shoutbox\|sidebar\|sponsor\|ad-break\|agegate\|pagination\|pager\|popu p\|tweet\|twitter/i,	57 unlikelyCandidates: /combx\|comment\|community\|disqus\|extra\|foot\|header \|menu\|remark\|rss\|shoutbox\|sidebar\|sponsor\|ad-break\|agegate\|pagination\|pager\|popu p\|tweet\|twitter/i,

51 okMaybeItsACandidate: /and\|article\|body\|column\|main\|shadow/i,	58 okMaybeItsACandidate: /and\|article\|body\|column\|main\|shadow/i,

52 positive: /article\|body\|content\|entry\|hentry\|main\|page\|pagi nation\|post\|text\|blog\|story/i,	59 positive: /article\|body\|content\|entry\|hentry\|main\|page\|pagi nation\|post\|text\|blog\|story/i,

53 negative: /combx\|comment\|com-\|contact\|foot\|footer\|footnote\| masthead\|media\|meta\|outbrain\|promo\|related\|scroll\|shoutbox\|sidebar\|sponsor\|shopp ing\|tags\|tool\|widget/i,	60 negative: /combx\|comment\|com-\|contact\|foot\|footer\|footnote\| masthead\|media\|meta\|outbrain\|promo\|related\|scroll\|shoutbox\|sidebar\|sponsor\|shopp ing\|tags\|tool\|widget/i,

54 extraneous: /print\|archive\|comment\|discuss\|e[\-]?mail\|share\|r eply\|all\|login\|sign\|single/i,	61 extraneous: /print\|archive\|comment\|discuss\|e[\-]?mail\|share\|r eply\|all\|login\|sign\|single/i,

55 divToPElements: /<(a\|blockquote\|dl\|div\|img\|ol\|p\|pre\|table\|ul)/i,	62 divToPElements: /<(a\|blockquote\|dl\|div\|img\|ol\|p\|pre\|table\|ul)/i,

56 replaceBrs: /(<br[^>]>[ \n\r\t]){2,}/gi,	63 replaceBrs: /(<br[^>]>[ \n\r\t]){2,}/gi,

57 replaceFonts: /<(\/?)font[^>]*>/gi,	64 replaceFonts: /<(\/?)font[^>]*>/gi,

58 trim: /^\s+\|\s+$/g,	65 trim: /^\s+\|\s+$/g,

59 normalize: /\s{2,}/g,	66 normalize: /\s{2,}/g,

60 killBreaks: /(<br\s\/?>(\s\| ?)){1,}/g,	67 killBreaks: /(<br\s\/?>(\s\| ?)){1,}/g,

61 videos: /http:\/\/(www\.)?(youtube\|vimeo)\.com/i,	68 videos: /http:\/\/(www\.)?(youtube\|vimeo)\.com/i,

62 skipFootnoteLink: /^\s(\[?[a-z0-9]{1,2}\]?\|^\|edit\|citation needed) \s$/i,	69 skipFootnoteLink: /^\s(\[?[a-z0-9]{1,2}\]?\|^\|edit\|citation needed) \s$/i,

63 nextLink: /(next\|weiter\|continue\|>([^\\|]\|$)\|»([^\\|]\|$))/i, // Match: next, continue, >, >>, » but not >\|, »\| as those usually mean last.	70 nextLink: /(next\|weiter\|continue\|>([^\\|]\|$)\|»([^\\|]\|$))/i, // Match: next, continue, >, >>, » but not >\|, »\| as those usually mean last.

64 prevLink: /(prev\|earl\|old\|new\|<\|«)/i	71 prevLink: /(prev\|earl\|old\|new\|<\|«)/i

65 },	72 },

66	73

67 /**	74 /**

68 * Runs readability.	75 * Runs readability.

69 *	76 *

70 * Workflow:	77 * Workflow:

71 * 1. Prep the document by removing script tags, css, etc.	78 * 1. Prep the document by removing script tags, css, etc.

72 * 2. Build readability's DOM tree.	79 * 2. Build readability's DOM tree.

73 * 3. Grab the article content from the current dom tree.	80 * 3. Grab the article content from the current dom tree.

74 * 4. Replace the current DOM tree with the new one.	81 * 4. Replace the current DOM tree with the new one.

75 * 5. Read peacefully.	82 * 5. Read peacefully.

76 *	83 *

77 * @return void	84 * @return void

78 **/	85 **/

79 init: function() {	86 init: function() {

80 /* Before we do anything, remove all scripts that are not readability. * /	87 /* Before we do anything, remove all scripts that are not readability. * /

81 window.onload = window.onunload = function() {};	88 window.onload = window.onunload = function() {};

82	89

83 readability.removeScripts(document);	90 readability.removeScripts(document);

84	91

85 /* Make sure this document is added to the list of parsed pages first, s o we don't double up on the first page */	92 /* Make sure this document is added to the list of parsed pages first, s o we don't double up on the first page */

86 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;	93 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;

87	94

88 /* Pull out any possible next page link first */	95 /* Pull out any possible next page link first */

89 var nextPageLink = readability.findNextPageLink(document.body);	96 readability.nextPageLink = readability.findNextPageLink(document.body);

90	97

	98 /* We handle processing of nextPage from C++ set nextPageLink to null */

	99 var nextPageLink = null;

	100

91 readability.prepDocument();	101 readability.prepDocument();

92	102

93 /* Build readability's DOM tree */	103 /* Build readability's DOM tree */

94 var overlay = document.createElement("DIV");	104 var overlay = document.createElement("DIV");

95 var innerDiv = document.createElement("DIV");	105 var innerDiv = document.createElement("DIV");

96 var articleTools = readability.getArticleTools();	106 var articleTools = readability.getArticleTools();

97 var articleTitleText = readability.getArticleTitle();	107 var articleTitleText = readability.getArticleTitle();

98 var articleContent = readability.grabArticle();	108 var articleContent = readability.grabArticle();

99	109

100 if(!articleContent) {	110 if(!articleContent) {

(...skipping 44 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
145 rootWarning.innerHTML = "<em>Readability</em> was intended for u se on individual articles and not home pages. " +	155 rootWarning.innerHTML = "<em>Readability</em> was intended for u se on individual articles and not home pages. " +

146 "If you'd like to try rendering this page anyway, <a onClick='ja vascript:document.getElementById(\"readability-warning\").style.display=\"none\" ;document.getElementById(\"readability-content\").style.display=\"block\";'>clic k here</a> to continue.";	156 "If you'd like to try rendering this page anyway, <a onClick='ja vascript:document.getElementById(\"readability-warning\").style.display=\"none\" ;document.getElementById(\"readability-content\").style.display=\"block\";'>clic k here</a> to continue.";

147	157

148 innerDiv.insertBefore( rootWarning, articleContent );	158 innerDiv.insertBefore( rootWarning, articleContent );

149 }	159 }

150	160

151 readability.postProcessContent(articleContent);	161 readability.postProcessContent(articleContent);

152	162

153 window.scrollTo(0, 0);	163 window.scrollTo(0, 0);

154	164

155 // TODO(bengr): Remove this assignment of null to nextPageLink when

156 // the processing of the next page link is safe.

157 nextPageLink = null;

158

159 if (nextPageLink) {	165 if (nextPageLink) {

160 /**	166 /**

161 * Append any additional pages after a small timeout so that people	167 * Append any additional pages after a small timeout so that people

162 * can start reading without having to wait for this to finish proce ssing.	168 * can start reading without having to wait for this to finish proce ssing.

163 **/	169 **/

164 window.setTimeout(function() {	170 window.setTimeout(function() {

165 readability.appendNextPage(nextPageLink);	171 readability.appendNextPage(nextPageLink);

166 }, 500);	172 }, 500);

167 }	173 }

168	174

169 / Smooth scrolling /	175 / Smooth scrolling /

170 document.onkeydown = function(e) {	176 document.onkeydown = function(e) {

171 var code = (window.event) ? event.keyCode : e.keyCode;	177 var code = (window.event) ? event.keyCode : e.keyCode;

172 if (code === 16) {	178 if (code === 16) {

173 readability.reversePageScroll = true;	179 readability.reversePageScroll = true;

174 return;	180 return;

175 }	181 }

176	182

177 if (code === 32) {	183 if (code === 32) {

178 readability.curScrollStep = 0;	184 readability.curScrollStep = 0;

179 var windowHeight = window.innerHeight ? window.innerHeight : (do cument.documentElement.clientHeight ? document.documentElement.clientHeight : do cument.body.clientHeight);	185 var windowHeight = window.innerHeight ? window.innerHeight : (do cument.documentElement.clientHeight ? document.documentElement.clientHeight : do cument.body.clientHeight);

180	186

181 if(readability.reversePageScroll) {	187 if(readability.reversePageScroll) {

182 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() - (windowHeight - 50), 20, 10);	188 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() - (windowHeight - 50), 20, 10);

183 }	189 }

184 else {	190 else {

185 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() + (windowHeight - 50), 20, 10);	191 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() + (windowHeight - 50), 20, 10);

186 }	192 }

187	193

188 return false;	194 return false;

189 }	195 }

190 };	196 };

191	197

192 document.onkeyup = function(e) {	198 document.onkeyup = function(e) {

193 var code = (window.event) ? event.keyCode : e.keyCode;	199 var code = (window.event) ? event.keyCode : e.keyCode;

194 if (code === 16) {	200 if (code === 16) {

195 readability.reversePageScroll = false;	201 readability.reversePageScroll = false;

196 return;	202 return;

197 }	203 }

198 };	204 };

199 },	205 },

200	206

201 /**	207 /**

202 * Run any post-process modifications to article content as necessary.	208 * Run any post-process modifications to article content as necessary.

203 *	209 *

204 * @param Element	210 * @param Element

205 * @return void	211 * @return void

206 **/	212 **/

207 postProcessContent: function(articleContent) {	213 postProcessContent: function(articleContent) {

208 if(readability.convertLinksToFootnotes && !window.location.href.match(/w ikipedia\.org/g)) {	214 if(readability.convertLinksToFootnotes && !window.location.href.match(/w ikipedia\.org/g)) {

209 readability.addFootnotes(articleContent);	215 readability.addFootnotes(articleContent);

210 }	216 }

211	217

212 readability.fixImageFloats(articleContent);	218 readability.fixImageFloats(articleContent);

213 },	219 },

214	220

215 /**	221 /**

216 * Some content ends up looking ugly if the image is too large to be floated .	222 * Some content ends up looking ugly if the image is too large to be floated .

217 * If the image is wider than a threshold (currently 55%), no longer float i t,	223 * If the image is wider than a threshold (currently 55%), no longer float i t,

218 * center it instead.	224 * center it instead.

219 *	225 *

220 * @param Element	226 * @param Element

221 * @return void	227 * @return void

222 **/	228 **/

223 fixImageFloats: function (articleContent) {	229 fixImageFloats: function (articleContent) {

224 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0. 55,	230 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0. 55,

225 images = articleContent.getElementsByTagName('img');	231 images = articleContent.getElementsByTagName('img');

226	232

227 for(var i=0, il = images.length; i < il; i+=1) {	233 for(var i=0, il = images.length; i < il; i+=1) {

228 var image = images[i];	234 var image = images[i];

229	235

230 if(image.offsetWidth > imageWidthThreshold) {	236 if(image.offsetWidth > imageWidthThreshold) {

231 image.className += " blockImage";	237 image.className += " blockImage";

232 }	238 }

233 }	239 }

234 },	240 },

235	241

236 /**	242 /**

237 * Get the article tools Element that has buttons like reload, print.	243 * Get the article tools Element that has buttons like reload, print.

238 *	244 *

239 * @return void	245 * @return void

240 **/	246 **/

241 getArticleTools: function () {	247 getArticleTools: function () {

242 var articleTools = document.createElement("DIV");	248 var articleTools = document.createElement("DIV");

243	249

244 articleTools.id = "readTools";	250 articleTools.id = "readTools";

245 articleTools.innerHTML =	251 articleTools.innerHTML =

246 "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +	252 "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +

247 "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +	253 "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +

248 "<a href='#' onclick='readability.emailBox(); return false;' title=' Email page' id='email-page'>Email Page</a>";	254 "<a href='#' onclick='readability.emailBox(); return false;' title=' Email page' id='email-page'>Email Page</a>";

249	255

250 return articleTools;	256 return articleTools;

251 },	257 },

252	258

253 /**	259 /**

254 * retuns the suggested direction of the string	260 * retuns the suggested direction of the string

255 *	261 *

256 * @return "rtl" \|\| "ltr"	262 * @return "rtl" \|\| "ltr"

257 **/	263 **/

258 getSuggestedDirection: function(text) {	264 getSuggestedDirection: function(text) {

259 function sanitizeText() {	265 function sanitizeText() {

260 return text.replace(/@\w+/, "");	266 return text.replace(/@\w+/, "");

261 }	267 }

262	268

263 function countMatches(match) {	269 function countMatches(match) {

264 var matches = text.match(new RegExp(match, "g"));	270 var matches = text.match(new RegExp(match, "g"));

265 return matches !== null ? matches.length : 0;	271 return matches !== null ? matches.length : 0;

266 }	272 }

267	273

268 function isRTL() {	274 function isRTL() {

269 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");	275 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");

270 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");	276 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");

271	277

272 // if 20% of chars are Hebrew or Arbic then direction is rtl	278 // if 20% of chars are Hebrew or Arbic then direction is rtl

273 return (count_heb + count_arb) * 100 / text.length > 20;	279 return (count_heb + count_arb) * 100 / text.length > 20;

274 }	280 }

275	281

276 text = sanitizeText(text);	282 text = sanitizeText(text);

277 return isRTL() ? "rtl" : "ltr";	283 return isRTL() ? "rtl" : "ltr";

278 },	284 },

279	285

280 /**	286 /**

281 * Get the article title as an H1.	287 * Get the article title as an H1.

282 *	288 *

283 * @return void	289 * @return void

284 **/	290 **/

285 getArticleTitle: function () {	291 getArticleTitle: function () {

286 var curTitle = "",	292 var curTitle = "",

287 origTitle = "";	293 origTitle = "";

288	294

289 try {	295 try {

290 curTitle = origTitle = document.title;	296 curTitle = origTitle = document.title;

291 if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */	297 if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */

292 curTitle = origTitle = readability.getInnerText(document.getElem entsByTagName('title')[0]);	298 curTitle = origTitle = readability.getInnerText(document.getElem entsByTagName('title')[0]);

293 }	299 }

294 }	300 }

295 catch(e) {}	301 catch(e) {}

296	302

297 if(curTitle.match(/ [\\|\-] /))	303 if(curTitle.match(/ [\\|\-] /))

298 {	304 {

299 curTitle = origTitle.replace(/(.)[\\|\-] ./gi,'$1');	305 curTitle = origTitle.replace(/(.)[\\|\-] ./gi,'$1');

300	306

301 if(curTitle.split(' ').length < 3) {	307 if(curTitle.split(' ').length < 3) {

302 curTitle = origTitle.replace(/[^\\|\-][\\|\-](.)/gi,'$1');	308 curTitle = origTitle.replace(/[^\\|\-][\\|\-](.)/gi,'$1');

303 }	309 }

304 }	310 }

305 else if(curTitle.indexOf(': ') !== -1)	311 else if(curTitle.indexOf(': ') !== -1)

306 {	312 {

307 curTitle = origTitle.replace(/.:(.)/gi, '$1');	313 curTitle = origTitle.replace(/.:(.)/gi, '$1');

308	314

309 if(curTitle.split(' ').length < 3) {	315 if(curTitle.split(' ').length < 3) {

310 curTitle = origTitle.replace(/[^:][:](.)/gi,'$1');	316 curTitle = origTitle.replace(/[^:][:](.)/gi,'$1');

(...skipping 12 matching lines...) Expand all Loading...
323	329

324 if(curTitle.split(' ').length <= 4) {	330 if(curTitle.split(' ').length <= 4) {

325 curTitle = origTitle;	331 curTitle = origTitle;

326 }	332 }

327 return curTitle;	333 return curTitle;

328 },	334 },

329	335

330 /**	336 /**

331 * Prepare the HTML document for readability to scrape it.	337 * Prepare the HTML document for readability to scrape it.

332 * This includes things like stripping javascript, CSS, and handling terribl e markup.	338 * This includes things like stripping javascript, CSS, and handling terribl e markup.

333 *	339 *

334 * @return void	340 * @return void

335 **/	341 **/

336 prepDocument: function () {	342 prepDocument: function () {

337 /**	343 /**

338 * In some cases a body element can't be found (if the HTML is totally h osed for example)	344 * In some cases a body element can't be found (if the HTML is totally h osed for example)

339 * so we create a new body node and append it to the document.	345 * so we create a new body node and append it to the document.

340 */	346 */

341 if(document.body === null)	347 if(document.body === null)

342 {	348 {

343 var body = document.createElement("body");	349 var body = document.createElement("body");

344 try {	350 try {

345 document.body = body;	351 document.body = body;

346 }	352 }

347 catch(e) {	353 catch(e) {

348 document.documentElement.appendChild(body);	354 document.documentElement.appendChild(body);

349 dbg(e);	355 dbg(e);

350 }	356 }

351 }	357 }

352	358

353 document.body.id = "readabilityBody";	359 document.body.id = "readabilityBody";

354	360

355 var frames = document.getElementsByTagName('frame');	361 var frames = document.getElementsByTagName('frame');

(...skipping 11 matching lines...) Expand all Loading...
367 canAccessFrame = true;	373 canAccessFrame = true;

368 }	374 }

369 catch(eFrames) {	375 catch(eFrames) {

370 dbg(eFrames);	376 dbg(eFrames);

371 }	377 }

372	378

373 if(frameSize > biggestFrameSize) {	379 if(frameSize > biggestFrameSize) {

374 biggestFrameSize = frameSize;	380 biggestFrameSize = frameSize;

375 readability.biggestFrame = frames[frameIndex];	381 readability.biggestFrame = frames[frameIndex];

376 }	382 }

377	383

378 if(canAccessFrame && frameSize > bestFrameSize)	384 if(canAccessFrame && frameSize > bestFrameSize)

379 {	385 {

380 readability.frameHack = true;	386 readability.frameHack = true;

381	387

382 bestFrame = frames[frameIndex];	388 bestFrame = frames[frameIndex];

383 bestFrameSize = frameSize;	389 bestFrameSize = frameSize;

384 }	390 }

385 }	391 }

386	392

387 if(bestFrame)	393 if(bestFrame)

388 {	394 {

389 var newBody = document.createElement('body');	395 var newBody = document.createElement('body');

390 readability.moveNodeInnards(bestFrame.contentWindow.document.bod y, newBody);	396 readability.moveNodeInnards(bestFrame.contentWindow.document.bod y, newBody);

391 newBody.style.overflow = 'scroll';	397 newBody.style.overflow = 'scroll';

392 document.body = newBody;	398 document.body = newBody;

393	399

394 var frameset = document.getElementsByTagName('frameset')[0];	400 var frameset = document.getElementsByTagName('frameset')[0];

395 if(frameset) {	401 if(frameset) {

396 frameset.parentNode.removeChild(frameset); }	402 frameset.parentNode.removeChild(frameset); }

397 }	403 }

398 }	404 }

399	405

400 /* Remove all stylesheets */	406 /* Remove all stylesheets */

401 for (var k=0;k < document.styleSheets.length; k+=1) {	407 for (var k=0;k < document.styleSheets.length; k+=1) {

402 if (document.styleSheets[k].href !== null && document.styleSheets[k] .href.lastIndexOf("readability") === -1) {	408 if (document.styleSheets[k].href !== null && document.styleSheets[k] .href.lastIndexOf("readability") === -1) {

403 document.styleSheets[k].disabled = true;	409 document.styleSheets[k].disabled = true;

(...skipping 44 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
448 readability.cleanConditionally(articleContent, "table");	454 readability.cleanConditionally(articleContent, "table");

449 readability.cleanConditionally(articleContent, "ul");	455 readability.cleanConditionally(articleContent, "ul");

450 readability.cleanConditionally(articleContent, "div");	456 readability.cleanConditionally(articleContent, "div");

451	457

452 /* Remove extra paragraphs */	458 /* Remove extra paragraphs */

453 var articleParagraphs = articleContent.getElementsByTagName('p');	459 var articleParagraphs = articleContent.getElementsByTagName('p');

454 for(var i = articleParagraphs.length-1; i >= 0; i-=1) {	460 for(var i = articleParagraphs.length-1; i >= 0; i-=1) {

455 var imgCount = articleParagraphs[i].getElementsByTagName('img').l ength;	461 var imgCount = articleParagraphs[i].getElementsByTagName('img').l ength;

456 var embedCount = articleParagraphs[i].getElementsByTagName('embed') .length;	462 var embedCount = articleParagraphs[i].getElementsByTagName('embed') .length;

457 var objectCount = articleParagraphs[i].getElementsByTagName('object' ).length;	463 var objectCount = articleParagraphs[i].getElementsByTagName('object' ).length;

458	464

459 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab ility.getInnerText(articleParagraphs[i], false) === '') {	465 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab ility.getInnerText(articleParagraphs[i], false) === '') {

460 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i] );	466 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i] );

461 }	467 }

462 }	468 }

463	469

464 try {	470 try {

465 readability.replaceBrsWithPs(articleContent);	471 readability.replaceBrsWithPs(articleContent);

466 }	472 }

467 catch (e) {	473 catch (e) {

468 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block -elements bug. Ignoring.: " + e);	474 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block -elements bug. Ignoring.: " + e);

469 }	475 }

470 },	476 },

471	477

472 /**	478 /**

473 * Initialize a node with the readability object. Also checks the	479 * Initialize a node with the readability object. Also checks the

474 * className/id for special names to add to its score.	480 * className/id for special names to add to its score.

475 *	481 *

476 * @param Element	482 * @param Element

477 * @return void	483 * @return void

478 **/	484 **/

479 initializeNode: function (node) {	485 initializeNode: function (node) {

480 node.readability = {"contentScore": 0};	486 node.readability = {"contentScore": 0};

481	487

482 switch(node.tagName) {	488 switch(node.tagName) {

483 case 'DIV':	489 case 'DIV':

484 node.readability.contentScore += 5;	490 node.readability.contentScore += 5;

485 break;	491 break;

486	492

487 case 'PRE':	493 case 'PRE':

488 case 'TD':	494 case 'TD':

489 case 'BLOCKQUOTE':	495 case 'BLOCKQUOTE':

490 node.readability.contentScore += 3;	496 node.readability.contentScore += 3;

491 break;	497 break;

492	498

493 case 'ADDRESS':	499 case 'ADDRESS':

494 case 'OL':	500 case 'OL':

495 case 'UL':	501 case 'UL':

496 case 'DL':	502 case 'DL':

497 case 'DD':	503 case 'DD':

498 case 'DT':	504 case 'DT':

499 case 'LI':	505 case 'LI':

500 case 'FORM':	506 case 'FORM':

501 node.readability.contentScore -= 3;	507 node.readability.contentScore -= 3;

502 break;	508 break;

503	509

504 case 'H1':	510 case 'H1':

505 case 'H2':	511 case 'H2':

506 case 'H3':	512 case 'H3':

507 case 'H4':	513 case 'H4':

508 case 'H5':	514 case 'H5':

509 case 'H6':	515 case 'H6':

510 case 'TH':	516 case 'TH':

511 node.readability.contentScore -= 5;	517 node.readability.contentScore -= 5;

512 break;	518 break;

513 }	519 }

514	520

515 node.readability.contentScore += readability.getClassWeight(node);	521 node.readability.contentScore += readability.getClassWeight(node);

516 },	522 },

517	523

518 /***	524 /***

519 * grabArticle - Using a variety of metrics (content score, classname, eleme nt types), find the content that is	525 * grabArticle - Using a variety of metrics (content score, classname, eleme nt types), find the content that is

520 * most likely to be the stuff a user wants to read. Then retu rn it wrapped up in a div.	526 * most likely to be the stuff a user wants to read. Then retu rn it wrapped up in a div.

521 *	527 *

522 * @param page a document to run upon. Needs to be a full document, complete with body.	528 * @param page a document to run upon. Needs to be a full document, complete with body.

523 * @return Element	529 * @return Element

524 **/	530 **/

525 grabArticle: function (pageToClone) {	531 grabArticle: function (pageToClone) {

526 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_ STRIP_UNLIKELYS),	532 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_ STRIP_UNLIKELYS),

527 isPaging = (page !== null) ? true: false;	533 isPaging = (page !== null) ? true: false;

528	534

529 var page = null;	535 var page = null;

530 // Never work on the actual page.	536 // Never work on the actual page.

531 if (isPaging) {	537 if (isPaging) {

532 page = document.body.cloneNode(true);	538 page = document.body.cloneNode(true);

533 } else {	539 } else {

534 page = pageToClone.cloneNode(true);	540 page = pageToClone.cloneNode(true);

535 }	541 }

536	542

537 var allElements = page.getElementsByTagName('*');	543 var allElements = page.getElementsByTagName('*');

538	544

539 /**	545 /**

540 * First, node prepping. Trash nodes that look cruddy (like ones with th e class name "comment", etc), and turn divs	546 * First, node prepping. Trash nodes that look cruddy (like ones with th e class name "comment", etc), and turn divs

541 * into P tags where they have been used inappropriately (as in, where t hey contain no other block level elements.)	547 * into P tags where they have been used inappropriately (as in, where t hey contain no other block level elements.)

542 *	548 *

543 * Note: Assignment from index for performance. See http://www.peachpit. com/articles/article.aspx?p=31567&seqNum=5	549 * Note: Assignment from index for performance. See http://www.peachpit. com/articles/article.aspx?p=31567&seqNum=5

544 * TODO: Shouldn't this be a reverse traversal?	550 * TODO: Shouldn't this be a reverse traversal?

545 **/	551 **/

546 var node = null;	552 var node = null;

547 var nodesToScore = [];	553 var nodesToScore = [];

548 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {	554 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {

549 /* Remove unlikely candidates */	555 /* Remove unlikely candidates */

550 if (stripUnlikelyCandidates) {	556 if (stripUnlikelyCandidates) {

551 var unlikelyMatchString = node.className + node.id;	557 var unlikelyMatchString = node.className + node.id;

552 if (	558 if (

553 (	559 (

554 unlikelyMatchString.search(readability.regexps.unlikelyC andidates) !== -1 &&	560 unlikelyMatchString.search(readability.regexps.unlikelyC andidates) !== -1 &&

555 unlikelyMatchString.search(readability.regexps.okMaybeIt sACandidate) === -1 &&	561 unlikelyMatchString.search(readability.regexps.okMaybeIt sACandidate) === -1 &&

556 node.tagName !== "BODY"	562 node.tagName !== "BODY"

557 )	563 )

558 )	564 )

559 {	565 {

560 dbg("Removing unlikely candidate - " + unlikelyMatchString);	566 dbg("Removing unlikely candidate - " + unlikelyMatchString);

561 node.parentNode.removeChild(node);	567 node.parentNode.removeChild(node);

562 nodeIndex-=1;	568 nodeIndex-=1;

563 continue;	569 continue;

564 }	570 }

565 }	571 }

566	572

567 if (node.tagName === "P" \|\| node.tagName === "TD" \|\| node.tagName == = "PRE") {	573 if (node.tagName === "P" \|\| node.tagName === "TD" \|\| node.tagName == = "PRE") {

568 nodesToScore[nodesToScore.length] = node;	574 nodesToScore[nodesToScore.length] = node;

569 }	575 }

570	576

571 /* Turn all divs that don't have children block level elements into p's */	577 /* Turn all divs that don't have children block level elements into p's */

572 if (node.tagName === "DIV") {	578 if (node.tagName === "DIV") {

573 if (node.innerHTML.search(readability.regexps.divToPElements) == = -1) {	579 if (node.innerHTML.search(readability.regexps.divToPElements) == = -1) {

574 var newNode = document.createElement('p');	580 var newNode = document.createElement('p');

(...skipping 16 matching lines...) Expand all Loading...
591 if(childNode.nodeType === 3) { // Node.TEXT_NODE	597 if(childNode.nodeType === 3) { // Node.TEXT_NODE

592 var p = document.createElement('p');	598 var p = document.createElement('p');

593 var t = document.createTextNode(childNode.nodeValue) ;	599 var t = document.createTextNode(childNode.nodeValue) ;

594 p.appendChild(t);	600 p.appendChild(t);

595 p.style.display = 'inline';	601 p.style.display = 'inline';

596 p.className = 'readability-styled';	602 p.className = 'readability-styled';

597 childNode.parentNode.replaceChild(p, childNode);	603 childNode.parentNode.replaceChild(p, childNode);

598 }	604 }

599 }	605 }

600 }	606 }

601 }	607 }

602 }	608 }

603	609

604 /**	610 /**

605 * Loop through all paragraphs, and assign a score to them based on how content-y they look.	611 * Loop through all paragraphs, and assign a score to them based on how content-y they look.

606 * Then add their score to their parent node.	612 * Then add their score to their parent node.

607 *	613 *

608 * A score is determined by things like number of commas, class names, e tc. Maybe eventually link density.	614 * A score is determined by things like number of commas, class names, e tc. Maybe eventually link density.

609 **/	615 **/

610 var candidates = [];	616 var candidates = [];

611 for (var pt=0; pt < nodesToScore.length; pt+=1) {	617 for (var pt=0; pt < nodesToScore.length; pt+=1) {

(...skipping 21 matching lines...) Expand all Loading...
633 candidates.push(grandParentNode);	639 candidates.push(grandParentNode);

634 }	640 }

635	641

636 var contentScore = 0;	642 var contentScore = 0;

637	643

638 /* Add a point for the paragraph itself as a base. */	644 /* Add a point for the paragraph itself as a base. */

639 contentScore+=1;	645 contentScore+=1;

640	646

641 /* Add points for any commas within this paragraph */	647 /* Add points for any commas within this paragraph */

642 contentScore += innerText.split(',').length;	648 contentScore += innerText.split(',').length;

643	649

644 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */	650 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */

645 contentScore += Math.min(Math.floor(innerText.length / 100), 3);	651 contentScore += Math.min(Math.floor(innerText.length / 100), 3);

646	652

647 /* Add the score to the parent. The grandparent gets half. */	653 /* Add the score to the parent. The grandparent gets half. */

648 parentNode.readability.contentScore += contentScore;	654 parentNode.readability.contentScore += contentScore;

649	655

650 if(grandParentNode) {	656 if(grandParentNode) {

651 grandParentNode.readability.contentScore += contentScore/2;	657 grandParentNode.readability.contentScore += contentScore/2;

652 }	658 }

653 }	659 }

654	660

655 /**	661 /**

656 * After we've calculated scores, loop through all of the possible candi date nodes we found	662 * After we've calculated scores, loop through all of the possible candi date nodes we found

657 * and find the one with the highest score.	663 * and find the one with the highest score.

658 **/	664 **/

659 var topCandidate = null;	665 var topCandidate = null;

660 for(var c=0, cl=candidates.length; c < cl; c+=1)	666 for(var c=0, cl=candidates.length; c < cl; c+=1)

661 {	667 {

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
718 var contentBonus = 0;	724 var contentBonus = 0;

719 /* Give a bonus if sibling nodes and top candidates have the example same classname */	725 /* Give a bonus if sibling nodes and top candidates have the example same classname */

720 if(siblingNode.className === topCandidate.className && topCandidate. className !== "") {	726 if(siblingNode.className === topCandidate.className && topCandidate. className !== "") {

721 contentBonus += topCandidate.readability.contentScore * 0.2;	727 contentBonus += topCandidate.readability.contentScore * 0.2;

722 }	728 }

723	729

724 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re adability.contentScore+contentBonus) >= siblingScoreThreshold)	730 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re adability.contentScore+contentBonus) >= siblingScoreThreshold)

725 {	731 {

726 append = true;	732 append = true;

727 }	733 }

728	734

729 if(siblingNode.nodeName === "P") {	735 if(siblingNode.nodeName === "P") {

730 var linkDensity = readability.getLinkDensity(siblingNode);	736 var linkDensity = readability.getLinkDensity(siblingNode);

731 var nodeContent = readability.getInnerText(siblingNode);	737 var nodeContent = readability.getInnerText(siblingNode);

732 var nodeLength = nodeContent.length;	738 var nodeLength = nodeContent.length;

733	739

734 if(nodeLength > 80 && linkDensity < 0.25)	740 if(nodeLength > 80 && linkDensity < 0.25)

735 {	741 {

736 append = true;	742 append = true;

737 }	743 }

738 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear ch(/\.( \|$)/) !== -1)	744 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear ch(/\.( \|$)/) !== -1)

739 {	745 {

740 append = true;	746 append = true;

741 }	747 }

742 }	748 }

743	749

744 if(append) {	750 if(append) {

745 dbg("Appending node: " + siblingNode);	751 dbg("Appending node: " + siblingNode);

746	752

747 var nodeToAppend = null;	753 var nodeToAppend = null;

748 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P ") {	754 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P ") {

749 /* We have a node that isn't a common block level element, l ike a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */	755 /* We have a node that isn't a common block level element, l ike a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */

750	756

751 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');	757 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');

752 nodeToAppend = document.createElement("DIV");	758 nodeToAppend = document.createElement("DIV");

753 try {	759 try {

754 nodeToAppend.id = siblingNode.id;	760 nodeToAppend.id = siblingNode.id;

755 readability.moveNodeInnards(siblingNode, nodeToAppend);	761 readability.moveNodeInnards(siblingNode, nodeToAppend);

756 }	762 }

757 catch(er) {	763 catch(er) {

758 dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");	764 dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");

759 nodeToAppend = siblingNode;	765 nodeToAppend = siblingNode;

760 s-=1;	766 s-=1;

761 sl-=1;	767 sl-=1;

762 }	768 }

763 } else {	769 } else {

764 nodeToAppend = siblingNode;	770 nodeToAppend = siblingNode;

765 s-=1;	771 s-=1;

766 sl-=1;	772 sl-=1;

767 }	773 }

768	774

769 /* To ensure a node does not interfere with readability styles, remove its classnames */	775 /* To ensure a node does not interfere with readability styles, remove its classnames */

770 nodeToAppend.className = "";	776 nodeToAppend.className = "";

771	777

772 /* Append sibling and subtract from our list because it removes the node when you append to another node */	778 /* Append sibling and subtract from our list because it removes the node when you append to another node */

773 articleContent.appendChild(nodeToAppend);	779 articleContent.appendChild(nodeToAppend);

774 }	780 }

775 }	781 }

776	782

777 /**	783 /**

778 * So we have all of the content that we need. Now we clean it up for pr esentation.	784 * So we have all of the content that we need. Now we clean it up for pr esentation.

779 **/	785 **/

780 readability.distilledArticleContent = articleContent.cloneNode(true);	786 readability.distilledArticleContent = articleContent.cloneNode(true);

781 //readability.prepArticle(articleContent);	787 //readability.prepArticle(articleContent);

782	788

783 if (readability.curPageNum === 1) {	789 if (readability.curPageNum === 1) {

784 var newNode = document.createElement('div');	790 var newNode = document.createElement('div');

785 newNode.id = "readability-page-1";	791 newNode.id = "readability-page-1";

786 newNode.setAttribute("class", "page");	792 newNode.setAttribute("class", "page");

787 readability.moveNodeInnards(articleContent, newNode);	793 readability.moveNodeInnards(articleContent, newNode);

788 articleContent.appendChild(newNode);	794 articleContent.appendChild(newNode);

789 }	795 }

790	796

791 /**	797 /**

792 * Now that we've gone through the full algorithm, check to see if we go t any meaningful content.	798 * Now that we've gone through the full algorithm, check to see if we go t any meaningful content.

793 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher	799 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher

794 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of	800 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of

795 * finding the -right- content.	801 * finding the -right- content.

796 **/	802 **/

797 if(readability.getInnerText(articleContent, false).length < 250) {	803 if(readability.getInnerText(articleContent, false).length < 250) {

798 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {	804 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {

799 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);	805 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);

800 return readability.grabArticle(document.body);	806 return readability.grabArticle(document.body);

801 }	807 }

802 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {	808 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {

803 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);	809 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);

804 return readability.grabArticle(document.body);	810 return readability.grabArticle(document.body);

805 }	811 }

806 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL LY)) {	812 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL LY)) {

807 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);	813 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);

808 return readability.grabArticle(document.body);	814 return readability.grabArticle(document.body);

809 } else {	815 } else {

810 return null;	816 return null;

811 }	817 }

812 }	818 }

813	819

814 return articleContent;	820 return articleContent;

815 },	821 },

816	822

817 /**	823 /**

818 * Removes script tags from the document.	824 * Removes script tags from the document.

819 *	825 *

820 * @param Element	826 * @param Element

821 **/	827 **/

822 removeScripts: function (doc) {	828 removeScripts: function (doc) {

823 var scripts = doc.getElementsByTagName('script');	829 var scripts = doc.getElementsByTagName('script');

824 for(var i = scripts.length-1; i >= 0; i-=1)	830 for(var i = scripts.length-1; i >= 0; i-=1)

825 {	831 {

826 if(typeof(scripts[i].src) === "undefined" \|\| (scripts[i].src.indexOf ('readability') === -1 && scripts[i].src.indexOf('typekit') === -1))	832 if(typeof(scripts[i].src) === "undefined" \|\| (scripts[i].src.indexOf ('readability') === -1 && scripts[i].src.indexOf('typekit') === -1))

827 {	833 {

828 scripts[i].nodeValue="";	834 scripts[i].nodeValue="";

829 scripts[i].removeAttribute('src');	835 scripts[i].removeAttribute('src');

830 if (scripts[i].parentNode) {	836 if (scripts[i].parentNode) {

831 scripts[i].parentNode.removeChild(scripts[i]);	837 scripts[i].parentNode.removeChild(scripts[i]);

832 }	838 }

833 }	839 }

834 }	840 }

835 },	841 },

836	842

837 /**	843 /**

838 * Get the inner text of a node - cross browser compatibly.	844 * Get the inner text of a node - cross browser compatibly.

839 * This also strips out any excess whitespace to be found.	845 * This also strips out any excess whitespace to be found.

840 *	846 *

841 * @param Element	847 * @param Element

842 * @return string	848 * @return string

843 **/	849 **/

844 getInnerText: function (e, normalizeSpaces) {	850 getInnerText: function (e, normalizeSpaces) {

845 var textContent = "";	851 var textContent = "";

846	852

(...skipping 42 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
889	895

890 // Remove any root styles, if we're able.	896 // Remove any root styles, if we're able.

891 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili ty-styled') {	897 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili ty-styled') {

892 e.removeAttribute('style'); }	898 e.removeAttribute('style'); }

893	899

894 // Go until there are no more child nodes	900 // Go until there are no more child nodes

895 while ( cur !== null ) {	901 while ( cur !== null ) {

896 if ( cur.nodeType === 1 ) {	902 if ( cur.nodeType === 1 ) {

897 // Remove style attribute(s) :	903 // Remove style attribute(s) :

898 if(cur.className !== "readability-styled") {	904 if(cur.className !== "readability-styled") {

899 cur.removeAttribute("style");	905 cur.removeAttribute("style");

900 }	906 }

901 readability.cleanStyles( cur );	907 readability.cleanStyles( cur );

902 }	908 }

903 cur = cur.nextSibling;	909 cur = cur.nextSibling;

904 }	910 }

905 },	911 },

906	912

907 /**	913 /**

908 * Get the density of links as a percentage of the content	914 * Get the density of links as a percentage of the content

909 * This is the amount of text that is inside a link divided by the total tex t in the node.	915 * This is the amount of text that is inside a link divided by the total tex t in the node.

910 *	916 *

911 * @param Element	917 * @param Element

912 * @return number (float)	918 * @return number (float)

913 **/	919 **/

914 getLinkDensity: function (e) {	920 getLinkDensity: function (e) {

915 var links = e.getElementsByTagName("a");	921 var links = e.getElementsByTagName("a");

916 var textLength = readability.getInnerText(e).length;	922 var textLength = readability.getInnerText(e).length;

917 var linkLength = 0;	923 var linkLength = 0;

918 for(var i=0, il=links.length; i<il;i+=1)	924 for(var i=0, il=links.length; i<il;i+=1)

919 {	925 {

920 linkLength += readability.getInnerText(links[i]).length;	926 linkLength += readability.getInnerText(links[i]).length;

921 }	927 }

922	928

923 return linkLength / textLength;	929 return linkLength / textLength;

924 },	930 },

925	931

926 /**	932 /**

927 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.	933 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.

928 *	934 *

929 * @author Dan Lacy	935 * @author Dan Lacy

930 * @return string the base url	936 * @return string the base url

931 **/	937 **/

932 findBaseUrl: function () {	938 findBaseUrl: function () {

933 var noUrlParams = window.location.pathname.split("?")[0],	939 var noUrlParams = window.location.pathname.split("?")[0],

934 urlSlashes = noUrlParams.split("/").reverse(),	940 urlSlashes = noUrlParams.split("/").reverse(),

935 cleanedSegments = [],	941 cleanedSegments = [],

936 possibleType = "";	942 possibleType = "";

937	943

938 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) {	944 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) {

939 var segment = urlSlashes[i];	945 var segment = urlSlashes[i];

940	946

941 // Split off and save anything that looks like a file type.	947 // Split off and save anything that looks like a file type.

942 if (segment.indexOf(".") !== -1) {	948 if (segment.indexOf(".") !== -1) {

943 possibleType = segment.split(".")[1];	949 possibleType = segment.split(".")[1];

944	950

945 /* If the type isn't alpha-only, it's probably not actually a fi le extension. */	951 /* If the type isn't alpha-only, it's probably not actually a fi le extension. */

946 if(!possibleType.match(/[^a-zA-Z]/)) {	952 if(!possibleType.match(/[^a-zA-Z]/)) {

947 segment = segment.split(".")[0];	953 segment = segment.split(".")[0];

948 }	954 }

949 }	955 }

950	956

951 /**	957 /**

952 * EW-CMS specific segment replacement. Ugly.	958 * EW-CMS specific segment replacement. Ugly.

953 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm l	959 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm l

954 **/	960 **/

955 if(segment.indexOf(',00') !== -1) {	961 if(segment.indexOf(',00') !== -1) {

956 segment = segment.replace(',00', '');	962 segment = segment.replace(',00', '');

957 }	963 }

958	964

959 // If our first or second segment has anything looking like a page n umber, remove it.	965 // If our first or second segment has anything looking like a page n umber, remove it.

960 if (segment.match(/((_\|-)?p[a-z]*\|(_\|-))[0-9]{1,2}$/i) && ((i === 1) \|\| (i === 0))) {	966 if (segment.match(/((_\|-)?p[a-z]*\|(_\|-))[0-9]{1,2}$/i) && ((i === 1) \|\| (i === 0))) {

961 segment = segment.replace(/((_\|-)?p[a-z]*\|(_\|-))[0-9]{1,2}$/i, " ");	967 segment = segment.replace(/((_\|-)?p[a-z]*\|(_\|-))[0-9]{1,2}$/i, " ");

962 }	968 }

963	969

964	970

965 var del = false;	971 var del = false;

966	972

967 /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */	973 /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */

968 if (i < 2 && segment.match(/^\d{1,2}$/)) {	974 if (i < 2 && segment.match(/^\d{1,2}$/)) {

969 del = true;	975 del = true;

970 }	976 }

971	977

972 /* If this is the first segment and it's just "index", remove it. */	978 /* If this is the first segment and it's just "index", remove it. */

973 if(i === 0 && segment.toLowerCase() === "index") {	979 if(i === 0 && segment.toLowerCase() === "index") {

974 del = true;	980 del = true;

975 }	981 }

976	982

977	983

978 /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */	984 /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */

979 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) {	985 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) {

980 del = true;	986 del = true;

981 }	987 }

982	988

983 /* If it's not marked for deletion, push it to cleanedSegments. */	989 /* If it's not marked for deletion, push it to cleanedSegments. */

984 if (!del) {	990 if (!del) {

985 cleanedSegments.push(segment);	991 cleanedSegments.push(segment);

986 }	992 }

987 }	993 }

988	994

989 // This is our final, cleaned, base article URL.	995 // This is our final, cleaned, base article URL.

990 return window.location.protocol + "//" + window.location.host + cleanedS egments.reverse().join("/");	996 return window.location.protocol + "//" + window.location.host + cleanedS egments.reverse().join("/");

991 },	997 },

992	998

993 /**	999 /**

994 * Look for any paging links that may occur within the document.	1000 * Look for any paging links that may occur within the document.

995 *	1001 *

996 * @param body	1002 * @param body

997 * @return object (array)	1003 * @return object (array)

998 **/	1004 **/

999 findNextPageLink: function (elem) {	1005 findNextPageLink: function (elem) {

1000 var possiblePages = {},	1006 var possiblePages = {},

1001 allLinks = elem.getElementsByTagName('a'),	1007 allLinks = elem.getElementsByTagName('a'),

1002 articleBaseUrl = readability.findBaseUrl();	1008 articleBaseUrl = readability.findBaseUrl();

1003	1009

1004 /**	1010 /**

1005 * Loop through all links, looking for hints that they may be next-page links.	1011 * Loop through all links, looking for hints that they may be next-page links.

1006 * Things like having "page" in their textContent, className or id, or b eing a child	1012 * Things like having "page" in their textContent, className or id, or b eing a child

1007 * of a node with a page-y className or id.	1013 * of a node with a page-y className or id.

1008 *	1014 *

1009 * Also possible: levenshtein distance? longest common subsequence?	1015 * Also possible: levenshtein distance? longest common subsequence?

1010 *	1016 *

1011 * After we do that, assign each page a score, and	1017 * After we do that, assign each page a score, and

1012 **/	1018 **/

1013 for(var i = 0, il = allLinks.length; i < il; i+=1) {	1019 for(var i = 0, il = allLinks.length; i < il; i+=1) {

1014 var link = allLinks[i],	1020 var link = allLinks[i],

1015 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ' ');	1021 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ' ');

1016	1022

1017 /* If we've already seen this page, ignore it */	1023 /* If we've already seen this page, ignore it */

1018 if(linkHref === "" \|\| linkHref === articleBaseUrl \|\| linkHref === wi ndow.location.href \|\| linkHref in readability.parsedPages) {	1024 if(linkHref === "" \|\| linkHref === articleBaseUrl \|\| linkHref === wi ndow.location.href \|\| linkHref in readability.parsedPages) {

1019 continue;	1025 continue;

1020 }	1026 }

1021	1027

1022 /* If it's on a different domain, skip it. */	1028 /* If it's on a different domain, skip it. */

1023 if(window.location.host !== linkHref.split(/\/+/g)[1]) {	1029 if(window.location.host !== linkHref.split(/\/+/g)[1]) {

1024 continue;	1030 continue;

1025 }	1031 }

1026	1032

1027 var linkText = readability.getInnerText(link);	1033 var linkText = readability.getInnerText(link);

1028	1034

1029 /* If the linkText looks like it's not the next page, skip it. */	1035 /* If the linkText looks like it's not the next page, skip it. */

1030 if(linkText.match(readability.regexps.extraneous) \|\| linkText.length > 25) {	1036 if(linkText.match(readability.regexps.extraneous) \|\| linkText.length > 25) {

1031 continue;	1037 continue;

1032 }	1038 }

1033	1039

1034 /* If the leftovers of the URL after removing the base URL don't con tain any digits, it's certainly not a next page link. */	1040 /* If the leftovers of the URL after removing the base URL don't con tain any digits, it's certainly not a next page link. */

1035 var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');	1041 var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');

1036 if(!linkHrefLeftover.match(/\d/)) {	1042 if(!linkHrefLeftover.match(/\d/)) {

1037 continue;	1043 continue;

1038 }	1044 }

1039	1045

1040 if(!(linkHref in possiblePages)) {	1046 if(!(linkHref in possiblePages)) {

1041 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr ef": linkHref};	1047 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr ef": linkHref};

1042 } else {	1048 } else {

1043 possiblePages[linkHref].linkText += ' \| ' + linkText;	1049 possiblePages[linkHref].linkText += ' \| ' + linkText;

1044 }	1050 }

1045	1051

1046 var linkObj = possiblePages[linkHref];	1052 var linkObj = possiblePages[linkHref];

1047	1053

1048 /**	1054 /**

1049 * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.	1055 * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.

1050 * Example: http://www.actionscript.org/resources/articles/745/1/Jav aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html	1056 * Example: http://www.actionscript.org/resources/articles/745/1/Jav aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html

1051 **/	1057 **/

1052 if(linkHref.indexOf(articleBaseUrl) !== 0) {	1058 if(linkHref.indexOf(articleBaseUrl) !== 0) {

1053 linkObj.score -= 25;	1059 linkObj.score -= 25;

1054 }	1060 }

1055	1061

1056 var linkData = linkText + ' ' + link.className + ' ' + link.id;	1062 var linkData = linkText + ' ' + link.className + ' ' + link.id;

1057 if(linkData.match(readability.regexps.nextLink)) {	1063 if(linkData.match(readability.regexps.nextLink)) {

1058 linkObj.score += 50;	1064 linkObj.score += 50;

1059 }	1065 }

1060 if(linkData.match(/pag(e\|ing\|inat)/i)) {	1066 if(linkData.match(/pag(e\|ing\|inat)/i)) {

1061 linkObj.score += 25;	1067 linkObj.score += 25;

1062 }	1068 }

1063 if(linkData.match(/(first\|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,	1069 if(linkData.match(/(first\|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,

1064 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */	1070 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */

1065 if(!linkObj.linkText.match(readability.regexps.nextLink)) {	1071 if(!linkObj.linkText.match(readability.regexps.nextLink)) {

1066 linkObj.score -= 65;	1072 linkObj.score -= 65;

1067 }	1073 }

1068 }	1074 }

1069 if(linkData.match(readability.regexps.negative) \|\| linkData.match(re adability.regexps.extraneous)) {	1075 if(linkData.match(readability.regexps.negative) \|\| linkData.match(re adability.regexps.extraneous)) {

1070 linkObj.score -= 50;	1076 linkObj.score -= 50;

1071 }	1077 }

1072 if(linkData.match(readability.regexps.prevLink)) {	1078 if(linkData.match(readability.regexps.prevLink)) {

1073 linkObj.score -= 200;	1079 linkObj.score -= 200;

1074 }	1080 }

1075	1081

1076 /* If a parentNode contains page or paging or paginat */	1082 /* If a parentNode contains page or paging or paginat */

1077 var parentNode = link.parentNode,	1083 var parentNode = link.parentNode,

1078 positiveNodeMatch = false,	1084 positiveNodeMatch = false,

1079 negativeNodeMatch = false;	1085 negativeNodeMatch = false;

1080 while(parentNode) {	1086 while(parentNode) {

1081 var parentNodeClassAndId = parentNode.className + ' ' + parentNo de.id;	1087 var parentNodeClassAndId = parentNode.className + ' ' + parentNo de.id;

1082 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(/pag(e\|ing\|inat)/i)) {	1088 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(/pag(e\|ing\|inat)/i)) {

1083 positiveNodeMatch = true;	1089 positiveNodeMatch = true;

1084 linkObj.score += 25;	1090 linkObj.score += 25;

1085 }	1091 }

1086 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(readability.regexps.negative)) {	1092 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(readability.regexps.negative)) {

1087 /* If this is just something like "footer", give it a negati ve. If it's something like "body-and-footer", leave it be. */	1093 /* If this is just something like "footer", give it a negati ve. If it's something like "body-and-footer", leave it be. */

1088 if(!parentNodeClassAndId.match(readability.regexps.positive) ) {	1094 if(!parentNodeClassAndId.match(readability.regexps.positive) ) {

1089 linkObj.score -= 25;	1095 linkObj.score -= 25;

1090 negativeNodeMatch = true;	1096 negativeNodeMatch = true;

1091 }	1097 }

1092 }	1098 }

1093	1099

1094 parentNode = parentNode.parentNode;	1100 parentNode = parentNode.parentNode;

1095 }	1101 }

1096	1102

1097 /**	1103 /**

1098 * If the URL looks like it has paging in it, add to the score.	1104 * If the URL looks like it has paging in it, add to the score.

1099 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34	1105 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34

1100 **/	1106 **/

1101 if (linkHref.match(/p(a\|g\|ag)?(e\|ing\|ination)?(=\|\/)[0-9]{1,2}/i) \|\| linkHref.match(/(page\|paging)/i)) {	1107 if (linkHref.match(/p(a\|g\|ag)?(e\|ing\|ination)?(=\|\/)[0-9]{1,2}/i) \|\| linkHref.match(/(page\|paging)/i)) {

1102 linkObj.score += 25;	1108 linkObj.score += 25;

1103 }	1109 }

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1145 topPage = possiblePages[page];	1151 topPage = possiblePages[page];

1146 }	1152 }

1147 }	1153 }

1148 }	1154 }

1149	1155

1150 if(topPage) {	1156 if(topPage) {

1151 var nextHref = topPage.href.replace(/\/$/,'');	1157 var nextHref = topPage.href.replace(/\/$/,'');

1152	1158

1153 dbg('NEXT PAGE IS ' + nextHref);	1159 dbg('NEXT PAGE IS ' + nextHref);

1154 readability.parsedPages[nextHref] = true;	1160 readability.parsedPages[nextHref] = true;

1155 return nextHref;	1161 return nextHref;

1156 }	1162 }

1157 else {	1163 else {

1158 return null;	1164 return null;

1159 }	1165 }

1160 },	1166 },

1161	1167

1162 createLinkDiv: function(link) {	1168 createLinkDiv: function(link) {

1163 var divNode = document.createElement('div');	1169 var divNode = document.createElement('div');

1164 var aNode = document.createElement('a');	1170 var aNode = document.createElement('a');

1165 var tNode = document.createTextNode('View Next Page');	1171 var tNode = document.createTextNode('View Next Page');

(...skipping 31 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1197 }	1203 }

1198 else {	1204 else {

1199 if (options.error) { options.error(request); }	1205 if (options.error) { options.error(request); }

1200 }	1206 }

1201 }	1207 }

1202 }	1208 }

1203	1209

1204 if (typeof options === 'undefined') { options = {}; }	1210 if (typeof options === 'undefined') { options = {}; }

1205	1211

1206 request.onreadystatechange = respondToReadyState;	1212 request.onreadystatechange = respondToReadyState;

1207	1213

1208 request.open('get', url, true);	1214 request.open('get', url, true);

1209 request.setRequestHeader('Accept', 'text/html');	1215 request.setRequestHeader('Accept', 'text/html');

1210	1216

1211 try {	1217 try {

1212 request.send(options.postBody);	1218 request.send(options.postBody);

1213 }	1219 }

1214 catch (e) {	1220 catch (e) {

1215 if (options.error) { options.error(); }	1221 if (options.error) { options.error(); }

1216 }	1222 }

1217	1223

(...skipping 14 matching lines...) Expand all Loading...
1232 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada bility.curPageNum + '">§</p>';	1238 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada bility.curPageNum + '">§</p>';

1233	1239

1234 document.getElementById("readability-content").appendChild(articlePage);	1240 document.getElementById("readability-content").appendChild(articlePage);

1235	1241

1236 if(readability.curPageNum > readability.maxPages) {	1242 if(readability.curPageNum > readability.maxPages) {

1237 var linkDiv = readability.createLinkDiv(nextPageLink);	1243 var linkDiv = readability.createLinkDiv(nextPageLink);

1238	1244

1239 articlePage.appendChild(linkDiv);	1245 articlePage.appendChild(linkDiv);

1240 return;	1246 return;

1241 }	1247 }

1242	1248

1243 /**	1249 /**

1244 * Now that we've built the article page DOM element, get the page conte nt	1250 * Now that we've built the article page DOM element, get the page conte nt

1245 * asynchronously and load the cleaned content into the div we created f or it.	1251 * asynchronously and load the cleaned content into the div we created f or it.

1246 **/	1252 **/

1247 (function(pageUrl, thisPage) {	1253 (function(pageUrl, thisPage) {

1248 readability.ajax(pageUrl, {	1254 readability.ajax(pageUrl, {

1249 success: function(r) {	1255 success: function(r) {

1250	1256

1251 /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */	1257 /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */

1252 var eTag = r.getResponseHeader('ETag');	1258 var eTag = r.getResponseHeader('ETag');

1253 if(eTag) {	1259 if(eTag) {

1254 if(eTag in readability.pageETags) {	1260 if(eTag in readability.pageETags) {

1255 dbg("Exact duplicate page found via ETag. Aborting." );	1261 dbg("Exact duplicate page found via ETag. Aborting." );

1256 articlePage.style.display = 'none';	1262 articlePage.style.display = 'none';

1257 return;	1263 return;

1258 } else {	1264 } else {

1259 readability.pageETags[eTag] = 1;	1265 readability.pageETags[eTag] = 1;

1260 }	1266 }

1261 }	1267 }

1262	1268

1263 // TODO: this ends up doubling up page numbers on NYTimes ar ticles. Need to generically parse those away.	1269 // TODO: this ends up doubling up page numbers on NYTimes ar ticles. Need to generically parse those away.

1264 var page = document.createElement("DIV");	1270 var page = document.createElement("DIV");

1265	1271

1266 /**	1272 /**

1267 * Do some preprocessing to our HTML to make it ready for ap pending.	1273 * Do some preprocessing to our HTML to make it ready for ap pending.

1268 * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript.	1274 * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript.

1269 * • Turn any noscript tags into divs so that we can parse t hem. This allows us to find any next page links hidden via javascript.	1275 * • Turn any noscript tags into divs so that we can parse t hem. This allows us to find any next page links hidden via javascript.

1270 * • Turn all double br's into p's - was handled by prepDocu ment in the original view.	1276 * • Turn all double br's into p's - was handled by prepDocu ment in the original view.

(...skipping 30 matching lines...) Expand all Loading...
1301 for(var i=1; i <= readability.curPageNum; i+=1) {	1307 for(var i=1; i <= readability.curPageNum; i+=1) {

1302 var rPage = document.getElementById('readability-pag e-' + i);	1308 var rPage = document.getElementById('readability-pag e-' + i);

1303 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML ) !== -1) {	1309 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML ) !== -1) {

1304 dbg('Duplicate of page ' + i + ' - skipping.');	1310 dbg('Duplicate of page ' + i + ' - skipping.');

1305 articlePage.style.display = 'none';	1311 articlePage.style.display = 'none';

1306 readability.parsedPages[pageUrl] = true;	1312 readability.parsedPages[pageUrl] = true;

1307 return;	1313 return;

1308 }	1314 }

1309 }	1315 }

1310 }	1316 }

1311	1317

1312 readability.removeScripts(content);	1318 readability.removeScripts(content);

1313	1319

1314 readability.moveNodeInnards(content, thisPage);	1320 readability.moveNodeInnards(content, thisPage);

1315	1321

1316 /**	1322 /**

1317 * After the page has rendered, post process the content. Th is delay is necessary because,	1323 * After the page has rendered, post process the content. Th is delay is necessary because,

1318 * in webkit at least, offsetWidth is not set in time to det ermine image width. We have to	1324 * in webkit at least, offsetWidth is not set in time to det ermine image width. We have to

1319 * wait a little bit for reflow to finish before we can fix floating images.	1325 * wait a little bit for reflow to finish before we can fix floating images.

1320 **/	1326 **/

1321 window.setTimeout(	1327 window.setTimeout(

1322 function() { readability.postProcessContent(thisPage); } ,	1328 function() { readability.postProcessContent(thisPage); } ,

1323 500	1329 500

1324 );	1330 );

1325	1331

1326 if(nextPageLink) {	1332 if(nextPageLink) {

1327 readability.appendNextPage(nextPageLink);	1333 readability.appendNextPage(nextPageLink);

1328 }	1334 }

1329 }	1335 }

1330 });	1336 });

1331 }(nextPageLink, articlePage));	1337 }(nextPageLink, articlePage));

1332 },	1338 },

1333	1339

1334 /**	1340 /**

1335 * Get an elements class/id weight. Uses regular expressions to tell if this	1341 * Get an elements class/id weight. Uses regular expressions to tell if this

1336 * element looks good or bad.	1342 * element looks good or bad.

1337 *	1343 *

1338 * @param Element	1344 * @param Element

1339 * @return number (Integer)	1345 * @return number (Integer)

1340 **/	1346 **/

1341 getClassWeight: function (e) {	1347 getClassWeight: function (e) {

1342 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {	1348 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {

1343 return 0;	1349 return 0;

1344 }	1350 }

1345	1351

(...skipping 29 matching lines...) Expand all Loading...
1375 /**	1381 /**

1376 * Remove extraneous break tags from a node.	1382 * Remove extraneous break tags from a node.

1377 *	1383 *

1378 * @param Element	1384 * @param Element

1379 * @return void	1385 * @return void

1380 **/	1386 **/

1381 killBreaks: function (e) {	1387 killBreaks: function (e) {

1382 var allElements = e.getElementsByTagName('*');	1388 var allElements = e.getElementsByTagName('*');

1383 while (i < allElements.length) {	1389 while (i < allElements.length) {

1384 readability.deleteExtraBreaks(allElements[i]);	1390 readability.deleteExtraBreaks(allElements[i]);

1385 i++;	1391 i++;

1386 }	1392 }

1387 },	1393 },

1388	1394

1389 /**	1395 /**

1390 * Clean a node of all elements of type "tag".	1396 * Clean a node of all elements of type "tag".

1391 * (Unless it's a youtube/vimeo video. People love movies.)	1397 * (Unless it's a youtube/vimeo video. People love movies.)

1392 *	1398 *

1393 * @param Element	1399 * @param Element

1394 * @param string tag to clean	1400 * @param string tag to clean

1395 * @return void	1401 * @return void

1396 **/	1402 **/

1397 clean: function (e, tag) {	1403 clean: function (e, tag) {

1398 var targetList = e.getElementsByTagName( tag );	1404 var targetList = e.getElementsByTagName( tag );

1399 var isEmbed = (tag === 'object' \|\| tag === 'embed');	1405 var isEmbed = (tag === 'object' \|\| tag === 'embed');

1400	1406

1401 for (var y=targetList.length-1; y >= 0; y-=1) {	1407 for (var y=targetList.length-1; y >= 0; y-=1) {

1402 /* Allow youtube and vimeo videos through as people usually want to see those. */	1408 /* Allow youtube and vimeo videos through as people usually want to see those. */

1403 if(isEmbed) {	1409 if(isEmbed) {

1404 var attributeValues = "";	1410 var attributeValues = "";

1405 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {	1411 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {

1406 attributeValues += targetList[y].attributes[i].value + '\|';	1412 attributeValues += targetList[y].attributes[i].value + '\|';

1407 }	1413 }

1408	1414

1409 /* First, check the elements attributes to see if any of them co ntain youtube or vimeo */	1415 /* First, check the elements attributes to see if any of them co ntain youtube or vimeo */

1410 if (attributeValues.search(readability.regexps.videos) !== -1) {	1416 if (attributeValues.search(readability.regexps.videos) !== -1) {

1411 continue;	1417 continue;

1412 }	1418 }

1413	1419

1414 /* Then check the elements inside this element for the same. */	1420 /* Then check the elements inside this element for the same. */

1415 if (targetList[y].innerHTML.search(readability.regexps.videos) ! == -1) {	1421 if (targetList[y].innerHTML.search(readability.regexps.videos) ! == -1) {

1416 continue;	1422 continue;

1417 }	1423 }

1418	1424

1419 }	1425 }

1420	1426

1421 targetList[y].parentNode.removeChild(targetList[y]);	1427 targetList[y].parentNode.removeChild(targetList[y]);

1422 }	1428 }

1423 },	1429 },

1424	1430

1425 /**	1431 /**

1426 * Clean an element of all tags of type "tag" if they look fishy.	1432 * Clean an element of all tags of type "tag" if they look fishy.

1427 * "Fishy" is an algorithm based on content length, classnames, link density , number of images & embeds, etc.	1433 * "Fishy" is an algorithm based on content length, classnames, link density , number of images & embeds, etc.

1428 *	1434 *

1429 * @return void	1435 * @return void

1430 **/	1436 **/

1431 cleanConditionally: function (e, tag) {	1437 cleanConditionally: function (e, tag) {

1432	1438

1433 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {	1439 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {

1434 return;	1440 return;

1435 }	1441 }

1436	1442

1437 var tagsList = e.getElementsByTagName(tag);	1443 var tagsList = e.getElementsByTagName(tag);

1438 var curTagsLength = tagsList.length;	1444 var curTagsLength = tagsList.length;

1439	1445

1440 /**	1446 /**

1441 * Gather counts for other typical elements embedded within.	1447 * Gather counts for other typical elements embedded within.

1442 * Traverse backwards so we can remove nodes at the same time without ef fecting the traversal.	1448 * Traverse backwards so we can remove nodes at the same time without ef fecting the traversal.

1443 *	1449 *

1444 * TODO: Consider taking into account original contentScore here.	1450 * TODO: Consider taking into account original contentScore here.

1445 **/	1451 **/

1446 for (var i=curTagsLength-1; i >= 0; i-=1) {	1452 for (var i=curTagsLength-1; i >= 0; i-=1) {

1447 var weight = readability.getClassWeight(tagsList[i]);	1453 var weight = readability.getClassWeight(tagsList[i]);

1448 var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;	1454 var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;

1449	1455

1450 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde fined') ? (" with score " + tagsList[i].readability.contentScore) : ''));	1456 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde fined') ? (" with score " + tagsList[i].readability.contentScore) : ''));

1451	1457

1452 if(weight+contentScore < 0)	1458 if(weight+contentScore < 0)

1453 {	1459 {

1454 tagsList[i].parentNode.removeChild(tagsList[i]);	1460 tagsList[i].parentNode.removeChild(tagsList[i]);

1455 }	1461 }

1456 else if ( readability.getCharCount(tagsList[i],',') < 10) {	1462 else if ( readability.getCharCount(tagsList[i],',') < 10) {

1457 /**	1463 /**

1458 * If there are not very many commas, and the number of	1464 * If there are not very many commas, and the number of

1459 * non-paragraph elements is more than paragraphs or other omino us signs, remove the element.	1465 * non-paragraph elements is more than paragraphs or other omino us signs, remove the element.

1460 **/	1466 **/

1461 var p = tagsList[i].getElementsByTagName("p").length;	1467 var p = tagsList[i].getElementsByTagName("p").length;

1462 var img = tagsList[i].getElementsByTagName("img").length;	1468 var img = tagsList[i].getElementsByTagName("img").length;

1463 var li = tagsList[i].getElementsByTagName("li").length-100;	1469 var li = tagsList[i].getElementsByTagName("li").length-100;

1464 var input = tagsList[i].getElementsByTagName("input").length;	1470 var input = tagsList[i].getElementsByTagName("input").length;

1465	1471

1466 var embedCount = 0;	1472 var embedCount = 0;

1467 var embeds = tagsList[i].getElementsByTagName("embed");	1473 var embeds = tagsList[i].getElementsByTagName("embed");

1468 for(var ei=0,il=embeds.length; ei < il; ei+=1) {	1474 for(var ei=0,il=embeds.length; ei < il; ei+=1) {

1469 if (embeds[ei].src.search(readability.regexps.videos) === -1 ) {	1475 if (embeds[ei].src.search(readability.regexps.videos) === -1 ) {

1470 embedCount+=1;	1476 embedCount+=1;

1471 }	1477 }

1472 }	1478 }

1473	1479

1474 var linkDensity = readability.getLinkDensity(tagsList[i]);	1480 var linkDensity = readability.getLinkDensity(tagsList[i]);

1475 var contentLength = readability.getInnerText(tagsList[i]).length ;	1481 var contentLength = readability.getInnerText(tagsList[i]).length ;

1476 var toRemove = false;	1482 var toRemove = false;

1477	1483

1478 if ( img > p ) {	1484 if ( img > p ) {

1479 toRemove = true;	1485 toRemove = true;

1480 } else if(li > p && tag !== "ul" && tag !== "ol") {	1486 } else if(li > p && tag !== "ul" && tag !== "ol") {

1481 toRemove = true;	1487 toRemove = true;

1482 } else if( input > Math.floor(p/3) ) {	1488 } else if( input > Math.floor(p/3) ) {

1483 toRemove = true;	1489 toRemove = true;

1484 } else if(contentLength < 25 && (img === 0 \|\| img > 2) ) {	1490 } else if(contentLength < 25 && (img === 0 \|\| img > 2) ) {

1485 toRemove = true;	1491 toRemove = true;

1486 } else if(weight < 25 && linkDensity > 0.2) {	1492 } else if(weight < 25 && linkDensity > 0.2) {

1487 toRemove = true;	1493 toRemove = true;

1488 } else if(weight >= 25 && linkDensity > 0.5) {	1494 } else if(weight >= 25 && linkDensity > 0.5) {

1489 toRemove = true;	1495 toRemove = true;

1490 } else if((embedCount === 1 && contentLength < 75) \|\| embedCount > 1) {	1496 } else if((embedCount === 1 && contentLength < 75) \|\| embedCount > 1) {

1491 toRemove = true;	1497 toRemove = true;

1492 }	1498 }

1493	1499

(...skipping 21 matching lines...) Expand all Loading...
1515 }	1521 }

1516 },	1522 },

1517	1523

1518 flagIsActive: function(flag) {	1524 flagIsActive: function(flag) {

1519 return (readability.flags & flag) > 0;	1525 return (readability.flags & flag) > 0;

1520 },	1526 },

1521	1527

1522 addFlag: function(flag) {	1528 addFlag: function(flag) {

1523 readability.flags = readability.flags \| flag;	1529 readability.flags = readability.flags \| flag;

1524 },	1530 },

1525	1531

1526 removeFlag: function(flag) {	1532 removeFlag: function(flag) {

1527 readability.flags = readability.flags & ~flag;	1533 readability.flags = readability.flags & ~flag;

1528 },	1534 },

1529	1535

1530 // Removes the children of \|src\| and appends them to \|dest\|.	1536 // Removes the children of \|src\| and appends them to \|dest\|.

1531 moveNodeInnards: function(src, dest) {	1537 moveNodeInnards: function(src, dest) {

1532 try {	1538 try {

1533 while (src.firstChild) {	1539 while (src.firstChild) {

1534 dest.appendChild(src.removeChild(src.firstChild));	1540 dest.appendChild(src.removeChild(src.firstChild));

1535 }	1541 }

(...skipping 48 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1584 var lastBr = readability.isMultipleBr(node, false);	1590 var lastBr = readability.isMultipleBr(node, false);

1585 var ret = false;	1591 var ret = false;

1586 while (lastBr && lastBr != node) {	1592 while (lastBr && lastBr != node) {

1587 var toRemove = lastBr;	1593 var toRemove = lastBr;

1588 lastBr = lastBr.previousSibling;	1594 lastBr = lastBr.previousSibling;

1589 toRemove.parentNode.removeChild(toRemove);	1595 toRemove.parentNode.removeChild(toRemove);

1590 ret = true;	1596 ret = true;

1591 }	1597 }

1592 return ret;	1598 return ret;

1593 },	1599 },

1594	1600

1595 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a	1601 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a

1596 // <P> node, and makes all next siblings of that pair children of <P>, up	1602 // <P> node, and makes all next siblings of that pair children of <P>, up

1597 // until the next pair of <BR> nodes is reached.	1603 // until the next pair of <BR> nodes is reached.

1598 replaceDoubleBrWithP: function(node) {	1604 replaceDoubleBrWithP: function(node) {

1599 // Check that we are starting with a BR.	1605 // Check that we are starting with a BR.

1600 var second = readability.isMultipleBr(node, true);	1606 var second = readability.isMultipleBr(node, true);

1601 if (!second) {	1607 if (!second) {

1602 return;	1608 return;

1603 }	1609 }

1604 // Make all next siblings of the second BR into children of a P.	1610 // Make all next siblings of the second BR into children of a P.

1605 var p = document.createElement('p');	1611 var p = document.createElement('p');

1606 var curr = second.nextSibling;	1612 var curr = second.nextSibling;

1607 while (curr) {	1613 while (curr) {

1608 if (readability.isMultipleBr(curr, true)) {	1614 if (readability.isMultipleBr(curr, true)) {

1609 break;	1615 break;

1610 }	1616 }

1611 var next = curr.nextSibling;	1617 var next = curr.nextSibling;

1612 p.appendChild(curr.parentNode.removeChild(curr));	1618 p.appendChild(curr.parentNode.removeChild(curr));

1613 curr = next;	1619 curr = next;

1614 }	1620 }

1615 var ret = curr;	1621 var ret = curr;

1616	1622

1617 // Remove all nodes between the first and second BR.	1623 // Remove all nodes between the first and second BR.

1618 curr = node.nextSibling;	1624 curr = node.nextSibling;

1619 while (curr && curr != second) {	1625 while (curr && curr != second) {

1620 var next = curr.nextSibling;	1626 var next = curr.nextSibling;

1621 curr.parentNode.removeChild(curr);	1627 curr.parentNode.removeChild(curr);

1622 curr = next;	1628 curr = next;

1623 }	1629 }

1624 // Remove the second BR.	1630 // Remove the second BR.

1625 second.parentNode.removeChild(second);	1631 second.parentNode.removeChild(second);

1626 // Replace the first BR with the P.	1632 // Replace the first BR with the P.

1627 node.parentNode.replaceChild(p, node);	1633 node.parentNode.replaceChild(p, node);

1628	1634

1629 return ret;	1635 return ret;

1630 },	1636 },

1631	1637

1632 // Returns true if the NodeList contains a double <BR>.	1638 // Returns true if the NodeList contains a double <BR>.

1633 hasDoubleBr: function(nodeList) {	1639 hasDoubleBr: function(nodeList) {

1634 for (var i = 0; i < nodeList.length; nodeList++) {	1640 for (var i = 0; i < nodeList.length; nodeList++) {

1635 if (readability.isMultipleBr(nodeList[i], true)) {	1641 if (readability.isMultipleBr(nodeList[i], true)) {

1636 return true;	1642 return true;

1637 }	1643 }

1638 }	1644 }

1639 return false;	1645 return false;

1640 },	1646 },

1641	1647

1642 // Replaces double <BR> tags with <P> tags.	1648 // Replaces double <BR> tags with <P> tags.

1643 replaceDoubleBrsWithPs: function(node) {	1649 replaceDoubleBrsWithPs: function(node) {

1644 var allElements = node.getElementsByTagName('BR');	1650 var allElements = node.getElementsByTagName('BR');

1645 var node = null;	1651 var node = null;

1646 while (allElements && allElements.length > 0 &&	1652 while (allElements && allElements.length > 0 &&

1647 readability.hasDoubleBr(allElements)) {	1653 readability.hasDoubleBr(allElements)) {

1648 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {	1654 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {

1649 var next = node;	1655 var next = node;

1650 while (next = readability.replaceDoubleBrWithP(next));	1656 while (next = readability.replaceDoubleBrWithP(next));

1651 }	1657 }

1652 allElements = document.body.getElementsByTagName('BR');	1658 allElements = document.body.getElementsByTagName('BR');

1653 }	1659 }

1654 },	1660 },

1655	1661

1656	1662

1657 // Replaces a BR and the whitespace that follows it with a P.	1663 // Replaces a BR and the whitespace that follows it with a P.

1658 replaceBrWithP: function(node) {	1664 replaceBrWithP: function(node) {

1659 if (!readability.isBrNode(node)) {	1665 if (!readability.isBrNode(node)) {

1660 return;	1666 return;

1661 }	1667 }

1662 var p = document.createElement('p');	1668 var p = document.createElement('p');

1663 var curr = node.nextSibling;	1669 var curr = node.nextSibling;

1664 while (curr && !isBrNode(curr)) {	1670 while (curr && !isBrNode(curr)) {

1665 var next = curr.nextSibling;	1671 var next = curr.nextSibling;

1666 if (readability.isWhitespaceNode(curr)) {	1672 if (readability.isWhitespaceNode(curr)) {

1667 curr.parentNode.removeChild(curr);	1673 curr.parentNode.removeChild(curr);

1668 } else {	1674 } else {

1669 p.appendChild(curr.parentNode.removeChild(curr));	1675 p.appendChild(curr.parentNode.removeChild(curr));

1670 }	1676 }

1671 curr = next;	1677 curr = next;

1672 }	1678 }

1673 node.parentNode.replaceChild(p, node);	1679 node.parentNode.replaceChild(p, node);

1674 return curr;	1680 return curr;

1675 },	1681 },

1676	1682

1677 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t ag	1683 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t ag

1678 // children of the <P>.	1684 // children of the <P>.

1679 replaceBrsWithPs: function(node) {	1685 replaceBrsWithPs: function(node) {

1680 var allElements = node.getElementsByTagName('BR');	1686 var allElements = node.getElementsByTagName('BR');

1681 var node = null;	1687 var node = null;

1682 while (allElements && allElements.length > 0) {	1688 while (allElements && allElements.length > 0) {

1683 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {	1689 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {

1684 var next = node;	1690 var next = node;

1685 while (next = readability.replaceBrWithP(next));	1691 while (next = readability.replaceBrWithP(next));

1686 }	1692 }

1687 allElements = document.body.getElementsByTagName('BR');	1693 allElements = document.body.getElementsByTagName('BR');

1688 }	1694 }

1689 },	1695 },

1690	1696

1691 // Replaces any tag with any other tag.	1697 // Replaces any tag with any other tag.

1692 replaceTagsWithTags: function(node, srcTag, destTag) {	1698 replaceTagsWithTags: function(node, srcTag, destTag) {

1693 var allElements = node.getElementsByTagName(srcTag);	1699 var allElements = node.getElementsByTagName(srcTag);

1694 for (var i = 0; i < allElements.length; i++) {	1700 for (var i = 0; i < allElements.length; i++) {

1695 var dest = document.createElement(destTag);	1701 var dest = document.createElement(destTag);

1696 readability.moveNodeInnards(allElements[i], dest);	1702 readability.moveNodeInnards(allElements[i], dest);

1697 node.replaceNode(dest, allElements[i]);	1703 allElements[i].parentNode.replaceChild(dest, allElements[i]);

1698 }	1704 }

1699 },	1705 },

1700	1706

1701 // Replaces all <noscript> tags with <p> tags.	1707 // Replaces all <noscript> tags with <p> tags.

1702 replaceNoscriptsWithPs: function(node) {	1708 replaceNoscriptsWithPs: function(node) {

1703 readability.replaceTagsWithTags(node, 'noscript', 'p');	1709 readability.replaceTagsWithTags(node, 'noscript', 'p');

1704 },	1710 },

1705	1711

1706 // Replaces all <font> tags with <span> tags.	1712 // Replaces all <font> tags with <span> tags.

1707 replaceFontsWithSpans: function(node) {	1713 replaceFontsWithSpans: function(node) {

1708 readability.replaceTagsWithTags(node, 'font', 'span');	1714 readability.replaceTagsWithTags(node, 'font', 'span');

1709 },	1715 },

1710	1716

1711 // Returns a list of image URLs in the distilled article.	1717 // Returns a list of image URLs in the distilled article.

1712 getImages : function() {	1718 getImages : function() {

1713 var images = document.getElementsByTagName('img');	1719 var images = document.getElementsByTagName('img');

1714 var result = new Array(images.length);	1720 var result = new Array(images.length);

1715 dbg("Number of images: " + images.length);	1721 dbg("Number of images: " + images.length);

1716 for(i = 0; i < images.length; i++) {	1722 for(i = 0; i < images.length; i++) {

1717 result[i] = images[i].src;	1723 result[i] = images[i].src;

1718 dbg("Image: " + result[i]);	1724 dbg("Image: " + result[i]);

1719 }	1725 }

1720 return result;	1726 return result;

1721 },	1727 },

1722	1728

1723 // Returns the distilled article HTML from the page(s).	1729 // Returns the distilled article HTML from the page(s).

1724 getDistilledArticleHTML : function() {	1730 getDistilledArticleHTML : function() {

1725 return readability.distilledHTML;	1731 return readability.distilledHTML;

	1732 },

	1733

	1734 // Returns the next page of this article.

	1735 getNextPageLink : function() {

	1736 return readability.nextPageLink;

1726 }	1737 }

1727 };	1738 };

1728	1739

1729 // Extracts long-form content from a page and returns and array where the first	1740 // Extracts long-form content from a page and returns and array where the first

1730 // element is the article title, the second element is HTML containing the	1741 // element is the article title, the second element is HTML containing the

1731 // long-form content, and remaining elements are URLs for images referenced by	1742 // long-form content, and remaining elements are URLs for images referenced by

1732 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which	1743 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which

1733 // corresponds to a URL listed at index k in the array returned.	1744 // corresponds to a URL listed at index k in the array returned.

1734 (function () {	1745 (function () {

1735 readability.init();	1746 readability.init();

1736 var result = new Array(2);	1747 var result = new Array(3);

1737 result[0] = readability.getArticleTitle();	1748 result[0] = readability.getArticleTitle();

1738 result[1] = readability.getDistilledArticleHTML();	1749 result[1] = readability.getDistilledArticleHTML();

	1750 result[2] = readability.getNextPageLink();

1739 return result.concat(readability.getImages());	1751 return result.concat(readability.getImages());

1740 }())	1752 }())

1741	1753

OLD	NEW

« no previous file with comments | « third_party/readability/README.chromium ('k') | no next file » | no next file with comments »

Powered by Google App Engine

This is Rietveld 408576698