third_party/readability/js/readability.js - Issue 146843010: Add support for multipage distillation. - Code Review

Chromium Code Reviews

chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out

(261)

My Issues | Starred Open | Closed | All

Side by Side Diff: third_party/readability/js/readability.js

Issue 146843010: Add support for multipage distillation. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« components/dom_distiller/core/distiller.cc ('K') | « components/dom_distiller/core/task_tracker_unittest.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1	1

2 var dbg = (typeof console !== 'undefined') ? function(s) {	2 var dbg = (typeof console !== 'undefined') ? function(s) {

3 console.log("Readability: " + s);	3 console.log("Readability: " + s);

4 } : function() {};	4 } : function() {};

5	5

6 /*	6 /*

7 * Readability. An Arc90 Lab Experiment.	7 * Readability. An Arc90 Lab Experiment.

8 * Website: http://lab.arc90.com/experiments/readability	8 * Website: http://lab.arc90.com/experiments/readability

9 * Source: http://code.google.com/p/arc90labs-readability	9 * Source: http://code.google.com/p/arc90labs-readability

10 *	10 *

11 * "Readability" is a trademark of Arc90 Inc and may not be used without explici t permission.	11 * "Readability" is a trademark of Arc90 Inc and may not be used without explici t permission.

12 *	12 *

13 * Copyright (c) 2010 Arc90 Inc	13 * Copyright (c) 2010 Arc90 Inc

14 * Readability is licensed under the Apache License, Version 2.0.	14 * Readability is licensed under the Apache License, Version 2.0.

15 **/	15 **/

16 var readability = {	16 var readability = {

17 readStyle: "style-newspaper",	17 readStyle: "style-newspaper",

18 readSize: "size-medium",	18 readSize: "size-medium",

19 readMargin: "margin-wide",	19 readMargin: "margin-wide",

20	20

21 distilledHTML: '',	21 distilledHTML: '',

22 distilledArticleContent: null,	22 distilledArticleContent: null,

	23 nextPageLink: '',

23	24

24 version: '1.7.1',	25 version: '1.7.1',

25 iframeLoads: 0,	26 iframeLoads: 0,

26 convertLinksToFootnotes: false,	27 convertLinksToFootnotes: false,

27 reversePageScroll: false, /* If they hold shift and hit space, scroll up */	28 reversePageScroll: false, /* If they hold shift and hit space, scroll up */

28 frameHack: false, /**	29 frameHack: false, /**

29 * The frame hack is to workaround a firefo x bug where if you	30 * The frame hack is to workaround a firefo x bug where if you

30 * pull content out of a frame and stick it into the parent element, the scrollbar won't appear.	31 * pull content out of a frame and stick it into the parent element, the scrollbar won't appear.

31 * So we fake a scrollbar in the wrapping d iv.	32 * So we fake a scrollbar in the wrapping d iv.

32 **/	33 **/

33 biggestFrame: false,	34 biggestFrame: false,

34 flags: 0x1 \| 0x2 \| 0x4, /* Start with all flags set. */	35 flags: 0x1 \| 0x2 \| 0x4, /* Start with all flags set. */

35	36

36 /* constants */	37 /* constants */

37 FLAG_STRIP_UNLIKELYS: 0x1,	38 FLAG_STRIP_UNLIKELYS: 0x1,

38 FLAG_WEIGHT_CLASSES: 0x2,	39 FLAG_WEIGHT_CLASSES: 0x2,

39 FLAG_CLEAN_CONDITIONALLY: 0x4,	40 FLAG_CLEAN_CONDITIONALLY: 0x4,

40	41

41 maxPages: 30, /* The maximum number of pages to loop through before we ca ll it quits and just show a link. */	42 maxPages: 30, /* The maximum number of pages to loop through before we ca ll it quits and just show a link. */

42 parsedPages: {}, /* The list of pages we've parsed in this call of readabili ty, for autopaging. As a key store for easier searching. */	43 parsedPages: {}, /* The list of pages we've parsed in this call of readabili ty, for autopaging. As a key store for easier searching. */

43 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas e they happen to match, we'll know it's a duplicate. */	44 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas e they happen to match, we'll know it's a duplicate. */

44	45

45 /**	46 /**

46 * All of the regular expressions in use within readability.	47 * All of the regular expressions in use within readability.

47 * Defined up here so we don't instantiate them repeatedly in loops.	48 * Defined up here so we don't instantiate them repeatedly in loops.

48 **/	49 **/

49 regexps: {	50 regexps: {

50 unlikelyCandidates: /combx\|comment\|community\|disqus\|extra\|foot\|header \|menu\|remark\|rss\|shoutbox\|sidebar\|sponsor\|ad-break\|agegate\|pagination\|pager\|popu p\|tweet\|twitter/i,	51 unlikelyCandidates: /combx\|comment\|community\|disqus\|extra\|foot\|header \|menu\|remark\|rss\|shoutbox\|sidebar\|sponsor\|ad-break\|agegate\|pagination\|pager\|popu p\|tweet\|twitter/i,

51 okMaybeItsACandidate: /and\|article\|body\|column\|main\|shadow/i,	52 okMaybeItsACandidate: /and\|article\|body\|column\|main\|shadow/i,

52 positive: /article\|body\|content\|entry\|hentry\|main\|page\|pagi nation\|post\|text\|blog\|story/i,	53 positive: /article\|body\|content\|entry\|hentry\|main\|page\|pagi nation\|post\|text\|blog\|story/i,

53 negative: /combx\|comment\|com-\|contact\|foot\|footer\|footnote\| masthead\|media\|meta\|outbrain\|promo\|related\|scroll\|shoutbox\|sidebar\|sponsor\|shopp ing\|tags\|tool\|widget/i,	54 negative: /combx\|comment\|com-\|contact\|foot\|footer\|footnote\| masthead\|media\|meta\|outbrain\|promo\|related\|scroll\|shoutbox\|sidebar\|sponsor\|shopp ing\|tags\|tool\|widget/i,

54 extraneous: /print\|archive\|comment\|discuss\|e[\-]?mail\|share\|r eply\|all\|login\|sign\|single/i,	55 extraneous: /print\|archive\|comment\|discuss\|e[\-]?mail\|share\|r eply\|all\|login\|sign\|single/i,

55 divToPElements: /<(a\|blockquote\|dl\|div\|img\|ol\|p\|pre\|table\|ul)/i,	56 divToPElements: /<(a\|blockquote\|dl\|div\|img\|ol\|p\|pre\|table\|ul)/i,

56 replaceBrs: /(<br[^>]>[ \n\r\t]){2,}/gi,	57 replaceBrs: /(<br[^>]>[ \n\r\t]){2,}/gi,

57 replaceFonts: /<(\/?)font[^>]*>/gi,	58 replaceFonts: /<(\/?)font[^>]*>/gi,

58 trim: /^\s+\|\s+$/g,	59 trim: /^\s+\|\s+$/g,

59 normalize: /\s{2,}/g,	60 normalize: /\s{2,}/g,

60 killBreaks: /(<br\s\/?>(\s\| ?)){1,}/g,	61 killBreaks: /(<br\s\/?>(\s\| ?)){1,}/g,

61 videos: /http:\/\/(www\.)?(youtube\|vimeo)\.com/i,	62 videos: /http:\/\/(www\.)?(youtube\|vimeo)\.com/i,

62 skipFootnoteLink: /^\s(\[?[a-z0-9]{1,2}\]?\|^\|edit\|citation needed) \s$/i,	63 skipFootnoteLink: /^\s(\[?[a-z0-9]{1,2}\]?\|^\|edit\|citation needed) \s$/i,

63 nextLink: /(next\|weiter\|continue\|>([^\\|]\|$)\|»([^\\|]\|$))/i, // Match: next, continue, >, >>, » but not >\|, »\| as those usually mean last.	64 nextLink: /(next\|weiter\|continue\|>([^\\|]\|$)\|»([^\\|]\|$))/i, // Match: next, continue, >, >>, » but not >\|, »\| as those usually mean last.

64 prevLink: /(prev\|earl\|old\|new\|<\|«)/i	65 prevLink: /(prev\|earl\|old\|new\|<\|«)/i

65 },	66 },

66	67

67 /**	68 /**

68 * Runs readability.	69 * Runs readability.

69 *	70 *

70 * Workflow:	71 * Workflow:

71 * 1. Prep the document by removing script tags, css, etc.	72 * 1. Prep the document by removing script tags, css, etc.

72 * 2. Build readability's DOM tree.	73 * 2. Build readability's DOM tree.

73 * 3. Grab the article content from the current dom tree.	74 * 3. Grab the article content from the current dom tree.

74 * 4. Replace the current DOM tree with the new one.	75 * 4. Replace the current DOM tree with the new one.

75 * 5. Read peacefully.	76 * 5. Read peacefully.

76 *	77 *

77 * @return void	78 * @return void

78 **/	79 **/

79 init: function() {	80 init: function() {

80 /* Before we do anything, remove all scripts that are not readability. * /	81 /* Before we do anything, remove all scripts that are not readability. * /

81 window.onload = window.onunload = function() {};	82 window.onload = window.onunload = function() {};

82	83

83 readability.removeScripts(document);	84 readability.removeScripts(document);

84	85

85 /* Make sure this document is added to the list of parsed pages first, s o we don't double up on the first page */	86 /* Make sure this document is added to the list of parsed pages first, s o we don't double up on the first page */

86 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;	87 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;

87	88

88 /* Pull out any possible next page link first */	89 /* Pull out any possible next page link first */

89 var nextPageLink = readability.findNextPageLink(document.body);	90 readability.nextPageLink = readability.findNextPageLink(document.body);

90	91

	92 /* We handle processing of nextPage from C++ set nextPageLink to null */

	93 var nextPageLink = null;

	94

91 readability.prepDocument();	95 readability.prepDocument();

92	96

93 /* Build readability's DOM tree */	97 /* Build readability's DOM tree */

94 var overlay = document.createElement("DIV");	98 var overlay = document.createElement("DIV");

95 var innerDiv = document.createElement("DIV");	99 var innerDiv = document.createElement("DIV");

96 var articleTools = readability.getArticleTools();	100 var articleTools = readability.getArticleTools();

97 var articleTitleText = readability.getArticleTitle();	101 var articleTitleText = readability.getArticleTitle();

98 var articleContent = readability.grabArticle();	102 var articleContent = readability.grabArticle();

99	103

100 if(!articleContent) {	104 if(!articleContent) {

(...skipping 44 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
145 rootWarning.innerHTML = "<em>Readability</em> was intended for u se on individual articles and not home pages. " +	149 rootWarning.innerHTML = "<em>Readability</em> was intended for u se on individual articles and not home pages. " +

146 "If you'd like to try rendering this page anyway, <a onClick='ja vascript:document.getElementById(\"readability-warning\").style.display=\"none\" ;document.getElementById(\"readability-content\").style.display=\"block\";'>clic k here</a> to continue.";	150 "If you'd like to try rendering this page anyway, <a onClick='ja vascript:document.getElementById(\"readability-warning\").style.display=\"none\" ;document.getElementById(\"readability-content\").style.display=\"block\";'>clic k here</a> to continue.";

147	151

148 innerDiv.insertBefore( rootWarning, articleContent );	152 innerDiv.insertBefore( rootWarning, articleContent );

149 }	153 }

150	154

151 readability.postProcessContent(articleContent);	155 readability.postProcessContent(articleContent);

152	156

153 window.scrollTo(0, 0);	157 window.scrollTo(0, 0);

154	158

155 // TODO(bengr): Remove this assignment of null to nextPageLink when	159 // TODO(bengr): Remove this assignment of null to nextPageLink when
	Yaron 2014/01/29 20:03:41 Please remove this to-do and subsequent null-ing o Please remove this to-do and subsequent null-ing out shashi 2014/01/29 22:51:37 Done. Show quoted text On 2014/01/29 20:03:41, Yaron wrote: > Please remove this to-do and subsequent null-ing out Done.
156 // the processing of the next page link is safe.	160 // the processing of the next page link is safe.

157 nextPageLink = null;	161 nextPageLink = null;

158	162

159 if (nextPageLink) {	163 if (nextPageLink) {

160 /**	164 /**

161 * Append any additional pages after a small timeout so that people	165 * Append any additional pages after a small timeout so that people

162 * can start reading without having to wait for this to finish proce ssing.	166 * can start reading without having to wait for this to finish proce ssing.

163 **/	167 **/

164 window.setTimeout(function() {	168 window.setTimeout(function() {

165 readability.appendNextPage(nextPageLink);	169 readability.appendNextPage(nextPageLink);

166 }, 500);	170 }, 500);

167 }	171 }

168	172

169 / Smooth scrolling /	173 / Smooth scrolling /

170 document.onkeydown = function(e) {	174 document.onkeydown = function(e) {

171 var code = (window.event) ? event.keyCode : e.keyCode;	175 var code = (window.event) ? event.keyCode : e.keyCode;

172 if (code === 16) {	176 if (code === 16) {

173 readability.reversePageScroll = true;	177 readability.reversePageScroll = true;

174 return;	178 return;

175 }	179 }

176	180

177 if (code === 32) {	181 if (code === 32) {

178 readability.curScrollStep = 0;	182 readability.curScrollStep = 0;

179 var windowHeight = window.innerHeight ? window.innerHeight : (do cument.documentElement.clientHeight ? document.documentElement.clientHeight : do cument.body.clientHeight);	183 var windowHeight = window.innerHeight ? window.innerHeight : (do cument.documentElement.clientHeight ? document.documentElement.clientHeight : do cument.body.clientHeight);

180	184

181 if(readability.reversePageScroll) {	185 if(readability.reversePageScroll) {

182 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() - (windowHeight - 50), 20, 10);	186 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() - (windowHeight - 50), 20, 10);

183 }	187 }

184 else {	188 else {

185 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() + (windowHeight - 50), 20, 10);	189 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() + (windowHeight - 50), 20, 10);

186 }	190 }

187	191

188 return false;	192 return false;

189 }	193 }

190 };	194 };

191	195

192 document.onkeyup = function(e) {	196 document.onkeyup = function(e) {

193 var code = (window.event) ? event.keyCode : e.keyCode;	197 var code = (window.event) ? event.keyCode : e.keyCode;

194 if (code === 16) {	198 if (code === 16) {

195 readability.reversePageScroll = false;	199 readability.reversePageScroll = false;

196 return;	200 return;

197 }	201 }

198 };	202 };

199 },	203 },

200	204

201 /**	205 /**

202 * Run any post-process modifications to article content as necessary.	206 * Run any post-process modifications to article content as necessary.

203 *	207 *

204 * @param Element	208 * @param Element

205 * @return void	209 * @return void

206 **/	210 **/

207 postProcessContent: function(articleContent) {	211 postProcessContent: function(articleContent) {

208 if(readability.convertLinksToFootnotes && !window.location.href.match(/w ikipedia\.org/g)) {	212 if(readability.convertLinksToFootnotes && !window.location.href.match(/w ikipedia\.org/g)) {

209 readability.addFootnotes(articleContent);	213 readability.addFootnotes(articleContent);

210 }	214 }

211	215

212 readability.fixImageFloats(articleContent);	216 readability.fixImageFloats(articleContent);

213 },	217 },

214	218

215 /**	219 /**

216 * Some content ends up looking ugly if the image is too large to be floated .	220 * Some content ends up looking ugly if the image is too large to be floated .

217 * If the image is wider than a threshold (currently 55%), no longer float i t,	221 * If the image is wider than a threshold (currently 55%), no longer float i t,

218 * center it instead.	222 * center it instead.

219 *	223 *

220 * @param Element	224 * @param Element

221 * @return void	225 * @return void

222 **/	226 **/

223 fixImageFloats: function (articleContent) {	227 fixImageFloats: function (articleContent) {

224 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0. 55,	228 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0. 55,

225 images = articleContent.getElementsByTagName('img');	229 images = articleContent.getElementsByTagName('img');

226	230

227 for(var i=0, il = images.length; i < il; i+=1) {	231 for(var i=0, il = images.length; i < il; i+=1) {

228 var image = images[i];	232 var image = images[i];

229	233

230 if(image.offsetWidth > imageWidthThreshold) {	234 if(image.offsetWidth > imageWidthThreshold) {

231 image.className += " blockImage";	235 image.className += " blockImage";

232 }	236 }

233 }	237 }

234 },	238 },

235	239

236 /**	240 /**

237 * Get the article tools Element that has buttons like reload, print.	241 * Get the article tools Element that has buttons like reload, print.

238 *	242 *

239 * @return void	243 * @return void

240 **/	244 **/

241 getArticleTools: function () {	245 getArticleTools: function () {

242 var articleTools = document.createElement("DIV");	246 var articleTools = document.createElement("DIV");

243	247

244 articleTools.id = "readTools";	248 articleTools.id = "readTools";

245 articleTools.innerHTML =	249 articleTools.innerHTML =

246 "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +	250 "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +

247 "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +	251 "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +

248 "<a href='#' onclick='readability.emailBox(); return false;' title=' Email page' id='email-page'>Email Page</a>";	252 "<a href='#' onclick='readability.emailBox(); return false;' title=' Email page' id='email-page'>Email Page</a>";

249	253

250 return articleTools;	254 return articleTools;

251 },	255 },

252	256

253 /**	257 /**

254 * retuns the suggested direction of the string	258 * retuns the suggested direction of the string

255 *	259 *

256 * @return "rtl" \|\| "ltr"	260 * @return "rtl" \|\| "ltr"

257 **/	261 **/

258 getSuggestedDirection: function(text) {	262 getSuggestedDirection: function(text) {

259 function sanitizeText() {	263 function sanitizeText() {

260 return text.replace(/@\w+/, "");	264 return text.replace(/@\w+/, "");

261 }	265 }

262	266

263 function countMatches(match) {	267 function countMatches(match) {

264 var matches = text.match(new RegExp(match, "g"));	268 var matches = text.match(new RegExp(match, "g"));

265 return matches !== null ? matches.length : 0;	269 return matches !== null ? matches.length : 0;

266 }	270 }

267	271

268 function isRTL() {	272 function isRTL() {

269 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");	273 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");

270 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");	274 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");

271	275

272 // if 20% of chars are Hebrew or Arbic then direction is rtl	276 // if 20% of chars are Hebrew or Arbic then direction is rtl

273 return (count_heb + count_arb) * 100 / text.length > 20;	277 return (count_heb + count_arb) * 100 / text.length > 20;

274 }	278 }

275	279

276 text = sanitizeText(text);	280 text = sanitizeText(text);

277 return isRTL() ? "rtl" : "ltr";	281 return isRTL() ? "rtl" : "ltr";

278 },	282 },

279	283

280 /**	284 /**

281 * Get the article title as an H1.	285 * Get the article title as an H1.

282 *	286 *

283 * @return void	287 * @return void

284 **/	288 **/

285 getArticleTitle: function () {	289 getArticleTitle: function () {

286 var curTitle = "",	290 var curTitle = "",

287 origTitle = "";	291 origTitle = "";

288	292

289 try {	293 try {

290 curTitle = origTitle = document.title;	294 curTitle = origTitle = document.title;

291 if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */	295 if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */

292 curTitle = origTitle = readability.getInnerText(document.getElem entsByTagName('title')[0]);	296 curTitle = origTitle = readability.getInnerText(document.getElem entsByTagName('title')[0]);

293 }	297 }

294 }	298 }

295 catch(e) {}	299 catch(e) {}

296	300

297 if(curTitle.match(/ [\\|\-] /))	301 if(curTitle.match(/ [\\|\-] /))

298 {	302 {

299 curTitle = origTitle.replace(/(.)[\\|\-] ./gi,'$1');	303 curTitle = origTitle.replace(/(.)[\\|\-] ./gi,'$1');

300	304

301 if(curTitle.split(' ').length < 3) {	305 if(curTitle.split(' ').length < 3) {

302 curTitle = origTitle.replace(/[^\\|\-][\\|\-](.)/gi,'$1');	306 curTitle = origTitle.replace(/[^\\|\-][\\|\-](.)/gi,'$1');

303 }	307 }

304 }	308 }

305 else if(curTitle.indexOf(': ') !== -1)	309 else if(curTitle.indexOf(': ') !== -1)

306 {	310 {

307 curTitle = origTitle.replace(/.:(.)/gi, '$1');	311 curTitle = origTitle.replace(/.:(.)/gi, '$1');

308	312

309 if(curTitle.split(' ').length < 3) {	313 if(curTitle.split(' ').length < 3) {

310 curTitle = origTitle.replace(/[^:][:](.)/gi,'$1');	314 curTitle = origTitle.replace(/[^:][:](.)/gi,'$1');

(...skipping 12 matching lines...) Expand all Loading...
323	327

324 if(curTitle.split(' ').length <= 4) {	328 if(curTitle.split(' ').length <= 4) {

325 curTitle = origTitle;	329 curTitle = origTitle;

326 }	330 }

327 return curTitle;	331 return curTitle;

328 },	332 },

329	333

330 /**	334 /**

331 * Prepare the HTML document for readability to scrape it.	335 * Prepare the HTML document for readability to scrape it.

332 * This includes things like stripping javascript, CSS, and handling terribl e markup.	336 * This includes things like stripping javascript, CSS, and handling terribl e markup.

333 *	337 *

334 * @return void	338 * @return void

335 **/	339 **/

336 prepDocument: function () {	340 prepDocument: function () {

337 /**	341 /**

338 * In some cases a body element can't be found (if the HTML is totally h osed for example)	342 * In some cases a body element can't be found (if the HTML is totally h osed for example)

339 * so we create a new body node and append it to the document.	343 * so we create a new body node and append it to the document.

340 */	344 */

341 if(document.body === null)	345 if(document.body === null)

342 {	346 {

343 var body = document.createElement("body");	347 var body = document.createElement("body");

344 try {	348 try {

345 document.body = body;	349 document.body = body;

346 }	350 }

347 catch(e) {	351 catch(e) {

348 document.documentElement.appendChild(body);	352 document.documentElement.appendChild(body);

349 dbg(e);	353 dbg(e);

350 }	354 }

351 }	355 }

352	356

353 document.body.id = "readabilityBody";	357 document.body.id = "readabilityBody";

354	358

355 var frames = document.getElementsByTagName('frame');	359 var frames = document.getElementsByTagName('frame');

(...skipping 11 matching lines...) Expand all Loading...
367 canAccessFrame = true;	371 canAccessFrame = true;

368 }	372 }

369 catch(eFrames) {	373 catch(eFrames) {

370 dbg(eFrames);	374 dbg(eFrames);

371 }	375 }

372	376

373 if(frameSize > biggestFrameSize) {	377 if(frameSize > biggestFrameSize) {

374 biggestFrameSize = frameSize;	378 biggestFrameSize = frameSize;

375 readability.biggestFrame = frames[frameIndex];	379 readability.biggestFrame = frames[frameIndex];

376 }	380 }

377	381

378 if(canAccessFrame && frameSize > bestFrameSize)	382 if(canAccessFrame && frameSize > bestFrameSize)

379 {	383 {

380 readability.frameHack = true;	384 readability.frameHack = true;

381	385

382 bestFrame = frames[frameIndex];	386 bestFrame = frames[frameIndex];

383 bestFrameSize = frameSize;	387 bestFrameSize = frameSize;

384 }	388 }

385 }	389 }

386	390

387 if(bestFrame)	391 if(bestFrame)

388 {	392 {

389 var newBody = document.createElement('body');	393 var newBody = document.createElement('body');

390 readability.moveNodeInnards(bestFrame.contentWindow.document.bod y, newBody);	394 readability.moveNodeInnards(bestFrame.contentWindow.document.bod y, newBody);

391 newBody.style.overflow = 'scroll';	395 newBody.style.overflow = 'scroll';

392 document.body = newBody;	396 document.body = newBody;

393	397

394 var frameset = document.getElementsByTagName('frameset')[0];	398 var frameset = document.getElementsByTagName('frameset')[0];

395 if(frameset) {	399 if(frameset) {

396 frameset.parentNode.removeChild(frameset); }	400 frameset.parentNode.removeChild(frameset); }

397 }	401 }

398 }	402 }

399	403

400 /* Remove all stylesheets */	404 /* Remove all stylesheets */

401 for (var k=0;k < document.styleSheets.length; k+=1) {	405 for (var k=0;k < document.styleSheets.length; k+=1) {

402 if (document.styleSheets[k].href !== null && document.styleSheets[k] .href.lastIndexOf("readability") === -1) {	406 if (document.styleSheets[k].href !== null && document.styleSheets[k] .href.lastIndexOf("readability") === -1) {

403 document.styleSheets[k].disabled = true;	407 document.styleSheets[k].disabled = true;

(...skipping 44 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
448 readability.cleanConditionally(articleContent, "table");	452 readability.cleanConditionally(articleContent, "table");

449 readability.cleanConditionally(articleContent, "ul");	453 readability.cleanConditionally(articleContent, "ul");

450 readability.cleanConditionally(articleContent, "div");	454 readability.cleanConditionally(articleContent, "div");

451	455

452 /* Remove extra paragraphs */	456 /* Remove extra paragraphs */

453 var articleParagraphs = articleContent.getElementsByTagName('p');	457 var articleParagraphs = articleContent.getElementsByTagName('p');

454 for(var i = articleParagraphs.length-1; i >= 0; i-=1) {	458 for(var i = articleParagraphs.length-1; i >= 0; i-=1) {

455 var imgCount = articleParagraphs[i].getElementsByTagName('img').l ength;	459 var imgCount = articleParagraphs[i].getElementsByTagName('img').l ength;

456 var embedCount = articleParagraphs[i].getElementsByTagName('embed') .length;	460 var embedCount = articleParagraphs[i].getElementsByTagName('embed') .length;

457 var objectCount = articleParagraphs[i].getElementsByTagName('object' ).length;	461 var objectCount = articleParagraphs[i].getElementsByTagName('object' ).length;

458	462

459 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab ility.getInnerText(articleParagraphs[i], false) === '') {	463 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab ility.getInnerText(articleParagraphs[i], false) === '') {

460 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i] );	464 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i] );

461 }	465 }

462 }	466 }

463	467

464 try {	468 try {

465 readability.replaceBrsWithPs(articleContent);	469 readability.replaceBrsWithPs(articleContent);

466 }	470 }

467 catch (e) {	471 catch (e) {

468 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block -elements bug. Ignoring.: " + e);	472 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block -elements bug. Ignoring.: " + e);

469 }	473 }

470 },	474 },

471	475

472 /**	476 /**

473 * Initialize a node with the readability object. Also checks the	477 * Initialize a node with the readability object. Also checks the

474 * className/id for special names to add to its score.	478 * className/id for special names to add to its score.

475 *	479 *

476 * @param Element	480 * @param Element

477 * @return void	481 * @return void

478 **/	482 **/

479 initializeNode: function (node) {	483 initializeNode: function (node) {

480 node.readability = {"contentScore": 0};	484 node.readability = {"contentScore": 0};

481	485

482 switch(node.tagName) {	486 switch(node.tagName) {

483 case 'DIV':	487 case 'DIV':

484 node.readability.contentScore += 5;	488 node.readability.contentScore += 5;

485 break;	489 break;

486	490

487 case 'PRE':	491 case 'PRE':

488 case 'TD':	492 case 'TD':

489 case 'BLOCKQUOTE':	493 case 'BLOCKQUOTE':

490 node.readability.contentScore += 3;	494 node.readability.contentScore += 3;

491 break;	495 break;

492	496

493 case 'ADDRESS':	497 case 'ADDRESS':

494 case 'OL':	498 case 'OL':

495 case 'UL':	499 case 'UL':

496 case 'DL':	500 case 'DL':

497 case 'DD':	501 case 'DD':

498 case 'DT':	502 case 'DT':

499 case 'LI':	503 case 'LI':

500 case 'FORM':	504 case 'FORM':

501 node.readability.contentScore -= 3;	505 node.readability.contentScore -= 3;

502 break;	506 break;

503	507

504 case 'H1':	508 case 'H1':

505 case 'H2':	509 case 'H2':

506 case 'H3':	510 case 'H3':

507 case 'H4':	511 case 'H4':

508 case 'H5':	512 case 'H5':

509 case 'H6':	513 case 'H6':

510 case 'TH':	514 case 'TH':

511 node.readability.contentScore -= 5;	515 node.readability.contentScore -= 5;

512 break;	516 break;

513 }	517 }

514	518

515 node.readability.contentScore += readability.getClassWeight(node);	519 node.readability.contentScore += readability.getClassWeight(node);

516 },	520 },

517	521

518 /***	522 /***

519 * grabArticle - Using a variety of metrics (content score, classname, eleme nt types), find the content that is	523 * grabArticle - Using a variety of metrics (content score, classname, eleme nt types), find the content that is

520 * most likely to be the stuff a user wants to read. Then retu rn it wrapped up in a div.	524 * most likely to be the stuff a user wants to read. Then retu rn it wrapped up in a div.

521 *	525 *

522 * @param page a document to run upon. Needs to be a full document, complete with body.	526 * @param page a document to run upon. Needs to be a full document, complete with body.

523 * @return Element	527 * @return Element

524 **/	528 **/

525 grabArticle: function (pageToClone) {	529 grabArticle: function (pageToClone) {

526 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_ STRIP_UNLIKELYS),	530 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_ STRIP_UNLIKELYS),

527 isPaging = (page !== null) ? true: false;	531 isPaging = (page !== null) ? true: false;

528	532

529 var page = null;	533 var page = null;

530 // Never work on the actual page.	534 // Never work on the actual page.

531 if (isPaging) {	535 if (isPaging) {

532 page = document.body.cloneNode(true);	536 page = document.body.cloneNode(true);

533 } else {	537 } else {

534 page = pageToClone.cloneNode(true);	538 page = pageToClone.cloneNode(true);

535 }	539 }

536	540

537 var allElements = page.getElementsByTagName('*');	541 var allElements = page.getElementsByTagName('*');

538	542

539 /**	543 /**

540 * First, node prepping. Trash nodes that look cruddy (like ones with th e class name "comment", etc), and turn divs	544 * First, node prepping. Trash nodes that look cruddy (like ones with th e class name "comment", etc), and turn divs

541 * into P tags where they have been used inappropriately (as in, where t hey contain no other block level elements.)	545 * into P tags where they have been used inappropriately (as in, where t hey contain no other block level elements.)

542 *	546 *

543 * Note: Assignment from index for performance. See http://www.peachpit. com/articles/article.aspx?p=31567&seqNum=5	547 * Note: Assignment from index for performance. See http://www.peachpit. com/articles/article.aspx?p=31567&seqNum=5

544 * TODO: Shouldn't this be a reverse traversal?	548 * TODO: Shouldn't this be a reverse traversal?

545 **/	549 **/

546 var node = null;	550 var node = null;

547 var nodesToScore = [];	551 var nodesToScore = [];

548 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {	552 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {

549 /* Remove unlikely candidates */	553 /* Remove unlikely candidates */

550 if (stripUnlikelyCandidates) {	554 if (stripUnlikelyCandidates) {

551 var unlikelyMatchString = node.className + node.id;	555 var unlikelyMatchString = node.className + node.id;

552 if (	556 if (

553 (	557 (

554 unlikelyMatchString.search(readability.regexps.unlikelyC andidates) !== -1 &&	558 unlikelyMatchString.search(readability.regexps.unlikelyC andidates) !== -1 &&

555 unlikelyMatchString.search(readability.regexps.okMaybeIt sACandidate) === -1 &&	559 unlikelyMatchString.search(readability.regexps.okMaybeIt sACandidate) === -1 &&

556 node.tagName !== "BODY"	560 node.tagName !== "BODY"

557 )	561 )

558 )	562 )

559 {	563 {

560 dbg("Removing unlikely candidate - " + unlikelyMatchString);	564 dbg("Removing unlikely candidate - " + unlikelyMatchString);

561 node.parentNode.removeChild(node);	565 node.parentNode.removeChild(node);

562 nodeIndex-=1;	566 nodeIndex-=1;

563 continue;	567 continue;

564 }	568 }

565 }	569 }

566	570

567 if (node.tagName === "P" \|\| node.tagName === "TD" \|\| node.tagName == = "PRE") {	571 if (node.tagName === "P" \|\| node.tagName === "TD" \|\| node.tagName == = "PRE") {

568 nodesToScore[nodesToScore.length] = node;	572 nodesToScore[nodesToScore.length] = node;

569 }	573 }

570	574

571 /* Turn all divs that don't have children block level elements into p's */	575 /* Turn all divs that don't have children block level elements into p's */

572 if (node.tagName === "DIV") {	576 if (node.tagName === "DIV") {

573 if (node.innerHTML.search(readability.regexps.divToPElements) == = -1) {	577 if (node.innerHTML.search(readability.regexps.divToPElements) == = -1) {

574 var newNode = document.createElement('p');	578 var newNode = document.createElement('p');

(...skipping 16 matching lines...) Expand all Loading...
591 if(childNode.nodeType === 3) { // Node.TEXT_NODE	595 if(childNode.nodeType === 3) { // Node.TEXT_NODE

592 var p = document.createElement('p');	596 var p = document.createElement('p');

593 var t = document.createTextNode(childNode.nodeValue) ;	597 var t = document.createTextNode(childNode.nodeValue) ;

594 p.appendChild(t);	598 p.appendChild(t);

595 p.style.display = 'inline';	599 p.style.display = 'inline';

596 p.className = 'readability-styled';	600 p.className = 'readability-styled';

597 childNode.parentNode.replaceChild(p, childNode);	601 childNode.parentNode.replaceChild(p, childNode);

598 }	602 }

599 }	603 }

600 }	604 }

601 }	605 }

602 }	606 }

603	607

604 /**	608 /**

605 * Loop through all paragraphs, and assign a score to them based on how content-y they look.	609 * Loop through all paragraphs, and assign a score to them based on how content-y they look.

606 * Then add their score to their parent node.	610 * Then add their score to their parent node.

607 *	611 *

608 * A score is determined by things like number of commas, class names, e tc. Maybe eventually link density.	612 * A score is determined by things like number of commas, class names, e tc. Maybe eventually link density.

609 **/	613 **/

610 var candidates = [];	614 var candidates = [];

611 for (var pt=0; pt < nodesToScore.length; pt+=1) {	615 for (var pt=0; pt < nodesToScore.length; pt+=1) {

(...skipping 21 matching lines...) Expand all Loading...
633 candidates.push(grandParentNode);	637 candidates.push(grandParentNode);

634 }	638 }

635	639

636 var contentScore = 0;	640 var contentScore = 0;

637	641

638 /* Add a point for the paragraph itself as a base. */	642 /* Add a point for the paragraph itself as a base. */

639 contentScore+=1;	643 contentScore+=1;

640	644

641 /* Add points for any commas within this paragraph */	645 /* Add points for any commas within this paragraph */

642 contentScore += innerText.split(',').length;	646 contentScore += innerText.split(',').length;

643	647

644 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */	648 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */

645 contentScore += Math.min(Math.floor(innerText.length / 100), 3);	649 contentScore += Math.min(Math.floor(innerText.length / 100), 3);

646	650

647 /* Add the score to the parent. The grandparent gets half. */	651 /* Add the score to the parent. The grandparent gets half. */

648 parentNode.readability.contentScore += contentScore;	652 parentNode.readability.contentScore += contentScore;

649	653

650 if(grandParentNode) {	654 if(grandParentNode) {

651 grandParentNode.readability.contentScore += contentScore/2;	655 grandParentNode.readability.contentScore += contentScore/2;

652 }	656 }

653 }	657 }

654	658

655 /**	659 /**

656 * After we've calculated scores, loop through all of the possible candi date nodes we found	660 * After we've calculated scores, loop through all of the possible candi date nodes we found

657 * and find the one with the highest score.	661 * and find the one with the highest score.

658 **/	662 **/

659 var topCandidate = null;	663 var topCandidate = null;

660 for(var c=0, cl=candidates.length; c < cl; c+=1)	664 for(var c=0, cl=candidates.length; c < cl; c+=1)

661 {	665 {

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
718 var contentBonus = 0;	722 var contentBonus = 0;

719 /* Give a bonus if sibling nodes and top candidates have the example same classname */	723 /* Give a bonus if sibling nodes and top candidates have the example same classname */

720 if(siblingNode.className === topCandidate.className && topCandidate. className !== "") {	724 if(siblingNode.className === topCandidate.className && topCandidate. className !== "") {

721 contentBonus += topCandidate.readability.contentScore * 0.2;	725 contentBonus += topCandidate.readability.contentScore * 0.2;

722 }	726 }

723	727

724 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re adability.contentScore+contentBonus) >= siblingScoreThreshold)	728 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re adability.contentScore+contentBonus) >= siblingScoreThreshold)

725 {	729 {

726 append = true;	730 append = true;

727 }	731 }

728	732

729 if(siblingNode.nodeName === "P") {	733 if(siblingNode.nodeName === "P") {

730 var linkDensity = readability.getLinkDensity(siblingNode);	734 var linkDensity = readability.getLinkDensity(siblingNode);

731 var nodeContent = readability.getInnerText(siblingNode);	735 var nodeContent = readability.getInnerText(siblingNode);

732 var nodeLength = nodeContent.length;	736 var nodeLength = nodeContent.length;

733	737

734 if(nodeLength > 80 && linkDensity < 0.25)	738 if(nodeLength > 80 && linkDensity < 0.25)

735 {	739 {

736 append = true;	740 append = true;

737 }	741 }

738 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear ch(/\.( \|$)/) !== -1)	742 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear ch(/\.( \|$)/) !== -1)

739 {	743 {

740 append = true;	744 append = true;

741 }	745 }

742 }	746 }

743	747

744 if(append) {	748 if(append) {

745 dbg("Appending node: " + siblingNode);	749 dbg("Appending node: " + siblingNode);

746	750

747 var nodeToAppend = null;	751 var nodeToAppend = null;

748 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P ") {	752 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P ") {

749 /* We have a node that isn't a common block level element, l ike a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */	753 /* We have a node that isn't a common block level element, l ike a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */

750	754

751 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');	755 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');

752 nodeToAppend = document.createElement("DIV");	756 nodeToAppend = document.createElement("DIV");

753 try {	757 try {

754 nodeToAppend.id = siblingNode.id;	758 nodeToAppend.id = siblingNode.id;

755 readability.moveNodeInnards(siblingNode, nodeToAppend);	759 readability.moveNodeInnards(siblingNode, nodeToAppend);

756 }	760 }

757 catch(er) {	761 catch(er) {

758 dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");	762 dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");

759 nodeToAppend = siblingNode;	763 nodeToAppend = siblingNode;

760 s-=1;	764 s-=1;

761 sl-=1;	765 sl-=1;

762 }	766 }

763 } else {	767 } else {

764 nodeToAppend = siblingNode;	768 nodeToAppend = siblingNode;

765 s-=1;	769 s-=1;

766 sl-=1;	770 sl-=1;

767 }	771 }

768	772

769 /* To ensure a node does not interfere with readability styles, remove its classnames */	773 /* To ensure a node does not interfere with readability styles, remove its classnames */

770 nodeToAppend.className = "";	774 nodeToAppend.className = "";

771	775

772 /* Append sibling and subtract from our list because it removes the node when you append to another node */	776 /* Append sibling and subtract from our list because it removes the node when you append to another node */

773 articleContent.appendChild(nodeToAppend);	777 articleContent.appendChild(nodeToAppend);

774 }	778 }

775 }	779 }

776	780

777 /**	781 /**

778 * So we have all of the content that we need. Now we clean it up for pr esentation.	782 * So we have all of the content that we need. Now we clean it up for pr esentation.

779 **/	783 **/

780 readability.distilledArticleContent = articleContent.cloneNode(true);	784 readability.distilledArticleContent = articleContent.cloneNode(true);

781 //readability.prepArticle(articleContent);	785 //readability.prepArticle(articleContent);

782	786

783 if (readability.curPageNum === 1) {	787 if (readability.curPageNum === 1) {

784 var newNode = document.createElement('div');	788 var newNode = document.createElement('div');

785 newNode.id = "readability-page-1";	789 newNode.id = "readability-page-1";

786 newNode.setAttribute("class", "page");	790 newNode.setAttribute("class", "page");

787 readability.moveNodeInnards(articleContent, newNode);	791 readability.moveNodeInnards(articleContent, newNode);

788 articleContent.appendChild(newNode);	792 articleContent.appendChild(newNode);

789 }	793 }

790	794

791 /**	795 /**

792 * Now that we've gone through the full algorithm, check to see if we go t any meaningful content.	796 * Now that we've gone through the full algorithm, check to see if we go t any meaningful content.

793 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher	797 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher

794 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of	798 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of

795 * finding the -right- content.	799 * finding the -right- content.

796 **/	800 **/

797 if(readability.getInnerText(articleContent, false).length < 250) {	801 if(readability.getInnerText(articleContent, false).length < 250) {

798 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {	802 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {

799 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);	803 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);

800 return readability.grabArticle(document.body);	804 return readability.grabArticle(document.body);

801 }	805 }

802 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {	806 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {

803 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);	807 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);

804 return readability.grabArticle(document.body);	808 return readability.grabArticle(document.body);

805 }	809 }

806 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL LY)) {	810 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL LY)) {

807 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);	811 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);

808 return readability.grabArticle(document.body);	812 return readability.grabArticle(document.body);

809 } else {	813 } else {

810 return null;	814 return null;

811 }	815 }

812 }	816 }

813	817

814 return articleContent;	818 return articleContent;

815 },	819 },

816	820

817 /**	821 /**

818 * Removes script tags from the document.	822 * Removes script tags from the document.

819 *	823 *

820 * @param Element	824 * @param Element

821 **/	825 **/

822 removeScripts: function (doc) {	826 removeScripts: function (doc) {

823 var scripts = doc.getElementsByTagName('script');	827 var scripts = doc.getElementsByTagName('script');

824 for(var i = scripts.length-1; i >= 0; i-=1)	828 for(var i = scripts.length-1; i >= 0; i-=1)

825 {	829 {

826 if(typeof(scripts[i].src) === "undefined" \|\| (scripts[i].src.indexOf ('readability') === -1 && scripts[i].src.indexOf('typekit') === -1))	830 if(typeof(scripts[i].src) === "undefined" \|\| (scripts[i].src.indexOf ('readability') === -1 && scripts[i].src.indexOf('typekit') === -1))

827 {	831 {

828 scripts[i].nodeValue="";	832 scripts[i].nodeValue="";

829 scripts[i].removeAttribute('src');	833 scripts[i].removeAttribute('src');

830 if (scripts[i].parentNode) {	834 if (scripts[i].parentNode) {

831 scripts[i].parentNode.removeChild(scripts[i]);	835 scripts[i].parentNode.removeChild(scripts[i]);

832 }	836 }

833 }	837 }

834 }	838 }

835 },	839 },

836	840

837 /**	841 /**

838 * Get the inner text of a node - cross browser compatibly.	842 * Get the inner text of a node - cross browser compatibly.

839 * This also strips out any excess whitespace to be found.	843 * This also strips out any excess whitespace to be found.

840 *	844 *

841 * @param Element	845 * @param Element

842 * @return string	846 * @return string

843 **/	847 **/

844 getInnerText: function (e, normalizeSpaces) {	848 getInnerText: function (e, normalizeSpaces) {

845 var textContent = "";	849 var textContent = "";

846	850

(...skipping 42 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
889	893

890 // Remove any root styles, if we're able.	894 // Remove any root styles, if we're able.

891 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili ty-styled') {	895 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili ty-styled') {

892 e.removeAttribute('style'); }	896 e.removeAttribute('style'); }

893	897

894 // Go until there are no more child nodes	898 // Go until there are no more child nodes

895 while ( cur !== null ) {	899 while ( cur !== null ) {

896 if ( cur.nodeType === 1 ) {	900 if ( cur.nodeType === 1 ) {

897 // Remove style attribute(s) :	901 // Remove style attribute(s) :

898 if(cur.className !== "readability-styled") {	902 if(cur.className !== "readability-styled") {

899 cur.removeAttribute("style");	903 cur.removeAttribute("style");

900 }	904 }

901 readability.cleanStyles( cur );	905 readability.cleanStyles( cur );

902 }	906 }

903 cur = cur.nextSibling;	907 cur = cur.nextSibling;

904 }	908 }

905 },	909 },

906	910

907 /**	911 /**

908 * Get the density of links as a percentage of the content	912 * Get the density of links as a percentage of the content

909 * This is the amount of text that is inside a link divided by the total tex t in the node.	913 * This is the amount of text that is inside a link divided by the total tex t in the node.

910 *	914 *

911 * @param Element	915 * @param Element

912 * @return number (float)	916 * @return number (float)

913 **/	917 **/

914 getLinkDensity: function (e) {	918 getLinkDensity: function (e) {

915 var links = e.getElementsByTagName("a");	919 var links = e.getElementsByTagName("a");

916 var textLength = readability.getInnerText(e).length;	920 var textLength = readability.getInnerText(e).length;

917 var linkLength = 0;	921 var linkLength = 0;

918 for(var i=0, il=links.length; i<il;i+=1)	922 for(var i=0, il=links.length; i<il;i+=1)

919 {	923 {

920 linkLength += readability.getInnerText(links[i]).length;	924 linkLength += readability.getInnerText(links[i]).length;

921 }	925 }

922	926

923 return linkLength / textLength;	927 return linkLength / textLength;

924 },	928 },

925	929

926 /**	930 /**

927 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.	931 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.

928 *	932 *

929 * @author Dan Lacy	933 * @author Dan Lacy

930 * @return string the base url	934 * @return string the base url

931 **/	935 **/

932 findBaseUrl: function () {	936 findBaseUrl: function () {

933 var noUrlParams = window.location.pathname.split("?")[0],	937 var noUrlParams = window.location.pathname.split("?")[0],

934 urlSlashes = noUrlParams.split("/").reverse(),	938 urlSlashes = noUrlParams.split("/").reverse(),

935 cleanedSegments = [],	939 cleanedSegments = [],

936 possibleType = "";	940 possibleType = "";

937	941

938 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) {	942 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) {

939 var segment = urlSlashes[i];	943 var segment = urlSlashes[i];

940	944

941 // Split off and save anything that looks like a file type.	945 // Split off and save anything that looks like a file type.

942 if (segment.indexOf(".") !== -1) {	946 if (segment.indexOf(".") !== -1) {

943 possibleType = segment.split(".")[1];	947 possibleType = segment.split(".")[1];

944	948

945 /* If the type isn't alpha-only, it's probably not actually a fi le extension. */	949 /* If the type isn't alpha-only, it's probably not actually a fi le extension. */

946 if(!possibleType.match(/[^a-zA-Z]/)) {	950 if(!possibleType.match(/[^a-zA-Z]/)) {

947 segment = segment.split(".")[0];	951 segment = segment.split(".")[0];

948 }	952 }

949 }	953 }

950	954

951 /**	955 /**

952 * EW-CMS specific segment replacement. Ugly.	956 * EW-CMS specific segment replacement. Ugly.

953 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm l	957 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm l

954 **/	958 **/

955 if(segment.indexOf(',00') !== -1) {	959 if(segment.indexOf(',00') !== -1) {

956 segment = segment.replace(',00', '');	960 segment = segment.replace(',00', '');

957 }	961 }

958	962

959 // If our first or second segment has anything looking like a page n umber, remove it.	963 // If our first or second segment has anything looking like a page n umber, remove it.

960 if (segment.match(/((_\|-)?p[a-z]*\|(_\|-))[0-9]{1,2}$/i) && ((i === 1) \|\| (i === 0))) {	964 if (segment.match(/((_\|-)?p[a-z]*\|(_\|-))[0-9]{1,2}$/i) && ((i === 1) \|\| (i === 0))) {

961 segment = segment.replace(/((_\|-)?p[a-z]*\|(_\|-))[0-9]{1,2}$/i, " ");	965 segment = segment.replace(/((_\|-)?p[a-z]*\|(_\|-))[0-9]{1,2}$/i, " ");

962 }	966 }

963	967

964	968

965 var del = false;	969 var del = false;

966	970

967 /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */	971 /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */

968 if (i < 2 && segment.match(/^\d{1,2}$/)) {	972 if (i < 2 && segment.match(/^\d{1,2}$/)) {

969 del = true;	973 del = true;

970 }	974 }

971	975

972 /* If this is the first segment and it's just "index", remove it. */	976 /* If this is the first segment and it's just "index", remove it. */

973 if(i === 0 && segment.toLowerCase() === "index") {	977 if(i === 0 && segment.toLowerCase() === "index") {

974 del = true;	978 del = true;

975 }	979 }

976	980

977	981

978 /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */	982 /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */

979 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) {	983 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) {

980 del = true;	984 del = true;

981 }	985 }

982	986

983 /* If it's not marked for deletion, push it to cleanedSegments. */	987 /* If it's not marked for deletion, push it to cleanedSegments. */

984 if (!del) {	988 if (!del) {

985 cleanedSegments.push(segment);	989 cleanedSegments.push(segment);

986 }	990 }

987 }	991 }

988	992

989 // This is our final, cleaned, base article URL.	993 // This is our final, cleaned, base article URL.

990 return window.location.protocol + "//" + window.location.host + cleanedS egments.reverse().join("/");	994 return window.location.protocol + "//" + window.location.host + cleanedS egments.reverse().join("/");

991 },	995 },

992	996

993 /**	997 /**

994 * Look for any paging links that may occur within the document.	998 * Look for any paging links that may occur within the document.

995 *	999 *

996 * @param body	1000 * @param body

997 * @return object (array)	1001 * @return object (array)

998 **/	1002 **/

999 findNextPageLink: function (elem) {	1003 findNextPageLink: function (elem) {

1000 var possiblePages = {},	1004 var possiblePages = {},

1001 allLinks = elem.getElementsByTagName('a'),	1005 allLinks = elem.getElementsByTagName('a'),

1002 articleBaseUrl = readability.findBaseUrl();	1006 articleBaseUrl = readability.findBaseUrl();

1003	1007

1004 /**	1008 /**

1005 * Loop through all links, looking for hints that they may be next-page links.	1009 * Loop through all links, looking for hints that they may be next-page links.

1006 * Things like having "page" in their textContent, className or id, or b eing a child	1010 * Things like having "page" in their textContent, className or id, or b eing a child

1007 * of a node with a page-y className or id.	1011 * of a node with a page-y className or id.

1008 *	1012 *

1009 * Also possible: levenshtein distance? longest common subsequence?	1013 * Also possible: levenshtein distance? longest common subsequence?

1010 *	1014 *

1011 * After we do that, assign each page a score, and	1015 * After we do that, assign each page a score, and

1012 **/	1016 **/

1013 for(var i = 0, il = allLinks.length; i < il; i+=1) {	1017 for(var i = 0, il = allLinks.length; i < il; i+=1) {

1014 var link = allLinks[i],	1018 var link = allLinks[i],

1015 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ' ');	1019 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ' ');

1016	1020

1017 /* If we've already seen this page, ignore it */	1021 /* If we've already seen this page, ignore it */

1018 if(linkHref === "" \|\| linkHref === articleBaseUrl \|\| linkHref === wi ndow.location.href \|\| linkHref in readability.parsedPages) {	1022 if(linkHref === "" \|\| linkHref === articleBaseUrl \|\| linkHref === wi ndow.location.href \|\| linkHref in readability.parsedPages) {

1019 continue;	1023 continue;

1020 }	1024 }

1021	1025

1022 /* If it's on a different domain, skip it. */	1026 /* If it's on a different domain, skip it. */

1023 if(window.location.host !== linkHref.split(/\/+/g)[1]) {	1027 if(window.location.host !== linkHref.split(/\/+/g)[1]) {

1024 continue;	1028 continue;

1025 }	1029 }

1026	1030

1027 var linkText = readability.getInnerText(link);	1031 var linkText = readability.getInnerText(link);

1028	1032

1029 /* If the linkText looks like it's not the next page, skip it. */	1033 /* If the linkText looks like it's not the next page, skip it. */

1030 if(linkText.match(readability.regexps.extraneous) \|\| linkText.length > 25) {	1034 if(linkText.match(readability.regexps.extraneous) \|\| linkText.length > 25) {

1031 continue;	1035 continue;

1032 }	1036 }

1033	1037

1034 /* If the leftovers of the URL after removing the base URL don't con tain any digits, it's certainly not a next page link. */	1038 /* If the leftovers of the URL after removing the base URL don't con tain any digits, it's certainly not a next page link. */

1035 var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');	1039 var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');

1036 if(!linkHrefLeftover.match(/\d/)) {	1040 if(!linkHrefLeftover.match(/\d/)) {

1037 continue;	1041 continue;

1038 }	1042 }

1039	1043

1040 if(!(linkHref in possiblePages)) {	1044 if(!(linkHref in possiblePages)) {

1041 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr ef": linkHref};	1045 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr ef": linkHref};

1042 } else {	1046 } else {

1043 possiblePages[linkHref].linkText += ' \| ' + linkText;	1047 possiblePages[linkHref].linkText += ' \| ' + linkText;

1044 }	1048 }

1045	1049

1046 var linkObj = possiblePages[linkHref];	1050 var linkObj = possiblePages[linkHref];

1047	1051

1048 /**	1052 /**

1049 * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.	1053 * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.

1050 * Example: http://www.actionscript.org/resources/articles/745/1/Jav aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html	1054 * Example: http://www.actionscript.org/resources/articles/745/1/Jav aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html

1051 **/	1055 **/

1052 if(linkHref.indexOf(articleBaseUrl) !== 0) {	1056 if(linkHref.indexOf(articleBaseUrl) !== 0) {

1053 linkObj.score -= 25;	1057 linkObj.score -= 25;

1054 }	1058 }

1055	1059

1056 var linkData = linkText + ' ' + link.className + ' ' + link.id;	1060 var linkData = linkText + ' ' + link.className + ' ' + link.id;

1057 if(linkData.match(readability.regexps.nextLink)) {	1061 if(linkData.match(readability.regexps.nextLink)) {

1058 linkObj.score += 50;	1062 linkObj.score += 50;

1059 }	1063 }

1060 if(linkData.match(/pag(e\|ing\|inat)/i)) {	1064 if(linkData.match(/pag(e\|ing\|inat)/i)) {

1061 linkObj.score += 25;	1065 linkObj.score += 25;

1062 }	1066 }

1063 if(linkData.match(/(first\|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,	1067 if(linkData.match(/(first\|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,

1064 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */	1068 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */

1065 if(!linkObj.linkText.match(readability.regexps.nextLink)) {	1069 if(!linkObj.linkText.match(readability.regexps.nextLink)) {

1066 linkObj.score -= 65;	1070 linkObj.score -= 65;

1067 }	1071 }

1068 }	1072 }

1069 if(linkData.match(readability.regexps.negative) \|\| linkData.match(re adability.regexps.extraneous)) {	1073 if(linkData.match(readability.regexps.negative) \|\| linkData.match(re adability.regexps.extraneous)) {

1070 linkObj.score -= 50;	1074 linkObj.score -= 50;

1071 }	1075 }

1072 if(linkData.match(readability.regexps.prevLink)) {	1076 if(linkData.match(readability.regexps.prevLink)) {

1073 linkObj.score -= 200;	1077 linkObj.score -= 200;

1074 }	1078 }

1075	1079

1076 /* If a parentNode contains page or paging or paginat */	1080 /* If a parentNode contains page or paging or paginat */

1077 var parentNode = link.parentNode,	1081 var parentNode = link.parentNode,

1078 positiveNodeMatch = false,	1082 positiveNodeMatch = false,

1079 negativeNodeMatch = false;	1083 negativeNodeMatch = false;

1080 while(parentNode) {	1084 while(parentNode) {

1081 var parentNodeClassAndId = parentNode.className + ' ' + parentNo de.id;	1085 var parentNodeClassAndId = parentNode.className + ' ' + parentNo de.id;

1082 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(/pag(e\|ing\|inat)/i)) {	1086 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(/pag(e\|ing\|inat)/i)) {

1083 positiveNodeMatch = true;	1087 positiveNodeMatch = true;

1084 linkObj.score += 25;	1088 linkObj.score += 25;

1085 }	1089 }

1086 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(readability.regexps.negative)) {	1090 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(readability.regexps.negative)) {

1087 /* If this is just something like "footer", give it a negati ve. If it's something like "body-and-footer", leave it be. */	1091 /* If this is just something like "footer", give it a negati ve. If it's something like "body-and-footer", leave it be. */

1088 if(!parentNodeClassAndId.match(readability.regexps.positive) ) {	1092 if(!parentNodeClassAndId.match(readability.regexps.positive) ) {

1089 linkObj.score -= 25;	1093 linkObj.score -= 25;

1090 negativeNodeMatch = true;	1094 negativeNodeMatch = true;

1091 }	1095 }

1092 }	1096 }

1093	1097

1094 parentNode = parentNode.parentNode;	1098 parentNode = parentNode.parentNode;

1095 }	1099 }

1096	1100

1097 /**	1101 /**

1098 * If the URL looks like it has paging in it, add to the score.	1102 * If the URL looks like it has paging in it, add to the score.

1099 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34	1103 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34

1100 **/	1104 **/

1101 if (linkHref.match(/p(a\|g\|ag)?(e\|ing\|ination)?(=\|\/)[0-9]{1,2}/i) \|\| linkHref.match(/(page\|paging)/i)) {	1105 if (linkHref.match(/p(a\|g\|ag)?(e\|ing\|ination)?(=\|\/)[0-9]{1,2}/i) \|\| linkHref.match(/(page\|paging)/i)) {

1102 linkObj.score += 25;	1106 linkObj.score += 25;

1103 }	1107 }

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1145 topPage = possiblePages[page];	1149 topPage = possiblePages[page];

1146 }	1150 }

1147 }	1151 }

1148 }	1152 }

1149	1153

1150 if(topPage) {	1154 if(topPage) {

1151 var nextHref = topPage.href.replace(/\/$/,'');	1155 var nextHref = topPage.href.replace(/\/$/,'');

1152	1156

1153 dbg('NEXT PAGE IS ' + nextHref);	1157 dbg('NEXT PAGE IS ' + nextHref);

1154 readability.parsedPages[nextHref] = true;	1158 readability.parsedPages[nextHref] = true;

1155 return nextHref;	1159 return nextHref;

1156 }	1160 }

1157 else {	1161 else {

1158 return null;	1162 return null;

1159 }	1163 }

1160 },	1164 },

1161	1165

1162 createLinkDiv: function(link) {	1166 createLinkDiv: function(link) {

1163 var divNode = document.createElement('div');	1167 var divNode = document.createElement('div');

1164 var aNode = document.createElement('a');	1168 var aNode = document.createElement('a');

1165 var tNode = document.createTextNode('View Next Page');	1169 var tNode = document.createTextNode('View Next Page');

(...skipping 31 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1197 }	1201 }

1198 else {	1202 else {

1199 if (options.error) { options.error(request); }	1203 if (options.error) { options.error(request); }

1200 }	1204 }

1201 }	1205 }

1202 }	1206 }

1203	1207

1204 if (typeof options === 'undefined') { options = {}; }	1208 if (typeof options === 'undefined') { options = {}; }

1205	1209

1206 request.onreadystatechange = respondToReadyState;	1210 request.onreadystatechange = respondToReadyState;

1207	1211

1208 request.open('get', url, true);	1212 request.open('get', url, true);

1209 request.setRequestHeader('Accept', 'text/html');	1213 request.setRequestHeader('Accept', 'text/html');

1210	1214

1211 try {	1215 try {

1212 request.send(options.postBody);	1216 request.send(options.postBody);

1213 }	1217 }

1214 catch (e) {	1218 catch (e) {

1215 if (options.error) { options.error(); }	1219 if (options.error) { options.error(); }

1216 }	1220 }

1217	1221

(...skipping 14 matching lines...) Expand all Loading...
1232 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada bility.curPageNum + '">§</p>';	1236 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada bility.curPageNum + '">§</p>';

1233	1237

1234 document.getElementById("readability-content").appendChild(articlePage);	1238 document.getElementById("readability-content").appendChild(articlePage);

1235	1239

1236 if(readability.curPageNum > readability.maxPages) {	1240 if(readability.curPageNum > readability.maxPages) {

1237 var linkDiv = readability.createLinkDiv(nextPageLink);	1241 var linkDiv = readability.createLinkDiv(nextPageLink);

1238	1242

1239 articlePage.appendChild(linkDiv);	1243 articlePage.appendChild(linkDiv);

1240 return;	1244 return;

1241 }	1245 }

1242	1246

1243 /**	1247 /**

1244 * Now that we've built the article page DOM element, get the page conte nt	1248 * Now that we've built the article page DOM element, get the page conte nt

1245 * asynchronously and load the cleaned content into the div we created f or it.	1249 * asynchronously and load the cleaned content into the div we created f or it.

1246 **/	1250 **/

1247 (function(pageUrl, thisPage) {	1251 (function(pageUrl, thisPage) {

1248 readability.ajax(pageUrl, {	1252 readability.ajax(pageUrl, {

1249 success: function(r) {	1253 success: function(r) {

1250	1254

1251 /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */	1255 /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */

1252 var eTag = r.getResponseHeader('ETag');	1256 var eTag = r.getResponseHeader('ETag');

1253 if(eTag) {	1257 if(eTag) {

1254 if(eTag in readability.pageETags) {	1258 if(eTag in readability.pageETags) {

1255 dbg("Exact duplicate page found via ETag. Aborting." );	1259 dbg("Exact duplicate page found via ETag. Aborting." );

1256 articlePage.style.display = 'none';	1260 articlePage.style.display = 'none';

1257 return;	1261 return;

1258 } else {	1262 } else {

1259 readability.pageETags[eTag] = 1;	1263 readability.pageETags[eTag] = 1;

1260 }	1264 }

1261 }	1265 }

1262	1266

1263 // TODO: this ends up doubling up page numbers on NYTimes ar ticles. Need to generically parse those away.	1267 // TODO: this ends up doubling up page numbers on NYTimes ar ticles. Need to generically parse those away.

1264 var page = document.createElement("DIV");	1268 var page = document.createElement("DIV");

1265	1269

1266 /**	1270 /**

1267 * Do some preprocessing to our HTML to make it ready for ap pending.	1271 * Do some preprocessing to our HTML to make it ready for ap pending.

1268 * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript.	1272 * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript.

1269 * • Turn any noscript tags into divs so that we can parse t hem. This allows us to find any next page links hidden via javascript.	1273 * • Turn any noscript tags into divs so that we can parse t hem. This allows us to find any next page links hidden via javascript.

1270 * • Turn all double br's into p's - was handled by prepDocu ment in the original view.	1274 * • Turn all double br's into p's - was handled by prepDocu ment in the original view.

(...skipping 30 matching lines...) Expand all Loading...
1301 for(var i=1; i <= readability.curPageNum; i+=1) {	1305 for(var i=1; i <= readability.curPageNum; i+=1) {

1302 var rPage = document.getElementById('readability-pag e-' + i);	1306 var rPage = document.getElementById('readability-pag e-' + i);

1303 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML ) !== -1) {	1307 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML ) !== -1) {

1304 dbg('Duplicate of page ' + i + ' - skipping.');	1308 dbg('Duplicate of page ' + i + ' - skipping.');

1305 articlePage.style.display = 'none';	1309 articlePage.style.display = 'none';

1306 readability.parsedPages[pageUrl] = true;	1310 readability.parsedPages[pageUrl] = true;

1307 return;	1311 return;

1308 }	1312 }

1309 }	1313 }

1310 }	1314 }

1311	1315

1312 readability.removeScripts(content);	1316 readability.removeScripts(content);

1313	1317

1314 readability.moveNodeInnards(content, thisPage);	1318 readability.moveNodeInnards(content, thisPage);

1315	1319

1316 /**	1320 /**

1317 * After the page has rendered, post process the content. Th is delay is necessary because,	1321 * After the page has rendered, post process the content. Th is delay is necessary because,

1318 * in webkit at least, offsetWidth is not set in time to det ermine image width. We have to	1322 * in webkit at least, offsetWidth is not set in time to det ermine image width. We have to

1319 * wait a little bit for reflow to finish before we can fix floating images.	1323 * wait a little bit for reflow to finish before we can fix floating images.

1320 **/	1324 **/

1321 window.setTimeout(	1325 window.setTimeout(

1322 function() { readability.postProcessContent(thisPage); } ,	1326 function() { readability.postProcessContent(thisPage); } ,

1323 500	1327 500

1324 );	1328 );

1325	1329

1326 if(nextPageLink) {	1330 if(nextPageLink) {

1327 readability.appendNextPage(nextPageLink);	1331 readability.appendNextPage(nextPageLink);

1328 }	1332 }

1329 }	1333 }

1330 });	1334 });

1331 }(nextPageLink, articlePage));	1335 }(nextPageLink, articlePage));

1332 },	1336 },

1333	1337

1334 /**	1338 /**

1335 * Get an elements class/id weight. Uses regular expressions to tell if this	1339 * Get an elements class/id weight. Uses regular expressions to tell if this

1336 * element looks good or bad.	1340 * element looks good or bad.

1337 *	1341 *

1338 * @param Element	1342 * @param Element

1339 * @return number (Integer)	1343 * @return number (Integer)

1340 **/	1344 **/

1341 getClassWeight: function (e) {	1345 getClassWeight: function (e) {

1342 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {	1346 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {

1343 return 0;	1347 return 0;

1344 }	1348 }

1345	1349

(...skipping 29 matching lines...) Expand all Loading...
1375 /**	1379 /**

1376 * Remove extraneous break tags from a node.	1380 * Remove extraneous break tags from a node.

1377 *	1381 *

1378 * @param Element	1382 * @param Element

1379 * @return void	1383 * @return void

1380 **/	1384 **/

1381 killBreaks: function (e) {	1385 killBreaks: function (e) {

1382 var allElements = e.getElementsByTagName('*');	1386 var allElements = e.getElementsByTagName('*');

1383 while (i < allElements.length) {	1387 while (i < allElements.length) {

1384 readability.deleteExtraBreaks(allElements[i]);	1388 readability.deleteExtraBreaks(allElements[i]);

1385 i++;	1389 i++;

1386 }	1390 }

1387 },	1391 },

1388	1392

1389 /**	1393 /**

1390 * Clean a node of all elements of type "tag".	1394 * Clean a node of all elements of type "tag".

1391 * (Unless it's a youtube/vimeo video. People love movies.)	1395 * (Unless it's a youtube/vimeo video. People love movies.)

1392 *	1396 *

1393 * @param Element	1397 * @param Element

1394 * @param string tag to clean	1398 * @param string tag to clean

1395 * @return void	1399 * @return void

1396 **/	1400 **/

1397 clean: function (e, tag) {	1401 clean: function (e, tag) {

1398 var targetList = e.getElementsByTagName( tag );	1402 var targetList = e.getElementsByTagName( tag );

1399 var isEmbed = (tag === 'object' \|\| tag === 'embed');	1403 var isEmbed = (tag === 'object' \|\| tag === 'embed');

1400	1404

1401 for (var y=targetList.length-1; y >= 0; y-=1) {	1405 for (var y=targetList.length-1; y >= 0; y-=1) {

1402 /* Allow youtube and vimeo videos through as people usually want to see those. */	1406 /* Allow youtube and vimeo videos through as people usually want to see those. */

1403 if(isEmbed) {	1407 if(isEmbed) {

1404 var attributeValues = "";	1408 var attributeValues = "";

1405 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {	1409 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {

1406 attributeValues += targetList[y].attributes[i].value + '\|';	1410 attributeValues += targetList[y].attributes[i].value + '\|';

1407 }	1411 }

1408	1412

1409 /* First, check the elements attributes to see if any of them co ntain youtube or vimeo */	1413 /* First, check the elements attributes to see if any of them co ntain youtube or vimeo */

1410 if (attributeValues.search(readability.regexps.videos) !== -1) {	1414 if (attributeValues.search(readability.regexps.videos) !== -1) {

1411 continue;	1415 continue;

1412 }	1416 }

1413	1417

1414 /* Then check the elements inside this element for the same. */	1418 /* Then check the elements inside this element for the same. */

1415 if (targetList[y].innerHTML.search(readability.regexps.videos) ! == -1) {	1419 if (targetList[y].innerHTML.search(readability.regexps.videos) ! == -1) {

1416 continue;	1420 continue;

1417 }	1421 }

1418	1422

1419 }	1423 }

1420	1424

1421 targetList[y].parentNode.removeChild(targetList[y]);	1425 targetList[y].parentNode.removeChild(targetList[y]);

1422 }	1426 }

1423 },	1427 },

1424	1428

1425 /**	1429 /**

1426 * Clean an element of all tags of type "tag" if they look fishy.	1430 * Clean an element of all tags of type "tag" if they look fishy.

1427 * "Fishy" is an algorithm based on content length, classnames, link density , number of images & embeds, etc.	1431 * "Fishy" is an algorithm based on content length, classnames, link density , number of images & embeds, etc.

1428 *	1432 *

1429 * @return void	1433 * @return void

1430 **/	1434 **/

1431 cleanConditionally: function (e, tag) {	1435 cleanConditionally: function (e, tag) {

1432	1436

1433 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {	1437 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {

1434 return;	1438 return;

1435 }	1439 }

1436	1440

1437 var tagsList = e.getElementsByTagName(tag);	1441 var tagsList = e.getElementsByTagName(tag);

1438 var curTagsLength = tagsList.length;	1442 var curTagsLength = tagsList.length;

1439	1443

1440 /**	1444 /**

1441 * Gather counts for other typical elements embedded within.	1445 * Gather counts for other typical elements embedded within.

1442 * Traverse backwards so we can remove nodes at the same time without ef fecting the traversal.	1446 * Traverse backwards so we can remove nodes at the same time without ef fecting the traversal.

1443 *	1447 *

1444 * TODO: Consider taking into account original contentScore here.	1448 * TODO: Consider taking into account original contentScore here.

1445 **/	1449 **/

1446 for (var i=curTagsLength-1; i >= 0; i-=1) {	1450 for (var i=curTagsLength-1; i >= 0; i-=1) {

1447 var weight = readability.getClassWeight(tagsList[i]);	1451 var weight = readability.getClassWeight(tagsList[i]);

1448 var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;	1452 var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;

1449	1453

1450 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde fined') ? (" with score " + tagsList[i].readability.contentScore) : ''));	1454 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde fined') ? (" with score " + tagsList[i].readability.contentScore) : ''));

1451	1455

1452 if(weight+contentScore < 0)	1456 if(weight+contentScore < 0)

1453 {	1457 {

1454 tagsList[i].parentNode.removeChild(tagsList[i]);	1458 tagsList[i].parentNode.removeChild(tagsList[i]);

1455 }	1459 }

1456 else if ( readability.getCharCount(tagsList[i],',') < 10) {	1460 else if ( readability.getCharCount(tagsList[i],',') < 10) {

1457 /**	1461 /**

1458 * If there are not very many commas, and the number of	1462 * If there are not very many commas, and the number of

1459 * non-paragraph elements is more than paragraphs or other omino us signs, remove the element.	1463 * non-paragraph elements is more than paragraphs or other omino us signs, remove the element.

1460 **/	1464 **/

1461 var p = tagsList[i].getElementsByTagName("p").length;	1465 var p = tagsList[i].getElementsByTagName("p").length;

1462 var img = tagsList[i].getElementsByTagName("img").length;	1466 var img = tagsList[i].getElementsByTagName("img").length;

1463 var li = tagsList[i].getElementsByTagName("li").length-100;	1467 var li = tagsList[i].getElementsByTagName("li").length-100;

1464 var input = tagsList[i].getElementsByTagName("input").length;	1468 var input = tagsList[i].getElementsByTagName("input").length;

1465	1469

1466 var embedCount = 0;	1470 var embedCount = 0;

1467 var embeds = tagsList[i].getElementsByTagName("embed");	1471 var embeds = tagsList[i].getElementsByTagName("embed");

1468 for(var ei=0,il=embeds.length; ei < il; ei+=1) {	1472 for(var ei=0,il=embeds.length; ei < il; ei+=1) {

1469 if (embeds[ei].src.search(readability.regexps.videos) === -1 ) {	1473 if (embeds[ei].src.search(readability.regexps.videos) === -1 ) {

1470 embedCount+=1;	1474 embedCount+=1;

1471 }	1475 }

1472 }	1476 }

1473	1477

1474 var linkDensity = readability.getLinkDensity(tagsList[i]);	1478 var linkDensity = readability.getLinkDensity(tagsList[i]);

1475 var contentLength = readability.getInnerText(tagsList[i]).length ;	1479 var contentLength = readability.getInnerText(tagsList[i]).length ;

1476 var toRemove = false;	1480 var toRemove = false;

1477	1481

1478 if ( img > p ) {	1482 if ( img > p ) {

1479 toRemove = true;	1483 toRemove = true;

1480 } else if(li > p && tag !== "ul" && tag !== "ol") {	1484 } else if(li > p && tag !== "ul" && tag !== "ol") {

1481 toRemove = true;	1485 toRemove = true;

1482 } else if( input > Math.floor(p/3) ) {	1486 } else if( input > Math.floor(p/3) ) {

1483 toRemove = true;	1487 toRemove = true;

1484 } else if(contentLength < 25 && (img === 0 \|\| img > 2) ) {	1488 } else if(contentLength < 25 && (img === 0 \|\| img > 2) ) {

1485 toRemove = true;	1489 toRemove = true;

1486 } else if(weight < 25 && linkDensity > 0.2) {	1490 } else if(weight < 25 && linkDensity > 0.2) {

1487 toRemove = true;	1491 toRemove = true;

1488 } else if(weight >= 25 && linkDensity > 0.5) {	1492 } else if(weight >= 25 && linkDensity > 0.5) {

1489 toRemove = true;	1493 toRemove = true;

1490 } else if((embedCount === 1 && contentLength < 75) \|\| embedCount > 1) {	1494 } else if((embedCount === 1 && contentLength < 75) \|\| embedCount > 1) {

1491 toRemove = true;	1495 toRemove = true;

1492 }	1496 }

1493	1497

(...skipping 21 matching lines...) Expand all Loading...
1515 }	1519 }

1516 },	1520 },

1517	1521

1518 flagIsActive: function(flag) {	1522 flagIsActive: function(flag) {

1519 return (readability.flags & flag) > 0;	1523 return (readability.flags & flag) > 0;

1520 },	1524 },

1521	1525

1522 addFlag: function(flag) {	1526 addFlag: function(flag) {

1523 readability.flags = readability.flags \| flag;	1527 readability.flags = readability.flags \| flag;

1524 },	1528 },

1525	1529

1526 removeFlag: function(flag) {	1530 removeFlag: function(flag) {

1527 readability.flags = readability.flags & ~flag;	1531 readability.flags = readability.flags & ~flag;

1528 },	1532 },

1529	1533

1530 // Removes the children of \|src\| and appends them to \|dest\|.	1534 // Removes the children of \|src\| and appends them to \|dest\|.

1531 moveNodeInnards: function(src, dest) {	1535 moveNodeInnards: function(src, dest) {

1532 try {	1536 try {

1533 while (src.firstChild) {	1537 while (src.firstChild) {

1534 dest.appendChild(src.removeChild(src.firstChild));	1538 dest.appendChild(src.removeChild(src.firstChild));

1535 }	1539 }

(...skipping 48 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1584 var lastBr = readability.isMultipleBr(node, false);	1588 var lastBr = readability.isMultipleBr(node, false);

1585 var ret = false;	1589 var ret = false;

1586 while (lastBr && lastBr != node) {	1590 while (lastBr && lastBr != node) {

1587 var toRemove = lastBr;	1591 var toRemove = lastBr;

1588 lastBr = lastBr.previousSibling;	1592 lastBr = lastBr.previousSibling;

1589 toRemove.parentNode.removeChild(toRemove);	1593 toRemove.parentNode.removeChild(toRemove);

1590 ret = true;	1594 ret = true;

1591 }	1595 }

1592 return ret;	1596 return ret;

1593 },	1597 },

1594	1598

1595 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a	1599 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a

1596 // <P> node, and makes all next siblings of that pair children of <P>, up	1600 // <P> node, and makes all next siblings of that pair children of <P>, up

1597 // until the next pair of <BR> nodes is reached.	1601 // until the next pair of <BR> nodes is reached.

1598 replaceDoubleBrWithP: function(node) {	1602 replaceDoubleBrWithP: function(node) {

1599 // Check that we are starting with a BR.	1603 // Check that we are starting with a BR.

1600 var second = readability.isMultipleBr(node, true);	1604 var second = readability.isMultipleBr(node, true);

1601 if (!second) {	1605 if (!second) {

1602 return;	1606 return;

1603 }	1607 }

1604 // Make all next siblings of the second BR into children of a P.	1608 // Make all next siblings of the second BR into children of a P.

1605 var p = document.createElement('p');	1609 var p = document.createElement('p');

1606 var curr = second.nextSibling;	1610 var curr = second.nextSibling;

1607 while (curr) {	1611 while (curr) {

1608 if (readability.isMultipleBr(curr, true)) {	1612 if (readability.isMultipleBr(curr, true)) {

1609 break;	1613 break;

1610 }	1614 }

1611 var next = curr.nextSibling;	1615 var next = curr.nextSibling;

1612 p.appendChild(curr.parentNode.removeChild(curr));	1616 p.appendChild(curr.parentNode.removeChild(curr));

1613 curr = next;	1617 curr = next;

1614 }	1618 }

1615 var ret = curr;	1619 var ret = curr;

1616	1620

1617 // Remove all nodes between the first and second BR.	1621 // Remove all nodes between the first and second BR.

1618 curr = node.nextSibling;	1622 curr = node.nextSibling;

1619 while (curr && curr != second) {	1623 while (curr && curr != second) {

1620 var next = curr.nextSibling;	1624 var next = curr.nextSibling;

1621 curr.parentNode.removeChild(curr);	1625 curr.parentNode.removeChild(curr);

1622 curr = next;	1626 curr = next;

1623 }	1627 }

1624 // Remove the second BR.	1628 // Remove the second BR.

1625 second.parentNode.removeChild(second);	1629 second.parentNode.removeChild(second);

1626 // Replace the first BR with the P.	1630 // Replace the first BR with the P.

1627 node.parentNode.replaceChild(p, node);	1631 node.parentNode.replaceChild(p, node);

1628	1632

1629 return ret;	1633 return ret;

1630 },	1634 },

1631	1635

1632 // Returns true if the NodeList contains a double <BR>.	1636 // Returns true if the NodeList contains a double <BR>.

1633 hasDoubleBr: function(nodeList) {	1637 hasDoubleBr: function(nodeList) {

1634 for (var i = 0; i < nodeList.length; nodeList++) {	1638 for (var i = 0; i < nodeList.length; nodeList++) {

1635 if (readability.isMultipleBr(nodeList[i], true)) {	1639 if (readability.isMultipleBr(nodeList[i], true)) {

1636 return true;	1640 return true;

1637 }	1641 }

1638 }	1642 }

1639 return false;	1643 return false;

1640 },	1644 },

1641	1645

1642 // Replaces double <BR> tags with <P> tags.	1646 // Replaces double <BR> tags with <P> tags.

1643 replaceDoubleBrsWithPs: function(node) {	1647 replaceDoubleBrsWithPs: function(node) {

1644 var allElements = node.getElementsByTagName('BR');	1648 var allElements = node.getElementsByTagName('BR');

1645 var node = null;	1649 var node = null;

1646 while (allElements && allElements.length > 0 &&	1650 while (allElements && allElements.length > 0 &&

1647 readability.hasDoubleBr(allElements)) {	1651 readability.hasDoubleBr(allElements)) {

1648 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {	1652 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {

1649 var next = node;	1653 var next = node;

1650 while (next = readability.replaceDoubleBrWithP(next));	1654 while (next = readability.replaceDoubleBrWithP(next));

1651 }	1655 }

1652 allElements = document.body.getElementsByTagName('BR');	1656 allElements = document.body.getElementsByTagName('BR');

1653 }	1657 }

1654 },	1658 },

1655	1659

1656	1660

1657 // Replaces a BR and the whitespace that follows it with a P.	1661 // Replaces a BR and the whitespace that follows it with a P.

1658 replaceBrWithP: function(node) {	1662 replaceBrWithP: function(node) {

1659 if (!readability.isBrNode(node)) {	1663 if (!readability.isBrNode(node)) {

1660 return;	1664 return;

1661 }	1665 }

1662 var p = document.createElement('p');	1666 var p = document.createElement('p');

1663 var curr = node.nextSibling;	1667 var curr = node.nextSibling;

1664 while (curr && !isBrNode(curr)) {	1668 while (curr && !isBrNode(curr)) {

1665 var next = curr.nextSibling;	1669 var next = curr.nextSibling;

1666 if (readability.isWhitespaceNode(curr)) {	1670 if (readability.isWhitespaceNode(curr)) {

1667 curr.parentNode.removeChild(curr);	1671 curr.parentNode.removeChild(curr);

1668 } else {	1672 } else {

1669 p.appendChild(curr.parentNode.removeChild(curr));	1673 p.appendChild(curr.parentNode.removeChild(curr));

1670 }	1674 }

1671 curr = next;	1675 curr = next;

1672 }	1676 }

1673 node.parentNode.replaceChild(p, node);	1677 node.parentNode.replaceChild(p, node);

1674 return curr;	1678 return curr;

1675 },	1679 },

1676	1680

1677 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t ag	1681 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t ag

1678 // children of the <P>.	1682 // children of the <P>.

1679 replaceBrsWithPs: function(node) {	1683 replaceBrsWithPs: function(node) {

1680 var allElements = node.getElementsByTagName('BR');	1684 var allElements = node.getElementsByTagName('BR');

1681 var node = null;	1685 var node = null;

1682 while (allElements && allElements.length > 0) {	1686 while (allElements && allElements.length > 0) {

1683 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {	1687 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {

1684 var next = node;	1688 var next = node;

1685 while (next = readability.replaceBrWithP(next));	1689 while (next = readability.replaceBrWithP(next));

1686 }	1690 }

1687 allElements = document.body.getElementsByTagName('BR');	1691 allElements = document.body.getElementsByTagName('BR');

1688 }	1692 }

1689 },	1693 },

1690	1694

1691 // Replaces any tag with any other tag.	1695 // Replaces any tag with any other tag.

1692 replaceTagsWithTags: function(node, srcTag, destTag) {	1696 replaceTagsWithTags: function(node, srcTag, destTag) {

1693 var allElements = node.getElementsByTagName(srcTag);	1697 var allElements = node.getElementsByTagName(srcTag);

1694 for (var i = 0; i < allElements.length; i++) {	1698 for (var i = 0; i < allElements.length; i++) {

1695 var dest = document.createElement(destTag);	1699 var dest = document.createElement(destTag);

1696 readability.moveNodeInnards(allElements[i], dest);	1700 readability.moveNodeInnards(allElements[i], dest);

1697 node.replaceNode(dest, allElements[i]);	1701 allElements[i].parentNode.replaceChild(dest, allElements[i]);

1698 }	1702 }

1699 },	1703 },

1700	1704

1701 // Replaces all <noscript> tags with <p> tags.	1705 // Replaces all <noscript> tags with <p> tags.

1702 replaceNoscriptsWithPs: function(node) {	1706 replaceNoscriptsWithPs: function(node) {

1703 readability.replaceTagsWithTags(node, 'noscript', 'p');	1707 readability.replaceTagsWithTags(node, 'noscript', 'p');

1704 },	1708 },

1705	1709

1706 // Replaces all <font> tags with <span> tags.	1710 // Replaces all <font> tags with <span> tags.

1707 replaceFontsWithSpans: function(node) {	1711 replaceFontsWithSpans: function(node) {

1708 readability.replaceTagsWithTags(node, 'font', 'span');	1712 readability.replaceTagsWithTags(node, 'font', 'span');

1709 },	1713 },

1710	1714

1711 // Returns a list of image URLs in the distilled article.	1715 // Returns a list of image URLs in the distilled article.

1712 getImages : function() {	1716 getImages : function() {

1713 var images = document.getElementsByTagName('img');	1717 var images = document.getElementsByTagName('img');

1714 var result = new Array(images.length);	1718 var result = new Array(images.length);

1715 dbg("Number of images: " + images.length);	1719 dbg("Number of images: " + images.length);

1716 for(i = 0; i < images.length; i++) {	1720 for(i = 0; i < images.length; i++) {

1717 result[i] = images[i].src;	1721 result[i] = images[i].src;

1718 dbg("Image: " + result[i]);	1722 dbg("Image: " + result[i]);

1719 }	1723 }

1720 return result;	1724 return result;

1721 },	1725 },

1722	1726

1723 // Returns the distilled article HTML from the page(s).	1727 // Returns the distilled article HTML from the page(s).

1724 getDistilledArticleHTML : function() {	1728 getDistilledArticleHTML : function() {

1725 return readability.distilledHTML;	1729 return readability.distilledHTML;

	1730 },

	1731

	1732 // Returns the next page of this article.

	1733 getNextPageLink : function() {

	1734 return readability.nextPageLink;

1726 }	1735 }

1727 };	1736 };

1728	1737

1729 // Extracts long-form content from a page and returns and array where the first	1738 // Extracts long-form content from a page and returns and array where the first

1730 // element is the article title, the second element is HTML containing the	1739 // element is the article title, the second element is HTML containing the

1731 // long-form content, and remaining elements are URLs for images referenced by	1740 // long-form content, and remaining elements are URLs for images referenced by

1732 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which	1741 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which

1733 // corresponds to a URL listed at index k in the array returned.	1742 // corresponds to a URL listed at index k in the array returned.

1734 (function () {	1743 (function () {

1735 readability.init();	1744 readability.init();

1736 var result = new Array(2);	1745 var result = new Array(3);

1737 result[0] = readability.getArticleTitle();	1746 result[0] = readability.getArticleTitle();

1738 result[1] = readability.getDistilledArticleHTML();	1747 result[1] = readability.getDistilledArticleHTML();

	1748 result[2] = readability.getNextPageLink();

1739 return result.concat(readability.getImages());	1749 return result.concat(readability.getImages());

1740 }())	1750 }())

1741	1751

OLD	NEW

« components/dom_distiller/core/distiller.cc ('K') | « components/dom_distiller/core/task_tracker_unittest.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine

This is Rietveld 408576698