third_party/readability/js/readability.js - Issue 146843010: Add support for multipage distillation. - Code Review

Chromium Code Reviews

chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out

(83)

My Issues | Starred Open | Closed | All

Side by Side Diff: third_party/readability/js/readability.js

Issue 146843010: Add support for multipage distillation. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Address Chris' comments. Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« components/dom_distiller/core/page_distiller.h ('K') | « components/dom_distiller/core/task_tracker_unittest.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1	1

2 var dbg = (typeof console !== 'undefined') ? function(s) {	2 var dbg = (typeof console !== 'undefined') ? function(s) {

3 console.log("Readability: " + s);	3 console.log("Readability: " + s);

4 } : function() {};	4 } : function() {};

5	5

6 /*	6 /*

7 * Readability. An Arc90 Lab Experiment.	7 * Readability. An Arc90 Lab Experiment.

8 * Website: http://lab.arc90.com/experiments/readability	8 * Website: http://lab.arc90.com/experiments/readability

9 * Source: http://code.google.com/p/arc90labs-readability	9 * Source: http://code.google.com/p/arc90labs-readability

10 *	10 *

11 * "Readability" is a trademark of Arc90 Inc and may not be used without explici t permission.	11 * "Readability" is a trademark of Arc90 Inc and may not be used without explici t permission.

12 *	12 *

13 * Copyright (c) 2010 Arc90 Inc	13 * Copyright (c) 2010 Arc90 Inc

14 * Readability is licensed under the Apache License, Version 2.0.	14 * Readability is licensed under the Apache License, Version 2.0.

15 **/	15 **/

16 var readability = {	16 var readability = {

17 readStyle: "style-newspaper",	17 readStyle: "style-newspaper",

18 readSize: "size-medium",	18 readSize: "size-medium",

19 readMargin: "margin-wide",	19 readMargin: "margin-wide",

20	20

21 distilledHTML: '',	21 distilledHTML: '',

22 distilledArticleContent: null,	22 distilledArticleContent: null,

	23 nextPageLink: '',

23	24

24 version: '1.7.1',	25 version: '1.7.1',

25 iframeLoads: 0,	26 iframeLoads: 0,

26 convertLinksToFootnotes: false,	27 convertLinksToFootnotes: false,

27 reversePageScroll: false, /* If they hold shift and hit space, scroll up */	28 reversePageScroll: false, /* If they hold shift and hit space, scroll up */

28 frameHack: false, /**	29 frameHack: false, /**

29 * The frame hack is to workaround a firefo x bug where if you	30 * The frame hack is to workaround a firefo x bug where if you

30 * pull content out of a frame and stick it into the parent element, the scrollbar won't appear.	31 * pull content out of a frame and stick it into the parent element, the scrollbar won't appear.

31 * So we fake a scrollbar in the wrapping d iv.	32 * So we fake a scrollbar in the wrapping d iv.

32 **/	33 **/

33 biggestFrame: false,	34 biggestFrame: false,

34 flags: 0x1 \| 0x2 \| 0x4, /* Start with all flags set. */	35 flags: 0x1 \| 0x2 \| 0x4, /* Start with all flags set. */

35	36

36 /* constants */	37 /* constants */

37 FLAG_STRIP_UNLIKELYS: 0x1,	38 FLAG_STRIP_UNLIKELYS: 0x1,

38 FLAG_WEIGHT_CLASSES: 0x2,	39 FLAG_WEIGHT_CLASSES: 0x2,

39 FLAG_CLEAN_CONDITIONALLY: 0x4,	40 FLAG_CLEAN_CONDITIONALLY: 0x4,

40	41

41 maxPages: 30, /* The maximum number of pages to loop through before we ca ll it quits and just show a link. */	42 maxPages: 30, /* The maximum number of pages to loop through before we ca ll it quits and just show a link. */

42 parsedPages: {}, /* The list of pages we've parsed in this call of readabili ty, for autopaging. As a key store for easier searching. */	43 parsedPages: {}, /* The list of pages we've parsed in this call of readabili ty, for autopaging. As a key store for easier searching. */

43 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas e they happen to match, we'll know it's a duplicate. */	44 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas e they happen to match, we'll know it's a duplicate. */

44	45

45 /**	46 /**

46 * All of the regular expressions in use within readability.	47 * All of the regular expressions in use within readability.

47 * Defined up here so we don't instantiate them repeatedly in loops.	48 * Defined up here so we don't instantiate them repeatedly in loops.

48 **/	49 **/

49 regexps: {	50 regexps: {

50 unlikelyCandidates: /combx\|comment\|community\|disqus\|extra\|foot\|header \|menu\|remark\|rss\|shoutbox\|sidebar\|sponsor\|ad-break\|agegate\|pagination\|pager\|popu p\|tweet\|twitter/i,	51 unlikelyCandidates: /combx\|comment\|community\|disqus\|extra\|foot\|header \|menu\|remark\|rss\|shoutbox\|sidebar\|sponsor\|ad-break\|agegate\|pagination\|pager\|popu p\|tweet\|twitter/i,

51 okMaybeItsACandidate: /and\|article\|body\|column\|main\|shadow/i,	52 okMaybeItsACandidate: /and\|article\|body\|column\|main\|shadow/i,

52 positive: /article\|body\|content\|entry\|hentry\|main\|page\|pagi nation\|post\|text\|blog\|story/i,	53 positive: /article\|body\|content\|entry\|hentry\|main\|page\|pagi nation\|post\|text\|blog\|story/i,

53 negative: /combx\|comment\|com-\|contact\|foot\|footer\|footnote\| masthead\|media\|meta\|outbrain\|promo\|related\|scroll\|shoutbox\|sidebar\|sponsor\|shopp ing\|tags\|tool\|widget/i,	54 negative: /combx\|comment\|com-\|contact\|foot\|footer\|footnote\| masthead\|media\|meta\|outbrain\|promo\|related\|scroll\|shoutbox\|sidebar\|sponsor\|shopp ing\|tags\|tool\|widget/i,

54 extraneous: /print\|archive\|comment\|discuss\|e[\-]?mail\|share\|r eply\|all\|login\|sign\|single/i,	55 extraneous: /print\|archive\|comment\|discuss\|e[\-]?mail\|share\|r eply\|all\|login\|sign\|single/i,

55 divToPElements: /<(a\|blockquote\|dl\|div\|img\|ol\|p\|pre\|table\|ul)/i,	56 divToPElements: /<(a\|blockquote\|dl\|div\|img\|ol\|p\|pre\|table\|ul)/i,

56 replaceBrs: /(<br[^>]>[ \n\r\t]){2,}/gi,	57 replaceBrs: /(<br[^>]>[ \n\r\t]){2,}/gi,

57 replaceFonts: /<(\/?)font[^>]*>/gi,	58 replaceFonts: /<(\/?)font[^>]*>/gi,

58 trim: /^\s+\|\s+$/g,	59 trim: /^\s+\|\s+$/g,

59 normalize: /\s{2,}/g,	60 normalize: /\s{2,}/g,

60 killBreaks: /(<br\s\/?>(\s\| ?)){1,}/g,	61 killBreaks: /(<br\s\/?>(\s\| ?)){1,}/g,

61 videos: /http:\/\/(www\.)?(youtube\|vimeo)\.com/i,	62 videos: /http:\/\/(www\.)?(youtube\|vimeo)\.com/i,

62 skipFootnoteLink: /^\s(\[?[a-z0-9]{1,2}\]?\|^\|edit\|citation needed) \s$/i,	63 skipFootnoteLink: /^\s(\[?[a-z0-9]{1,2}\]?\|^\|edit\|citation needed) \s$/i,

63 nextLink: /(next\|weiter\|continue\|>([^\\|]\|$)\|»([^\\|]\|$))/i, // Match: next, continue, >, >>, » but not >\|, »\| as those usually mean last.	64 nextLink: /(next\|weiter\|continue\|>([^\\|]\|$)\|»([^\\|]\|$))/i, // Match: next, continue, >, >>, » but not >\|, »\| as those usually mean last.

64 prevLink: /(prev\|earl\|old\|new\|<\|«)/i	65 prevLink: /(prev\|earl\|old\|new\|<\|«)/i

65 },	66 },

66	67

67 /**	68 /**

68 * Runs readability.	69 * Runs readability.

69 *	70 *

70 * Workflow:	71 * Workflow:

71 * 1. Prep the document by removing script tags, css, etc.	72 * 1. Prep the document by removing script tags, css, etc.

72 * 2. Build readability's DOM tree.	73 * 2. Build readability's DOM tree.

73 * 3. Grab the article content from the current dom tree.	74 * 3. Grab the article content from the current dom tree.

74 * 4. Replace the current DOM tree with the new one.	75 * 4. Replace the current DOM tree with the new one.

75 * 5. Read peacefully.	76 * 5. Read peacefully.

76 *	77 *

77 * @return void	78 * @return void

78 **/	79 **/

79 init: function() {	80 init: function() {

80 /* Before we do anything, remove all scripts that are not readability. * /	81 /* Before we do anything, remove all scripts that are not readability. * /

81 window.onload = window.onunload = function() {};	82 window.onload = window.onunload = function() {};

82	83

83 readability.removeScripts(document);	84 readability.removeScripts(document);

84	85

85 /* Make sure this document is added to the list of parsed pages first, s o we don't double up on the first page */	86 /* Make sure this document is added to the list of parsed pages first, s o we don't double up on the first page */

86 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;	87 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;

87	88

88 /* Pull out any possible next page link first */	89 /* Pull out any possible next page link first */

89 var nextPageLink = readability.findNextPageLink(document.body);	90 readability.nextPageLink = readability.findNextPageLink(document.body);

90	91

	92 /* We handle processing of nextPage from C++ set nextPageLink to null */

	93 var nextPageLink = null;

	94

91 readability.prepDocument();	95 readability.prepDocument();

92	96

93 /* Build readability's DOM tree */	97 /* Build readability's DOM tree */

94 var overlay = document.createElement("DIV");	98 var overlay = document.createElement("DIV");

95 var innerDiv = document.createElement("DIV");	99 var innerDiv = document.createElement("DIV");

96 var articleTools = readability.getArticleTools();	100 var articleTools = readability.getArticleTools();

97 var articleTitleText = readability.getArticleTitle();	101 var articleTitleText = readability.getArticleTitle();

98 var articleContent = readability.grabArticle();	102 var articleContent = readability.grabArticle();

99	103

100 if(!articleContent) {	104 if(!articleContent) {

(...skipping 44 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
145 rootWarning.innerHTML = "<em>Readability</em> was intended for u se on individual articles and not home pages. " +	149 rootWarning.innerHTML = "<em>Readability</em> was intended for u se on individual articles and not home pages. " +

146 "If you'd like to try rendering this page anyway, <a onClick='ja vascript:document.getElementById(\"readability-warning\").style.display=\"none\" ;document.getElementById(\"readability-content\").style.display=\"block\";'>clic k here</a> to continue.";	150 "If you'd like to try rendering this page anyway, <a onClick='ja vascript:document.getElementById(\"readability-warning\").style.display=\"none\" ;document.getElementById(\"readability-content\").style.display=\"block\";'>clic k here</a> to continue.";

147	151

148 innerDiv.insertBefore( rootWarning, articleContent );	152 innerDiv.insertBefore( rootWarning, articleContent );

149 }	153 }

150	154

151 readability.postProcessContent(articleContent);	155 readability.postProcessContent(articleContent);

152	156

153 window.scrollTo(0, 0);	157 window.scrollTo(0, 0);

154	158

155 // TODO(bengr): Remove this assignment of null to nextPageLink when

156 // the processing of the next page link is safe.

157 nextPageLink = null;

158

159 if (nextPageLink) {	159 if (nextPageLink) {

160 /**	160 /**

161 * Append any additional pages after a small timeout so that people	161 * Append any additional pages after a small timeout so that people

162 * can start reading without having to wait for this to finish proce ssing.	162 * can start reading without having to wait for this to finish proce ssing.

163 **/	163 **/

164 window.setTimeout(function() {	164 window.setTimeout(function() {

165 readability.appendNextPage(nextPageLink);	165 readability.appendNextPage(nextPageLink);

166 }, 500);	166 }, 500);

167 }	167 }

168	168

169 / Smooth scrolling /	169 / Smooth scrolling /

170 document.onkeydown = function(e) {	170 document.onkeydown = function(e) {

171 var code = (window.event) ? event.keyCode : e.keyCode;	171 var code = (window.event) ? event.keyCode : e.keyCode;

172 if (code === 16) {	172 if (code === 16) {

173 readability.reversePageScroll = true;	173 readability.reversePageScroll = true;

174 return;	174 return;

175 }	175 }

176	176

177 if (code === 32) {	177 if (code === 32) {

178 readability.curScrollStep = 0;	178 readability.curScrollStep = 0;

179 var windowHeight = window.innerHeight ? window.innerHeight : (do cument.documentElement.clientHeight ? document.documentElement.clientHeight : do cument.body.clientHeight);	179 var windowHeight = window.innerHeight ? window.innerHeight : (do cument.documentElement.clientHeight ? document.documentElement.clientHeight : do cument.body.clientHeight);

180	180

181 if(readability.reversePageScroll) {	181 if(readability.reversePageScroll) {

182 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() - (windowHeight - 50), 20, 10);	182 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() - (windowHeight - 50), 20, 10);

183 }	183 }

184 else {	184 else {

185 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() + (windowHeight - 50), 20, 10);	185 readability.scrollTo(readability.scrollTop(), readability.sc rollTop() + (windowHeight - 50), 20, 10);

186 }	186 }

187	187

188 return false;	188 return false;

189 }	189 }

190 };	190 };

191	191

192 document.onkeyup = function(e) {	192 document.onkeyup = function(e) {

193 var code = (window.event) ? event.keyCode : e.keyCode;	193 var code = (window.event) ? event.keyCode : e.keyCode;

194 if (code === 16) {	194 if (code === 16) {

195 readability.reversePageScroll = false;	195 readability.reversePageScroll = false;

196 return;	196 return;

197 }	197 }

198 };	198 };

199 },	199 },

200	200

201 /**	201 /**

202 * Run any post-process modifications to article content as necessary.	202 * Run any post-process modifications to article content as necessary.

203 *	203 *

204 * @param Element	204 * @param Element

205 * @return void	205 * @return void

206 **/	206 **/

207 postProcessContent: function(articleContent) {	207 postProcessContent: function(articleContent) {

208 if(readability.convertLinksToFootnotes && !window.location.href.match(/w ikipedia\.org/g)) {	208 if(readability.convertLinksToFootnotes && !window.location.href.match(/w ikipedia\.org/g)) {

209 readability.addFootnotes(articleContent);	209 readability.addFootnotes(articleContent);

210 }	210 }

211	211

212 readability.fixImageFloats(articleContent);	212 readability.fixImageFloats(articleContent);

213 },	213 },

214	214

215 /**	215 /**

216 * Some content ends up looking ugly if the image is too large to be floated .	216 * Some content ends up looking ugly if the image is too large to be floated .

217 * If the image is wider than a threshold (currently 55%), no longer float i t,	217 * If the image is wider than a threshold (currently 55%), no longer float i t,

218 * center it instead.	218 * center it instead.

219 *	219 *

220 * @param Element	220 * @param Element

221 * @return void	221 * @return void

222 **/	222 **/

223 fixImageFloats: function (articleContent) {	223 fixImageFloats: function (articleContent) {

224 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0. 55,	224 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0. 55,

225 images = articleContent.getElementsByTagName('img');	225 images = articleContent.getElementsByTagName('img');

226	226

227 for(var i=0, il = images.length; i < il; i+=1) {	227 for(var i=0, il = images.length; i < il; i+=1) {

228 var image = images[i];	228 var image = images[i];

229	229

230 if(image.offsetWidth > imageWidthThreshold) {	230 if(image.offsetWidth > imageWidthThreshold) {

231 image.className += " blockImage";	231 image.className += " blockImage";

232 }	232 }

233 }	233 }

234 },	234 },

235	235

236 /**	236 /**

237 * Get the article tools Element that has buttons like reload, print.	237 * Get the article tools Element that has buttons like reload, print.

238 *	238 *

239 * @return void	239 * @return void

240 **/	240 **/

241 getArticleTools: function () {	241 getArticleTools: function () {

242 var articleTools = document.createElement("DIV");	242 var articleTools = document.createElement("DIV");

243	243

244 articleTools.id = "readTools";	244 articleTools.id = "readTools";

245 articleTools.innerHTML =	245 articleTools.innerHTML =

246 "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +	246 "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +

247 "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +	247 "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +

248 "<a href='#' onclick='readability.emailBox(); return false;' title=' Email page' id='email-page'>Email Page</a>";	248 "<a href='#' onclick='readability.emailBox(); return false;' title=' Email page' id='email-page'>Email Page</a>";

249	249

250 return articleTools;	250 return articleTools;

251 },	251 },

252	252

253 /**	253 /**

254 * retuns the suggested direction of the string	254 * retuns the suggested direction of the string

255 *	255 *

256 * @return "rtl" \|\| "ltr"	256 * @return "rtl" \|\| "ltr"

257 **/	257 **/

258 getSuggestedDirection: function(text) {	258 getSuggestedDirection: function(text) {

259 function sanitizeText() {	259 function sanitizeText() {

260 return text.replace(/@\w+/, "");	260 return text.replace(/@\w+/, "");

261 }	261 }

262	262

263 function countMatches(match) {	263 function countMatches(match) {

264 var matches = text.match(new RegExp(match, "g"));	264 var matches = text.match(new RegExp(match, "g"));

265 return matches !== null ? matches.length : 0;	265 return matches !== null ? matches.length : 0;

266 }	266 }

267	267

268 function isRTL() {	268 function isRTL() {

269 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");	269 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");

270 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");	270 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");

271	271

272 // if 20% of chars are Hebrew or Arbic then direction is rtl	272 // if 20% of chars are Hebrew or Arbic then direction is rtl

273 return (count_heb + count_arb) * 100 / text.length > 20;	273 return (count_heb + count_arb) * 100 / text.length > 20;

274 }	274 }

275	275

276 text = sanitizeText(text);	276 text = sanitizeText(text);

277 return isRTL() ? "rtl" : "ltr";	277 return isRTL() ? "rtl" : "ltr";

278 },	278 },

279	279

280 /**	280 /**

281 * Get the article title as an H1.	281 * Get the article title as an H1.

282 *	282 *

283 * @return void	283 * @return void

284 **/	284 **/

285 getArticleTitle: function () {	285 getArticleTitle: function () {

286 var curTitle = "",	286 var curTitle = "",

287 origTitle = "";	287 origTitle = "";

288	288

289 try {	289 try {

290 curTitle = origTitle = document.title;	290 curTitle = origTitle = document.title;

291 if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */	291 if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */

292 curTitle = origTitle = readability.getInnerText(document.getElem entsByTagName('title')[0]);	292 curTitle = origTitle = readability.getInnerText(document.getElem entsByTagName('title')[0]);

293 }	293 }

294 }	294 }

295 catch(e) {}	295 catch(e) {}

296	296

297 if(curTitle.match(/ [\\|\-] /))	297 if(curTitle.match(/ [\\|\-] /))

298 {	298 {

299 curTitle = origTitle.replace(/(.)[\\|\-] ./gi,'$1');	299 curTitle = origTitle.replace(/(.)[\\|\-] ./gi,'$1');

300	300

301 if(curTitle.split(' ').length < 3) {	301 if(curTitle.split(' ').length < 3) {

302 curTitle = origTitle.replace(/[^\\|\-][\\|\-](.)/gi,'$1');	302 curTitle = origTitle.replace(/[^\\|\-][\\|\-](.)/gi,'$1');

303 }	303 }

304 }	304 }

305 else if(curTitle.indexOf(': ') !== -1)	305 else if(curTitle.indexOf(': ') !== -1)

306 {	306 {

307 curTitle = origTitle.replace(/.:(.)/gi, '$1');	307 curTitle = origTitle.replace(/.:(.)/gi, '$1');

308	308

309 if(curTitle.split(' ').length < 3) {	309 if(curTitle.split(' ').length < 3) {

310 curTitle = origTitle.replace(/[^:][:](.)/gi,'$1');	310 curTitle = origTitle.replace(/[^:][:](.)/gi,'$1');

(...skipping 12 matching lines...) Expand all Loading...
323	323

324 if(curTitle.split(' ').length <= 4) {	324 if(curTitle.split(' ').length <= 4) {

325 curTitle = origTitle;	325 curTitle = origTitle;

326 }	326 }

327 return curTitle;	327 return curTitle;

328 },	328 },

329	329

330 /**	330 /**

331 * Prepare the HTML document for readability to scrape it.	331 * Prepare the HTML document for readability to scrape it.

332 * This includes things like stripping javascript, CSS, and handling terribl e markup.	332 * This includes things like stripping javascript, CSS, and handling terribl e markup.

333 *	333 *

334 * @return void	334 * @return void

335 **/	335 **/

336 prepDocument: function () {	336 prepDocument: function () {

337 /**	337 /**

338 * In some cases a body element can't be found (if the HTML is totally h osed for example)	338 * In some cases a body element can't be found (if the HTML is totally h osed for example)

339 * so we create a new body node and append it to the document.	339 * so we create a new body node and append it to the document.

340 */	340 */

341 if(document.body === null)	341 if(document.body === null)

342 {	342 {

343 var body = document.createElement("body");	343 var body = document.createElement("body");

344 try {	344 try {

345 document.body = body;	345 document.body = body;

346 }	346 }

347 catch(e) {	347 catch(e) {

348 document.documentElement.appendChild(body);	348 document.documentElement.appendChild(body);

349 dbg(e);	349 dbg(e);

350 }	350 }

351 }	351 }

352	352

353 document.body.id = "readabilityBody";	353 document.body.id = "readabilityBody";

354	354

355 var frames = document.getElementsByTagName('frame');	355 var frames = document.getElementsByTagName('frame');

(...skipping 11 matching lines...) Expand all Loading...
367 canAccessFrame = true;	367 canAccessFrame = true;

368 }	368 }

369 catch(eFrames) {	369 catch(eFrames) {

370 dbg(eFrames);	370 dbg(eFrames);

371 }	371 }

372	372

373 if(frameSize > biggestFrameSize) {	373 if(frameSize > biggestFrameSize) {

374 biggestFrameSize = frameSize;	374 biggestFrameSize = frameSize;

375 readability.biggestFrame = frames[frameIndex];	375 readability.biggestFrame = frames[frameIndex];

376 }	376 }

377	377

378 if(canAccessFrame && frameSize > bestFrameSize)	378 if(canAccessFrame && frameSize > bestFrameSize)

379 {	379 {

380 readability.frameHack = true;	380 readability.frameHack = true;

381	381

382 bestFrame = frames[frameIndex];	382 bestFrame = frames[frameIndex];

383 bestFrameSize = frameSize;	383 bestFrameSize = frameSize;

384 }	384 }

385 }	385 }

386	386

387 if(bestFrame)	387 if(bestFrame)

388 {	388 {

389 var newBody = document.createElement('body');	389 var newBody = document.createElement('body');

390 readability.moveNodeInnards(bestFrame.contentWindow.document.bod y, newBody);	390 readability.moveNodeInnards(bestFrame.contentWindow.document.bod y, newBody);

391 newBody.style.overflow = 'scroll';	391 newBody.style.overflow = 'scroll';

392 document.body = newBody;	392 document.body = newBody;

393	393

394 var frameset = document.getElementsByTagName('frameset')[0];	394 var frameset = document.getElementsByTagName('frameset')[0];

395 if(frameset) {	395 if(frameset) {

396 frameset.parentNode.removeChild(frameset); }	396 frameset.parentNode.removeChild(frameset); }

397 }	397 }

398 }	398 }

399	399

400 /* Remove all stylesheets */	400 /* Remove all stylesheets */

401 for (var k=0;k < document.styleSheets.length; k+=1) {	401 for (var k=0;k < document.styleSheets.length; k+=1) {

402 if (document.styleSheets[k].href !== null && document.styleSheets[k] .href.lastIndexOf("readability") === -1) {	402 if (document.styleSheets[k].href !== null && document.styleSheets[k] .href.lastIndexOf("readability") === -1) {

403 document.styleSheets[k].disabled = true;	403 document.styleSheets[k].disabled = true;

(...skipping 44 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
448 readability.cleanConditionally(articleContent, "table");	448 readability.cleanConditionally(articleContent, "table");

449 readability.cleanConditionally(articleContent, "ul");	449 readability.cleanConditionally(articleContent, "ul");

450 readability.cleanConditionally(articleContent, "div");	450 readability.cleanConditionally(articleContent, "div");

451	451

452 /* Remove extra paragraphs */	452 /* Remove extra paragraphs */

453 var articleParagraphs = articleContent.getElementsByTagName('p');	453 var articleParagraphs = articleContent.getElementsByTagName('p');

454 for(var i = articleParagraphs.length-1; i >= 0; i-=1) {	454 for(var i = articleParagraphs.length-1; i >= 0; i-=1) {

455 var imgCount = articleParagraphs[i].getElementsByTagName('img').l ength;	455 var imgCount = articleParagraphs[i].getElementsByTagName('img').l ength;

456 var embedCount = articleParagraphs[i].getElementsByTagName('embed') .length;	456 var embedCount = articleParagraphs[i].getElementsByTagName('embed') .length;

457 var objectCount = articleParagraphs[i].getElementsByTagName('object' ).length;	457 var objectCount = articleParagraphs[i].getElementsByTagName('object' ).length;

458	458

459 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab ility.getInnerText(articleParagraphs[i], false) === '') {	459 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab ility.getInnerText(articleParagraphs[i], false) === '') {

460 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i] );	460 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i] );

461 }	461 }

462 }	462 }

463	463

464 try {	464 try {

465 readability.replaceBrsWithPs(articleContent);	465 readability.replaceBrsWithPs(articleContent);

466 }	466 }

467 catch (e) {	467 catch (e) {

468 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block -elements bug. Ignoring.: " + e);	468 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block -elements bug. Ignoring.: " + e);

469 }	469 }

470 },	470 },

471	471

472 /**	472 /**

473 * Initialize a node with the readability object. Also checks the	473 * Initialize a node with the readability object. Also checks the

474 * className/id for special names to add to its score.	474 * className/id for special names to add to its score.

475 *	475 *

476 * @param Element	476 * @param Element

477 * @return void	477 * @return void

478 **/	478 **/

479 initializeNode: function (node) {	479 initializeNode: function (node) {

480 node.readability = {"contentScore": 0};	480 node.readability = {"contentScore": 0};

481	481

482 switch(node.tagName) {	482 switch(node.tagName) {

483 case 'DIV':	483 case 'DIV':

484 node.readability.contentScore += 5;	484 node.readability.contentScore += 5;

485 break;	485 break;

486	486

487 case 'PRE':	487 case 'PRE':

488 case 'TD':	488 case 'TD':

489 case 'BLOCKQUOTE':	489 case 'BLOCKQUOTE':

490 node.readability.contentScore += 3;	490 node.readability.contentScore += 3;

491 break;	491 break;

492	492

493 case 'ADDRESS':	493 case 'ADDRESS':

494 case 'OL':	494 case 'OL':

495 case 'UL':	495 case 'UL':

496 case 'DL':	496 case 'DL':

497 case 'DD':	497 case 'DD':

498 case 'DT':	498 case 'DT':

499 case 'LI':	499 case 'LI':

500 case 'FORM':	500 case 'FORM':

501 node.readability.contentScore -= 3;	501 node.readability.contentScore -= 3;

502 break;	502 break;

503	503

504 case 'H1':	504 case 'H1':

505 case 'H2':	505 case 'H2':

506 case 'H3':	506 case 'H3':

507 case 'H4':	507 case 'H4':

508 case 'H5':	508 case 'H5':

509 case 'H6':	509 case 'H6':

510 case 'TH':	510 case 'TH':

511 node.readability.contentScore -= 5;	511 node.readability.contentScore -= 5;

512 break;	512 break;

513 }	513 }

514	514

515 node.readability.contentScore += readability.getClassWeight(node);	515 node.readability.contentScore += readability.getClassWeight(node);

516 },	516 },

517	517

518 /***	518 /***

519 * grabArticle - Using a variety of metrics (content score, classname, eleme nt types), find the content that is	519 * grabArticle - Using a variety of metrics (content score, classname, eleme nt types), find the content that is

520 * most likely to be the stuff a user wants to read. Then retu rn it wrapped up in a div.	520 * most likely to be the stuff a user wants to read. Then retu rn it wrapped up in a div.

521 *	521 *

522 * @param page a document to run upon. Needs to be a full document, complete with body.	522 * @param page a document to run upon. Needs to be a full document, complete with body.

523 * @return Element	523 * @return Element

524 **/	524 **/

525 grabArticle: function (pageToClone) {	525 grabArticle: function (pageToClone) {

526 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_ STRIP_UNLIKELYS),	526 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_ STRIP_UNLIKELYS),

527 isPaging = (page !== null) ? true: false;	527 isPaging = (page !== null) ? true: false;

528	528

529 var page = null;	529 var page = null;

530 // Never work on the actual page.	530 // Never work on the actual page.

531 if (isPaging) {	531 if (isPaging) {

532 page = document.body.cloneNode(true);	532 page = document.body.cloneNode(true);

533 } else {	533 } else {

534 page = pageToClone.cloneNode(true);	534 page = pageToClone.cloneNode(true);

535 }	535 }

536	536

537 var allElements = page.getElementsByTagName('*');	537 var allElements = page.getElementsByTagName('*');

538	538

539 /**	539 /**

540 * First, node prepping. Trash nodes that look cruddy (like ones with th e class name "comment", etc), and turn divs	540 * First, node prepping. Trash nodes that look cruddy (like ones with th e class name "comment", etc), and turn divs

541 * into P tags where they have been used inappropriately (as in, where t hey contain no other block level elements.)	541 * into P tags where they have been used inappropriately (as in, where t hey contain no other block level elements.)

542 *	542 *

543 * Note: Assignment from index for performance. See http://www.peachpit. com/articles/article.aspx?p=31567&seqNum=5	543 * Note: Assignment from index for performance. See http://www.peachpit. com/articles/article.aspx?p=31567&seqNum=5

544 * TODO: Shouldn't this be a reverse traversal?	544 * TODO: Shouldn't this be a reverse traversal?

545 **/	545 **/

546 var node = null;	546 var node = null;

547 var nodesToScore = [];	547 var nodesToScore = [];

548 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {	548 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {

549 /* Remove unlikely candidates */	549 /* Remove unlikely candidates */

550 if (stripUnlikelyCandidates) {	550 if (stripUnlikelyCandidates) {

551 var unlikelyMatchString = node.className + node.id;	551 var unlikelyMatchString = node.className + node.id;

552 if (	552 if (

553 (	553 (

554 unlikelyMatchString.search(readability.regexps.unlikelyC andidates) !== -1 &&	554 unlikelyMatchString.search(readability.regexps.unlikelyC andidates) !== -1 &&

555 unlikelyMatchString.search(readability.regexps.okMaybeIt sACandidate) === -1 &&	555 unlikelyMatchString.search(readability.regexps.okMaybeIt sACandidate) === -1 &&

556 node.tagName !== "BODY"	556 node.tagName !== "BODY"

557 )	557 )

558 )	558 )

559 {	559 {

560 dbg("Removing unlikely candidate - " + unlikelyMatchString);	560 dbg("Removing unlikely candidate - " + unlikelyMatchString);

561 node.parentNode.removeChild(node);	561 node.parentNode.removeChild(node);

562 nodeIndex-=1;	562 nodeIndex-=1;

563 continue;	563 continue;

564 }	564 }

565 }	565 }

566	566

567 if (node.tagName === "P" \|\| node.tagName === "TD" \|\| node.tagName == = "PRE") {	567 if (node.tagName === "P" \|\| node.tagName === "TD" \|\| node.tagName == = "PRE") {

568 nodesToScore[nodesToScore.length] = node;	568 nodesToScore[nodesToScore.length] = node;

569 }	569 }

570	570

571 /* Turn all divs that don't have children block level elements into p's */	571 /* Turn all divs that don't have children block level elements into p's */

572 if (node.tagName === "DIV") {	572 if (node.tagName === "DIV") {

573 if (node.innerHTML.search(readability.regexps.divToPElements) == = -1) {	573 if (node.innerHTML.search(readability.regexps.divToPElements) == = -1) {

574 var newNode = document.createElement('p');	574 var newNode = document.createElement('p');

(...skipping 16 matching lines...) Expand all Loading...
591 if(childNode.nodeType === 3) { // Node.TEXT_NODE	591 if(childNode.nodeType === 3) { // Node.TEXT_NODE

592 var p = document.createElement('p');	592 var p = document.createElement('p');

593 var t = document.createTextNode(childNode.nodeValue) ;	593 var t = document.createTextNode(childNode.nodeValue) ;

594 p.appendChild(t);	594 p.appendChild(t);

595 p.style.display = 'inline';	595 p.style.display = 'inline';

596 p.className = 'readability-styled';	596 p.className = 'readability-styled';

597 childNode.parentNode.replaceChild(p, childNode);	597 childNode.parentNode.replaceChild(p, childNode);

598 }	598 }

599 }	599 }

600 }	600 }

601 }	601 }

602 }	602 }

603	603

604 /**	604 /**

605 * Loop through all paragraphs, and assign a score to them based on how content-y they look.	605 * Loop through all paragraphs, and assign a score to them based on how content-y they look.

606 * Then add their score to their parent node.	606 * Then add their score to their parent node.

607 *	607 *

608 * A score is determined by things like number of commas, class names, e tc. Maybe eventually link density.	608 * A score is determined by things like number of commas, class names, e tc. Maybe eventually link density.

609 **/	609 **/

610 var candidates = [];	610 var candidates = [];

611 for (var pt=0; pt < nodesToScore.length; pt+=1) {	611 for (var pt=0; pt < nodesToScore.length; pt+=1) {

(...skipping 21 matching lines...) Expand all Loading...
633 candidates.push(grandParentNode);	633 candidates.push(grandParentNode);

634 }	634 }

635	635

636 var contentScore = 0;	636 var contentScore = 0;

637	637

638 /* Add a point for the paragraph itself as a base. */	638 /* Add a point for the paragraph itself as a base. */

639 contentScore+=1;	639 contentScore+=1;

640	640

641 /* Add points for any commas within this paragraph */	641 /* Add points for any commas within this paragraph */

642 contentScore += innerText.split(',').length;	642 contentScore += innerText.split(',').length;

643	643

644 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */	644 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */

645 contentScore += Math.min(Math.floor(innerText.length / 100), 3);	645 contentScore += Math.min(Math.floor(innerText.length / 100), 3);

646	646

647 /* Add the score to the parent. The grandparent gets half. */	647 /* Add the score to the parent. The grandparent gets half. */

648 parentNode.readability.contentScore += contentScore;	648 parentNode.readability.contentScore += contentScore;

649	649

650 if(grandParentNode) {	650 if(grandParentNode) {

651 grandParentNode.readability.contentScore += contentScore/2;	651 grandParentNode.readability.contentScore += contentScore/2;

652 }	652 }

653 }	653 }

654	654

655 /**	655 /**

656 * After we've calculated scores, loop through all of the possible candi date nodes we found	656 * After we've calculated scores, loop through all of the possible candi date nodes we found

657 * and find the one with the highest score.	657 * and find the one with the highest score.

658 **/	658 **/

659 var topCandidate = null;	659 var topCandidate = null;

660 for(var c=0, cl=candidates.length; c < cl; c+=1)	660 for(var c=0, cl=candidates.length; c < cl; c+=1)

661 {	661 {

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
718 var contentBonus = 0;	718 var contentBonus = 0;

719 /* Give a bonus if sibling nodes and top candidates have the example same classname */	719 /* Give a bonus if sibling nodes and top candidates have the example same classname */

720 if(siblingNode.className === topCandidate.className && topCandidate. className !== "") {	720 if(siblingNode.className === topCandidate.className && topCandidate. className !== "") {

721 contentBonus += topCandidate.readability.contentScore * 0.2;	721 contentBonus += topCandidate.readability.contentScore * 0.2;

722 }	722 }

723	723

724 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re adability.contentScore+contentBonus) >= siblingScoreThreshold)	724 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re adability.contentScore+contentBonus) >= siblingScoreThreshold)

725 {	725 {

726 append = true;	726 append = true;

727 }	727 }

728	728

729 if(siblingNode.nodeName === "P") {	729 if(siblingNode.nodeName === "P") {

730 var linkDensity = readability.getLinkDensity(siblingNode);	730 var linkDensity = readability.getLinkDensity(siblingNode);

731 var nodeContent = readability.getInnerText(siblingNode);	731 var nodeContent = readability.getInnerText(siblingNode);

732 var nodeLength = nodeContent.length;	732 var nodeLength = nodeContent.length;

733	733

734 if(nodeLength > 80 && linkDensity < 0.25)	734 if(nodeLength > 80 && linkDensity < 0.25)

735 {	735 {

736 append = true;	736 append = true;

737 }	737 }

738 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear ch(/\.( \|$)/) !== -1)	738 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear ch(/\.( \|$)/) !== -1)

739 {	739 {

740 append = true;	740 append = true;

741 }	741 }

742 }	742 }

743	743

744 if(append) {	744 if(append) {

745 dbg("Appending node: " + siblingNode);	745 dbg("Appending node: " + siblingNode);

746	746

747 var nodeToAppend = null;	747 var nodeToAppend = null;

748 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P ") {	748 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P ") {

749 /* We have a node that isn't a common block level element, l ike a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */	749 /* We have a node that isn't a common block level element, l ike a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */

750	750

751 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');	751 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');

752 nodeToAppend = document.createElement("DIV");	752 nodeToAppend = document.createElement("DIV");

753 try {	753 try {

754 nodeToAppend.id = siblingNode.id;	754 nodeToAppend.id = siblingNode.id;

755 readability.moveNodeInnards(siblingNode, nodeToAppend);	755 readability.moveNodeInnards(siblingNode, nodeToAppend);

756 }	756 }

757 catch(er) {	757 catch(er) {

758 dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");	758 dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");

759 nodeToAppend = siblingNode;	759 nodeToAppend = siblingNode;

760 s-=1;	760 s-=1;

761 sl-=1;	761 sl-=1;

762 }	762 }

763 } else {	763 } else {

764 nodeToAppend = siblingNode;	764 nodeToAppend = siblingNode;

765 s-=1;	765 s-=1;

766 sl-=1;	766 sl-=1;

767 }	767 }

768	768

769 /* To ensure a node does not interfere with readability styles, remove its classnames */	769 /* To ensure a node does not interfere with readability styles, remove its classnames */

770 nodeToAppend.className = "";	770 nodeToAppend.className = "";

771	771

772 /* Append sibling and subtract from our list because it removes the node when you append to another node */	772 /* Append sibling and subtract from our list because it removes the node when you append to another node */

773 articleContent.appendChild(nodeToAppend);	773 articleContent.appendChild(nodeToAppend);

774 }	774 }

775 }	775 }

776	776

777 /**	777 /**

778 * So we have all of the content that we need. Now we clean it up for pr esentation.	778 * So we have all of the content that we need. Now we clean it up for pr esentation.

779 **/	779 **/

780 readability.distilledArticleContent = articleContent.cloneNode(true);	780 readability.distilledArticleContent = articleContent.cloneNode(true);

781 //readability.prepArticle(articleContent);	781 //readability.prepArticle(articleContent);

782	782

783 if (readability.curPageNum === 1) {	783 if (readability.curPageNum === 1) {

784 var newNode = document.createElement('div');	784 var newNode = document.createElement('div');

785 newNode.id = "readability-page-1";	785 newNode.id = "readability-page-1";

786 newNode.setAttribute("class", "page");	786 newNode.setAttribute("class", "page");

787 readability.moveNodeInnards(articleContent, newNode);	787 readability.moveNodeInnards(articleContent, newNode);

788 articleContent.appendChild(newNode);	788 articleContent.appendChild(newNode);

789 }	789 }

790	790

791 /**	791 /**

792 * Now that we've gone through the full algorithm, check to see if we go t any meaningful content.	792 * Now that we've gone through the full algorithm, check to see if we go t any meaningful content.

793 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher	793 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher

794 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of	794 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of

795 * finding the -right- content.	795 * finding the -right- content.

796 **/	796 **/

797 if(readability.getInnerText(articleContent, false).length < 250) {	797 if(readability.getInnerText(articleContent, false).length < 250) {

798 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {	798 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {

799 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);	799 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);

800 return readability.grabArticle(document.body);	800 return readability.grabArticle(document.body);

801 }	801 }

802 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {	802 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {

803 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);	803 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);

804 return readability.grabArticle(document.body);	804 return readability.grabArticle(document.body);

805 }	805 }

806 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL LY)) {	806 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL LY)) {

807 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);	807 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);

808 return readability.grabArticle(document.body);	808 return readability.grabArticle(document.body);

809 } else {	809 } else {

810 return null;	810 return null;

811 }	811 }

812 }	812 }

813	813

814 return articleContent;	814 return articleContent;

815 },	815 },

816	816

817 /**	817 /**

818 * Removes script tags from the document.	818 * Removes script tags from the document.

819 *	819 *

820 * @param Element	820 * @param Element

821 **/	821 **/

822 removeScripts: function (doc) {	822 removeScripts: function (doc) {

823 var scripts = doc.getElementsByTagName('script');	823 var scripts = doc.getElementsByTagName('script');

824 for(var i = scripts.length-1; i >= 0; i-=1)	824 for(var i = scripts.length-1; i >= 0; i-=1)

825 {	825 {

826 if(typeof(scripts[i].src) === "undefined" \|\| (scripts[i].src.indexOf ('readability') === -1 && scripts[i].src.indexOf('typekit') === -1))	826 if(typeof(scripts[i].src) === "undefined" \|\| (scripts[i].src.indexOf ('readability') === -1 && scripts[i].src.indexOf('typekit') === -1))

827 {	827 {

828 scripts[i].nodeValue="";	828 scripts[i].nodeValue="";

829 scripts[i].removeAttribute('src');	829 scripts[i].removeAttribute('src');

830 if (scripts[i].parentNode) {	830 if (scripts[i].parentNode) {

831 scripts[i].parentNode.removeChild(scripts[i]);	831 scripts[i].parentNode.removeChild(scripts[i]);

832 }	832 }

833 }	833 }

834 }	834 }

835 },	835 },

836	836

837 /**	837 /**

838 * Get the inner text of a node - cross browser compatibly.	838 * Get the inner text of a node - cross browser compatibly.

839 * This also strips out any excess whitespace to be found.	839 * This also strips out any excess whitespace to be found.

840 *	840 *

841 * @param Element	841 * @param Element

842 * @return string	842 * @return string

843 **/	843 **/

844 getInnerText: function (e, normalizeSpaces) {	844 getInnerText: function (e, normalizeSpaces) {

845 var textContent = "";	845 var textContent = "";

846	846

(...skipping 42 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
889	889

890 // Remove any root styles, if we're able.	890 // Remove any root styles, if we're able.

891 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili ty-styled') {	891 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili ty-styled') {

892 e.removeAttribute('style'); }	892 e.removeAttribute('style'); }

893	893

894 // Go until there are no more child nodes	894 // Go until there are no more child nodes

895 while ( cur !== null ) {	895 while ( cur !== null ) {

896 if ( cur.nodeType === 1 ) {	896 if ( cur.nodeType === 1 ) {

897 // Remove style attribute(s) :	897 // Remove style attribute(s) :

898 if(cur.className !== "readability-styled") {	898 if(cur.className !== "readability-styled") {

899 cur.removeAttribute("style");	899 cur.removeAttribute("style");

900 }	900 }

901 readability.cleanStyles( cur );	901 readability.cleanStyles( cur );

902 }	902 }

903 cur = cur.nextSibling;	903 cur = cur.nextSibling;

904 }	904 }

905 },	905 },

906	906

907 /**	907 /**

908 * Get the density of links as a percentage of the content	908 * Get the density of links as a percentage of the content

909 * This is the amount of text that is inside a link divided by the total tex t in the node.	909 * This is the amount of text that is inside a link divided by the total tex t in the node.

910 *	910 *

911 * @param Element	911 * @param Element

912 * @return number (float)	912 * @return number (float)

913 **/	913 **/

914 getLinkDensity: function (e) {	914 getLinkDensity: function (e) {

915 var links = e.getElementsByTagName("a");	915 var links = e.getElementsByTagName("a");

916 var textLength = readability.getInnerText(e).length;	916 var textLength = readability.getInnerText(e).length;

917 var linkLength = 0;	917 var linkLength = 0;

918 for(var i=0, il=links.length; i<il;i+=1)	918 for(var i=0, il=links.length; i<il;i+=1)

919 {	919 {

920 linkLength += readability.getInnerText(links[i]).length;	920 linkLength += readability.getInnerText(links[i]).length;

921 }	921 }

922	922

923 return linkLength / textLength;	923 return linkLength / textLength;

924 },	924 },

925	925

926 /**	926 /**

927 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.	927 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.

928 *	928 *

929 * @author Dan Lacy	929 * @author Dan Lacy

930 * @return string the base url	930 * @return string the base url

931 **/	931 **/

932 findBaseUrl: function () {	932 findBaseUrl: function () {

933 var noUrlParams = window.location.pathname.split("?")[0],	933 var noUrlParams = window.location.pathname.split("?")[0],

934 urlSlashes = noUrlParams.split("/").reverse(),	934 urlSlashes = noUrlParams.split("/").reverse(),

935 cleanedSegments = [],	935 cleanedSegments = [],

936 possibleType = "";	936 possibleType = "";

937	937

938 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) {	938 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) {

939 var segment = urlSlashes[i];	939 var segment = urlSlashes[i];

940	940

941 // Split off and save anything that looks like a file type.	941 // Split off and save anything that looks like a file type.

942 if (segment.indexOf(".") !== -1) {	942 if (segment.indexOf(".") !== -1) {

943 possibleType = segment.split(".")[1];	943 possibleType = segment.split(".")[1];

944	944

945 /* If the type isn't alpha-only, it's probably not actually a fi le extension. */	945 /* If the type isn't alpha-only, it's probably not actually a fi le extension. */

946 if(!possibleType.match(/[^a-zA-Z]/)) {	946 if(!possibleType.match(/[^a-zA-Z]/)) {

947 segment = segment.split(".")[0];	947 segment = segment.split(".")[0];

948 }	948 }

949 }	949 }

950	950

951 /**	951 /**

952 * EW-CMS specific segment replacement. Ugly.	952 * EW-CMS specific segment replacement. Ugly.

953 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm l	953 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm l

954 **/	954 **/

955 if(segment.indexOf(',00') !== -1) {	955 if(segment.indexOf(',00') !== -1) {

956 segment = segment.replace(',00', '');	956 segment = segment.replace(',00', '');

957 }	957 }

958	958

959 // If our first or second segment has anything looking like a page n umber, remove it.	959 // If our first or second segment has anything looking like a page n umber, remove it.

960 if (segment.match(/((_\|-)?p[a-z]*\|(_\|-))[0-9]{1,2}$/i) && ((i === 1) \|\| (i === 0))) {	960 if (segment.match(/((_\|-)?p[a-z]*\|(_\|-))[0-9]{1,2}$/i) && ((i === 1) \|\| (i === 0))) {

961 segment = segment.replace(/((_\|-)?p[a-z]*\|(_\|-))[0-9]{1,2}$/i, " ");	961 segment = segment.replace(/((_\|-)?p[a-z]*\|(_\|-))[0-9]{1,2}$/i, " ");

962 }	962 }

963	963

964	964

965 var del = false;	965 var del = false;

966	966

967 /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */	967 /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */

968 if (i < 2 && segment.match(/^\d{1,2}$/)) {	968 if (i < 2 && segment.match(/^\d{1,2}$/)) {

969 del = true;	969 del = true;

970 }	970 }

971	971

972 /* If this is the first segment and it's just "index", remove it. */	972 /* If this is the first segment and it's just "index", remove it. */

973 if(i === 0 && segment.toLowerCase() === "index") {	973 if(i === 0 && segment.toLowerCase() === "index") {

974 del = true;	974 del = true;

975 }	975 }

976	976

977	977

978 /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */	978 /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */

979 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) {	979 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) {

980 del = true;	980 del = true;

981 }	981 }

982	982

983 /* If it's not marked for deletion, push it to cleanedSegments. */	983 /* If it's not marked for deletion, push it to cleanedSegments. */

984 if (!del) {	984 if (!del) {

985 cleanedSegments.push(segment);	985 cleanedSegments.push(segment);

986 }	986 }

987 }	987 }

988	988

989 // This is our final, cleaned, base article URL.	989 // This is our final, cleaned, base article URL.

990 return window.location.protocol + "//" + window.location.host + cleanedS egments.reverse().join("/");	990 return window.location.protocol + "//" + window.location.host + cleanedS egments.reverse().join("/");

991 },	991 },

992	992

993 /**	993 /**

994 * Look for any paging links that may occur within the document.	994 * Look for any paging links that may occur within the document.

995 *	995 *

996 * @param body	996 * @param body

997 * @return object (array)	997 * @return object (array)

998 **/	998 **/

999 findNextPageLink: function (elem) {	999 findNextPageLink: function (elem) {

1000 var possiblePages = {},	1000 var possiblePages = {},

1001 allLinks = elem.getElementsByTagName('a'),	1001 allLinks = elem.getElementsByTagName('a'),

1002 articleBaseUrl = readability.findBaseUrl();	1002 articleBaseUrl = readability.findBaseUrl();

1003	1003

1004 /**	1004 /**

1005 * Loop through all links, looking for hints that they may be next-page links.	1005 * Loop through all links, looking for hints that they may be next-page links.

1006 * Things like having "page" in their textContent, className or id, or b eing a child	1006 * Things like having "page" in their textContent, className or id, or b eing a child

1007 * of a node with a page-y className or id.	1007 * of a node with a page-y className or id.

1008 *	1008 *

1009 * Also possible: levenshtein distance? longest common subsequence?	1009 * Also possible: levenshtein distance? longest common subsequence?

1010 *	1010 *

1011 * After we do that, assign each page a score, and	1011 * After we do that, assign each page a score, and

1012 **/	1012 **/

1013 for(var i = 0, il = allLinks.length; i < il; i+=1) {	1013 for(var i = 0, il = allLinks.length; i < il; i+=1) {

1014 var link = allLinks[i],	1014 var link = allLinks[i],

1015 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ' ');	1015 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ' ');

1016	1016

1017 /* If we've already seen this page, ignore it */	1017 /* If we've already seen this page, ignore it */

1018 if(linkHref === "" \|\| linkHref === articleBaseUrl \|\| linkHref === wi ndow.location.href \|\| linkHref in readability.parsedPages) {	1018 if(linkHref === "" \|\| linkHref === articleBaseUrl \|\| linkHref === wi ndow.location.href \|\| linkHref in readability.parsedPages) {

1019 continue;	1019 continue;

1020 }	1020 }

1021	1021

1022 /* If it's on a different domain, skip it. */	1022 /* If it's on a different domain, skip it. */

1023 if(window.location.host !== linkHref.split(/\/+/g)[1]) {	1023 if(window.location.host !== linkHref.split(/\/+/g)[1]) {

1024 continue;	1024 continue;

1025 }	1025 }

1026	1026

1027 var linkText = readability.getInnerText(link);	1027 var linkText = readability.getInnerText(link);

1028	1028

1029 /* If the linkText looks like it's not the next page, skip it. */	1029 /* If the linkText looks like it's not the next page, skip it. */

1030 if(linkText.match(readability.regexps.extraneous) \|\| linkText.length > 25) {	1030 if(linkText.match(readability.regexps.extraneous) \|\| linkText.length > 25) {

1031 continue;	1031 continue;

1032 }	1032 }

1033	1033

1034 /* If the leftovers of the URL after removing the base URL don't con tain any digits, it's certainly not a next page link. */	1034 /* If the leftovers of the URL after removing the base URL don't con tain any digits, it's certainly not a next page link. */

1035 var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');	1035 var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');

1036 if(!linkHrefLeftover.match(/\d/)) {	1036 if(!linkHrefLeftover.match(/\d/)) {

1037 continue;	1037 continue;

1038 }	1038 }

1039	1039

1040 if(!(linkHref in possiblePages)) {	1040 if(!(linkHref in possiblePages)) {

1041 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr ef": linkHref};	1041 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr ef": linkHref};

1042 } else {	1042 } else {

1043 possiblePages[linkHref].linkText += ' \| ' + linkText;	1043 possiblePages[linkHref].linkText += ' \| ' + linkText;

1044 }	1044 }

1045	1045

1046 var linkObj = possiblePages[linkHref];	1046 var linkObj = possiblePages[linkHref];

1047	1047

1048 /**	1048 /**

1049 * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.	1049 * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.

1050 * Example: http://www.actionscript.org/resources/articles/745/1/Jav aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html	1050 * Example: http://www.actionscript.org/resources/articles/745/1/Jav aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html

1051 **/	1051 **/

1052 if(linkHref.indexOf(articleBaseUrl) !== 0) {	1052 if(linkHref.indexOf(articleBaseUrl) !== 0) {

1053 linkObj.score -= 25;	1053 linkObj.score -= 25;

1054 }	1054 }

1055	1055

1056 var linkData = linkText + ' ' + link.className + ' ' + link.id;	1056 var linkData = linkText + ' ' + link.className + ' ' + link.id;

1057 if(linkData.match(readability.regexps.nextLink)) {	1057 if(linkData.match(readability.regexps.nextLink)) {

1058 linkObj.score += 50;	1058 linkObj.score += 50;

1059 }	1059 }

1060 if(linkData.match(/pag(e\|ing\|inat)/i)) {	1060 if(linkData.match(/pag(e\|ing\|inat)/i)) {

1061 linkObj.score += 25;	1061 linkObj.score += 25;

1062 }	1062 }

1063 if(linkData.match(/(first\|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,	1063 if(linkData.match(/(first\|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,

1064 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */	1064 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */

1065 if(!linkObj.linkText.match(readability.regexps.nextLink)) {	1065 if(!linkObj.linkText.match(readability.regexps.nextLink)) {

1066 linkObj.score -= 65;	1066 linkObj.score -= 65;

1067 }	1067 }

1068 }	1068 }

1069 if(linkData.match(readability.regexps.negative) \|\| linkData.match(re adability.regexps.extraneous)) {	1069 if(linkData.match(readability.regexps.negative) \|\| linkData.match(re adability.regexps.extraneous)) {

1070 linkObj.score -= 50;	1070 linkObj.score -= 50;

1071 }	1071 }

1072 if(linkData.match(readability.regexps.prevLink)) {	1072 if(linkData.match(readability.regexps.prevLink)) {

1073 linkObj.score -= 200;	1073 linkObj.score -= 200;

1074 }	1074 }

1075	1075

1076 /* If a parentNode contains page or paging or paginat */	1076 /* If a parentNode contains page or paging or paginat */

1077 var parentNode = link.parentNode,	1077 var parentNode = link.parentNode,

1078 positiveNodeMatch = false,	1078 positiveNodeMatch = false,

1079 negativeNodeMatch = false;	1079 negativeNodeMatch = false;

1080 while(parentNode) {	1080 while(parentNode) {

1081 var parentNodeClassAndId = parentNode.className + ' ' + parentNo de.id;	1081 var parentNodeClassAndId = parentNode.className + ' ' + parentNo de.id;

1082 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(/pag(e\|ing\|inat)/i)) {	1082 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(/pag(e\|ing\|inat)/i)) {

1083 positiveNodeMatch = true;	1083 positiveNodeMatch = true;

1084 linkObj.score += 25;	1084 linkObj.score += 25;

1085 }	1085 }

1086 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(readability.regexps.negative)) {	1086 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass AndId.match(readability.regexps.negative)) {

1087 /* If this is just something like "footer", give it a negati ve. If it's something like "body-and-footer", leave it be. */	1087 /* If this is just something like "footer", give it a negati ve. If it's something like "body-and-footer", leave it be. */

1088 if(!parentNodeClassAndId.match(readability.regexps.positive) ) {	1088 if(!parentNodeClassAndId.match(readability.regexps.positive) ) {

1089 linkObj.score -= 25;	1089 linkObj.score -= 25;

1090 negativeNodeMatch = true;	1090 negativeNodeMatch = true;

1091 }	1091 }

1092 }	1092 }

1093	1093

1094 parentNode = parentNode.parentNode;	1094 parentNode = parentNode.parentNode;

1095 }	1095 }

1096	1096

1097 /**	1097 /**

1098 * If the URL looks like it has paging in it, add to the score.	1098 * If the URL looks like it has paging in it, add to the score.

1099 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34	1099 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34

1100 **/	1100 **/

1101 if (linkHref.match(/p(a\|g\|ag)?(e\|ing\|ination)?(=\|\/)[0-9]{1,2}/i) \|\| linkHref.match(/(page\|paging)/i)) {	1101 if (linkHref.match(/p(a\|g\|ag)?(e\|ing\|ination)?(=\|\/)[0-9]{1,2}/i) \|\| linkHref.match(/(page\|paging)/i)) {

1102 linkObj.score += 25;	1102 linkObj.score += 25;

1103 }	1103 }

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1145 topPage = possiblePages[page];	1145 topPage = possiblePages[page];

1146 }	1146 }

1147 }	1147 }

1148 }	1148 }

1149	1149

1150 if(topPage) {	1150 if(topPage) {

1151 var nextHref = topPage.href.replace(/\/$/,'');	1151 var nextHref = topPage.href.replace(/\/$/,'');

1152	1152

1153 dbg('NEXT PAGE IS ' + nextHref);	1153 dbg('NEXT PAGE IS ' + nextHref);

1154 readability.parsedPages[nextHref] = true;	1154 readability.parsedPages[nextHref] = true;

1155 return nextHref;	1155 return nextHref;

1156 }	1156 }

1157 else {	1157 else {

1158 return null;	1158 return null;

1159 }	1159 }

1160 },	1160 },

1161	1161

1162 createLinkDiv: function(link) {	1162 createLinkDiv: function(link) {

1163 var divNode = document.createElement('div');	1163 var divNode = document.createElement('div');

1164 var aNode = document.createElement('a');	1164 var aNode = document.createElement('a');

1165 var tNode = document.createTextNode('View Next Page');	1165 var tNode = document.createTextNode('View Next Page');

(...skipping 31 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1197 }	1197 }

1198 else {	1198 else {

1199 if (options.error) { options.error(request); }	1199 if (options.error) { options.error(request); }

1200 }	1200 }

1201 }	1201 }

1202 }	1202 }

1203	1203

1204 if (typeof options === 'undefined') { options = {}; }	1204 if (typeof options === 'undefined') { options = {}; }

1205	1205

1206 request.onreadystatechange = respondToReadyState;	1206 request.onreadystatechange = respondToReadyState;

1207	1207

1208 request.open('get', url, true);	1208 request.open('get', url, true);

1209 request.setRequestHeader('Accept', 'text/html');	1209 request.setRequestHeader('Accept', 'text/html');

1210	1210

1211 try {	1211 try {

1212 request.send(options.postBody);	1212 request.send(options.postBody);

1213 }	1213 }

1214 catch (e) {	1214 catch (e) {

1215 if (options.error) { options.error(); }	1215 if (options.error) { options.error(); }

1216 }	1216 }

1217	1217

(...skipping 14 matching lines...) Expand all Loading...
1232 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada bility.curPageNum + '">§</p>';	1232 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada bility.curPageNum + '">§</p>';

1233	1233

1234 document.getElementById("readability-content").appendChild(articlePage);	1234 document.getElementById("readability-content").appendChild(articlePage);

1235	1235

1236 if(readability.curPageNum > readability.maxPages) {	1236 if(readability.curPageNum > readability.maxPages) {

1237 var linkDiv = readability.createLinkDiv(nextPageLink);	1237 var linkDiv = readability.createLinkDiv(nextPageLink);

1238	1238

1239 articlePage.appendChild(linkDiv);	1239 articlePage.appendChild(linkDiv);

1240 return;	1240 return;

1241 }	1241 }

1242	1242

1243 /**	1243 /**

1244 * Now that we've built the article page DOM element, get the page conte nt	1244 * Now that we've built the article page DOM element, get the page conte nt

1245 * asynchronously and load the cleaned content into the div we created f or it.	1245 * asynchronously and load the cleaned content into the div we created f or it.

1246 **/	1246 **/

1247 (function(pageUrl, thisPage) {	1247 (function(pageUrl, thisPage) {

1248 readability.ajax(pageUrl, {	1248 readability.ajax(pageUrl, {

1249 success: function(r) {	1249 success: function(r) {

1250	1250

1251 /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */	1251 /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */

1252 var eTag = r.getResponseHeader('ETag');	1252 var eTag = r.getResponseHeader('ETag');

1253 if(eTag) {	1253 if(eTag) {

1254 if(eTag in readability.pageETags) {	1254 if(eTag in readability.pageETags) {

1255 dbg("Exact duplicate page found via ETag. Aborting." );	1255 dbg("Exact duplicate page found via ETag. Aborting." );

1256 articlePage.style.display = 'none';	1256 articlePage.style.display = 'none';

1257 return;	1257 return;

1258 } else {	1258 } else {

1259 readability.pageETags[eTag] = 1;	1259 readability.pageETags[eTag] = 1;

1260 }	1260 }

1261 }	1261 }

1262	1262

1263 // TODO: this ends up doubling up page numbers on NYTimes ar ticles. Need to generically parse those away.	1263 // TODO: this ends up doubling up page numbers on NYTimes ar ticles. Need to generically parse those away.

1264 var page = document.createElement("DIV");	1264 var page = document.createElement("DIV");

1265	1265

1266 /**	1266 /**

1267 * Do some preprocessing to our HTML to make it ready for ap pending.	1267 * Do some preprocessing to our HTML to make it ready for ap pending.

1268 * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript.	1268 * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript.

1269 * • Turn any noscript tags into divs so that we can parse t hem. This allows us to find any next page links hidden via javascript.	1269 * • Turn any noscript tags into divs so that we can parse t hem. This allows us to find any next page links hidden via javascript.

1270 * • Turn all double br's into p's - was handled by prepDocu ment in the original view.	1270 * • Turn all double br's into p's - was handled by prepDocu ment in the original view.

(...skipping 30 matching lines...) Expand all Loading...
1301 for(var i=1; i <= readability.curPageNum; i+=1) {	1301 for(var i=1; i <= readability.curPageNum; i+=1) {

1302 var rPage = document.getElementById('readability-pag e-' + i);	1302 var rPage = document.getElementById('readability-pag e-' + i);

1303 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML ) !== -1) {	1303 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML ) !== -1) {

1304 dbg('Duplicate of page ' + i + ' - skipping.');	1304 dbg('Duplicate of page ' + i + ' - skipping.');

1305 articlePage.style.display = 'none';	1305 articlePage.style.display = 'none';

1306 readability.parsedPages[pageUrl] = true;	1306 readability.parsedPages[pageUrl] = true;

1307 return;	1307 return;

1308 }	1308 }

1309 }	1309 }

1310 }	1310 }

1311	1311

1312 readability.removeScripts(content);	1312 readability.removeScripts(content);

1313	1313

1314 readability.moveNodeInnards(content, thisPage);	1314 readability.moveNodeInnards(content, thisPage);

1315	1315

1316 /**	1316 /**

1317 * After the page has rendered, post process the content. Th is delay is necessary because,	1317 * After the page has rendered, post process the content. Th is delay is necessary because,

1318 * in webkit at least, offsetWidth is not set in time to det ermine image width. We have to	1318 * in webkit at least, offsetWidth is not set in time to det ermine image width. We have to

1319 * wait a little bit for reflow to finish before we can fix floating images.	1319 * wait a little bit for reflow to finish before we can fix floating images.

1320 **/	1320 **/

1321 window.setTimeout(	1321 window.setTimeout(

1322 function() { readability.postProcessContent(thisPage); } ,	1322 function() { readability.postProcessContent(thisPage); } ,

1323 500	1323 500

1324 );	1324 );

1325	1325

1326 if(nextPageLink) {	1326 if(nextPageLink) {

1327 readability.appendNextPage(nextPageLink);	1327 readability.appendNextPage(nextPageLink);

1328 }	1328 }

1329 }	1329 }

1330 });	1330 });

1331 }(nextPageLink, articlePage));	1331 }(nextPageLink, articlePage));

1332 },	1332 },

1333	1333

1334 /**	1334 /**

1335 * Get an elements class/id weight. Uses regular expressions to tell if this	1335 * Get an elements class/id weight. Uses regular expressions to tell if this

1336 * element looks good or bad.	1336 * element looks good or bad.

1337 *	1337 *

1338 * @param Element	1338 * @param Element

1339 * @return number (Integer)	1339 * @return number (Integer)

1340 **/	1340 **/

1341 getClassWeight: function (e) {	1341 getClassWeight: function (e) {

1342 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {	1342 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {

1343 return 0;	1343 return 0;

1344 }	1344 }

1345	1345

(...skipping 29 matching lines...) Expand all Loading...
1375 /**	1375 /**

1376 * Remove extraneous break tags from a node.	1376 * Remove extraneous break tags from a node.

1377 *	1377 *

1378 * @param Element	1378 * @param Element

1379 * @return void	1379 * @return void

1380 **/	1380 **/

1381 killBreaks: function (e) {	1381 killBreaks: function (e) {

1382 var allElements = e.getElementsByTagName('*');	1382 var allElements = e.getElementsByTagName('*');

1383 while (i < allElements.length) {	1383 while (i < allElements.length) {

1384 readability.deleteExtraBreaks(allElements[i]);	1384 readability.deleteExtraBreaks(allElements[i]);

1385 i++;	1385 i++;

1386 }	1386 }

1387 },	1387 },

1388	1388

1389 /**	1389 /**

1390 * Clean a node of all elements of type "tag".	1390 * Clean a node of all elements of type "tag".

1391 * (Unless it's a youtube/vimeo video. People love movies.)	1391 * (Unless it's a youtube/vimeo video. People love movies.)

1392 *	1392 *

1393 * @param Element	1393 * @param Element

1394 * @param string tag to clean	1394 * @param string tag to clean

1395 * @return void	1395 * @return void

1396 **/	1396 **/

1397 clean: function (e, tag) {	1397 clean: function (e, tag) {

1398 var targetList = e.getElementsByTagName( tag );	1398 var targetList = e.getElementsByTagName( tag );

1399 var isEmbed = (tag === 'object' \|\| tag === 'embed');	1399 var isEmbed = (tag === 'object' \|\| tag === 'embed');

1400	1400

1401 for (var y=targetList.length-1; y >= 0; y-=1) {	1401 for (var y=targetList.length-1; y >= 0; y-=1) {

1402 /* Allow youtube and vimeo videos through as people usually want to see those. */	1402 /* Allow youtube and vimeo videos through as people usually want to see those. */

1403 if(isEmbed) {	1403 if(isEmbed) {

1404 var attributeValues = "";	1404 var attributeValues = "";

1405 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {	1405 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {

1406 attributeValues += targetList[y].attributes[i].value + '\|';	1406 attributeValues += targetList[y].attributes[i].value + '\|';

1407 }	1407 }

1408	1408

1409 /* First, check the elements attributes to see if any of them co ntain youtube or vimeo */	1409 /* First, check the elements attributes to see if any of them co ntain youtube or vimeo */

1410 if (attributeValues.search(readability.regexps.videos) !== -1) {	1410 if (attributeValues.search(readability.regexps.videos) !== -1) {

1411 continue;	1411 continue;

1412 }	1412 }

1413	1413

1414 /* Then check the elements inside this element for the same. */	1414 /* Then check the elements inside this element for the same. */

1415 if (targetList[y].innerHTML.search(readability.regexps.videos) ! == -1) {	1415 if (targetList[y].innerHTML.search(readability.regexps.videos) ! == -1) {

1416 continue;	1416 continue;

1417 }	1417 }

1418	1418

1419 }	1419 }

1420	1420

1421 targetList[y].parentNode.removeChild(targetList[y]);	1421 targetList[y].parentNode.removeChild(targetList[y]);

1422 }	1422 }

1423 },	1423 },

1424	1424

1425 /**	1425 /**

1426 * Clean an element of all tags of type "tag" if they look fishy.	1426 * Clean an element of all tags of type "tag" if they look fishy.

1427 * "Fishy" is an algorithm based on content length, classnames, link density , number of images & embeds, etc.	1427 * "Fishy" is an algorithm based on content length, classnames, link density , number of images & embeds, etc.

1428 *	1428 *

1429 * @return void	1429 * @return void

1430 **/	1430 **/

1431 cleanConditionally: function (e, tag) {	1431 cleanConditionally: function (e, tag) {

1432	1432

1433 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {	1433 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {

1434 return;	1434 return;

1435 }	1435 }

1436	1436

1437 var tagsList = e.getElementsByTagName(tag);	1437 var tagsList = e.getElementsByTagName(tag);

1438 var curTagsLength = tagsList.length;	1438 var curTagsLength = tagsList.length;

1439	1439

1440 /**	1440 /**

1441 * Gather counts for other typical elements embedded within.	1441 * Gather counts for other typical elements embedded within.

1442 * Traverse backwards so we can remove nodes at the same time without ef fecting the traversal.	1442 * Traverse backwards so we can remove nodes at the same time without ef fecting the traversal.

1443 *	1443 *

1444 * TODO: Consider taking into account original contentScore here.	1444 * TODO: Consider taking into account original contentScore here.

1445 **/	1445 **/

1446 for (var i=curTagsLength-1; i >= 0; i-=1) {	1446 for (var i=curTagsLength-1; i >= 0; i-=1) {

1447 var weight = readability.getClassWeight(tagsList[i]);	1447 var weight = readability.getClassWeight(tagsList[i]);

1448 var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;	1448 var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;

1449	1449

1450 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde fined') ? (" with score " + tagsList[i].readability.contentScore) : ''));	1450 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde fined') ? (" with score " + tagsList[i].readability.contentScore) : ''));

1451	1451

1452 if(weight+contentScore < 0)	1452 if(weight+contentScore < 0)

1453 {	1453 {

1454 tagsList[i].parentNode.removeChild(tagsList[i]);	1454 tagsList[i].parentNode.removeChild(tagsList[i]);

1455 }	1455 }

1456 else if ( readability.getCharCount(tagsList[i],',') < 10) {	1456 else if ( readability.getCharCount(tagsList[i],',') < 10) {

1457 /**	1457 /**

1458 * If there are not very many commas, and the number of	1458 * If there are not very many commas, and the number of

1459 * non-paragraph elements is more than paragraphs or other omino us signs, remove the element.	1459 * non-paragraph elements is more than paragraphs or other omino us signs, remove the element.

1460 **/	1460 **/

1461 var p = tagsList[i].getElementsByTagName("p").length;	1461 var p = tagsList[i].getElementsByTagName("p").length;

1462 var img = tagsList[i].getElementsByTagName("img").length;	1462 var img = tagsList[i].getElementsByTagName("img").length;

1463 var li = tagsList[i].getElementsByTagName("li").length-100;	1463 var li = tagsList[i].getElementsByTagName("li").length-100;

1464 var input = tagsList[i].getElementsByTagName("input").length;	1464 var input = tagsList[i].getElementsByTagName("input").length;

1465	1465

1466 var embedCount = 0;	1466 var embedCount = 0;

1467 var embeds = tagsList[i].getElementsByTagName("embed");	1467 var embeds = tagsList[i].getElementsByTagName("embed");

1468 for(var ei=0,il=embeds.length; ei < il; ei+=1) {	1468 for(var ei=0,il=embeds.length; ei < il; ei+=1) {

1469 if (embeds[ei].src.search(readability.regexps.videos) === -1 ) {	1469 if (embeds[ei].src.search(readability.regexps.videos) === -1 ) {

1470 embedCount+=1;	1470 embedCount+=1;

1471 }	1471 }

1472 }	1472 }

1473	1473

1474 var linkDensity = readability.getLinkDensity(tagsList[i]);	1474 var linkDensity = readability.getLinkDensity(tagsList[i]);

1475 var contentLength = readability.getInnerText(tagsList[i]).length ;	1475 var contentLength = readability.getInnerText(tagsList[i]).length ;

1476 var toRemove = false;	1476 var toRemove = false;

1477	1477

1478 if ( img > p ) {	1478 if ( img > p ) {

1479 toRemove = true;	1479 toRemove = true;

1480 } else if(li > p && tag !== "ul" && tag !== "ol") {	1480 } else if(li > p && tag !== "ul" && tag !== "ol") {

1481 toRemove = true;	1481 toRemove = true;

1482 } else if( input > Math.floor(p/3) ) {	1482 } else if( input > Math.floor(p/3) ) {

1483 toRemove = true;	1483 toRemove = true;

1484 } else if(contentLength < 25 && (img === 0 \|\| img > 2) ) {	1484 } else if(contentLength < 25 && (img === 0 \|\| img > 2) ) {

1485 toRemove = true;	1485 toRemove = true;

1486 } else if(weight < 25 && linkDensity > 0.2) {	1486 } else if(weight < 25 && linkDensity > 0.2) {

1487 toRemove = true;	1487 toRemove = true;

1488 } else if(weight >= 25 && linkDensity > 0.5) {	1488 } else if(weight >= 25 && linkDensity > 0.5) {

1489 toRemove = true;	1489 toRemove = true;

1490 } else if((embedCount === 1 && contentLength < 75) \|\| embedCount > 1) {	1490 } else if((embedCount === 1 && contentLength < 75) \|\| embedCount > 1) {

1491 toRemove = true;	1491 toRemove = true;

1492 }	1492 }

1493	1493

(...skipping 21 matching lines...) Expand all Loading...
1515 }	1515 }

1516 },	1516 },

1517	1517

1518 flagIsActive: function(flag) {	1518 flagIsActive: function(flag) {

1519 return (readability.flags & flag) > 0;	1519 return (readability.flags & flag) > 0;

1520 },	1520 },

1521	1521

1522 addFlag: function(flag) {	1522 addFlag: function(flag) {

1523 readability.flags = readability.flags \| flag;	1523 readability.flags = readability.flags \| flag;

1524 },	1524 },

1525	1525

1526 removeFlag: function(flag) {	1526 removeFlag: function(flag) {

1527 readability.flags = readability.flags & ~flag;	1527 readability.flags = readability.flags & ~flag;

1528 },	1528 },

1529	1529

1530 // Removes the children of \|src\| and appends them to \|dest\|.	1530 // Removes the children of \|src\| and appends them to \|dest\|.

1531 moveNodeInnards: function(src, dest) {	1531 moveNodeInnards: function(src, dest) {

1532 try {	1532 try {

1533 while (src.firstChild) {	1533 while (src.firstChild) {

1534 dest.appendChild(src.removeChild(src.firstChild));	1534 dest.appendChild(src.removeChild(src.firstChild));

1535 }	1535 }

(...skipping 48 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1584 var lastBr = readability.isMultipleBr(node, false);	1584 var lastBr = readability.isMultipleBr(node, false);

1585 var ret = false;	1585 var ret = false;

1586 while (lastBr && lastBr != node) {	1586 while (lastBr && lastBr != node) {

1587 var toRemove = lastBr;	1587 var toRemove = lastBr;

1588 lastBr = lastBr.previousSibling;	1588 lastBr = lastBr.previousSibling;

1589 toRemove.parentNode.removeChild(toRemove);	1589 toRemove.parentNode.removeChild(toRemove);

1590 ret = true;	1590 ret = true;

1591 }	1591 }

1592 return ret;	1592 return ret;

1593 },	1593 },

1594	1594

1595 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a	1595 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a

1596 // <P> node, and makes all next siblings of that pair children of <P>, up	1596 // <P> node, and makes all next siblings of that pair children of <P>, up

1597 // until the next pair of <BR> nodes is reached.	1597 // until the next pair of <BR> nodes is reached.

1598 replaceDoubleBrWithP: function(node) {	1598 replaceDoubleBrWithP: function(node) {

1599 // Check that we are starting with a BR.	1599 // Check that we are starting with a BR.

1600 var second = readability.isMultipleBr(node, true);	1600 var second = readability.isMultipleBr(node, true);

1601 if (!second) {	1601 if (!second) {

1602 return;	1602 return;

1603 }	1603 }

1604 // Make all next siblings of the second BR into children of a P.	1604 // Make all next siblings of the second BR into children of a P.

1605 var p = document.createElement('p');	1605 var p = document.createElement('p');

1606 var curr = second.nextSibling;	1606 var curr = second.nextSibling;

1607 while (curr) {	1607 while (curr) {

1608 if (readability.isMultipleBr(curr, true)) {	1608 if (readability.isMultipleBr(curr, true)) {

1609 break;	1609 break;

1610 }	1610 }

1611 var next = curr.nextSibling;	1611 var next = curr.nextSibling;

1612 p.appendChild(curr.parentNode.removeChild(curr));	1612 p.appendChild(curr.parentNode.removeChild(curr));

1613 curr = next;	1613 curr = next;

1614 }	1614 }

1615 var ret = curr;	1615 var ret = curr;

1616	1616

1617 // Remove all nodes between the first and second BR.	1617 // Remove all nodes between the first and second BR.

1618 curr = node.nextSibling;	1618 curr = node.nextSibling;

1619 while (curr && curr != second) {	1619 while (curr && curr != second) {

1620 var next = curr.nextSibling;	1620 var next = curr.nextSibling;

1621 curr.parentNode.removeChild(curr);	1621 curr.parentNode.removeChild(curr);

1622 curr = next;	1622 curr = next;

1623 }	1623 }

1624 // Remove the second BR.	1624 // Remove the second BR.

1625 second.parentNode.removeChild(second);	1625 second.parentNode.removeChild(second);

1626 // Replace the first BR with the P.	1626 // Replace the first BR with the P.

1627 node.parentNode.replaceChild(p, node);	1627 node.parentNode.replaceChild(p, node);

1628	1628

1629 return ret;	1629 return ret;

1630 },	1630 },

1631	1631

1632 // Returns true if the NodeList contains a double <BR>.	1632 // Returns true if the NodeList contains a double <BR>.

1633 hasDoubleBr: function(nodeList) {	1633 hasDoubleBr: function(nodeList) {

1634 for (var i = 0; i < nodeList.length; nodeList++) {	1634 for (var i = 0; i < nodeList.length; nodeList++) {

1635 if (readability.isMultipleBr(nodeList[i], true)) {	1635 if (readability.isMultipleBr(nodeList[i], true)) {

1636 return true;	1636 return true;

1637 }	1637 }

1638 }	1638 }

1639 return false;	1639 return false;

1640 },	1640 },

1641	1641

1642 // Replaces double <BR> tags with <P> tags.	1642 // Replaces double <BR> tags with <P> tags.

1643 replaceDoubleBrsWithPs: function(node) {	1643 replaceDoubleBrsWithPs: function(node) {

1644 var allElements = node.getElementsByTagName('BR');	1644 var allElements = node.getElementsByTagName('BR');

1645 var node = null;	1645 var node = null;

1646 while (allElements && allElements.length > 0 &&	1646 while (allElements && allElements.length > 0 &&

1647 readability.hasDoubleBr(allElements)) {	1647 readability.hasDoubleBr(allElements)) {

1648 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {	1648 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {

1649 var next = node;	1649 var next = node;

1650 while (next = readability.replaceDoubleBrWithP(next));	1650 while (next = readability.replaceDoubleBrWithP(next));

1651 }	1651 }

1652 allElements = document.body.getElementsByTagName('BR');	1652 allElements = document.body.getElementsByTagName('BR');

1653 }	1653 }

1654 },	1654 },

1655	1655

1656	1656

1657 // Replaces a BR and the whitespace that follows it with a P.	1657 // Replaces a BR and the whitespace that follows it with a P.

1658 replaceBrWithP: function(node) {	1658 replaceBrWithP: function(node) {

1659 if (!readability.isBrNode(node)) {	1659 if (!readability.isBrNode(node)) {

1660 return;	1660 return;

1661 }	1661 }

1662 var p = document.createElement('p');	1662 var p = document.createElement('p');

1663 var curr = node.nextSibling;	1663 var curr = node.nextSibling;

1664 while (curr && !isBrNode(curr)) {	1664 while (curr && !isBrNode(curr)) {

1665 var next = curr.nextSibling;	1665 var next = curr.nextSibling;

1666 if (readability.isWhitespaceNode(curr)) {	1666 if (readability.isWhitespaceNode(curr)) {

1667 curr.parentNode.removeChild(curr);	1667 curr.parentNode.removeChild(curr);

1668 } else {	1668 } else {

1669 p.appendChild(curr.parentNode.removeChild(curr));	1669 p.appendChild(curr.parentNode.removeChild(curr));

1670 }	1670 }

1671 curr = next;	1671 curr = next;

1672 }	1672 }

1673 node.parentNode.replaceChild(p, node);	1673 node.parentNode.replaceChild(p, node);

1674 return curr;	1674 return curr;

1675 },	1675 },

1676	1676

1677 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t ag	1677 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t ag

1678 // children of the <P>.	1678 // children of the <P>.

1679 replaceBrsWithPs: function(node) {	1679 replaceBrsWithPs: function(node) {

1680 var allElements = node.getElementsByTagName('BR');	1680 var allElements = node.getElementsByTagName('BR');

1681 var node = null;	1681 var node = null;

1682 while (allElements && allElements.length > 0) {	1682 while (allElements && allElements.length > 0) {

1683 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {	1683 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex + = 1) {

1684 var next = node;	1684 var next = node;

1685 while (next = readability.replaceBrWithP(next));	1685 while (next = readability.replaceBrWithP(next));

1686 }	1686 }

1687 allElements = document.body.getElementsByTagName('BR');	1687 allElements = document.body.getElementsByTagName('BR');

1688 }	1688 }

1689 },	1689 },

1690	1690

1691 // Replaces any tag with any other tag.	1691 // Replaces any tag with any other tag.

1692 replaceTagsWithTags: function(node, srcTag, destTag) {	1692 replaceTagsWithTags: function(node, srcTag, destTag) {

1693 var allElements = node.getElementsByTagName(srcTag);	1693 var allElements = node.getElementsByTagName(srcTag);

1694 for (var i = 0; i < allElements.length; i++) {	1694 for (var i = 0; i < allElements.length; i++) {

1695 var dest = document.createElement(destTag);	1695 var dest = document.createElement(destTag);

1696 readability.moveNodeInnards(allElements[i], dest);	1696 readability.moveNodeInnards(allElements[i], dest);

1697 node.replaceNode(dest, allElements[i]);	1697 allElements[i].parentNode.replaceChild(dest, allElements[i]);

1698 }	1698 }

1699 },	1699 },

1700	1700

1701 // Replaces all <noscript> tags with <p> tags.	1701 // Replaces all <noscript> tags with <p> tags.

1702 replaceNoscriptsWithPs: function(node) {	1702 replaceNoscriptsWithPs: function(node) {

1703 readability.replaceTagsWithTags(node, 'noscript', 'p');	1703 readability.replaceTagsWithTags(node, 'noscript', 'p');

1704 },	1704 },

1705	1705

1706 // Replaces all <font> tags with <span> tags.	1706 // Replaces all <font> tags with <span> tags.

1707 replaceFontsWithSpans: function(node) {	1707 replaceFontsWithSpans: function(node) {

1708 readability.replaceTagsWithTags(node, 'font', 'span');	1708 readability.replaceTagsWithTags(node, 'font', 'span');

1709 },	1709 },

1710	1710

1711 // Returns a list of image URLs in the distilled article.	1711 // Returns a list of image URLs in the distilled article.

1712 getImages : function() {	1712 getImages : function() {

1713 var images = document.getElementsByTagName('img');	1713 var images = document.getElementsByTagName('img');

1714 var result = new Array(images.length);	1714 var result = new Array(images.length);

1715 dbg("Number of images: " + images.length);	1715 dbg("Number of images: " + images.length);

1716 for(i = 0; i < images.length; i++) {	1716 for(i = 0; i < images.length; i++) {

1717 result[i] = images[i].src;	1717 result[i] = images[i].src;

1718 dbg("Image: " + result[i]);	1718 dbg("Image: " + result[i]);

1719 }	1719 }

1720 return result;	1720 return result;

1721 },	1721 },

1722	1722

1723 // Returns the distilled article HTML from the page(s).	1723 // Returns the distilled article HTML from the page(s).

1724 getDistilledArticleHTML : function() {	1724 getDistilledArticleHTML : function() {

1725 return readability.distilledHTML;	1725 return readability.distilledHTML;

	1726 },

	1727

	1728 // Returns the next page of this article.

	1729 getNextPageLink : function() {

	1730 return readability.nextPageLink;

1726 }	1731 }

1727 };	1732 };

1728	1733

1729 // Extracts long-form content from a page and returns and array where the first	1734 // Extracts long-form content from a page and returns and array where the first

1730 // element is the article title, the second element is HTML containing the	1735 // element is the article title, the second element is HTML containing the

1731 // long-form content, and remaining elements are URLs for images referenced by	1736 // long-form content, and remaining elements are URLs for images referenced by

1732 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which	1737 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which

1733 // corresponds to a URL listed at index k in the array returned.	1738 // corresponds to a URL listed at index k in the array returned.

1734 (function () {	1739 (function () {

1735 readability.init();	1740 readability.init();

1736 var result = new Array(2);	1741 var result = new Array(3);

1737 result[0] = readability.getArticleTitle();	1742 result[0] = readability.getArticleTitle();

1738 result[1] = readability.getDistilledArticleHTML();	1743 result[1] = readability.getDistilledArticleHTML();

	1744 result[2] = readability.getNextPageLink();

1739 return result.concat(readability.getImages());	1745 return result.concat(readability.getImages());

1740 }())	1746 }())

1741	1747

OLD	NEW

« components/dom_distiller/core/page_distiller.h ('K') | « components/dom_distiller/core/task_tracker_unittest.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine

This is Rietveld 408576698