third_party/readability/js/readability.js - Issue 146843010: Add support for multipage distillation.

Unified Diff: third_party/readability/js/readability.js

Issue 146843010: Add support for multipage distillation. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Rebase + change Viewer to use DomDistillerArticleProto Created 6 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: third_party/readability/js/readability.js

diff --git a/third_party/readability/js/readability.js b/third_party/readability/js/readability.js

index 68a0286497ad9b1a06091004889b035720bb9896..4f648f864613b931b956bb3adb02d7ebe36b4416 100644

--- a/third_party/readability/js/readability.js

+++ b/third_party/readability/js/readability.js

@@ -4,11 +4,11 @@ var dbg = (typeof console !== 'undefined') ? function(s) {

} : function() {};

- * Readability. An Arc90 Lab Experiment.

+ * Readability. An Arc90 Lab Experiment.

* Website: http://lab.arc90.com/experiments/readability

* Source: http://code.google.com/p/arc90labs-readability

- * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission.

+ * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission.

* Readability is licensed under the Apache License, Version 2.0.

@@ -20,6 +20,7 @@ var readability = {

distilledHTML: '',

distilledArticleContent: null,

+ nextPageLink: '',

version: '1.7.1',

iframeLoads: 0,

@@ -41,7 +42,7 @@ var readability = {

maxPages: 30, /* The maximum number of pages to loop through before we call it quits and just show a link. */

parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */

pageETags: {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */

/**

* All of the regular expressions in use within readability.

* Defined up here so we don't instantiate them repeatedly in loops.

@@ -66,7 +67,7 @@ var readability = {

/**

* Runs readability.

- *

+ *

* Workflow:

* 1. Prep the document by removing script tags, css, etc.

* 2. Build readability's DOM tree.

@@ -86,8 +87,11 @@ var readability = {

readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;

/* Pull out any possible next page link first */

- var nextPageLink = readability.findNextPageLink(document.body);

+ readability.nextPageLink = readability.findNextPageLink(document.body);

+ /* We handle processing of nextPage from C++ set nextPageLink to null */

+ var nextPageLink = null;

readability.prepDocument();

/* Build readability's DOM tree */

@@ -157,7 +161,7 @@ var readability = {

nextPageLink = null;

if (nextPageLink) {

- /**

+ /**

* Append any additional pages after a small timeout so that people

* can start reading without having to wait for this to finish processing.

**/

@@ -179,16 +183,16 @@ var readability = {

var windowHeight = window.innerHeight ? window.innerHeight : (document.documentElement.clientHeight ? document.documentElement.clientHeight : document.body.clientHeight);

if(readability.reversePageScroll) {

- readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10);

+ readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10);

}

else {

- readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10);

+ readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10);

}

return false;

}

};

document.onkeyup = function(e) {

var code = (window.event) ? event.keyCode : e.keyCode;

if (code === 16) {

@@ -200,7 +204,7 @@ var readability = {

/**

* Run any post-process modifications to article content as necessary.

- *

+ *

* @param Element

* @return void

**/

@@ -226,7 +230,7 @@ var readability = {

for(var i=0, il = images.length; i < il; i+=1) {

var image = images[i];

if(image.offsetWidth > imageWidthThreshold) {

image.className += " blockImage";

}

@@ -242,7 +246,7 @@ var readability = {

var articleTools = document.createElement("DIV");

articleTools.id = "readTools";

- articleTools.innerHTML =

+ articleTools.innerHTML =

"<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +

"<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +

"<a href='#' onclick='readability.emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>";

@@ -259,13 +263,13 @@ var readability = {

function sanitizeText() {

return text.replace(/@\w+/, "");

}

function countMatches(match) {

var matches = text.match(new RegExp(match, "g"));

- return matches !== null ? matches.length : 0;

+ return matches !== null ? matches.length : 0;

}

- function isRTL() {

+ function isRTL() {

var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");

var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");

@@ -289,15 +293,15 @@ var readability = {

try {

curTitle = origTitle = document.title;

if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */

- curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]);

+ curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]);

}

catch(e) {}

if(curTitle.match(/ [\|\-] /))

{

curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');

if(curTitle.split(' ').length < 3) {

curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');

}

@@ -330,7 +334,7 @@ var readability = {

/**

* Prepare the HTML document for readability to scrape it.

* This includes things like stripping javascript, CSS, and handling terrible markup.

- *

+ *

* @return void

**/

prepDocument: function () {

@@ -342,7 +346,7 @@ var readability = {

{

var body = document.createElement("body");

try {

- document.body = body;

+ document.body = body;

}

catch(e) {

document.documentElement.appendChild(body);

@@ -374,11 +378,11 @@ var readability = {

biggestFrameSize = frameSize;

readability.biggestFrame = frames[frameIndex];

}

if(canAccessFrame && frameSize > bestFrameSize)

{

readability.frameHack = true;

bestFrame = frames[frameIndex];

bestFrameSize = frameSize;

}

@@ -390,7 +394,7 @@ var readability = {

readability.moveNodeInnards(bestFrame.contentWindow.document.body, newBody);

newBody.style.overflow = 'scroll';

document.body = newBody;

var frameset = document.getElementsByTagName('frameset')[0];

if(frameset) {

frameset.parentNode.removeChild(frameset); }

@@ -455,7 +459,7 @@ var readability = {

var imgCount = articleParagraphs[i].getElementsByTagName('img').length;

var embedCount = articleParagraphs[i].getElementsByTagName('embed').length;

var objectCount = articleParagraphs[i].getElementsByTagName('object').length;

if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) === '') {

articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);

}

@@ -468,7 +472,7 @@ var readability = {

dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e);

}

/**

* Initialize a node with the readability object. Also checks the

* className/id for special names to add to its score.

@@ -477,7 +481,7 @@ var readability = {

* @return void

**/

initializeNode: function (node) {

- node.readability = {"contentScore": 0};

+ node.readability = {"contentScore": 0};

switch(node.tagName) {

case 'DIV':

@@ -489,7 +493,7 @@ var readability = {

case 'BLOCKQUOTE':

node.readability.contentScore += 3;

break;

case 'ADDRESS':

case 'OL':

case 'UL':

@@ -511,10 +515,10 @@ var readability = {

node.readability.contentScore -= 5;

break;

}

node.readability.contentScore += readability.getClassWeight(node);

/***

* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is

* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.

@@ -525,7 +529,7 @@ var readability = {

grabArticle: function (pageToClone) {

var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS),

isPaging = (page !== null) ? true: false;

var page = null;

// Never work on the actual page.

if (isPaging) {

@@ -533,7 +537,7 @@ var readability = {

} else {

page = pageToClone.cloneNode(true);

}

var allElements = page.getElementsByTagName('*');

/**

@@ -561,7 +565,7 @@ var readability = {

node.parentNode.removeChild(node);

nodeIndex-=1;

continue;

- }

+ }

}

if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") {

@@ -598,7 +602,7 @@ var readability = {

}

- }

+ }

}

/**

@@ -640,15 +644,15 @@ var readability = {

/* Add points for any commas within this paragraph */

contentScore += innerText.split(',').length;

/* For every 100 characters in this paragraph, add another point. Up to 3 points. */

contentScore += Math.min(Math.floor(innerText.length / 100), 3);

/* Add the score to the parent. The grandparent gets half. */

parentNode.readability.contentScore += contentScore;

if(grandParentNode) {

- grandParentNode.readability.contentScore += contentScore/2;

+ grandParentNode.readability.contentScore += contentScore/2;

}

@@ -725,12 +729,12 @@ var readability = {

{

append = true;

}

if(siblingNode.nodeName === "P") {

var linkDensity = readability.getLinkDensity(siblingNode);

var nodeContent = readability.getInnerText(siblingNode);

var nodeLength = nodeContent.length;

if(nodeLength > 80 && linkDensity < 0.25)

{

append = true;

@@ -747,7 +751,7 @@ var readability = {

var nodeToAppend = null;

if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {

/* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */

dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');

nodeToAppend = document.createElement("DIV");

try {

@@ -765,7 +769,7 @@ var readability = {

s-=1;

sl-=1;

}

/* To ensure a node does not interfere with readability styles, remove its classnames */

nodeToAppend.className = "";

@@ -779,15 +783,15 @@ var readability = {

**/

readability.distilledArticleContent = articleContent.cloneNode(true);

//readability.prepArticle(articleContent);

if (readability.curPageNum === 1) {

var newNode = document.createElement('div');

newNode.id = "readability-page-1";

newNode.setAttribute("class", "page");

readability.moveNodeInnards(articleContent, newNode);

articleContent.appendChild(newNode);

- }

+ }

/**

* Now that we've gone through the full algorithm, check to see if we got any meaningful content.

* If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher

@@ -813,7 +817,7 @@ var readability = {

return articleContent;

/**

* Removes script tags from the document.

@@ -828,12 +832,12 @@ var readability = {

scripts[i].nodeValue="";

scripts[i].removeAttribute('src');

if (scripts[i].parentNode) {

- scripts[i].parentNode.removeChild(scripts[i]);

+ scripts[i].parentNode.removeChild(scripts[i]);

}

/**

* Get the inner text of a node - cross browser compatibly.

* This also strips out any excess whitespace to be found.

@@ -896,18 +900,18 @@ var readability = {

if ( cur.nodeType === 1 ) {

// Remove style attribute(s) :

if(cur.className !== "readability-styled") {

- cur.removeAttribute("style");

+ cur.removeAttribute("style");

}

readability.cleanStyles( cur );

}

cur = cur.nextSibling;

- }

+ }

/**

* Get the density of links as a percentage of the content

* This is the amount of text that is inside a link divided by the total text in the node.

- *

+ *

* @param Element

* @return number (float)

**/

@@ -918,11 +922,11 @@ var readability = {

for(var i=0, il=links.length; i<il;i+=1)

{

linkLength += readability.getInnerText(links[i]).length;

- }

+ }

return linkLength / textLength;

/**

* Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.

@@ -944,10 +948,10 @@ var readability = {

/* If the type isn't alpha-only, it's probably not actually a file extension. */

if(!possibleType.match(/[^a-zA-Z]/)) {

- segment = segment.split(".")[0];

+ segment = segment.split(".")[0];

}

/**

* EW-CMS specific segment replacement. Ugly.

* Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html

@@ -968,7 +972,7 @@ var readability = {

if (i < 2 && segment.match(/^\d{1,2}$/)) {

del = true;

}

/* If this is the first segment and it's just "index", remove it. */

if(i === 0 && segment.toLowerCase() === "index") {

del = true;

@@ -992,7 +996,7 @@ var readability = {

/**

* Look for any paging links that may occur within the document.

- *

+ *

* @param body

* @return object (array)

**/

@@ -1008,7 +1012,7 @@ var readability = {

* Also possible: levenshtein distance? longest common subsequence?

- * After we do that, assign each page a score, and

+ * After we do that, assign each page a score, and

**/

for(var i = 0, il = allLinks.length; i < il; i+=1) {

var link = allLinks[i],

@@ -1018,12 +1022,12 @@ var readability = {

if(linkHref === "" || linkHref === articleBaseUrl || linkHref === window.location.href || linkHref in readability.parsedPages) {

continue;

}

/* If it's on a different domain, skip it. */

if(window.location.host !== linkHref.split(/\/+/g)[1]) {

continue;

}

var linkText = readability.getInnerText(link);

/* If the linkText looks like it's not the next page, skip it. */

@@ -1036,9 +1040,9 @@ var readability = {

if(!linkHrefLeftover.match(/\d/)) {

continue;

}

if(!(linkHref in possiblePages)) {

- possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};

+ possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};

} else {

possiblePages[linkHref].linkText += ' | ' + linkText;

}

@@ -1060,7 +1064,7 @@ var readability = {

if(linkData.match(/pag(e|ing|inat)/i)) {

linkObj.score += 25;

}

- if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,

+ if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,

/* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */

if(!linkObj.linkText.match(readability.regexps.nextLink)) {

linkObj.score -= 65;

@@ -1087,10 +1091,10 @@ var readability = {

/* If this is just something like "footer", give it a negative. If it's something like "body-and-footer", leave it be. */

if(!parentNodeClassAndId.match(readability.regexps.positive)) {

linkObj.score -= 25;

- negativeNodeMatch = true;

+ negativeNodeMatch = true;

}

parentNode = parentNode.parentNode;

}

@@ -1152,7 +1156,7 @@ var readability = {

dbg('NEXT PAGE IS ' + nextHref);

readability.parsedPages[nextHref] = true;

- return nextHref;

+ return nextHref;

}

else {

return null;

@@ -1204,7 +1208,7 @@ var readability = {

if (typeof options === 'undefined') { options = {}; }

request.onreadystatechange = respondToReadyState;

request.open('get', url, true);

request.setRequestHeader('Accept', 'text/html');

@@ -1239,7 +1243,7 @@ var readability = {

articlePage.appendChild(linkDiv);

return;

}

/**

* Now that we've built the article page DOM element, get the page content

* asynchronously and load the cleaned content into the div we created for it.

@@ -1257,7 +1261,7 @@ var readability = {

return;

} else {

readability.pageETags[eTag] = 1;

- }

+ }

}

// TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.

@@ -1308,7 +1312,7 @@ var readability = {

}

readability.removeScripts(content);

readability.moveNodeInnards(content, thisPage);

@@ -1330,9 +1334,9 @@ var readability = {

});

}(nextPageLink, articlePage));

/**

- * Get an elements class/id weight. Uses regular expressions to tell if this

+ * Get an elements class/id weight. Uses regular expressions to tell if this

* element looks good or bad.

* @param Element

@@ -1382,7 +1386,7 @@ var readability = {

var allElements = e.getElementsByTagName('*');

while (i < allElements.length) {

readability.deleteExtraBreaks(allElements[i]);

- i++;

+ i++;

}

@@ -1397,7 +1401,7 @@ var readability = {

clean: function (e, tag) {

var targetList = e.getElementsByTagName( tag );

var isEmbed = (tag === 'object' || tag === 'embed');

for (var y=targetList.length-1; y >= 0; y-=1) {

/* Allow youtube and vimeo videos through as people usually want to see those. */

if(isEmbed) {

@@ -1405,7 +1409,7 @@ var readability = {

for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {

attributeValues += targetList[y].attributes[i].value + '|';

}

/* First, check the elements attributes to see if any of them contain youtube or vimeo */

if (attributeValues.search(readability.regexps.videos) !== -1) {

continue;

@@ -1415,13 +1419,13 @@ var readability = {

if (targetList[y].innerHTML.search(readability.regexps.videos) !== -1) {

continue;

}

targetList[y].parentNode.removeChild(targetList[y]);

}

/**

* Clean an element of all tags of type "tag" if they look fishy.

* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.

@@ -1446,7 +1450,7 @@ var readability = {

for (var i=curTagsLength-1; i >= 0; i-=1) {

var weight = readability.getClassWeight(tagsList[i]);

var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;

dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : ''));

if(weight+contentScore < 0)

@@ -1467,7 +1471,7 @@ var readability = {

var embeds = tagsList[i].getElementsByTagName("embed");

for(var ei=0,il=embeds.length; ei < il; ei+=1) {

if (embeds[ei].src.search(readability.regexps.videos) === -1) {

- embedCount+=1;

+ embedCount+=1;

}

@@ -1480,7 +1484,7 @@ var readability = {

} else if(li > p && tag !== "ul" && tag !== "ol") {

toRemove = true;

} else if( input > Math.floor(p/3) ) {

- toRemove = true;

+ toRemove = true;

} else if(contentLength < 25 && (img === 0 || img > 2) ) {

toRemove = true;

} else if(weight < 25 && linkDensity > 0.2) {

@@ -1522,7 +1526,7 @@ var readability = {

addFlag: function(flag) {

readability.flags = readability.flags | flag;

removeFlag: function(flag) {

readability.flags = readability.flags & ~flag;

@@ -1591,7 +1595,7 @@ var readability = {

}

return ret;

// Replaces a pair of nodes (possibly separated by whitespace), with a

// node, and makes all next siblings of that pair children of , up

// until the next pair of nodes is reached.

@@ -1600,7 +1604,7 @@ var readability = {

var second = readability.isMultipleBr(node, true);

if (!second) {

return;

- }

+ }

// Make all next siblings of the second BR into children of a P.

var p = document.createElement('p');

var curr = second.nextSibling;

@@ -1613,7 +1617,7 @@ var readability = {

curr = next;

}

var ret = curr;

// Remove all nodes between the first and second BR.

curr = node.nextSibling;

while (curr && curr != second) {

@@ -1625,10 +1629,10 @@ var readability = {

second.parentNode.removeChild(second);

// Replace the first BR with the P.

node.parentNode.replaceChild(p, node);

return ret;

// Returns true if the NodeList contains a double .

hasDoubleBr: function(nodeList) {

for (var i = 0; i < nodeList.length; nodeList++) {

@@ -1637,8 +1641,8 @@ var readability = {

}

return false;

- },

+ },

// Replaces double tags with tags.

replaceDoubleBrsWithPs: function(node) {

var allElements = node.getElementsByTagName('BR');

@@ -1652,8 +1656,8 @@ var readability = {

allElements = document.body.getElementsByTagName('BR');

}

// Replaces a BR and the whitespace that follows it with a P.

replaceBrWithP: function(node) {

if (!readability.isBrNode(node)) {

@@ -1673,7 +1677,7 @@ var readability = {

node.parentNode.replaceChild(p, node);

return curr;

// Replaces all tags with tags. Makes all next siblings of a tag

// children of the .

replaceBrsWithPs: function(node) {

@@ -1687,27 +1691,27 @@ var readability = {

allElements = document.body.getElementsByTagName('BR');

}

// Replaces any tag with any other tag.

replaceTagsWithTags: function(node, srcTag, destTag) {

var allElements = node.getElementsByTagName(srcTag);

for (var i = 0; i < allElements.length; i++) {

var dest = document.createElement(destTag);

readability.moveNodeInnards(allElements[i], dest);

- node.replaceNode(dest, allElements[i]);

+ allElements[i].parentNode.replaceChild(dest, allElements[i]);

}

// Replaces all <noscript> tags with tags.

replaceNoscriptsWithPs: function(node) {

readability.replaceTagsWithTags(node, 'noscript', 'p');

// Replaces all tags with tags.

replaceFontsWithSpans: function(node) {

readability.replaceTagsWithTags(node, 'font', 'span');

// Returns a list of image URLs in the distilled article.

getImages : function() {

var images = document.getElementsByTagName('img');

@@ -1719,10 +1723,15 @@ var readability = {

}

return result;

// Returns the distilled article HTML from the page(s).

getDistilledArticleHTML : function() {

return readability.distilledHTML;

+ },

+ // Returns the next page of this article.

+ getNextPageLink : function() {

+ return readability.nextPageLink;

}

};

@@ -1730,12 +1739,13 @@ var readability = {

// element is the article title, the second element is HTML containing the

// long-form content, and remaining elements are URLs for images referenced by

// that HTML. Each <img> tag in the HTML has an id field set to k - 2, which

-// corresponds to a URL listed at index k in the array returned.

+// corresponds to a URL listed at index k in the array returned.

(function () {

readability.init();

- var result = new Array(2);

+ var result = new Array(3);

result[0] = readability.getArticleTitle();

cjhopman 2014/01/29 21:44:13 We should probably change this to a dictionary at

shashi 2014/01/29 22:51:37 I actually tried to change it to dictionary ,but m

result[1] = readability.getDistilledArticleHTML();

+ result[2] = readability.getNextPageLink();

return result.concat(readability.getImages());

}())

« components/dom_distiller/core/distiller.h ('K') | « components/dom_distiller/core/task_tracker_unittest.cc ('k') | no next file » | no next file with comments »