java/org/chromium/distiller/PageParameterParser.java - Issue 1178633002: implement parser for new pagination algorithm

Side by Side Diff: java/org/chromium/distiller/PageParameterParser.java

Issue 1178633002: implement parser for new pagination algorithm (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master

Patch Set: addr wychen's comments Created 5 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright 2015 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 package org.chromium.distiller;

	6

	7 import org.chromium.distiller.proto.DomDistillerProtos;

	8 import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo;

	9

	10 import com.google.gwt.dom.client.AnchorElement;

	11 import com.google.gwt.dom.client.Document;

	12 import com.google.gwt.dom.client.Element;

	13 import com.google.gwt.dom.client.Node;

	14 import com.google.gwt.dom.client.NodeList;

	15 import com.google.gwt.dom.client.Style;

	16 import com.google.gwt.regexp.shared.MatchResult;

	17 import com.google.gwt.regexp.shared.RegExp;

	18

	19 /**

	20 * Background:

	21 * The long article/news/forum thread/blog document may be partitioned into se veral partial pages

	22 * by webmaster. Each partial page has outlinks pointing to the adjacent part ial pages. The

	23 * anchor text of those outlinks is numeric.

	24 *

	25 * This class parses the document to collect groups of adjacent plain text numbe rs and outlinks with

	26 * digital anchor text. These are then passed to PageParameterParser which woul d spit out the

	27 * pagination URLs if available.

	28 */

	29 public class PageParameterParser {

	30 // If the numeric value of a link's anchor text is greater than this number, we don't think it

	31 // represents the page number of the link.

	32 private static final int MAX_NUM_FOR_PAGE_PARAM = 100;

	33

	34 /**

	35 * Stores PageParamInfo.PageInfo and the anchor's text, specifically returne d by

	36 * getPageInfoAndText().

	37 */

	38 private static class PageInfoAndText {

	39 private final PageParamInfo.PageInfo mPageInfo;

	40 private final String mText;

	41

	42 PageInfoAndText(int number, String url, String text) {

	43 mPageInfo = new PageParamInfo.PageInfo(number, url);

	44 mText = text;

	45 }

	46 }

	47

	48 /**

	49 * Entry point for PageParameterParser.

	50 * Parses the document to collect outlinks with numeric anchor text and nume ric text around

	51 * them. These are then passed to PageParameterParser to detect pagination URLs.

	52 *

	53 * @return PageParamInfo (see PageParamInfo.java), always. If no page param eter is detected or

	54 * determined to be best, its mType is PageParamInfo.Type.UNSET.

	55 *

	56 * @param originalUrl the original URL of the document to be parsed.

	57 * @param timingInfo for tracking performance.

	58 */

	59 public static PageParamInfo parse(String originalUrl, TimingInfo timingInfo) {

	60 PageParameterParser parser = new PageParameterParser(timingInfo);

	61 return parser.parseDocument(Document.get().getDocumentElement(), origina lUrl);

	62 }

	63

	64 private final TimingInfo mTimingInfo;

	65 private String mDocUrl = "";

	66 private ParsedUrl mParsedUrl = null;

	67 private final MonotonicPageInfosGroups mAdjacentNumbersGroups = new Monotoni cPageInfosGroups();

	68 private int mNumForwardLinksProcessed = 0;

	69

	70 private static RegExp sHrefCleaner = RegExp.compile("\\/$");

	71 private static RegExp sInvalidParentWrapper = null;

	72

	73 private PageParameterParser(TimingInfo timingInfo) {

	74 mTimingInfo = timingInfo;

	75 }

	76

	77 /**

	78 * Acutually implements PageParameterParser.parse(), see above description f or parse().

	79 */

	80 private PageParamInfo parseDocument(Element root, String originalUrl) {

	81 double startTime = DomUtil.getTime();

	82

	83 mDocUrl = sHrefCleaner.replace(originalUrl, "");

	84 mParsedUrl = ParsedUrl.create(mDocUrl);

	85 if (mParsedUrl == null) return new PageParamInfo(); // Invalid document URL.

	86

	87 AnchorElement baseAnchor = PagingLinksFinder.createAnchorWithBase(

	88 PagingLinksFinder.getBaseUrlForRelative(root, originalUrl));

	89

	90 NodeList<Element> allLinks = root.getElementsByTagName("A");

	91 int idx = 0;

	92 while (idx < allLinks.getLength()) {

	93 final AnchorElement link = AnchorElement.as(allLinks.getItem(idx));

	94 PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAncho r);

	95 if (pageInfoAndText == null) {

	96 idx++;

	97 continue;

	98 }

	99

	100 // This link is a good candidate for pagination.

	101

	102 // Close current group of adjacent numbers, add a new group if neces sary.

	103 mAdjacentNumbersGroups.addGroup();

	104

	105 // Before we append the link to the new group of adjacent numbers, c heck if it's

	106 // preceded by a text node with numeric text; if so, add it before t he link.

	107 findAndAddClosestValidLeafNodes(link, false, true, null);

	108

	109 // Add the link to the current group of adjacent numbers.

	110 mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo);

	111

	112 // Add all following text nodes and links with numeric text.

	113 mNumForwardLinksProcessed = 0;

	114 findAndAddClosestValidLeafNodes(link, false, false, baseAnchor);

	115

	116 // Skip the current link and links already processed in the forward

	117 // findandAddClosestValidLeafNodes().

	118 idx += 1 + mNumForwardLinksProcessed;

	119 } // while there're links.

	120

	121 mAdjacentNumbersGroups.cleanup();

	122

	123 LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser");

	124

	125 startTime = DomUtil.getTime();

	126 PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups , mDocUrl);

	127 LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector");

	128 return info;

	129 }

	130

	131 /**

	132 * @return a populated PageInfoAndText if given link is to be added to mAdja centNumbersGroups.

	133 * Otherwise, returns null if link is to be ignored.

	134 * "javascript:" links with numeric text are considered valid links to be ad ded.

	135 *

	136 * @param link to process.

	137 * @param baseAnchor created for the current document.

	138 */

	139 private PageInfoAndText getPageInfoAndText(AnchorElement link, AnchorElement baseAnchor) {

	140 // Ignore invisible links.

	141 if (!DomUtil.isVisible(link)) return null;

	142

	143 // Use javascript innerText (instead of javascript textContent) to only get visible text.

	144 String linkText = StringUtil.jsTrim(DomUtil.getInnerText(link));

	145 int number = linkTextToNumber(linkText);

	146 if (!isPlainPageNumber(number)) return null;

	147

	148 String linkHref = resolveLinkHref(link, baseAnchor);

	149 final boolean isEmptyHref = linkHref.isEmpty();

	150 boolean isJavascriptLink = false;

	151 ParsedUrl url = null;

	152 if (!isEmptyHref) {

	153 isJavascriptLink = isJavascriptHref(linkHref);

	154 url = ParsedUrl.create(linkHref);

	155 if (url == null \|\|

	156 (!isJavascriptLink && !url.getHost().equalsIgnoreCase(mParse dUrl.getHost()))) {

	157 return null;

	158 }

	159 url.setHash("");

	160 }

	161

	162 if (isEmptyHref \|\| isJavascriptLink \|\| isDisabledLink(link)) {

	163 return new PageInfoAndText(number, "", linkText);

	164 }

	165

	166 return new PageInfoAndText(number, sHrefCleaner.replace(url.toString(), ""), linkText);

	167 }

	168

	169 /**

	170 * Finds and adds the leaf node(s) closest to the given start node.

	171 * This recurses and keeps finding and, if necessary, adding the numeric tex t of valid nodes,

	172 * collecting the PageParamInfo.PageInfo's for the current adjacency group.

	173 * For backward search, i.e. nodes before start node, search terminates (i.e . recursion stops)

	174 * once a text node or anchor is encountered. If the text node contains num eric text, it's

	175 * added to the current adjacency group. Otherwise, a new group is created to break the

	176 * adjacency.

	177 * For forward search, i.e. nodes after start node, search continues (i.e. r ecursion continues)

	178 * until a text node or anchor with non-numeric text is encountered. In the process, text nodes

	179 * and anchors with numeric text are added to the current adjaency group. W hen a non-numeric

	180 * text node or anchor is encountered, a new group is started to break the a djacency, and search

	181 * ends.

	182 *

	183 * @return true to continue search, false to stop.

	184 *

	185 * @param start node to work on.

	186 * @param checkStart true to check start node. Otherwise, the previous or n ext sibling of the

	187 * start node is checked.

	188 * @param backward true to search backward (i.e. nodes before start node), f alse to search

	189 * forward (i.e. nodes after start node).

	190 * @param baseAnchor created for the current document, only needed for forwa rd search.

	191 */

	192 private boolean findAndAddClosestValidLeafNodes(Node start, boolean checkSta rt,

	193 boolean backward, AnchorElement baseAnchor) {

	194 Node node = checkStart ? start :

	195 (backward ? start.getPreviousSibling() : start.getNextSibling()) ;

	196 if (node == null) { // No sibling, try parent.

	197 node = start.getParentNode();

	198 if (sInvalidParentWrapper == null) {

	199 sInvalidParentWrapper = RegExp.compile("(BODY)\|(HTML)");

	200 }

	201 if (sInvalidParentWrapper.test(node.getNodeName())) return false;

	202 return findAndAddClosestValidLeafNodes(node, false, backward, baseAn chor);

	203 }

	204

	205 checkStart = false;

	206 switch (node.getNodeType()) {

	207 case Node.TEXT_NODE:

	208 String text = node.getNodeValue();

	209 // Text must contain words.

	210 if (text.isEmpty() \|\| StringUtil.countWords(text) == 0) break;

	211 boolean added = addNonLinkTextIfValid(node.getNodeValue());

	212 // For backward search, we're done regardless if text was added.

	213 // For forward search, we're done only if text was invalid, othe rwise continue.

	214 if (backward \|\| !added) return false;

	215 break;

	216

	217 case Node.ELEMENT_NODE:

	218 Element e = Element.as(node);

	219 if (e.hasTagName("A")) {

	220 // For backward search, we're done because we've already pro cessed the anchor.

	221 if (backward) return false;

	222 // For forward search, we're done only if link was invalid, otherwise continue.

	223 mNumForwardLinksProcessed++;

	224 if (!addLinkIfValid(AnchorElement.as(e), baseAnchor)) return false;

	225 break;

	226 }

	227 // Intentionally fall through.

	228

	229 default:

	230 // Check children nodes.

	231 if (!node.hasChildNodes()) break;

	232 checkStart = true; // We want to check the child node.

	233 if (backward) {

	234 // Start the backward search with the rightmost child i.e. l ast and closest to

	235 // given node.

	236 node = node.getLastChild();

	237 } else {

	238 // Start the forward search with the leftmost child i.e. fir st and closest to

	239 // given node.

	240 node = node.getFirstChild();

	241 }

	242 break;

	243 }

	244

	245 return findAndAddClosestValidLeafNodes(node, checkStart, backward, baseA nchor);

	246 }

	247

	248 private static RegExp sTermsRegExp = null; // Match terms i.e. words.

	249 private static RegExp sSurroundingDigitsRegExp = null; // Match term with o nly digits.

	250

	251 /**

	252 * Handle the text for a non-link node. Each numeric term in the text that is a valid plain

	253 * page number adds a PageParamInfo.PageInfo into the current adjacent group . All other terms

	254 * break the adjacency in the current group, adding a new group instead.

	255 *

	256 * @Return true if text was added to current group of adjacent numbers. Oth erwise, false with

	257 * a new group created to break the current adjacency.

	258 */

	259 private boolean addNonLinkTextIfValid(String text) {

	260 if (!StringUtil.containsDigit(text)) {

	261 // The text does not contain valid number(s); if necessary, current group of adjacent

	262 // numbers should be closed, adding a new group if possible.

	263 mAdjacentNumbersGroups.addGroup();

	264 return false;

	265 }

	266

	267 if (sTermsRegExp == null) {

	268 sTermsRegExp = RegExp.compile("(\\S[\\w\u00C0-\u1FFF\u2C00-\uD7FF]\ \S)", "gi");

	269 } else {

	270 sTermsRegExp.setLastIndex(0);

	271 }

	272 if (sSurroundingDigitsRegExp == null) {

	273 sSurroundingDigitsRegExp = RegExp.compile("^[\\W_](\\d+)[\\W_]$", "i");

	274 }

	275

	276 // Extract terms from the text, differentiating between those that conta in only digits and

	277 // those that contain non-digits.

	278 boolean added = false;

	279 while (true) {

	280 MatchResult match = sTermsRegExp.exec(text);

	281 if (match == null) break;

	282 if (match.getGroupCount() <= 1) continue;

	283

	284 String term = match.getGroup(1);

	285 MatchResult termWithDigits = sSurroundingDigitsRegExp.exec(term);

	286 int number = -1;

	287 if (termWithDigits != null && termWithDigits.getGroupCount() > 1) {

	288 number = StringUtil.toNumber(termWithDigits.getGroup(1));

	289 }

	290 if (isPlainPageNumber(number)) {

	291 // This text is a valid candidate of plain text page number, add it to last group of

	292 // adjacent numbers.

	293 mAdjacentNumbersGroups.addNumber(number, "");

	294 added = true;

	295 } else {

	296 // The text is not a valid number, so current group of adjacent numbers should be

	297 // closed, adding a new group if possible.

	298 mAdjacentNumbersGroups.addGroup();

	299 }

	300 } // while there're matches

	301

	302 return added;

	303 }

	304

	305 /**

	306 * Adds PageParamInfo.PageInfo to the current adjacent group for a link if i ts text is numeric.

	307 * Otherwise, add a new group to break the adjacency.

	308 *

	309 * @Return true if link was added, false otherwise.

	310 */

	311 private boolean addLinkIfValid(AnchorElement link, AnchorElement baseAnchor) {

	312 PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAnchor);

	313 if (pageInfoAndText != null) {

	314 mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo);

	315 return true;

	316 }

	317 mAdjacentNumbersGroups.addGroup();

	318 return false;

	319 }

	320

	321 /**

	322 * @return true if link is disabled i.e. not clickable because it has a text cursor.

	323 */

	324 private static boolean isDisabledLink(AnchorElement link) {

	325 Style style = DomUtil.getComputedStyle(link);

	326 return Style.Cursor.valueOf(style.getCursor().toUpperCase()) == Style.Cu rsor.TEXT;

	327 }

	328

	329 /**

	330 * @return true if href starts with "javascript:".

	331 */

	332 private static boolean isJavascriptHref(String href) {

	333 return href.startsWith("javascript:");

	334 }

	335

	336 private static String resolveLinkHref(AnchorElement link, AnchorElement base Anchor) {

	337 // Anchors without "href" attribute are not considered potential paginat ion links.

	338 String linkHref = link.getAttribute("href");

	339 if (linkHref.isEmpty()) return "";

	340 baseAnchor.setAttribute("href", linkHref);

	341 return baseAnchor.getHref();

	342 }

	343

	344 private static int linkTextToNumber(String linkText) {

	345 linkText = linkText.replaceAll("[()\\[\\]{}]", "");

	346 linkText = linkText.trim(); // Remove leading and trailing whitespaces.

	347 return StringUtil.toNumber(linkText);

	348 }

	349

	350 /**

	351 * @returns true if number is >= 0 && < MAX_NUM_FOR_PAGE_PARAM.

	352 */

	353 private static boolean isPlainPageNumber(int number) {

	354 return number >= 0 && number <= MAX_NUM_FOR_PAGE_PARAM;

	355 }

	356

	357 }

OLD	NEW

« no previous file with comments | « java/org/chromium/distiller/MonotonicPageInfosGroups.java ('k') | java/org/chromium/distiller/ParsedUrl.java » ('j') | no next file with comments »