OLD | NEW |
(Empty) | |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 package org.chromium.distiller; |
| 6 |
| 7 import org.chromium.distiller.proto.DomDistillerProtos; |
| 8 import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo; |
| 9 |
| 10 import com.google.gwt.dom.client.AnchorElement; |
| 11 import com.google.gwt.dom.client.Document; |
| 12 import com.google.gwt.dom.client.Element; |
| 13 import com.google.gwt.dom.client.Node; |
| 14 import com.google.gwt.dom.client.NodeList; |
| 15 import com.google.gwt.dom.client.Style; |
| 16 import com.google.gwt.regexp.shared.MatchResult; |
| 17 import com.google.gwt.regexp.shared.RegExp; |
| 18 |
| 19 /** |
| 20 * Background: |
| 21 * The long article/news/forum thread/blog document may be partitioned into se
veral partial pages |
| 22 * by webmaster. Each partial page has outlinks pointing to the adjacent part
ial pages. The |
| 23 * anchor text of those outlinks is numeric. |
| 24 * |
| 25 * This class parses the document to collect groups of adjacent plain text numbe
rs and outlinks with |
| 26 * digital anchor text. These are then passed to PageParameterParser which woul
d spit out the |
| 27 * pagination URLs if available. |
| 28 */ |
| 29 public class PageParameterParser { |
| 30 // If the numeric value of a link's anchor text is greater than this number,
we don't think it |
| 31 // represents the page number of the link. |
| 32 private static final int MAX_NUM_FOR_PAGE_PARAM = 100; |
| 33 |
| 34 /** |
| 35 * Stores PageParamInfo.PageInfo and the anchor's text, specifically returne
d by |
| 36 * getPageInfoAndText(). |
| 37 */ |
| 38 private static class PageInfoAndText { |
| 39 private final PageParamInfo.PageInfo mPageInfo; |
| 40 private final String mText; |
| 41 |
| 42 PageInfoAndText(int number, String url, String text) { |
| 43 mPageInfo = new PageParamInfo.PageInfo(number, url); |
| 44 mText = text; |
| 45 } |
| 46 } |
| 47 |
| 48 /** |
| 49 * Entry point for PageParameterParser. |
| 50 * Parses the document to collect outlinks with numeric anchor text and nume
ric text around |
| 51 * them. These are then passed to PageParameterParser to detect pagination
URLs. |
| 52 * |
| 53 * @return PageParamInfo (see PageParamInfo.java), always. If no page param
eter is detected or |
| 54 * determined to be best, its mType is PageParamInfo.Type.UNSET. |
| 55 * |
| 56 * @param originalUrl the original URL of the document to be parsed. |
| 57 * @param timingInfo for tracking performance. |
| 58 */ |
| 59 public static PageParamInfo parse(String originalUrl, TimingInfo timingInfo)
{ |
| 60 PageParameterParser parser = new PageParameterParser(timingInfo); |
| 61 return parser.parseDocument(Document.get().getDocumentElement(), origina
lUrl); |
| 62 } |
| 63 |
| 64 private final TimingInfo mTimingInfo; |
| 65 private String mDocUrl = ""; |
| 66 private ParsedUrl mParsedUrl = null; |
| 67 private final MonotonicPageInfosGroups mAdjacentNumbersGroups = new Monotoni
cPageInfosGroups(); |
| 68 private int mNumForwardLinksProcessed = 0; |
| 69 |
| 70 private static RegExp sHrefCleaner = RegExp.compile("\\/$"); |
| 71 private static RegExp sInvalidParentWrapper = null; |
| 72 |
| 73 private PageParameterParser(TimingInfo timingInfo) { |
| 74 mTimingInfo = timingInfo; |
| 75 } |
| 76 |
| 77 /** |
| 78 * Acutually implements PageParameterParser.parse(), see above description f
or parse(). |
| 79 */ |
| 80 private PageParamInfo parseDocument(Element root, String originalUrl) { |
| 81 double startTime = DomUtil.getTime(); |
| 82 |
| 83 mDocUrl = sHrefCleaner.replace(originalUrl, ""); |
| 84 mParsedUrl = ParsedUrl.create(mDocUrl); |
| 85 if (mParsedUrl == null) return new PageParamInfo(); // Invalid document
URL. |
| 86 |
| 87 AnchorElement baseAnchor = PagingLinksFinder.createAnchorWithBase( |
| 88 PagingLinksFinder.getBaseUrlForRelative(root, originalUrl)); |
| 89 |
| 90 NodeList<Element> allLinks = root.getElementsByTagName("A"); |
| 91 int idx = 0; |
| 92 while (idx < allLinks.getLength()) { |
| 93 final AnchorElement link = AnchorElement.as(allLinks.getItem(idx)); |
| 94 PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAncho
r); |
| 95 if (pageInfoAndText == null) { |
| 96 idx++; |
| 97 continue; |
| 98 } |
| 99 |
| 100 // This link is a good candidate for pagination. |
| 101 |
| 102 // Close current group of adjacent numbers, add a new group if neces
sary. |
| 103 mAdjacentNumbersGroups.addGroup(); |
| 104 |
| 105 // Before we append the link to the new group of adjacent numbers, c
heck if it's |
| 106 // preceded by a text node with numeric text; if so, add it before t
he link. |
| 107 findAndAddClosestValidLeafNodes(link, false, true, null); |
| 108 |
| 109 // Add the link to the current group of adjacent numbers. |
| 110 mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo); |
| 111 |
| 112 // Add all following text nodes and links with numeric text. |
| 113 mNumForwardLinksProcessed = 0; |
| 114 findAndAddClosestValidLeafNodes(link, false, false, baseAnchor); |
| 115 |
| 116 // Skip the current link and links already processed in the forward |
| 117 // findandAddClosestValidLeafNodes(). |
| 118 idx += 1 + mNumForwardLinksProcessed; |
| 119 } // while there're links. |
| 120 |
| 121 mAdjacentNumbersGroups.cleanup(); |
| 122 |
| 123 LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser"); |
| 124 |
| 125 startTime = DomUtil.getTime(); |
| 126 PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups
, mDocUrl); |
| 127 LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector"); |
| 128 return info; |
| 129 } |
| 130 |
| 131 /** |
| 132 * @return a populated PageInfoAndText if given link is to be added to mAdja
centNumbersGroups. |
| 133 * Otherwise, returns null if link is to be ignored. |
| 134 * "javascript:" links with numeric text are considered valid links to be ad
ded. |
| 135 * |
| 136 * @param link to process. |
| 137 * @param baseAnchor created for the current document. |
| 138 */ |
| 139 private PageInfoAndText getPageInfoAndText(AnchorElement link, AnchorElement
baseAnchor) { |
| 140 // Ignore invisible links. |
| 141 if (!DomUtil.isVisible(link)) return null; |
| 142 |
| 143 // Use javascript innerText (instead of javascript textContent) to only
get visible text. |
| 144 String linkText = StringUtil.jsTrim(DomUtil.getInnerText(link)); |
| 145 int number = linkTextToNumber(linkText); |
| 146 if (!isPlainPageNumber(number)) return null; |
| 147 |
| 148 String linkHref = resolveLinkHref(link, baseAnchor); |
| 149 final boolean isEmptyHref = linkHref.isEmpty(); |
| 150 boolean isJavascriptLink = false; |
| 151 ParsedUrl url = null; |
| 152 if (!isEmptyHref) { |
| 153 isJavascriptLink = isJavascriptHref(linkHref); |
| 154 url = ParsedUrl.create(linkHref); |
| 155 if (url == null || |
| 156 (!isJavascriptLink && !url.getHost().equalsIgnoreCase(mParse
dUrl.getHost()))) { |
| 157 return null; |
| 158 } |
| 159 url.setHash(""); |
| 160 } |
| 161 |
| 162 if (isEmptyHref || isJavascriptLink || isDisabledLink(link)) { |
| 163 return new PageInfoAndText(number, "", linkText); |
| 164 } |
| 165 |
| 166 return new PageInfoAndText(number, sHrefCleaner.replace(url.toString(),
""), linkText); |
| 167 } |
| 168 |
| 169 /** |
| 170 * Finds and adds the leaf node(s) closest to the given start node. |
| 171 * This recurses and keeps finding and, if necessary, adding the numeric tex
t of valid nodes, |
| 172 * collecting the PageParamInfo.PageInfo's for the current adjacency group. |
| 173 * For backward search, i.e. nodes before start node, search terminates (i.e
. recursion stops) |
| 174 * once a text node or anchor is encountered. If the text node contains num
eric text, it's |
| 175 * added to the current adjacency group. Otherwise, a new group is created
to break the |
| 176 * adjacency. |
| 177 * For forward search, i.e. nodes after start node, search continues (i.e. r
ecursion continues) |
| 178 * until a text node or anchor with non-numeric text is encountered. In the
process, text nodes |
| 179 * and anchors with numeric text are added to the current adjaency group. W
hen a non-numeric |
| 180 * text node or anchor is encountered, a new group is started to break the a
djacency, and search |
| 181 * ends. |
| 182 * |
| 183 * @return true to continue search, false to stop. |
| 184 * |
| 185 * @param start node to work on. |
| 186 * @param checkStart true to check start node. Otherwise, the previous or n
ext sibling of the |
| 187 * start node is checked. |
| 188 * @param backward true to search backward (i.e. nodes before start node), f
alse to search |
| 189 * forward (i.e. nodes after start node). |
| 190 * @param baseAnchor created for the current document, only needed for forwa
rd search. |
| 191 */ |
| 192 private boolean findAndAddClosestValidLeafNodes(Node start, boolean checkSta
rt, |
| 193 boolean backward, AnchorElement baseAnchor) { |
| 194 Node node = checkStart ? start : |
| 195 (backward ? start.getPreviousSibling() : start.getNextSibling())
; |
| 196 if (node == null) { // No sibling, try parent. |
| 197 node = start.getParentNode(); |
| 198 if (sInvalidParentWrapper == null) { |
| 199 sInvalidParentWrapper = RegExp.compile("(BODY)|(HTML)"); |
| 200 } |
| 201 if (sInvalidParentWrapper.test(node.getNodeName())) return false; |
| 202 return findAndAddClosestValidLeafNodes(node, false, backward, baseAn
chor); |
| 203 } |
| 204 |
| 205 checkStart = false; |
| 206 switch (node.getNodeType()) { |
| 207 case Node.TEXT_NODE: |
| 208 String text = node.getNodeValue(); |
| 209 // Text must contain words. |
| 210 if (text.isEmpty() || StringUtil.countWords(text) == 0) break; |
| 211 boolean added = addNonLinkTextIfValid(node.getNodeValue()); |
| 212 // For backward search, we're done regardless if text was added. |
| 213 // For forward search, we're done only if text was invalid, othe
rwise continue. |
| 214 if (backward || !added) return false; |
| 215 break; |
| 216 |
| 217 case Node.ELEMENT_NODE: |
| 218 Element e = Element.as(node); |
| 219 if (e.hasTagName("A")) { |
| 220 // For backward search, we're done because we've already pro
cessed the anchor. |
| 221 if (backward) return false; |
| 222 // For forward search, we're done only if link was invalid,
otherwise continue. |
| 223 mNumForwardLinksProcessed++; |
| 224 if (!addLinkIfValid(AnchorElement.as(e), baseAnchor)) return
false; |
| 225 break; |
| 226 } |
| 227 // Intentionally fall through. |
| 228 |
| 229 default: |
| 230 // Check children nodes. |
| 231 if (!node.hasChildNodes()) break; |
| 232 checkStart = true; // We want to check the child node. |
| 233 if (backward) { |
| 234 // Start the backward search with the rightmost child i.e. l
ast and closest to |
| 235 // given node. |
| 236 node = node.getLastChild(); |
| 237 } else { |
| 238 // Start the forward search with the leftmost child i.e. fir
st and closest to |
| 239 // given node. |
| 240 node = node.getFirstChild(); |
| 241 } |
| 242 break; |
| 243 } |
| 244 |
| 245 return findAndAddClosestValidLeafNodes(node, checkStart, backward, baseA
nchor); |
| 246 } |
| 247 |
| 248 private static RegExp sTermsRegExp = null; // Match terms i.e. words. |
| 249 private static RegExp sSurroundingDigitsRegExp = null; // Match term with o
nly digits. |
| 250 |
| 251 /** |
| 252 * Handle the text for a non-link node. Each numeric term in the text that
is a valid plain |
| 253 * page number adds a PageParamInfo.PageInfo into the current adjacent group
. All other terms |
| 254 * break the adjacency in the current group, adding a new group instead. |
| 255 * |
| 256 * @Return true if text was added to current group of adjacent numbers. Oth
erwise, false with |
| 257 * a new group created to break the current adjacency. |
| 258 */ |
| 259 private boolean addNonLinkTextIfValid(String text) { |
| 260 if (!StringUtil.containsDigit(text)) { |
| 261 // The text does not contain valid number(s); if necessary, current
group of adjacent |
| 262 // numbers should be closed, adding a new group if possible. |
| 263 mAdjacentNumbersGroups.addGroup(); |
| 264 return false; |
| 265 } |
| 266 |
| 267 if (sTermsRegExp == null) { |
| 268 sTermsRegExp = RegExp.compile("(\\S*[\\w\u00C0-\u1FFF\u2C00-\uD7FF]\
\S*)", "gi"); |
| 269 } else { |
| 270 sTermsRegExp.setLastIndex(0); |
| 271 } |
| 272 if (sSurroundingDigitsRegExp == null) { |
| 273 sSurroundingDigitsRegExp = RegExp.compile("^[\\W_]*(\\d+)[\\W_]*$",
"i"); |
| 274 } |
| 275 |
| 276 // Extract terms from the text, differentiating between those that conta
in only digits and |
| 277 // those that contain non-digits. |
| 278 boolean added = false; |
| 279 while (true) { |
| 280 MatchResult match = sTermsRegExp.exec(text); |
| 281 if (match == null) break; |
| 282 if (match.getGroupCount() <= 1) continue; |
| 283 |
| 284 String term = match.getGroup(1); |
| 285 MatchResult termWithDigits = sSurroundingDigitsRegExp.exec(term); |
| 286 int number = -1; |
| 287 if (termWithDigits != null && termWithDigits.getGroupCount() > 1) { |
| 288 number = StringUtil.toNumber(termWithDigits.getGroup(1)); |
| 289 } |
| 290 if (isPlainPageNumber(number)) { |
| 291 // This text is a valid candidate of plain text page number, add
it to last group of |
| 292 // adjacent numbers. |
| 293 mAdjacentNumbersGroups.addNumber(number, ""); |
| 294 added = true; |
| 295 } else { |
| 296 // The text is not a valid number, so current group of adjacent
numbers should be |
| 297 // closed, adding a new group if possible. |
| 298 mAdjacentNumbersGroups.addGroup(); |
| 299 } |
| 300 } // while there're matches |
| 301 |
| 302 return added; |
| 303 } |
| 304 |
| 305 /** |
| 306 * Adds PageParamInfo.PageInfo to the current adjacent group for a link if i
ts text is numeric. |
| 307 * Otherwise, add a new group to break the adjacency. |
| 308 * |
| 309 * @Return true if link was added, false otherwise. |
| 310 */ |
| 311 private boolean addLinkIfValid(AnchorElement link, AnchorElement baseAnchor)
{ |
| 312 PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAnchor); |
| 313 if (pageInfoAndText != null) { |
| 314 mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo); |
| 315 return true; |
| 316 } |
| 317 mAdjacentNumbersGroups.addGroup(); |
| 318 return false; |
| 319 } |
| 320 |
| 321 /** |
| 322 * @return true if link is disabled i.e. not clickable because it has a text
cursor. |
| 323 */ |
| 324 private static boolean isDisabledLink(AnchorElement link) { |
| 325 Style style = DomUtil.getComputedStyle(link); |
| 326 return Style.Cursor.valueOf(style.getCursor().toUpperCase()) == Style.Cu
rsor.TEXT; |
| 327 } |
| 328 |
| 329 /** |
| 330 * @return true if href starts with "javascript:". |
| 331 */ |
| 332 private static boolean isJavascriptHref(String href) { |
| 333 return href.startsWith("javascript:"); |
| 334 } |
| 335 |
| 336 private static String resolveLinkHref(AnchorElement link, AnchorElement base
Anchor) { |
| 337 // Anchors without "href" attribute are not considered potential paginat
ion links. |
| 338 String linkHref = link.getAttribute("href"); |
| 339 if (linkHref.isEmpty()) return ""; |
| 340 baseAnchor.setAttribute("href", linkHref); |
| 341 return baseAnchor.getHref(); |
| 342 } |
| 343 |
| 344 private static int linkTextToNumber(String linkText) { |
| 345 linkText = linkText.replaceAll("[()\\[\\]{}]", ""); |
| 346 linkText = linkText.trim(); // Remove leading and trailing whitespaces. |
| 347 return StringUtil.toNumber(linkText); |
| 348 } |
| 349 |
| 350 /** |
| 351 * @returns true if number is >= 0 && < MAX_NUM_FOR_PAGE_PARAM. |
| 352 */ |
| 353 private static boolean isPlainPageNumber(int number) { |
| 354 return number >= 0 && number <= MAX_NUM_FOR_PAGE_PARAM; |
| 355 } |
| 356 |
| 357 } |
OLD | NEW |