Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(543)

Side by Side Diff: java/org/chromium/distiller/PageParameterParser.java

Issue 1178633002: implement parser for new pagination algorithm (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: addr chris's comments, fixes for dataset Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 package org.chromium.distiller;
6
7 import org.chromium.distiller.proto.DomDistillerProtos;
8 import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo;
9
10 import com.google.gwt.dom.client.AnchorElement;
11 import com.google.gwt.dom.client.Document;
12 import com.google.gwt.dom.client.Element;
13 import com.google.gwt.dom.client.Node;
14 import com.google.gwt.dom.client.NodeList;
15 import com.google.gwt.dom.client.Style;
16 import com.google.gwt.regexp.shared.MatchResult;
17 import com.google.gwt.regexp.shared.RegExp;
18
19 /**
20 * Background:
21 * The long article/news/forum thread/blog document may be partitioned into se veral partial pages
22 * by webmaster. Each partial page has outlinks pointing to the adjacent part ial pages. The
23 * anchor text of those outlinks is numeric.
24 *
25 * This class parses the document to collect groups of adjacent plain text numbe rs and outlinks with
26 * digital anchor text. These are then passed to PageParameterParser which woul d spit out the
27 * pagination URLs if available.
28 */
29 public class PageParameterParser {
30 // If the numeric value of a link's anchor text is greater than this number, we don't think it
31 // represents the page number of the link.
32 private static final int MAX_NUM_FOR_PAGE_PARAM = 100;
33
34 /**
35 * Stores PageParamInfo.PageInfo and the anchor's text, specifically returne d by
36 * getPageInfoAndText().
37 */
38 private static class PageInfoAndText {
39 private final PageParamInfo.PageInfo mPageInfo;
40 private final String mText;
41
42 PageInfoAndText(int number, String url, String text) {
43 mPageInfo = new PageParamInfo.PageInfo(number, url);
44 mText = text;
45 }
46 }
47
48 /**
49 * Entry point for PageParameterParser.
50 * Parses the document to collect outlinks with digital anchor text and nume ric text around
wychen 2015/09/21 23:08:03 Does digital mean the same thing as numeric?
kuan 2015/10/02 15:59:17 Done.
51 * them. These are then passed to PageParameterParser to detect pagination URLs.
52 *
53 * @return PageParamInfo (see PageParamInfo.java), always. If no page param eter is detected or
54 * determined to be best, its mType is PageParamInfo.Type.UNSET.
55 *
56 * @param originalUrl the original URL of the document to be parsed.
57 * @param timingInfo for tracking performance.
58 */
59 public static PageParamInfo parse(String originalUrl, TimingInfo timingInfo) {
60 PageParameterParser parser = new PageParameterParser(timingInfo);
61 return parser.parseDocument(Document.get().getDocumentElement(), origina lUrl);
62 }
63
64 private final TimingInfo mTimingInfo;
65 private String mDocUrl = "";
66 private ParsedUrl mParsedUrl = null;
67 private final MonotonicPageInfosGroups mAdjacentNumbersGroups = new Monotoni cPageInfosGroups();
68 private int mNumForwardLinksProcessed = 0;
69
70 private static RegExp sHrefCleaner = null;
71 private static RegExp sInvalidParentWrapper = null;
72
73 private PageParameterParser(TimingInfo timingInfo) {
74 mTimingInfo = timingInfo;
75 }
76
77 /**
78 * Acutually implements PageParameterParser.parse(), see above description f or parse().
79 */
80 private PageParamInfo parseDocument(Element root, String originalUrl) {
81 double startTime = DomUtil.getTime();
82
83 if (sHrefCleaner == null) sHrefCleaner = RegExp.compile("\\/$");
wychen 2015/09/21 23:08:03 Is this faster than eager initialization? If these
kuan 2015/10/02 15:59:17 sHrefCleaner is always used, so i've changed to in
84 if (sInvalidParentWrapper == null) sInvalidParentWrapper = RegExp.compil e("(BODY)|(HTML)");
85
86 mDocUrl = sHrefCleaner.replace(originalUrl, "");
87 mParsedUrl = ParsedUrl.create(mDocUrl);
88 if (mParsedUrl == null) return new PageParamInfo(); // Invalid document URL.
89
90 AnchorElement baseAnchor = PagingLinksFinder.createAnchorWithBase(
91 PagingLinksFinder.getBaseUrlForRelative(root, originalUrl));
92
93 NodeList<Element> allLinks = root.getElementsByTagName("A");
94 int idx = 0;
95 while (idx < allLinks.getLength()) {
96 final AnchorElement link = AnchorElement.as(allLinks.getItem(idx));
97 PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAncho r);
98 if (pageInfoAndText == null) {
99 idx++;
100 continue;
101 }
102
103 // This link is a good candidate for pagination.
104
105 // Close current group of adjacent numbers, add a new group if neces sary.
106 mAdjacentNumbersGroups.addGroup();
107
108 // Before we append the link to the new group of adjacent numbers, c heck if it's
109 // preceded by a text node with numeric text; if so, add it before t he link.
110 findAndAddClosestValidLeafNodes(link, false, true, null);
111
112 // Add the link to the current group of adjacent numbers.
113 mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo);
114
115 // Add all following text nodes and links with numeric text.
116 mNumForwardLinksProcessed = 0;
117 findAndAddClosestValidLeafNodes(link, false, false, baseAnchor);
118
119 // Skip the current link and links already processed in the forward
120 // findandAddClosestValidLeafNodes().
121 idx += 1 + mNumForwardLinksProcessed;
122 } // while there're links.
123
124 mAdjacentNumbersGroups.cleanup();
125
126 LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterParser");
127
128 startTime = DomUtil.getTime();
129 PageParamInfo info = PageParameterDetector.detect(mAdjacentNumbersGroups , mDocUrl);
130 LogUtil.addTimingInfo(startTime, mTimingInfo, "PageParameterDetector");
131 return info;
132 }
133
134 /**
135 * @return a populated PageInfoAndText if given link is to be added to mAdja centNumbersGroups.
136 * Otherwise, returns null if link is to be ignored.
137 * "javascript:void" links with numeric text are considered valid links to b e added.
wychen 2015/09/21 23:08:03 nit: not necessarily void.
kuan 2015/10/02 15:59:17 Done.
138 *
139 * @param link to process.
140 * @param baseAnchor created for the current document.
141 */
142 private PageInfoAndText getPageInfoAndText(AnchorElement link, AnchorElement baseAnchor) {
143 // Ignore invisible links.
144 if (!DomUtil.isVisible(link)) return null;
145
146 String linkHref = resolveLinkHref(link, baseAnchor);
147 final boolean isEmptyHref = linkHref.isEmpty();
148 boolean isJavascriptLink = false;
149 ParsedUrl url = null;
150 if (!isEmptyHref) {
151 isJavascriptLink = isJavascriptHref(linkHref);
152 url = ParsedUrl.create(linkHref);
153 if (url == null ||
154 (!isJavascriptLink && !url.getHost().equalsIgnoreCase(mParse dUrl.getHost()))) {
155 return null;
156 }
157 url.setHash("");
158 }
159
160 // Use javascript innerText (instead of javascript textContent) to only get visible text.
161 String linkText = StringUtil.jsTrim(DomUtil.getInnerText(link));
162 int number = linkTextToNumber(linkText);
163 if (!isPlainPageNumber(number)) return null;
wychen 2015/09/21 23:08:03 Since most links aren't numbers, can we use this a
kuan 2015/10/02 15:59:17 i can't move it to beginning of while loop - it ne
164
165 if (isEmptyHref || isJavascriptLink || isDisabledLink(link)) {
166 return new PageInfoAndText(number, "", linkText);
167 }
168
169 return new PageInfoAndText(number, sHrefCleaner.replace(url.toString(), ""), linkText);
170 }
171
172 /**
173 * Finds and adds the leaf node(s) closest to the given start node.
174 * This recurses and keeps finding and, if necessary, adding the numeric tex t of valid nodes,
175 * collecting the PageParamInfo.PageInfo's for the current adjacency group.
176 * For backward search, i.e. nodes before start node, search terminates (i.e . recusion stops)
177 * once a text node or anchor is encountered. If the text node contains num eric text, it's
178 * added to the current adjacency group. Otherwise, a new group is created to break the
179 * adjacency.
180 * For forward search, i.e. nodes after start node, search continues (i.e. r ecursion continues)
181 * until a text node or anchor with non-numeric text is encountered. In the process, text nodes
182 * and anchors with numeric text are added to the current adjaency group. W hen a non-numeric
183 * text node or anchor is encountered, a new group is started to break the a djacency, and search
184 * ends.
185 *
186 * @return true to continue search, false to stop.
187 *
188 * @param start node to work on.
189 * @param checkStart true to check start node. Otherwise, the previous or n ext sibling of the
190 * start node is checked.
191 * @param backward true to search backward (i.e. nodes before start node), f alse to search
192 * forward (i.e. nodes after start node).
193 * @param baseAnchor created for the current document, only needed for forwa rd search.
194 */
195 private boolean findAndAddClosestValidLeafNodes(Node start, boolean checkSta rt,
196 boolean backward, AnchorElement baseAnchor) {
197 Node node = checkStart ? start :
198 (backward ? start.getPreviousSibling() : start.getNextSibling()) ;
199 if (node == null) { // No sibling, try parent.
200 node = start.getParentNode();
201 if (sInvalidParentWrapper.test(node.getNodeName())) return false;
202 return findAndAddClosestValidLeafNodes(node, false, backward, baseAn chor);
203 }
204
205 checkStart = false;
206 switch (node.getNodeType()) {
207 case Node.TEXT_NODE:
208 String text = node.getNodeValue();
209 // Text must contain words.
210 if (text.isEmpty() || StringUtil.countWords(text) == 0) break;
211 boolean added = addNonLinkTextIfValid(node.getNodeValue());
212 // For backward search, we're done regardless if text was added.
213 // For forward search, we're done only if text was invalid, othe rwise continue.
214 if (backward || !added) return false;
215 break;
216
217 case Node.ELEMENT_NODE:
218 Element e = Element.as(node);
219 if (e.hasTagName("A")) {
220 // For backward search, we're done because we've already pro cessed the anchor.
221 if (backward) return false;
222 // For forward search, we're done only if link was invalid, otherwise continue.
223 mNumForwardLinksProcessed++;
224 if (!addLinkIfValid(AnchorElement.as(e), baseAnchor)) return false;
225 break;
226 }
227 // Intentionally fall through.
228
229 default:
230 // Check children nodes.
231 if (!node.hasChildNodes()) break;
232 checkStart = true; // We want to check the child node.
233 if (backward) {
234 // Start the backward search with the rightmost child i.e. l ast and closest to
235 // given node.
236 node = node.getLastChild();
237 } else {
238 // Start the forward search with the leftmost child i.e. fir st and closest to
239 // given node.
wychen 2015/09/21 23:08:03 nit: indentation
kuan 2015/10/02 15:59:17 Done.
240 node = node.getFirstChild();
241 }
242 break;
243 }
244
245 return findAndAddClosestValidLeafNodes(node, checkStart, backward, baseA nchor);
246 }
247
248 private static RegExp sTermsRegExp = null; // Match terms i.e. words.
249 private static RegExp sSurroundingDigitsRegExp = null; // Match term with o nly digits.
250
251 /**
252 * Handle the text for a non-link node. Each numeric term in the text that is a valid plain
253 * page number adds a PageParamInfo.PageInfo into the current adjacent group . All other terms
254 * break the adjacency in the current group, adding a new group instead.
255 *
256 * @Return true if text was added to current group of adjacent numbers. Oth erwise, false with
257 * a new group created to break the current adjacency.
258 */
259 private boolean addNonLinkTextIfValid(String text) {
260 if (!StringUtil.containsDigit(text)) {
261 // The text does not contain valid number(s); if necessary, current group of adjacent
262 // numbers should be closed, adding a new group if possible.
263 mAdjacentNumbersGroups.addGroup();
264 return false;
265 }
266
267 if (sTermsRegExp == null) {
268 sTermsRegExp = RegExp.compile("(\\S*[\\w\u00C0-\u1FFF\u2C00-\uD7FF]\ \S*)", "gi");
269 } else {
270 sTermsRegExp.setLastIndex(0);
271 }
272 if (sSurroundingDigitsRegExp == null) {
273 sSurroundingDigitsRegExp = RegExp.compile("^[\\W_]*(\\d+)[\\W_]*$", "i");
274 }
275
276 // Extract terms from the text, differentiating between those that conta in only digits and
277 // those that contain non-digits.
278 boolean added = false;
279 while (true) {
280 MatchResult match = sTermsRegExp.exec(text);
281 if (match == null) break;
282 if (match.getGroupCount() <= 1) continue;
283
284 String term = match.getGroup(1);
285 MatchResult termWithDigits = sSurroundingDigitsRegExp.exec(term);
286 int number = -1;
287 if (termWithDigits != null && termWithDigits.getGroupCount() > 1) {
288 number = StringUtil.toNumber(termWithDigits.getGroup(1));
289 }
290 if (isPlainPageNumber(number)) {
291 // This text is a valid candidate of plain text page number, add it to last group of
292 // adjacent numbers.
293 mAdjacentNumbersGroups.addNumber(number, "");
294 added = true;
295 } else {
296 // The text is not a valid number, so current group of adjacent numbers should be
297 // closed, adding a new group if possible.
298 mAdjacentNumbersGroups.addGroup();
299 }
300 } // while there're matches
301
302 return added;
303 }
304
305 /**
306 * Adds PageParamInfo.PageInfo to the current adjacent group for a link if i ts text is numeric.
307 * Otherwise, add a new group to break the adjacency.
308 *
309 * @Return true if link was added, false otherwise.
310 */
311 private boolean addLinkIfValid(AnchorElement link, AnchorElement baseAnchor) {
312 PageInfoAndText pageInfoAndText = getPageInfoAndText(link, baseAnchor);
313 if (pageInfoAndText != null) {
314 mAdjacentNumbersGroups.addPageInfo(pageInfoAndText.mPageInfo);
315 return true;
316 }
317 mAdjacentNumbersGroups.addGroup();
318 return false;
319 }
320
321 /**
322 * @return true if link is disabled i.e. not clickable because it has a text cursor.
323 */
324 private static boolean isDisabledLink(AnchorElement link) {
325 Style style = DomUtil.getComputedStyle(link);
326 return Style.Cursor.valueOf(style.getCursor().toUpperCase()) == Style.Cu rsor.TEXT;
wychen 2015/09/21 23:08:03 Even if the cursor style is different, the link is
kuan 2015/10/02 15:59:17 no, the link is not clickable - it behaves like re
327 }
328
329 /**
330 * @return true if href starts with "javascript:".
331 */
332 private static boolean isJavascriptHref(String href) {
333 return href.startsWith("javascript:");
334 }
335
336 private static String resolveLinkHref(AnchorElement link, AnchorElement base Anchor) {
337 String linkHref = link.getAttribute("href");
338 if (linkHref.isEmpty()) return "";
wychen 2015/09/21 23:08:03 If href="", it means the current URL. What's the r
kuan 2015/10/02 15:59:17 anchors w/out "href" attr are not considered pagin
339 baseAnchor.setAttribute("href", linkHref);
340 return baseAnchor.getHref();
341 }
342
343 private static int linkTextToNumber(String linkText) {
344 linkText = linkText.replaceAll("[()\\[\\]{}]", "");
345 linkText = linkText.trim(); // Remove leading and trailing whitespaces.
346 // Remove duplicate internal whitespaces.
347 linkText = linkText.replaceAll("\\s\\{2,\\}", " ");
wychen 2015/09/21 23:08:03 Why is this necessary?
kuan 2015/10/02 15:59:17 the original code has this, so i follow suit. how
348 return StringUtil.toNumber(linkText);
349 }
350
351 /**
352 * @returns true if number is >= 0 && < MAX_NUM_FOR_PAGE_PARAM.
353 */
354 private static boolean isPlainPageNumber(int number) {
355 return number >= 0 && number <= MAX_NUM_FOR_PAGE_PARAM;
356 }
357
358 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698