Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(732)

Unified Diff: javatests/org/chromium/distiller/PageParameterParserTest.java

Issue 1178633002: implement parser for new pagination algorithm (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: addr chris's comments, fixes for dataset Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: javatests/org/chromium/distiller/PageParameterParserTest.java
diff --git a/javatests/org/chromium/distiller/PageParameterParserTest.java b/javatests/org/chromium/distiller/PageParameterParserTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..9180a333f68da04836a3b722fab5e9242cf5a171
--- /dev/null
+++ b/javatests/org/chromium/distiller/PageParameterParserTest.java
@@ -0,0 +1,254 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+package org.chromium.distiller;
+
+import com.google.gwt.dom.client.BaseElement;
+import com.google.gwt.dom.client.Document;
+
+public class PageParameterParserTest extends DomDistillerJsTestCase {
+ private static final String BASE_URL = "http://www.test.com/";
+ private static final String TEST_URL = BASE_URL + "foo/bar";
+
+ public void testBasic() {
+ PageParamInfo info = processDocument(
+ "1<br>" +
+ "<a href=\"/foo/bar/2\">2</a>");
+ assertEquals(2, info.mAllPageInfo.size());
+
+ info = processDocument(
+ "1<br>" +
+ "<a href=\"/foo/bar/2\">2</a>" +
+ "<a href=\"/foo/bar/3\">3</a>");
+ assertEquals(3, info.mAllPageInfo.size());
+ }
+
+ public void testRejectOnlyPage2LinkWithoutCurrentPageText() {
+ // Although there is a digital outlink to 2nd page, there is no plain text "1"
+ // before it, so there is no pagination.
+ PageParamInfo info = processDocument(
+ "If there were a '1', pagination should be detected. But there isn't." +
+ "<a href=\"/foo/bar/2\">2</a>" +
+ "Main content");
+ PageParameterDetectorTest.expectEmptyPageParamInfo(info);
+ }
+
+ public void testRejectNonAdjacentOutlinks() {
+ PageParamInfo info = processDocument(
+ "1<br>" +
+ "Unrelated terms<br>" +
+ "<a href=\"/foo/bar/2\">2</a>" +
+ "Unrelated terms<br>" +
+ "<a href=\"/foo/bar/3\">3</a>" +
+ "<a href=\"/foo/bar/all\">All</a>");
+ PageParameterDetectorTest.expectEmptyPageParamInfo(info);
+ }
+
+ public void testAcceptAdjacentOutlinks() {
+ PageParamInfo info = processDocumentWithoutBase(
+ "Unrelated link: <a href=\"http://www.test.com/other/2\">2</a>" +
+ "<p>Main content</p>" +
+ "1<br>" +
+ "<a href=\"http://www.test.com/foo/bar/2\">2</a>" +
+ "<a href=\"http://www.test.com/foo/bar/3\">3</a>",
+ TEST_URL);
+ assertEquals(3, info.mAllPageInfo.size());
+ PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
+ assertEquals(1, page.mPageNum);
+ assertEquals(BASE_URL + "foo/bar", page.mUrl);
+ page = info.mAllPageInfo.get(1);
+ assertEquals(2, page.mPageNum);
+ assertEquals(BASE_URL + "foo/bar/2", page.mUrl);
+ page = info.mAllPageInfo.get(2);
+ assertEquals(3, page.mPageNum);
+ assertEquals(BASE_URL + "foo/bar/3", page.mUrl);
+ assertEquals(BASE_URL + "foo/bar/2", info.mNextPagingUrl);
+ }
+
+ public void testAcceptDuplicatePatterns() {
+ PageParamInfo info = processDocument(
+ "1<br>" +
+ "<a href=\"http://www.test.com/foo/bar/2\">2</a>" +
+ "<a href=\"http://www.test.com/foo/bar/3\">3</a>" +
+ "<p>Main content</p>" +
+ "1<br>" +
+ "<a href=\"http://www.test.com/foo/bar/2\">2</a>" +
+ "<a href=\"http://www.test.com/foo/bar/3\">3</a>");
+ assertEquals(3, info.mAllPageInfo.size());
+ PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
+ assertEquals(1, page.mPageNum);
+ assertEquals(BASE_URL + "foo/bar", page.mUrl);
+ page = info.mAllPageInfo.get(1);
+ assertEquals(2, page.mPageNum);
+ assertEquals(BASE_URL + "foo/bar/2", page.mUrl);
+ page = info.mAllPageInfo.get(2);
+ assertEquals(3, page.mPageNum);
+ assertEquals(BASE_URL + "foo/bar/3", page.mUrl);
+ assertEquals(BASE_URL + "foo/bar/2", info.mNextPagingUrl);
+ }
+
+ public void testPreferPageNumber() {
+ PageParamInfo info = processDocument(
+ "<a href=\"http://www.test.com/foo/bar/size-25\">25</a>" +
+ "<a href=\"http://www.test.com/foo/bar/size-50\">50</a>" +
+ "<a href=\"http://www.test.com/foo/bar/size-100\">100</a>" +
+ "<p>Main content</p>" +
+ "1<br>" +
+ "<a href=\"http://www.test.com/foo/bar/2\">2</a>" +
+ "<a href=\"http://www.test.com/foo/bar/3\">3</a>");
+ assertEquals(PageParamInfo.Type.PAGE_NUMBER, info.mType);
+ assertEquals(3, info.mAllPageInfo.size());
+ PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
+ assertEquals(1, page.mPageNum);
+ assertEquals(BASE_URL + "foo/bar", page.mUrl);
+ page = info.mAllPageInfo.get(1);
+ assertEquals(2, page.mPageNum);
+ assertEquals(BASE_URL + "foo/bar/2", page.mUrl);
+ page = info.mAllPageInfo.get(2);
+ assertEquals(3, page.mPageNum);
+ assertEquals(BASE_URL + "foo/bar/3", page.mUrl);
+ assertEquals(BASE_URL + "foo/bar/2", info.mNextPagingUrl);
+ }
+
+ public void testRejectMultiplePageNumberPatterns() {
+ PageParamInfo info = processDocumentWithoutBase(
+ "<a href=\"http://www.google.com/test/list.php?start=10\">2</a>" +
+ "<a href=\"http://www.google.com/test/list.php?start=20\">3</a>" +
+ "<a href=\"http://www.google.com/test/list.php?start=30\">4</a>" +
+ "<p>Main content</p>" +
+ "<a href=\"http://www.google.com/test/list.php?offset=10\">2</a>" +
+ "<a href=\"http://www.google.com/test/list.php?offset=20\">3</a>" +
+ "<a href=\"http://www.google.com/test/list.php?offset=30\">4</a>" +
+ "<a href=\"http://www.google.com/test/list.php?offset=all\">All</a>",
+ "http://www.google.com/test/list.php");
+
+ assertEquals(PageParamInfo.Type.PAGE_NUMBER, info.mType);
+ assertEquals(4, info.mAllPageInfo.size());
+ PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
+ assertEquals(1, page.mPageNum);
+ assertEquals("http://www.google.com/test/list.php", page.mUrl);
+ page = info.mAllPageInfo.get(1);
+ assertEquals(2, page.mPageNum);
+ assertEquals("http://www.google.com/test/list.php?start=10", page.mUrl);
+ page = info.mAllPageInfo.get(2);
+ assertEquals(3, page.mPageNum);
+ assertEquals("http://www.google.com/test/list.php?start=20", page.mUrl);
+ page = info.mAllPageInfo.get(3);
+ assertEquals(4, page.mPageNum);
+ assertEquals("http://www.google.com/test/list.php?start=30", page.mUrl);
+ assertTrue(info.mFormula != null);
+ assertEquals(10, info.mFormula.mCoefficient);
+ assertEquals(-10, info.mFormula.mDelta);
+ assertEquals("http://www.google.com/test/list.php?start=10", info.mNextPagingUrl);
+ }
+
+ public void testInvalidAndVoidLinks() {
+ PageParamInfo info = processDocument(
+ "1<br>" +
+ "<a href=\"javascript:void(0)\">2</a>");
+ PageParameterDetectorTest.expectEmptyPageParamInfo(info);
+ }
+
+ public void testDifferentHostLinks() {
+ PageParamInfo info = processDocumentWithoutBase(
+ "1<br>" +
+ "<a href=\"http://www.foo.com/foo/bar/2\">2</a>",
+ TEST_URL);
+ PageParameterDetectorTest.expectEmptyPageParamInfo(info);
+ }
+
+ public void testWhitespaceSibling() {
+ PageParamInfo info = processDocument(
+ "1<br>" +
+ " " +
+ "<a href=\"/foo/bar/2\">2</a>");
+ assertEquals(2, info.mAllPageInfo.size());
+ }
+
+ public void testPunctuationSibling() {
+ PageParamInfo info = processDocument(
+ "<a href=\"/foo/bar/1\">1</a>" +
+ "," +
+ "<a href=\"/foo/bar/2\">2</a>");
+ assertEquals(2, info.mAllPageInfo.size());
+ }
+
+ public void testParentSibling0() {
wychen 2015/09/21 23:08:03 Should we add tests for things like this to test s
kuan 2015/10/02 15:59:17 Done. fyi, i already had testPuncationSibling() t
+ PageParamInfo info = processDocumentWithoutBase(
+ "<div>begin" +
+ "<strong>1</strong>" +
+ "<div><a href=\"http://www.test.com/foo/bar/2\">2</a></div>" +
+ "<div><a href=\"http://www.test.com/foo/bar/3\">3</a></div>" +
+ "end</div>",
+ TEST_URL);
+ assertEquals(3, info.mAllPageInfo.size());
+ PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
+ assertEquals(1, page.mPageNum);
+ assertEquals(TEST_URL, page.mUrl);
+ page = info.mAllPageInfo.get(1);
+ assertEquals(2, page.mPageNum);
+ assertEquals(TEST_URL + "/2", page.mUrl);
+ page = info.mAllPageInfo.get(2);
+ assertEquals(3, page.mPageNum);
+ assertEquals(TEST_URL + "/3", page.mUrl);
+ assertEquals("http://www.test.com/foo/bar/2", info.mNextPagingUrl);
+ }
+
+ public void testParentSibling1() {
+ PageParamInfo info = processDocumentWithoutBase(
+ "<div>begin" +
+ "<div><a href=\"http://www.test.com/foo/bar\">1</a></div>" +
+ "<strong>2</strong>" +
+ "<div><a href=\"http://www.test.com/foo/bar/3\">3</a></div>" +
+ "end</div>",
+ "http://www.test.com/foo/bar/2");
+ assertEquals(2, info.mAllPageInfo.size());
+ PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
+ assertEquals(1, page.mPageNum);
+ assertEquals(TEST_URL, page.mUrl);
+ page = info.mAllPageInfo.get(1);
+ assertEquals(3, page.mPageNum);
+ assertEquals(TEST_URL + "/3", page.mUrl);
+ assertEquals("http://www.test.com/foo/bar/3", info.mNextPagingUrl);
+ }
+
+ public void testParentSibling2() {
+ PageParamInfo info = processDocumentWithoutBase(
+ "<div>begin" +
+ "<div><a href=\"http://www.test.com/foo/bar\">1</a></div>" +
+ "<div><a href=\"http://www.test.com/foo/bar/2\">2</a></div>" +
+ "<strong>3</strong>" +
+ "end</div>",
+ "http://www.test.com/foo/bar/3");
+ assertEquals(2, info.mAllPageInfo.size());
+ PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
+ assertEquals(1, page.mPageNum);
+ assertEquals(TEST_URL, page.mUrl);
+ page = info.mAllPageInfo.get(1);
+ assertEquals(2, page.mPageNum);
+ assertEquals(TEST_URL + "/2", page.mUrl);
+ assertTrue(info.mNextPagingUrl.isEmpty());
+ }
+
+ private PageParamInfo processDocument(String content) {
+ // Create and add a <base> element so that all anchors are based off it.
+ BaseElement baseTag = Document.get().createBaseElement();
+ baseTag.setHref(BASE_URL);
+ mHead.appendChild(baseTag);
+
+ // Append content to body.
+ mBody.setInnerHTML(content);
+
+ PageParamInfo info = PageParameterParser.parse(TEST_URL, null);
+ mHead.removeChild(baseTag);
+ return info;
+ }
+
+ private PageParamInfo processDocumentWithoutBase(String content, String originalUrl) {
+ // Append content to body.
+ mBody.setInnerHTML(content);
+ return PageParameterParser.parse(originalUrl, null);
+ }
+
+}

Powered by Google App Engine
This is Rietveld 408576698