Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(558)

Side by Side Diff: javatests/org/chromium/distiller/PageParameterParserTest.java

Issue 1178633002: implement parser for new pagination algorithm (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: addr wychen's comments Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 package org.chromium.distiller;
6
7 import com.google.gwt.dom.client.BaseElement;
8 import com.google.gwt.dom.client.Document;
9
10 public class PageParameterParserTest extends DomDistillerJsTestCase {
11 private static final String BASE_URL = "http://www.test.com/";
12 private static final String TEST_URL = BASE_URL + "foo/bar";
13
14 public void testBasic() {
15 PageParamInfo info = processDocument(
16 "1<br>" +
17 "<a href=\"/foo/bar/2\">2</a>");
18 assertEquals(2, info.mAllPageInfo.size());
19
20 info = processDocument(
21 "1<br>" +
22 "<a href=\"/foo/bar/2\">2</a>" +
23 "<a href=\"/foo/bar/3\">3</a>");
24 assertEquals(3, info.mAllPageInfo.size());
25 }
26
27 public void testRejectOnlyPage2LinkWithoutCurrentPageText() {
28 // Although there is a digital outlink to 2nd page, there is no plain te xt "1"
29 // before it, so there is no pagination.
30 PageParamInfo info = processDocument(
31 "If there were a '1', pagination should be detected. But there isn't ." +
32 "<a href=\"/foo/bar/2\">2</a>" +
33 "Main content");
34 PageParameterDetectorTest.expectEmptyPageParamInfo(info);
35 }
36
37 public void testRejectNonAdjacentOutlinks() {
38 PageParamInfo info = processDocument(
39 "1<br>" +
40 "Unrelated terms<br>" +
41 "<a href=\"/foo/bar/2\">2</a>" +
42 "Unrelated terms<br>" +
43 "<a href=\"/foo/bar/3\">3</a>" +
44 "<a href=\"/foo/bar/all\">All</a>");
45 PageParameterDetectorTest.expectEmptyPageParamInfo(info);
46 }
47
48 public void testAcceptAdjacentOutlinks() {
49 PageParamInfo info = processDocumentWithoutBase(
50 "Unrelated link: <a href=\"http://www.test.com/other/2\">2</a>" +
51 "<p>Main content</p>" +
52 "1<br>" +
53 "<a href=\"http://www.test.com/foo/bar/2\">2</a>" +
54 "<a href=\"http://www.test.com/foo/bar/3\">3</a>",
55 TEST_URL);
56 assertEquals(3, info.mAllPageInfo.size());
57 PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
58 assertEquals(1, page.mPageNum);
59 assertEquals(BASE_URL + "foo/bar", page.mUrl);
60 page = info.mAllPageInfo.get(1);
61 assertEquals(2, page.mPageNum);
62 assertEquals(BASE_URL + "foo/bar/2", page.mUrl);
63 page = info.mAllPageInfo.get(2);
64 assertEquals(3, page.mPageNum);
65 assertEquals(BASE_URL + "foo/bar/3", page.mUrl);
66 assertEquals(BASE_URL + "foo/bar/2", info.mNextPagingUrl);
67 }
68
69 public void testAcceptDuplicatePatterns() {
70 PageParamInfo info = processDocument(
71 "1<br>" +
72 "<a href=\"http://www.test.com/foo/bar/2\">2</a>" +
73 "<a href=\"http://www.test.com/foo/bar/3\">3</a>" +
74 "<p>Main content</p>" +
75 "1<br>" +
76 "<a href=\"http://www.test.com/foo/bar/2\">2</a>" +
77 "<a href=\"http://www.test.com/foo/bar/3\">3</a>");
78 assertEquals(3, info.mAllPageInfo.size());
79 PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
80 assertEquals(1, page.mPageNum);
81 assertEquals(BASE_URL + "foo/bar", page.mUrl);
82 page = info.mAllPageInfo.get(1);
83 assertEquals(2, page.mPageNum);
84 assertEquals(BASE_URL + "foo/bar/2", page.mUrl);
85 page = info.mAllPageInfo.get(2);
86 assertEquals(3, page.mPageNum);
87 assertEquals(BASE_URL + "foo/bar/3", page.mUrl);
88 assertEquals(BASE_URL + "foo/bar/2", info.mNextPagingUrl);
89 }
90
91 public void testPreferPageNumber() {
92 PageParamInfo info = processDocument(
93 "<a href=\"http://www.test.com/foo/bar/size-25\">25</a>" +
94 "<a href=\"http://www.test.com/foo/bar/size-50\">50</a>" +
95 "<a href=\"http://www.test.com/foo/bar/size-100\">100</a>" +
96 "<p>Main content</p>" +
97 "1<br>" +
98 "<a href=\"http://www.test.com/foo/bar/2\">2</a>" +
99 "<a href=\"http://www.test.com/foo/bar/3\">3</a>");
100 assertEquals(PageParamInfo.Type.PAGE_NUMBER, info.mType);
101 assertEquals(3, info.mAllPageInfo.size());
102 PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
103 assertEquals(1, page.mPageNum);
104 assertEquals(BASE_URL + "foo/bar", page.mUrl);
105 page = info.mAllPageInfo.get(1);
106 assertEquals(2, page.mPageNum);
107 assertEquals(BASE_URL + "foo/bar/2", page.mUrl);
108 page = info.mAllPageInfo.get(2);
109 assertEquals(3, page.mPageNum);
110 assertEquals(BASE_URL + "foo/bar/3", page.mUrl);
111 assertEquals(BASE_URL + "foo/bar/2", info.mNextPagingUrl);
112 }
113
114 public void testRejectMultiplePageNumberPatterns() {
115 PageParamInfo info = processDocumentWithoutBase(
116 "<a href=\"http://www.google.com/test/list.php?start=10\">2</a>" +
117 "<a href=\"http://www.google.com/test/list.php?start=20\">3</a>" +
118 "<a href=\"http://www.google.com/test/list.php?start=30\">4</a>" +
119 "<p>Main content</p>" +
120 "<a href=\"http://www.google.com/test/list.php?offset=10\">2</a>" +
121 "<a href=\"http://www.google.com/test/list.php?offset=20\">3</a>" +
122 "<a href=\"http://www.google.com/test/list.php?offset=30\">4</a>" +
123 "<a href=\"http://www.google.com/test/list.php?offset=all\">All</a>" ,
124 "http://www.google.com/test/list.php");
125
126 assertEquals(PageParamInfo.Type.PAGE_NUMBER, info.mType);
127 assertEquals(4, info.mAllPageInfo.size());
128 PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
129 assertEquals(1, page.mPageNum);
130 assertEquals("http://www.google.com/test/list.php", page.mUrl);
131 page = info.mAllPageInfo.get(1);
132 assertEquals(2, page.mPageNum);
133 assertEquals("http://www.google.com/test/list.php?start=10", page.mUrl);
134 page = info.mAllPageInfo.get(2);
135 assertEquals(3, page.mPageNum);
136 assertEquals("http://www.google.com/test/list.php?start=20", page.mUrl);
137 page = info.mAllPageInfo.get(3);
138 assertEquals(4, page.mPageNum);
139 assertEquals("http://www.google.com/test/list.php?start=30", page.mUrl);
140 assertTrue(info.mFormula != null);
141 assertEquals(10, info.mFormula.mCoefficient);
142 assertEquals(-10, info.mFormula.mDelta);
143 assertEquals("http://www.google.com/test/list.php?start=10", info.mNextP agingUrl);
144 }
145
146 public void testInvalidAndVoidLinks() {
147 PageParamInfo info = processDocument(
148 "1<br>" +
149 "<a href=\"javascript:void(0)\">2</a>");
150 PageParameterDetectorTest.expectEmptyPageParamInfo(info);
151 }
152
153 public void testDifferentHostLinks() {
154 PageParamInfo info = processDocumentWithoutBase(
155 "1<br>" +
156 "<a href=\"http://www.foo.com/foo/bar/2\">2</a>",
157 TEST_URL);
158 PageParameterDetectorTest.expectEmptyPageParamInfo(info);
159 }
160
161 public void testWhitespaceSibling() {
162 PageParamInfo info = processDocument(
163 "1<br>" +
164 " " +
165 "<a href=\"/foo/bar/2\">2</a>");
166 assertEquals(2, info.mAllPageInfo.size());
167 }
168
169 public void testPunctuationSibling() {
170 PageParamInfo info = processDocument(
171 "<a href=\"/foo/bar/1\">1</a>" +
172 "," +
173 "<a href=\"/foo/bar/2\">2</a>");
174 assertEquals(2, info.mAllPageInfo.size());
175 }
176
177 public void testSeparatorSibling() {
178 PageParamInfo info = processDocument(
179 "<div>" +
180 "1 | " +
181 "<a href=\"/foo/bar/2\">2</a>" +
182 " | " +
183 "<a href=\"/foo/bar/3\">3</a>" +
184 "</div>");
185 assertEquals(3, info.mAllPageInfo.size());
186 }
187
188 public void testParentSibling0() {
189 PageParamInfo info = processDocumentWithoutBase(
190 "<div>begin" +
191 "<strong>1</strong>" +
192 "<div><a href=\"http://www.test.com/foo/bar/2\">2</a></div>" +
193 "<div><a href=\"http://www.test.com/foo/bar/3\">3</a></div>" +
194 "end</div>",
195 TEST_URL);
196 assertEquals(3, info.mAllPageInfo.size());
197 PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
198 assertEquals(1, page.mPageNum);
199 assertEquals(TEST_URL, page.mUrl);
200 page = info.mAllPageInfo.get(1);
201 assertEquals(2, page.mPageNum);
202 assertEquals(TEST_URL + "/2", page.mUrl);
203 page = info.mAllPageInfo.get(2);
204 assertEquals(3, page.mPageNum);
205 assertEquals(TEST_URL + "/3", page.mUrl);
206 assertEquals("http://www.test.com/foo/bar/2", info.mNextPagingUrl);
207 }
208
209 public void testParentSibling1() {
210 PageParamInfo info = processDocumentWithoutBase(
211 "<div>begin" +
212 "<div><a href=\"http://www.test.com/foo/bar\">1</a></div>" +
213 "<strong>2</strong>" +
214 "<div><a href=\"http://www.test.com/foo/bar/3\">3</a></div>" +
215 "end</div>",
216 "http://www.test.com/foo/bar/2");
217 assertEquals(2, info.mAllPageInfo.size());
218 PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
219 assertEquals(1, page.mPageNum);
220 assertEquals(TEST_URL, page.mUrl);
221 page = info.mAllPageInfo.get(1);
222 assertEquals(3, page.mPageNum);
223 assertEquals(TEST_URL + "/3", page.mUrl);
224 assertEquals("http://www.test.com/foo/bar/3", info.mNextPagingUrl);
225 }
226
227 public void testParentSibling2() {
228 PageParamInfo info = processDocumentWithoutBase(
229 "<div>begin" +
230 "<div><a href=\"http://www.test.com/foo/bar\">1</a></div>" +
231 "<div><a href=\"http://www.test.com/foo/bar/2\">2</a></div>" +
232 "<strong>3</strong>" +
233 "end</div>",
234 "http://www.test.com/foo/bar/3");
235 assertEquals(2, info.mAllPageInfo.size());
236 PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
237 assertEquals(1, page.mPageNum);
238 assertEquals(TEST_URL, page.mUrl);
239 page = info.mAllPageInfo.get(1);
240 assertEquals(2, page.mPageNum);
241 assertEquals(TEST_URL + "/2", page.mUrl);
242 assertTrue(info.mNextPagingUrl.isEmpty());
243 }
244
245 public void testNestedStructure() {
246 PageParamInfo info = processDocumentWithoutBase(
247 "<div>begin" +
248 "<span><a href=\"http://www.test.com/foo?page=2\">&lsaquo;&lsaquo; P rev</a></span>" +
249 "<span><a href=\"http://www.test.com/foo?page=1\">1</a></span>" +
250 "<span><a href=\"http://www.test.com/foo?page=2\">2</a></span>" +
251 "<span>3</span>" +
252 "<span><a href=\"http://www.test.com/foo?page=4\">4</a></span>" +
253 "<span><a href=\"http://www.test.com/foo?page=5\">5</a></span>" +
254 "<span>...</span>" +
255 "<span><a href=\"http://www.test.com/foo?page=48\">48</a></span>" +
256 "<span><a href=\"http://www.test.com/foo?page=4\">Next &rsaquo;&rsaq uo;</a></span>" +
257 "</div>",
258 "http://www.test.com/foo?page=3");
259 assertEquals(5, info.mAllPageInfo.size());
260 final String urlPrefix = "http://www.test.com/foo?page=";
261 PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
262 assertEquals(1, page.mPageNum);
263 assertEquals(urlPrefix + "1", page.mUrl);
264 page = info.mAllPageInfo.get(1);
265 assertEquals(2, page.mPageNum);
266 assertEquals(urlPrefix + "2", page.mUrl);
267 page = info.mAllPageInfo.get(2);
268 assertEquals(4, page.mPageNum);
269 assertEquals(urlPrefix + "4", page.mUrl);
270 page = info.mAllPageInfo.get(3);
271 assertEquals(5, page.mPageNum);
272 assertEquals(urlPrefix + "5", page.mUrl);
273 page = info.mAllPageInfo.get(4);
274 assertEquals(48, page.mPageNum);
275 assertEquals(urlPrefix + "48", page.mUrl);
276 assertEquals(urlPrefix + "4", info.mNextPagingUrl);
277 }
278
279 private PageParamInfo processDocument(String content) {
280 // Create and add a <base> element so that all anchors are based off it.
281 BaseElement baseTag = Document.get().createBaseElement();
282 baseTag.setHref(BASE_URL);
283 mHead.appendChild(baseTag);
284
285 // Append content to body.
286 mBody.setInnerHTML(content);
287
288 PageParamInfo info = PageParameterParser.parse(TEST_URL, null);
289 mHead.removeChild(baseTag);
290 return info;
291 }
292
293 private PageParamInfo processDocumentWithoutBase(String content, String orig inalUrl) {
294 // Append content to body.
295 mBody.setInnerHTML(content);
296 return PageParameterParser.parse(originalUrl, null);
297 }
298
299 }
OLDNEW
« no previous file with comments | « java/org/chromium/distiller/ParsedUrl.java ('k') | javatests/org/chromium/distiller/ParsedUrlTest.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698