Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(315)

Side by Side Diff: java/org/chromium/distiller/ContentExtractor.java

Issue 1230583006: Fix for keeping lists structure (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: canBeNested move out of the switch. Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | java/org/chromium/distiller/webdocument/DomConverter.java » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package org.chromium.distiller; 5 package org.chromium.distiller;
6 6
7 import org.chromium.distiller.document.TextDocument; 7 import org.chromium.distiller.document.TextDocument;
8 import org.chromium.distiller.document.TextDocumentStatistics; 8 import org.chromium.distiller.document.TextDocumentStatistics;
9 import org.chromium.distiller.extractors.ArticleExtractor; 9 import org.chromium.distiller.extractors.ArticleExtractor;
10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; 10 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo;
11 import org.chromium.distiller.proto.DomDistillerProtos.TimingEntry; 11 import org.chromium.distiller.proto.DomDistillerProtos.TimingEntry;
12 import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo; 12 import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo;
13 import org.chromium.distiller.webdocument.DomConverter; 13 import org.chromium.distiller.webdocument.DomConverter;
14 import org.chromium.distiller.webdocument.WebDocument; 14 import org.chromium.distiller.webdocument.WebDocument;
15 import org.chromium.distiller.webdocument.WebDocumentBuilder; 15 import org.chromium.distiller.webdocument.WebDocumentBuilder;
16 import org.chromium.distiller.webdocument.WebImage; 16 import org.chromium.distiller.webdocument.WebImage;
17 import org.chromium.distiller.webdocument.filters.RelevantElements; 17 import org.chromium.distiller.webdocument.filters.RelevantElements;
18 import org.chromium.distiller.webdocument.filters.LeadImageFinder; 18 import org.chromium.distiller.webdocument.filters.LeadImageFinder;
19 import org.chromium.distiller.webdocument.filters.NestedElementRetainer;
19 20
20 import com.google.gwt.dom.client.Document; 21 import com.google.gwt.dom.client.Document;
21 import com.google.gwt.dom.client.Element; 22 import com.google.gwt.dom.client.Element;
22 import com.google.gwt.dom.client.Node; 23 import com.google.gwt.dom.client.Node;
23 import com.google.gwt.dom.client.NodeList; 24 import com.google.gwt.dom.client.NodeList;
24 25
25 import java.util.ArrayList; 26 import java.util.ArrayList;
26 import java.util.LinkedList; 27 import java.util.LinkedList;
27 import java.util.List; 28 import java.util.List;
28 import java.util.Set; 29 import java.util.Set;
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after
86 87
87 public String extractContent(boolean textOnly) { 88 public String extractContent(boolean textOnly) {
88 double now = DomUtil.getTime(); 89 double now = DomUtil.getTime();
89 WebDocumentInfo documentInfo = createWebDocumentInfoFromPage(); 90 WebDocumentInfo documentInfo = createWebDocumentInfoFromPage();
90 mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now); 91 mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now);
91 92
92 now = DomUtil.getTime(); 93 now = DomUtil.getTime();
93 processDocument(documentInfo.document); 94 processDocument(documentInfo.document);
94 RelevantElements.process(documentInfo.document); 95 RelevantElements.process(documentInfo.document);
95 LeadImageFinder.process(documentInfo.document); 96 LeadImageFinder.process(documentInfo.document);
97 NestedElementRetainer.process(documentInfo.document);
96 98
97 List<WebImage> images = documentInfo.document.getContentImages(); 99 List<WebImage> images = documentInfo.document.getContentImages();
98 for (WebImage wi : images) { 100 for (WebImage wi : images) {
99 imageUrls.add(wi.getSrc()); 101 imageUrls.add(wi.getSrc());
100 } 102 }
101 mTimingInfo.setArticleProcessingTime(DomUtil.getTime() - now); 103 mTimingInfo.setArticleProcessingTime(DomUtil.getTime() - now);
102 104
103 now = DomUtil.getTime(); 105 now = DomUtil.getTime();
104 String html = documentInfo.document.generateOutput(textOnly); 106 String html = documentInfo.document.generateOutput(textOnly);
105 mTimingInfo.setFormattingTime(DomUtil.getTime() - now); 107 mTimingInfo.setFormattingTime(DomUtil.getTime() - now);
(...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after
206 * 208 *
207 * @param document the WebDocument representation of the page extracted from the DOM. 209 * @param document the WebDocument representation of the page extracted from the DOM.
208 */ 210 */
209 private void processDocument(WebDocument document) { 211 private void processDocument(WebDocument document) {
210 TextDocument textDocument = document.createTextDocumentView(); 212 TextDocument textDocument = document.createTextDocumentView();
211 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); 213 ArticleExtractor.INSTANCE.process(textDocument, candidateTitles);
212 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument)); 214 mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent( textDocument));
213 textDocument.applyToModel(); 215 textDocument.applyToModel();
214 } 216 }
215 } 217 }
OLDNEW
« no previous file with comments | « no previous file | java/org/chromium/distiller/webdocument/DomConverter.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698