Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(298)

Unified Diff: java/org/chromium/distiller/webdocument/filters/NestedElementRetainer.java

Issue 1230583006: Fix for keeping lists structure (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: canBeNested move out of the switch. Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: java/org/chromium/distiller/webdocument/filters/NestedElementRetainer.java
diff --git a/java/org/chromium/distiller/webdocument/filters/NestedElementRetainer.java b/java/org/chromium/distiller/webdocument/filters/NestedElementRetainer.java
new file mode 100644
index 0000000000000000000000000000000000000000..47b590fd39645f7c2c09f132cbd77149959d802b
--- /dev/null
+++ b/java/org/chromium/distiller/webdocument/filters/NestedElementRetainer.java
@@ -0,0 +1,49 @@
+package org.chromium.distiller.webdocument.filters;
+
+import org.chromium.distiller.webdocument.WebDocument;
+import org.chromium.distiller.webdocument.WebElement;
+import org.chromium.distiller.webdocument.WebTag;
+
+import java.util.Stack;
+
+/**
+ * This class is used to identify what WebTag should be
+ * marked as <i>isContent</i> based on its {@link WebElement}s inside.
+ * A {@link WebTag} is content when:
+ * <ul>
+ * <li>Has any {@link WebElement} which is content.</li>
+ * <li>Has at least one nested {@link WebTag} which is content.</li>
+ * </ul>
+ */
+public class NestedElementRetainer {
+ public static void process(WebDocument document) {
+ boolean isContent = false;
+ int stackMark = -1;
+ Stack<WebTag> stack = new Stack<>();
+
+ for (WebElement e : document.getElements()) {
+ if (!(e instanceof WebTag)) {
+ if (!isContent) {
+ isContent = e.getIsContent();
+ }
+ } else {
+ WebTag webTag = (WebTag) e;
+ if (webTag.isStartTag()) {
+ webTag.setIsContent(isContent);
+ stack.push(webTag);
+ isContent = false;
+ } else {
+ WebTag startWebTag = stack.pop();
+ isContent |= stackMark >= stack.size();
+ if (isContent) {
+ stackMark = stack.size() - 1;
+ }
+ boolean wasContent = startWebTag.getIsContent();
+ startWebTag.setIsContent(isContent);
+ webTag.setIsContent(isContent);
+ isContent = wasContent;
+ }
+ }
+ }
+ }
+}
« no previous file with comments | « java/org/chromium/distiller/webdocument/WebText.java ('k') | javatests/org/chromium/distiller/ContentExtractorTest.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698