Index: components/dom_distiller/core/page_features.h |
diff --git a/components/dom_distiller/core/page_features.h b/components/dom_distiller/core/page_features.h |
new file mode 100644 |
index 0000000000000000000000000000000000000000..919a90a1e4d18857140c8c8d964e54d6567fbb98 |
--- /dev/null |
+++ b/components/dom_distiller/core/page_features.h |
@@ -0,0 +1,38 @@ |
+// Copyright 2015 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+#ifndef COMPONENTS_DOM_DISTILLER_CORE_PAGE_FEATURES_H_ |
+#define COMPONENTS_DOM_DISTILLER_CORE_PAGE_FEATURES_H_ |
+ |
+#include <vector> |
+ |
+#include "base/values.h" |
+#include "url/gurl.h" |
+ |
+namespace dom_distiller { |
+ |
+// The distillable page detector is a model trained on a list of numeric |
+// features derived from core more complex features of a webpage (like the |
+// body's .textContent). This derives the numeric features for a set of core |
+// features. |
+// |
+// Note: It is crucial that these features are derived in the same way and are |
+// in the same order as in the training pipeline. See //heuristics/distillable |
+// in the external DomDistillerJs repo. |
+std::vector<double> CalculateDerivedFeatures(bool isOGArticle, |
+ const GURL& url, |
+ double numElements, |
+ double numAnchors, |
+ double numForms, |
+ const std::string& innerText, |
+ const std::string& textContent, |
+ const std::string& innerHTML); |
+ |
+// Calculates the derived features from the JSON value as returned by the |
+// javascript core feature extraction. |
+std::vector<double> CalculateDerivedFeaturesFromJSON(const base::Value* json); |
+ |
+} // namespace dom_distiller |
+ |
+#endif // COMPONENTS_DOM_DISTILLER_CORE_PAGE_FEATURES_H_ |