Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(123)

Unified Diff: components/dom_distiller/core/page_features.cc

Issue 1042053003: Add calculation of derived features for distillable page model (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: rebase Created 5 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: components/dom_distiller/core/page_features.cc
diff --git a/components/dom_distiller/core/page_features.cc b/components/dom_distiller/core/page_features.cc
new file mode 100644
index 0000000000000000000000000000000000000000..057adbf924fad00b3c3c57a5edef0c236698722e
--- /dev/null
+++ b/components/dom_distiller/core/page_features.cc
@@ -0,0 +1,165 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/dom_distiller/core/page_features.h"
+
+#include <string>
+
+#include "third_party/re2/re2/re2.h"
+
+namespace dom_distiller {
+/* This code needs to derive features in the same way and order in which they
+ * are derived when training the model. Parts of that code are reproduced in the
+ * comments below.
+ */
+
+namespace {
+std::string GetLastSegment(const std::string& path) {
+ // return re.search('[^/]*\/?$', path).group(0)
+ if (path.size() == 0)
+ return "";
+ size_t start = path.rfind("/", path.size() - 1);
+ return start == std::string::npos ? "" : path.substr(start + 1);
+}
+
+int CountMatches(const std::string& s, const std::string& p) {
+ // return len(re.findall(p, s))
+ re2::StringPiece sp(s);
+ re2::RE2 regexp(p);
+ int count = 0;
+ while (re2::RE2::FindAndConsume(&sp, regexp))
+ count++;
+ return count;
+}
+
+int GetWordCount(const std::string& s) {
+ return CountMatches(s, "\\w+");
+}
+
+bool Contains(const std::string& n, const std::string& h) {
+ return h.find(n) != std::string::npos;
+}
+
+bool EndsWith(const std::string& t, const std::string& s) {
+ return s.size() >= t.size() &&
+ s.compare(s.size() - t.size(), std::string::npos, t) == 0;
+}
+}
+
+std::vector<double> CalculateDerivedFeatures(bool isOGArticle,
+ const GURL& url,
+ double numElements,
+ double numAnchors,
+ double numForms,
+ const std::string& innerText,
+ const std::string& textContent,
+ const std::string& innerHTML) {
+ // In the training pipeline, the strings are explicitly encoded in utf-8 (as
+ // they are here).
+ const std::string& path = url.path();
+ int innerTextWords = GetWordCount(innerText);
+ int textContentWords = GetWordCount(textContent);
+ int innerHTMLWords = GetWordCount(innerHTML);
+ std::vector<double> features;
+ // 'opengraph', opengraph,
+ features.push_back(isOGArticle);
+ // 'forum', 'forum' in path,
+ features.push_back(Contains("forum", path));
+ // 'index', 'index' in path,
+ features.push_back(Contains("index", path));
+ // 'view', 'view' in path,
+ features.push_back(Contains("view", path));
+ // 'asp', '.asp' in path,
+ features.push_back(Contains(".asp", path));
+ // 'phpbb', 'phpbb' in path,
+ features.push_back(Contains("phpbb", path));
+ // 'php', path.endswith('.php'),
+ features.push_back(EndsWith(".php", path));
+ // 'pathlength', len(path),
+ features.push_back(path.size());
+ // 'domain', len(path) < 2,
+ features.push_back(path.size() < 2);
+ // 'pathcomponents', CountMatches(path, r'\/.'),
+ features.push_back(CountMatches(path, "\\/."));
+ // 'slugdetector', CountMatches(path, r'[^\w/]'),
+ features.push_back(CountMatches(path, "[^\\w/]"));
+ // 'pathnumbers', CountMatches(path, r'\d+'),
+ features.push_back(CountMatches(path, "\\d+"));
+ // 'lastSegmentLength', len(GetLastSegment(path)),
+ features.push_back(GetLastSegment(path).size());
+ // 'formcount', numForms,
+ features.push_back(numForms);
+ // 'anchorcount', numAnchors,
+ features.push_back(numAnchors);
+ // 'elementcount', numElements,
+ features.push_back(numElements);
+ // 'anchorratio', float(numAnchors) / max(1, numElements),
+ features.push_back(double(numAnchors) / std::max<double>(1, numElements));
+ // 'innertextlength', len(innerText),
+ features.push_back(innerText.size());
+ // 'textcontentlength', len(textContent),
+ features.push_back(textContent.size());
+ // 'innerhtmllength', len(innerHTML),
+ features.push_back(innerHTML.size());
+ // 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),
+ features.push_back(double(innerText.size()) /
+ std::max<double>(1.0, innerHTML.size()));
+ // 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),
+ features.push_back(double(textContent.size()) /
+ std::max<double>(1.0, innerHTML.size()));
+ // 'innertexttextcontentlengthratio',
+ // float(len(innerText)) / max(1, len(textContent)),
+ features.push_back(double(innerText.size()) /
+ std::max<double>(1.0, textContent.size()));
+ // 'innertextwordcount', innerTextWords,
+ features.push_back(innerTextWords);
+ // 'textcontentwordcount', textContentWords,
+ features.push_back(textContentWords);
+ // 'innerhtmlwordcount', innerHTMLWords,
+ features.push_back(innerHTMLWords);
+ // 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
+ features.push_back(double(innerTextWords) /
+ std::max<int>(1.0, innerHTMLWords));
+ // 'textcontentwordcountratio',
+ // float(textContentWords) / max(1, innerHTMLWords),
+ features.push_back(double(textContentWords) /
+ std::max<int>(1.0, innerHTMLWords));
+ // 'innertexttextcontentwordcountratio',
+ // float(innerTextWords) / max(1, textContentWords),
+ features.push_back(double(innerTextWords) /
+ std::max<int>(1.0, textContentWords));
+ return features;
+}
+
+std::vector<double> CalculateDerivedFeaturesFromJSON(const base::Value* json) {
+ const base::DictionaryValue* dict;
+ if (!json->GetAsDictionary(&dict)) {
+ return std::vector<double>();
+ }
+
+ bool isOGArticle = false;
+ std::string url, innerText, textContent, innerHTML;
+ double numElements = 0.0, numAnchors = 0.0, numForms = 0.0;
+
+ if (!(dict->GetBoolean("opengraph", &isOGArticle) &&
+ dict->GetString("url", &url) &&
+ dict->GetDouble("numElements", &numElements) &&
+ dict->GetDouble("numAnchors", &numAnchors) &&
+ dict->GetDouble("numForms", &numForms) &&
+ dict->GetString("innerText", &innerText) &&
+ dict->GetString("textContent", &textContent) &&
+ dict->GetString("innerHTML", &innerHTML))) {
+ return std::vector<double>();
+ }
+
+ GURL parsed_url(url);
+ if (!parsed_url.is_valid()) {
+ return std::vector<double>();
+ }
+
+ return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements,
+ numAnchors, numForms, innerText, textContent,
+ innerHTML);
+}
+}
« no previous file with comments | « components/dom_distiller/core/page_features.h ('k') | components/dom_distiller/core/page_features_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698