OLD | NEW |
(Empty) | |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "components/dom_distiller/core/page_features.h" |
| 6 |
| 7 #include <string> |
| 8 |
| 9 #include "third_party/re2/re2/re2.h" |
| 10 |
| 11 namespace dom_distiller { |
| 12 /* This code needs to derive features in the same way and order in which they |
| 13 * are derived when training the model. Parts of that code are reproduced in the |
| 14 * comments below. |
| 15 */ |
| 16 |
| 17 namespace { |
| 18 std::string GetLastSegment(const std::string& path) { |
| 19 // return re.search('[^/]*\/?$', path).group(0) |
| 20 if (path.size() == 0) |
| 21 return ""; |
| 22 size_t start = path.rfind("/", path.size() - 1); |
| 23 return start == std::string::npos ? "" : path.substr(start + 1); |
| 24 } |
| 25 |
| 26 int CountMatches(const std::string& s, const std::string& p) { |
| 27 // return len(re.findall(p, s)) |
| 28 re2::StringPiece sp(s); |
| 29 re2::RE2 regexp(p); |
| 30 int count = 0; |
| 31 while (re2::RE2::FindAndConsume(&sp, regexp)) |
| 32 count++; |
| 33 return count; |
| 34 } |
| 35 |
| 36 int GetWordCount(const std::string& s) { |
| 37 return CountMatches(s, "\\w+"); |
| 38 } |
| 39 |
| 40 bool Contains(const std::string& n, const std::string& h) { |
| 41 return h.find(n) != std::string::npos; |
| 42 } |
| 43 |
| 44 bool EndsWith(const std::string& t, const std::string& s) { |
| 45 return s.size() >= t.size() && |
| 46 s.compare(s.size() - t.size(), std::string::npos, t) == 0; |
| 47 } |
| 48 } |
| 49 |
| 50 std::vector<double> CalculateDerivedFeatures(bool isOGArticle, |
| 51 const GURL& url, |
| 52 double numElements, |
| 53 double numAnchors, |
| 54 double numForms, |
| 55 const std::string& innerText, |
| 56 const std::string& textContent, |
| 57 const std::string& innerHTML) { |
| 58 // In the training pipeline, the strings are explicitly encoded in utf-8 (as |
| 59 // they are here). |
| 60 const std::string& path = url.path(); |
| 61 int innerTextWords = GetWordCount(innerText); |
| 62 int textContentWords = GetWordCount(textContent); |
| 63 int innerHTMLWords = GetWordCount(innerHTML); |
| 64 std::vector<double> features; |
| 65 // 'opengraph', opengraph, |
| 66 features.push_back(isOGArticle); |
| 67 // 'forum', 'forum' in path, |
| 68 features.push_back(Contains("forum", path)); |
| 69 // 'index', 'index' in path, |
| 70 features.push_back(Contains("index", path)); |
| 71 // 'view', 'view' in path, |
| 72 features.push_back(Contains("view", path)); |
| 73 // 'asp', '.asp' in path, |
| 74 features.push_back(Contains(".asp", path)); |
| 75 // 'phpbb', 'phpbb' in path, |
| 76 features.push_back(Contains("phpbb", path)); |
| 77 // 'php', path.endswith('.php'), |
| 78 features.push_back(EndsWith(".php", path)); |
| 79 // 'pathlength', len(path), |
| 80 features.push_back(path.size()); |
| 81 // 'domain', len(path) < 2, |
| 82 features.push_back(path.size() < 2); |
| 83 // 'pathcomponents', CountMatches(path, r'\/.'), |
| 84 features.push_back(CountMatches(path, "\\/.")); |
| 85 // 'slugdetector', CountMatches(path, r'[^\w/]'), |
| 86 features.push_back(CountMatches(path, "[^\\w/]")); |
| 87 // 'pathnumbers', CountMatches(path, r'\d+'), |
| 88 features.push_back(CountMatches(path, "\\d+")); |
| 89 // 'lastSegmentLength', len(GetLastSegment(path)), |
| 90 features.push_back(GetLastSegment(path).size()); |
| 91 // 'formcount', numForms, |
| 92 features.push_back(numForms); |
| 93 // 'anchorcount', numAnchors, |
| 94 features.push_back(numAnchors); |
| 95 // 'elementcount', numElements, |
| 96 features.push_back(numElements); |
| 97 // 'anchorratio', float(numAnchors) / max(1, numElements), |
| 98 features.push_back(double(numAnchors) / std::max<double>(1, numElements)); |
| 99 // 'innertextlength', len(innerText), |
| 100 features.push_back(innerText.size()); |
| 101 // 'textcontentlength', len(textContent), |
| 102 features.push_back(textContent.size()); |
| 103 // 'innerhtmllength', len(innerHTML), |
| 104 features.push_back(innerHTML.size()); |
| 105 // 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)), |
| 106 features.push_back(double(innerText.size()) / |
| 107 std::max<double>(1.0, innerHTML.size())); |
| 108 // 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)), |
| 109 features.push_back(double(textContent.size()) / |
| 110 std::max<double>(1.0, innerHTML.size())); |
| 111 // 'innertexttextcontentlengthratio', |
| 112 // float(len(innerText)) / max(1, len(textContent)), |
| 113 features.push_back(double(innerText.size()) / |
| 114 std::max<double>(1.0, textContent.size())); |
| 115 // 'innertextwordcount', innerTextWords, |
| 116 features.push_back(innerTextWords); |
| 117 // 'textcontentwordcount', textContentWords, |
| 118 features.push_back(textContentWords); |
| 119 // 'innerhtmlwordcount', innerHTMLWords, |
| 120 features.push_back(innerHTMLWords); |
| 121 // 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords), |
| 122 features.push_back(double(innerTextWords) / |
| 123 std::max<int>(1.0, innerHTMLWords)); |
| 124 // 'textcontentwordcountratio', |
| 125 // float(textContentWords) / max(1, innerHTMLWords), |
| 126 features.push_back(double(textContentWords) / |
| 127 std::max<int>(1.0, innerHTMLWords)); |
| 128 // 'innertexttextcontentwordcountratio', |
| 129 // float(innerTextWords) / max(1, textContentWords), |
| 130 features.push_back(double(innerTextWords) / |
| 131 std::max<int>(1.0, textContentWords)); |
| 132 return features; |
| 133 } |
| 134 |
| 135 std::vector<double> CalculateDerivedFeaturesFromJSON(const base::Value* json) { |
| 136 const base::DictionaryValue* dict; |
| 137 if (!json->GetAsDictionary(&dict)) { |
| 138 return std::vector<double>(); |
| 139 } |
| 140 |
| 141 bool isOGArticle = false; |
| 142 std::string url, innerText, textContent, innerHTML; |
| 143 double numElements = 0.0, numAnchors = 0.0, numForms = 0.0; |
| 144 |
| 145 if (!(dict->GetBoolean("opengraph", &isOGArticle) && |
| 146 dict->GetString("url", &url) && |
| 147 dict->GetDouble("numElements", &numElements) && |
| 148 dict->GetDouble("numAnchors", &numAnchors) && |
| 149 dict->GetDouble("numForms", &numForms) && |
| 150 dict->GetString("innerText", &innerText) && |
| 151 dict->GetString("textContent", &textContent) && |
| 152 dict->GetString("innerHTML", &innerHTML))) { |
| 153 return std::vector<double>(); |
| 154 } |
| 155 |
| 156 GURL parsed_url(url); |
| 157 if (!parsed_url.is_valid()) { |
| 158 return std::vector<double>(); |
| 159 } |
| 160 |
| 161 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements, |
| 162 numAnchors, numForms, innerText, textContent, |
| 163 innerHTML); |
| 164 } |
| 165 } |
OLD | NEW |