OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "extensions/common/matcher/substring_set_matcher.h" | 5 #include "extensions/common/matcher/substring_set_matcher.h" |
6 | 6 |
| 7 #include <algorithm> |
7 #include <queue> | 8 #include <queue> |
8 | 9 |
9 #include "base/logging.h" | 10 #include "base/logging.h" |
10 #include "base/stl_util.h" | 11 #include "base/stl_util.h" |
11 | 12 |
12 namespace extensions { | 13 namespace extensions { |
13 | 14 |
| 15 namespace { |
| 16 |
| 17 // Compare StringPattern instances based on their string patterns. |
| 18 bool ComparePatterns(const StringPattern* a, const StringPattern* b) { |
| 19 return a->pattern() < b->pattern(); |
| 20 } |
| 21 |
| 22 // Given the set of patterns, compute how many nodes will the corresponding |
| 23 // Aho-Corasick tree have. Note that |patterns| need to be sorted. |
| 24 uint32 TreeSize(const std::vector<const StringPattern*>& patterns) { |
| 25 uint32 result = 1u; // 1 for the root node. |
| 26 if (patterns.empty()) |
| 27 return result; |
| 28 |
| 29 std::vector<const StringPattern*>::const_iterator last = patterns.begin(); |
| 30 std::vector<const StringPattern*>::const_iterator current = last + 1; |
| 31 // For the first pattern, each letter is a label of an edge to a new node. |
| 32 result += (*last)->pattern().size(); |
| 33 |
| 34 // For the subsequent patterns, only count the edges which were not counted |
| 35 // yet. For this it suffices to test against the previous pattern, because the |
| 36 // patterns are sorted. |
| 37 for (; current != patterns.end(); ++last, ++current) { |
| 38 const std::string& last_pattern = (*last)->pattern(); |
| 39 const std::string& current_pattern = (*current)->pattern(); |
| 40 const uint32 prefix_bound = |
| 41 std::min(last_pattern.size(), current_pattern.size()); |
| 42 |
| 43 uint32 common_prefix = 0; |
| 44 while (common_prefix < prefix_bound && |
| 45 last_pattern[common_prefix] == current_pattern[common_prefix]) |
| 46 ++common_prefix; |
| 47 result += current_pattern.size() - common_prefix; |
| 48 } |
| 49 return result; |
| 50 } |
| 51 |
| 52 } // namespace |
| 53 |
14 // | 54 // |
15 // SubstringSetMatcher | 55 // SubstringSetMatcher |
16 // | 56 // |
17 | 57 |
18 SubstringSetMatcher::SubstringSetMatcher() { | 58 SubstringSetMatcher::SubstringSetMatcher() { |
19 RebuildAhoCorasickTree(); | 59 RebuildAhoCorasickTree(SubstringPatternVector()); |
20 } | 60 } |
21 | 61 |
22 SubstringSetMatcher::~SubstringSetMatcher() {} | 62 SubstringSetMatcher::~SubstringSetMatcher() {} |
23 | 63 |
24 void SubstringSetMatcher::RegisterPatterns( | 64 void SubstringSetMatcher::RegisterPatterns( |
25 const std::vector<const StringPattern*>& patterns) { | 65 const std::vector<const StringPattern*>& patterns) { |
26 RegisterAndUnregisterPatterns(patterns, | 66 RegisterAndUnregisterPatterns(patterns, |
27 std::vector<const StringPattern*>()); | 67 std::vector<const StringPattern*>()); |
28 } | 68 } |
29 | 69 |
(...skipping 12 matching lines...) Expand all Loading... |
42 DCHECK(patterns_.find((*i)->id()) == patterns_.end()); | 82 DCHECK(patterns_.find((*i)->id()) == patterns_.end()); |
43 patterns_[(*i)->id()] = *i; | 83 patterns_[(*i)->id()] = *i; |
44 } | 84 } |
45 | 85 |
46 // Unregister patterns | 86 // Unregister patterns |
47 for (std::vector<const StringPattern*>::const_iterator i = | 87 for (std::vector<const StringPattern*>::const_iterator i = |
48 to_unregister.begin(); i != to_unregister.end(); ++i) { | 88 to_unregister.begin(); i != to_unregister.end(); ++i) { |
49 patterns_.erase((*i)->id()); | 89 patterns_.erase((*i)->id()); |
50 } | 90 } |
51 | 91 |
52 RebuildAhoCorasickTree(); | 92 // Now we compute the total number of tree nodes needed. |
| 93 SubstringPatternVector sorted_patterns; |
| 94 sorted_patterns.resize(patterns_.size()); |
| 95 |
| 96 size_t next = 0; |
| 97 for (SubstringPatternMap::const_iterator i = patterns_.begin(); |
| 98 i != patterns_.end(); |
| 99 ++i, ++next) { |
| 100 sorted_patterns[next] = i->second; |
| 101 } |
| 102 |
| 103 std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns); |
| 104 tree_.reserve(TreeSize(sorted_patterns)); |
| 105 |
| 106 RebuildAhoCorasickTree(sorted_patterns); |
53 } | 107 } |
54 | 108 |
55 bool SubstringSetMatcher::Match(const std::string& text, | 109 bool SubstringSetMatcher::Match(const std::string& text, |
56 std::set<StringPattern::ID>* matches) const { | 110 std::set<StringPattern::ID>* matches) const { |
57 size_t old_number_of_matches = matches->size(); | 111 const size_t old_number_of_matches = matches->size(); |
58 | 112 |
59 // Handle patterns matching the empty string. | 113 // Handle patterns matching the empty string. |
60 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); | 114 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); |
61 | 115 |
62 int current_node = 0; | 116 uint32 current_node = 0; |
63 size_t text_length = text.length(); | 117 for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { |
64 for (size_t i = 0; i < text_length; ++i) { | 118 uint32 edge_from_current = tree_[current_node].GetEdge(*i); |
65 while (!tree_[current_node].HasEdge(text[i]) && current_node != 0) | 119 while (edge_from_current == AhoCorasickNode::kNoSuchEdge && |
| 120 current_node != 0) { |
66 current_node = tree_[current_node].failure(); | 121 current_node = tree_[current_node].failure(); |
67 if (tree_[current_node].HasEdge(text[i])) { | 122 edge_from_current = tree_[current_node].GetEdge(*i); |
68 current_node = tree_[current_node].GetEdge(text[i]); | 123 } |
| 124 if (edge_from_current != AhoCorasickNode::kNoSuchEdge) { |
| 125 current_node = edge_from_current; |
69 matches->insert(tree_[current_node].matches().begin(), | 126 matches->insert(tree_[current_node].matches().begin(), |
70 tree_[current_node].matches().end()); | 127 tree_[current_node].matches().end()); |
71 } else { | 128 } else { |
72 DCHECK_EQ(0, current_node); | 129 DCHECK_EQ(0u, current_node); |
73 } | 130 } |
74 } | 131 } |
75 | 132 |
76 return old_number_of_matches != matches->size(); | 133 return old_number_of_matches != matches->size(); |
77 } | 134 } |
78 | 135 |
79 bool SubstringSetMatcher::IsEmpty() const { | 136 bool SubstringSetMatcher::IsEmpty() const { |
80 // An empty tree consists of only the root node. | 137 // An empty tree consists of only the root node. |
81 return patterns_.empty() && tree_.size() == 1u; | 138 return patterns_.empty() && tree_.size() == 1u; |
82 } | 139 } |
83 | 140 |
84 void SubstringSetMatcher::RebuildAhoCorasickTree() { | 141 void SubstringSetMatcher::RebuildAhoCorasickTree( |
| 142 const SubstringPatternVector& sorted_patterns) { |
85 tree_.clear(); | 143 tree_.clear(); |
86 | 144 |
87 // Initialize root note of tree. | 145 // Initialize root note of tree. |
88 AhoCorasickNode root; | 146 AhoCorasickNode root; |
89 root.set_failure(0); | 147 root.set_failure(0); |
90 tree_.push_back(root); | 148 tree_.push_back(root); |
91 | 149 |
92 // Insert all patterns. | 150 // Insert all patterns. |
93 for (SubstringPatternSet::const_iterator i = patterns_.begin(); | 151 for (SubstringPatternVector::const_iterator i = sorted_patterns.begin(); |
94 i != patterns_.end(); ++i) { | 152 i != sorted_patterns.end(); |
95 InsertPatternIntoAhoCorasickTree(i->second); | 153 ++i) { |
| 154 InsertPatternIntoAhoCorasickTree(*i); |
96 } | 155 } |
97 | 156 |
98 CreateFailureEdges(); | 157 CreateFailureEdges(); |
99 } | 158 } |
100 | 159 |
101 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( | 160 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
102 const StringPattern* pattern) { | 161 const StringPattern* pattern) { |
103 const std::string& text = pattern->pattern(); | 162 const std::string& text = pattern->pattern(); |
104 size_t text_length = text.length(); | 163 const std::string::const_iterator text_end = text.end(); |
105 | 164 |
106 // Iterators on the tree and the text. | 165 // Iterators on the tree and the text. |
107 int current_node = 0; | 166 uint32 current_node = 0; |
108 size_t text_pos = 0; | 167 std::string::const_iterator i = text.begin(); |
109 | 168 |
110 // Follow existing paths for as long as possible. | 169 // Follow existing paths for as long as possible. |
111 while (text_pos < text_length && | 170 while (i != text_end) { |
112 tree_[current_node].HasEdge(text[text_pos])) { | 171 uint32 edge_from_current = tree_[current_node].GetEdge(*i); |
113 current_node = tree_[current_node].GetEdge(text[text_pos]); | 172 if (edge_from_current == AhoCorasickNode::kNoSuchEdge) |
114 ++text_pos; | 173 break; |
| 174 current_node = edge_from_current; |
| 175 ++i; |
115 } | 176 } |
116 | 177 |
117 // Create new nodes if necessary. | 178 // Create new nodes if necessary. |
118 while (text_pos < text_length) { | 179 while (i != text_end) { |
119 AhoCorasickNode new_node; | 180 tree_.push_back(AhoCorasickNode()); |
120 tree_.push_back(new_node); | 181 tree_[current_node].SetEdge(*i, tree_.size() - 1); |
121 tree_[current_node].SetEdge(text[text_pos], tree_.size() - 1); | |
122 current_node = tree_.size() - 1; | 182 current_node = tree_.size() - 1; |
123 ++text_pos; | 183 ++i; |
124 } | 184 } |
125 | 185 |
126 // Register match. | 186 // Register match. |
127 tree_[current_node].AddMatch(pattern->id()); | 187 tree_[current_node].AddMatch(pattern->id()); |
128 } | 188 } |
129 | 189 |
130 void SubstringSetMatcher::CreateFailureEdges() { | 190 void SubstringSetMatcher::CreateFailureEdges() { |
131 typedef AhoCorasickNode::Edges Edges; | 191 typedef AhoCorasickNode::Edges Edges; |
132 | 192 |
133 std::queue<int> queue; | 193 std::queue<uint32> queue; |
134 | 194 |
135 AhoCorasickNode& root = tree_[0]; | 195 AhoCorasickNode& root = tree_[0]; |
136 root.set_failure(0); | 196 root.set_failure(0); |
137 const AhoCorasickNode::Edges& root_edges = root.edges(); | 197 const Edges& root_edges = root.edges(); |
138 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); | 198 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); |
139 ++e) { | 199 ++e) { |
140 int leads_to = e->second; | 200 const uint32& leads_to = e->second; |
141 tree_[leads_to].set_failure(0); | 201 tree_[leads_to].set_failure(0); |
142 queue.push(leads_to); | 202 queue.push(leads_to); |
143 } | 203 } |
144 | 204 |
145 while (!queue.empty()) { | 205 while (!queue.empty()) { |
146 AhoCorasickNode& current_node = tree_[queue.front()]; | 206 AhoCorasickNode& current_node = tree_[queue.front()]; |
147 queue.pop(); | 207 queue.pop(); |
148 for (Edges::const_iterator e = current_node.edges().begin(); | 208 for (Edges::const_iterator e = current_node.edges().begin(); |
149 e != current_node.edges().end(); ++e) { | 209 e != current_node.edges().end(); ++e) { |
150 char edge_label = e->first; | 210 const char& edge_label = e->first; |
151 int leads_to = e->second; | 211 const uint32& leads_to = e->second; |
152 queue.push(leads_to); | 212 queue.push(leads_to); |
153 | 213 |
154 int failure = current_node.failure(); | 214 uint32 failure = current_node.failure(); |
155 while (!tree_[failure].HasEdge(edge_label) && failure != 0) | 215 uint32 edge_from_failure = tree_[failure].GetEdge(edge_label); |
| 216 while (edge_from_failure == AhoCorasickNode::kNoSuchEdge && |
| 217 failure != 0) { |
156 failure = tree_[failure].failure(); | 218 failure = tree_[failure].failure(); |
| 219 edge_from_failure = tree_[failure].GetEdge(edge_label); |
| 220 } |
157 | 221 |
158 int follow_in_case_of_failure = tree_[failure].HasEdge(e->first) ? | 222 const uint32 follow_in_case_of_failure = |
159 tree_[failure].GetEdge(edge_label) : 0; | 223 edge_from_failure != AhoCorasickNode::kNoSuchEdge |
| 224 ? edge_from_failure |
| 225 : 0; |
160 tree_[leads_to].set_failure(follow_in_case_of_failure); | 226 tree_[leads_to].set_failure(follow_in_case_of_failure); |
161 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); | 227 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); |
162 } | 228 } |
163 } | 229 } |
164 } | 230 } |
165 | 231 |
| 232 const uint32 SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = ~0; |
| 233 |
166 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() | 234 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() |
167 : failure_(-1) {} | 235 : failure_(kNoSuchEdge) {} |
168 | 236 |
169 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} | 237 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} |
170 | 238 |
171 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( | 239 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( |
172 const SubstringSetMatcher::AhoCorasickNode& other) | 240 const SubstringSetMatcher::AhoCorasickNode& other) |
173 : edges_(other.edges_), | 241 : edges_(other.edges_), |
174 failure_(other.failure_), | 242 failure_(other.failure_), |
175 matches_(other.matches_) {} | 243 matches_(other.matches_) {} |
176 | 244 |
177 SubstringSetMatcher::AhoCorasickNode& | 245 SubstringSetMatcher::AhoCorasickNode& |
178 SubstringSetMatcher::AhoCorasickNode::operator=( | 246 SubstringSetMatcher::AhoCorasickNode::operator=( |
179 const SubstringSetMatcher::AhoCorasickNode& other) { | 247 const SubstringSetMatcher::AhoCorasickNode& other) { |
180 edges_ = other.edges_; | 248 edges_ = other.edges_; |
181 failure_ = other.failure_; | 249 failure_ = other.failure_; |
182 matches_ = other.matches_; | 250 matches_ = other.matches_; |
183 return *this; | 251 return *this; |
184 } | 252 } |
185 | 253 |
186 bool SubstringSetMatcher::AhoCorasickNode::HasEdge(char c) const { | 254 uint32 SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { |
187 return edges_.find(c) != edges_.end(); | 255 Edges::const_iterator i = edges_.find(c); |
| 256 return i == edges_.end() ? kNoSuchEdge : i->second; |
188 } | 257 } |
189 | 258 |
190 int SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { | 259 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32 node) { |
191 std::map<char, int>::const_iterator i = edges_.find(c); | |
192 return i == edges_.end() ? -1 : i->second; | |
193 } | |
194 | |
195 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, int node) { | |
196 edges_[c] = node; | 260 edges_[c] = node; |
197 } | 261 } |
198 | 262 |
199 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { | 263 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { |
200 matches_.insert(id); | 264 matches_.insert(id); |
201 } | 265 } |
202 | 266 |
203 void SubstringSetMatcher::AhoCorasickNode::AddMatches( | 267 void SubstringSetMatcher::AhoCorasickNode::AddMatches( |
204 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { | 268 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { |
205 matches_.insert(matches.begin(), matches.end()); | 269 matches_.insert(matches.begin(), matches.end()); |
206 } | 270 } |
207 | 271 |
208 } // namespace extensions | 272 } // namespace extensions |
OLD | NEW |