OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #ifndef CHROME_COMMON_EXTENSIONS_MATCHER_SUBSTRING_SET_MATCHER_H_ | |
6 #define CHROME_COMMON_EXTENSIONS_MATCHER_SUBSTRING_SET_MATCHER_H_ | |
7 | |
8 #include <map> | |
9 #include <set> | |
10 #include <string> | |
11 #include <vector> | |
12 | |
13 #include "base/basictypes.h" | |
14 #include "chrome/common/extensions/matcher/string_pattern.h" | |
15 | |
16 namespace extensions { | |
17 | |
18 // Class that store a set of string patterns and can find for a string S, | |
19 // which string patterns occur in S. | |
20 class SubstringSetMatcher { | |
21 public: | |
22 SubstringSetMatcher(); | |
23 ~SubstringSetMatcher(); | |
24 | |
25 // Registers all |patterns|. The ownership remains with the caller. | |
26 // The same pattern cannot be registered twice and each pattern needs to have | |
27 // a unique ID. | |
28 // Ownership of the patterns remains with the caller. | |
29 void RegisterPatterns(const std::vector<const StringPattern*>& patterns); | |
30 | |
31 // Unregisters the passed |patterns|. | |
32 void UnregisterPatterns(const std::vector<const StringPattern*>& patterns); | |
33 | |
34 // Analogous to RegisterPatterns and UnregisterPatterns but executes both | |
35 // operations in one step, which is cheaper in the execution. | |
36 void RegisterAndUnregisterPatterns( | |
37 const std::vector<const StringPattern*>& to_register, | |
38 const std::vector<const StringPattern*>& to_unregister); | |
39 | |
40 // Matches |text| against all registered StringPatterns. Stores the IDs | |
41 // of matching patterns in |matches|. |matches| is not cleared before adding | |
42 // to it. | |
43 bool Match(const std::string& text, | |
44 std::set<StringPattern::ID>* matches) const; | |
45 | |
46 // Returns true if this object retains no allocated data. Only for debugging. | |
47 bool IsEmpty() const; | |
48 | |
49 private: | |
50 // A node of an Aho Corasick Tree. This is implemented according to | |
51 // http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf | |
52 // | |
53 // The algorithm is based on the idea of building a trie of all registered | |
54 // patterns. Each node of the tree is annotated with a set of pattern | |
55 // IDs that are used to report matches. | |
56 // | |
57 // The root of the trie represents an empty match. If we were looking whether | |
58 // any registered pattern matches a text at the beginning of the text (i.e. | |
59 // whether any pattern is a prefix of the text), we could just follow | |
60 // nodes in the trie according to the matching characters in the text. | |
61 // E.g., if text == "foobar", we would follow the trie from the root node | |
62 // to its child labeled 'f', from there to child 'o', etc. In this process we | |
63 // would report all pattern IDs associated with the trie nodes as matches. | |
64 // | |
65 // As we are not looking for all prefix matches but all substring matches, | |
66 // this algorithm would need to compare text.substr(0), text.substr(1), ... | |
67 // against the trie, which is in O(|text|^2). | |
68 // | |
69 // The Aho Corasick algorithm improves this runtime by using failure edges. | |
70 // In case we have found a partial match of length k in the text | |
71 // (text[i, ..., i + k - 1]) in the trie starting at the root and ending at | |
72 // a node at depth k, but cannot find a match in the trie for character | |
73 // text[i + k] at depth k + 1, we follow a failure edge. This edge | |
74 // corresponds to the longest proper suffix of text[i, ..., i + k - 1] that | |
75 // is a prefix of any registered pattern. | |
76 // | |
77 // If your brain thinks "Forget it, let's go shopping.", don't worry. | |
78 // Take a nap and read an introductory text on the Aho Corasick algorithm. | |
79 // It will make sense. Eventually. | |
80 class AhoCorasickNode { | |
81 public: | |
82 // Key: label of the edge, value: node index in |tree_| of parent class. | |
83 typedef std::map<char, int> Edges; | |
84 typedef std::set<StringPattern::ID> Matches; | |
85 | |
86 AhoCorasickNode(); | |
87 ~AhoCorasickNode(); | |
88 AhoCorasickNode(const AhoCorasickNode& other); | |
89 AhoCorasickNode& operator=(const AhoCorasickNode& other); | |
90 | |
91 bool HasEdge(char c) const; | |
92 int GetEdge(char c) const; | |
93 void SetEdge(char c, int node); | |
94 const Edges& edges() const { return edges_; } | |
95 | |
96 int failure() const { return failure_; } | |
97 void set_failure(int failure) { failure_ = failure; } | |
98 | |
99 void AddMatch(StringPattern::ID id); | |
100 void AddMatches(const Matches& matches); | |
101 const Matches& matches() const { return matches_; } | |
102 | |
103 private: | |
104 // Outgoing edges of current node. | |
105 Edges edges_; | |
106 | |
107 // Node index that failure edge leads to. | |
108 int failure_; | |
109 | |
110 // Identifiers of matches. | |
111 Matches matches_; | |
112 }; | |
113 | |
114 void RebuildAhoCorasickTree(); | |
115 | |
116 // Inserts a path for |pattern->pattern()| into the tree and adds | |
117 // |pattern->id()| to the set of matches. Ownership of |pattern| remains with | |
118 // the caller. | |
119 void InsertPatternIntoAhoCorasickTree(const StringPattern* pattern); | |
120 void CreateFailureEdges(); | |
121 | |
122 // Set of all registered StringPatterns. Used to regenerate the | |
123 // Aho-Corasick tree in case patterns are registered or unregistered. | |
124 typedef std::map<StringPattern::ID, const StringPattern*> | |
125 SubstringPatternSet; | |
126 SubstringPatternSet patterns_; | |
127 | |
128 // The nodes of a Aho-Corasick tree. | |
129 std::vector<AhoCorasickNode> tree_; | |
130 | |
131 DISALLOW_COPY_AND_ASSIGN(SubstringSetMatcher); | |
132 }; | |
133 | |
134 } // namespace extensions | |
135 | |
136 #endif // CHROME_COMMON_EXTENSIONS_MATCHER_SUBSTRING_SET_MATCHER_H_ | |
OLD | NEW |