Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(187)

Side by Side Diff: chrome/common/extensions/matcher/substring_set_matcher.h

Issue 12092096: Move c/c/extensions/matcher/ to top level extension dir (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: TOT Created 7 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef CHROME_COMMON_EXTENSIONS_MATCHER_SUBSTRING_SET_MATCHER_H_
6 #define CHROME_COMMON_EXTENSIONS_MATCHER_SUBSTRING_SET_MATCHER_H_
7
8 #include <map>
9 #include <set>
10 #include <string>
11 #include <vector>
12
13 #include "base/basictypes.h"
14 #include "chrome/common/extensions/matcher/string_pattern.h"
15
16 namespace extensions {
17
18 // Class that store a set of string patterns and can find for a string S,
19 // which string patterns occur in S.
20 class SubstringSetMatcher {
21 public:
22 SubstringSetMatcher();
23 ~SubstringSetMatcher();
24
25 // Registers all |patterns|. The ownership remains with the caller.
26 // The same pattern cannot be registered twice and each pattern needs to have
27 // a unique ID.
28 // Ownership of the patterns remains with the caller.
29 void RegisterPatterns(const std::vector<const StringPattern*>& patterns);
30
31 // Unregisters the passed |patterns|.
32 void UnregisterPatterns(const std::vector<const StringPattern*>& patterns);
33
34 // Analogous to RegisterPatterns and UnregisterPatterns but executes both
35 // operations in one step, which is cheaper in the execution.
36 void RegisterAndUnregisterPatterns(
37 const std::vector<const StringPattern*>& to_register,
38 const std::vector<const StringPattern*>& to_unregister);
39
40 // Matches |text| against all registered StringPatterns. Stores the IDs
41 // of matching patterns in |matches|. |matches| is not cleared before adding
42 // to it.
43 bool Match(const std::string& text,
44 std::set<StringPattern::ID>* matches) const;
45
46 // Returns true if this object retains no allocated data. Only for debugging.
47 bool IsEmpty() const;
48
49 private:
50 // A node of an Aho Corasick Tree. This is implemented according to
51 // http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf
52 //
53 // The algorithm is based on the idea of building a trie of all registered
54 // patterns. Each node of the tree is annotated with a set of pattern
55 // IDs that are used to report matches.
56 //
57 // The root of the trie represents an empty match. If we were looking whether
58 // any registered pattern matches a text at the beginning of the text (i.e.
59 // whether any pattern is a prefix of the text), we could just follow
60 // nodes in the trie according to the matching characters in the text.
61 // E.g., if text == "foobar", we would follow the trie from the root node
62 // to its child labeled 'f', from there to child 'o', etc. In this process we
63 // would report all pattern IDs associated with the trie nodes as matches.
64 //
65 // As we are not looking for all prefix matches but all substring matches,
66 // this algorithm would need to compare text.substr(0), text.substr(1), ...
67 // against the trie, which is in O(|text|^2).
68 //
69 // The Aho Corasick algorithm improves this runtime by using failure edges.
70 // In case we have found a partial match of length k in the text
71 // (text[i, ..., i + k - 1]) in the trie starting at the root and ending at
72 // a node at depth k, but cannot find a match in the trie for character
73 // text[i + k] at depth k + 1, we follow a failure edge. This edge
74 // corresponds to the longest proper suffix of text[i, ..., i + k - 1] that
75 // is a prefix of any registered pattern.
76 //
77 // If your brain thinks "Forget it, let's go shopping.", don't worry.
78 // Take a nap and read an introductory text on the Aho Corasick algorithm.
79 // It will make sense. Eventually.
80 class AhoCorasickNode {
81 public:
82 // Key: label of the edge, value: node index in |tree_| of parent class.
83 typedef std::map<char, int> Edges;
84 typedef std::set<StringPattern::ID> Matches;
85
86 AhoCorasickNode();
87 ~AhoCorasickNode();
88 AhoCorasickNode(const AhoCorasickNode& other);
89 AhoCorasickNode& operator=(const AhoCorasickNode& other);
90
91 bool HasEdge(char c) const;
92 int GetEdge(char c) const;
93 void SetEdge(char c, int node);
94 const Edges& edges() const { return edges_; }
95
96 int failure() const { return failure_; }
97 void set_failure(int failure) { failure_ = failure; }
98
99 void AddMatch(StringPattern::ID id);
100 void AddMatches(const Matches& matches);
101 const Matches& matches() const { return matches_; }
102
103 private:
104 // Outgoing edges of current node.
105 Edges edges_;
106
107 // Node index that failure edge leads to.
108 int failure_;
109
110 // Identifiers of matches.
111 Matches matches_;
112 };
113
114 void RebuildAhoCorasickTree();
115
116 // Inserts a path for |pattern->pattern()| into the tree and adds
117 // |pattern->id()| to the set of matches. Ownership of |pattern| remains with
118 // the caller.
119 void InsertPatternIntoAhoCorasickTree(const StringPattern* pattern);
120 void CreateFailureEdges();
121
122 // Set of all registered StringPatterns. Used to regenerate the
123 // Aho-Corasick tree in case patterns are registered or unregistered.
124 typedef std::map<StringPattern::ID, const StringPattern*>
125 SubstringPatternSet;
126 SubstringPatternSet patterns_;
127
128 // The nodes of a Aho-Corasick tree.
129 std::vector<AhoCorasickNode> tree_;
130
131 DISALLOW_COPY_AND_ASSIGN(SubstringSetMatcher);
132 };
133
134 } // namespace extensions
135
136 #endif // CHROME_COMMON_EXTENSIONS_MATCHER_SUBSTRING_SET_MATCHER_H_
OLDNEW
« no previous file with comments | « chrome/common/extensions/matcher/string_pattern_unittest.cc ('k') | chrome/common/extensions/matcher/substring_set_matcher.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698