OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #ifndef CHROME_COMMON_EXTENSIONS_MATCHER_URL_MATCHER_H_ | |
6 #define CHROME_COMMON_EXTENSIONS_MATCHER_URL_MATCHER_H_ | |
7 | |
8 #include <set> | |
9 #include <vector> | |
10 | |
11 #include "base/memory/ref_counted.h" | |
12 #include "base/memory/scoped_ptr.h" | |
13 #include "base/memory/scoped_vector.h" | |
14 #include "chrome/common/extensions/matcher/regex_set_matcher.h" | |
15 #include "chrome/common/extensions/matcher/substring_set_matcher.h" | |
16 | |
17 class GURL; | |
18 | |
19 namespace base { | |
20 class DictionaryValue; | |
21 } | |
22 | |
23 namespace extensions { | |
24 | |
25 // This class represents a single URL matching condition, e.g. a match on the | |
26 // host suffix or the containment of a string in the query component of a GURL. | |
27 // | |
28 // The difference from a simple StringPattern is that this also supports | |
29 // checking whether the {Host, Path, Query} of a URL contains a string. The | |
30 // reduction of URL matching conditions to StringPatterns conducted by | |
31 // URLMatcherConditionFactory is not capable of expressing that alone. | |
32 // | |
33 // Also supported is matching regular expressions against the URL (URL_MATCHES). | |
34 class URLMatcherCondition { | |
35 public: | |
36 enum Criterion { | |
37 HOST_PREFIX, | |
38 HOST_SUFFIX, | |
39 HOST_CONTAINS, | |
40 HOST_EQUALS, | |
41 PATH_PREFIX, | |
42 PATH_SUFFIX, | |
43 PATH_CONTAINS, | |
44 PATH_EQUALS, | |
45 QUERY_PREFIX, | |
46 QUERY_SUFFIX, | |
47 QUERY_CONTAINS, | |
48 QUERY_EQUALS, | |
49 HOST_SUFFIX_PATH_PREFIX, | |
50 HOST_EQUALS_PATH_PREFIX, | |
51 URL_PREFIX, | |
52 URL_SUFFIX, | |
53 URL_CONTAINS, | |
54 URL_EQUALS, | |
55 URL_MATCHES, | |
56 }; | |
57 | |
58 URLMatcherCondition(); | |
59 ~URLMatcherCondition(); | |
60 URLMatcherCondition(Criterion criterion, | |
61 const StringPattern* substring_pattern); | |
62 URLMatcherCondition(const URLMatcherCondition& rhs); | |
63 URLMatcherCondition& operator=(const URLMatcherCondition& rhs); | |
64 bool operator<(const URLMatcherCondition& rhs) const; | |
65 | |
66 Criterion criterion() const { return criterion_; } | |
67 const StringPattern* string_pattern() const { | |
68 return string_pattern_; | |
69 } | |
70 | |
71 // Returns whether this URLMatcherCondition needs to be executed on a | |
72 // full URL rather than the individual components (see | |
73 // URLMatcherConditionFactory). | |
74 bool IsFullURLCondition() const; | |
75 | |
76 // Returns whether this URLMatcherCondition is a regular expression to be | |
77 // handled by a regex matcher instead of a substring matcher. | |
78 bool IsRegexCondition() const; | |
79 | |
80 // Returns whether this condition is fulfilled according to | |
81 // |matching_patterns| and |url|. | |
82 bool IsMatch(const std::set<StringPattern::ID>& matching_patterns, | |
83 const GURL& url) const; | |
84 | |
85 private: | |
86 // |criterion_| and |string_pattern_| describe together what property a URL | |
87 // needs to fulfill to be considered a match. | |
88 Criterion criterion_; | |
89 | |
90 // This is the StringPattern that is used in a SubstringSetMatcher. | |
91 const StringPattern* string_pattern_; | |
92 }; | |
93 | |
94 // Class to map the problem of finding {host, path, query} {prefixes, suffixes, | |
95 // containments, and equality} in GURLs to the substring matching problem. | |
96 // | |
97 // Say, you want to check whether the path of a URL starts with "/index.html". | |
98 // This class preprocesses a URL like "www.google.com/index.html" into something | |
99 // like "www.google.com|/index.html". After preprocessing, you can search for | |
100 // "|/index.html" in the string and see that this candidate URL actually has | |
101 // a path that starts with "/index.html". On the contrary, | |
102 // "www.google.com/images/index.html" would be normalized to | |
103 // "www.google.com|/images/index.html". It is easy to see that it contains | |
104 // "/index.html" but the path of the URL does not start with "/index.html". | |
105 // | |
106 // This preprocessing is important if you want to match a URL against many | |
107 // patterns because it reduces the matching to a "discover all substrings | |
108 // of a dictionary in a text" problem, which can be solved very efficiently | |
109 // by the Aho-Corasick algorithm. | |
110 // | |
111 // IMPORTANT: The URLMatcherConditionFactory owns the StringPattern | |
112 // referenced by created URLMatcherConditions. Therefore, it must outlive | |
113 // all created URLMatcherCondition and the SubstringSetMatcher. | |
114 class URLMatcherConditionFactory { | |
115 public: | |
116 URLMatcherConditionFactory(); | |
117 ~URLMatcherConditionFactory(); | |
118 | |
119 // Canonicalizes a URL for "Create{Host,Path,Query}*Condition" searches. | |
120 std::string CanonicalizeURLForComponentSearches(const GURL& url) const; | |
121 | |
122 // Factory methods for various condition types. | |
123 // | |
124 // Note that these methods fill the pattern_singletons_. If you create | |
125 // conditions and don't register them to a URLMatcher, they will continue to | |
126 // consume memory. You need to call ForgetUnusedPatterns() or | |
127 // URLMatcher::ClearUnusedConditionSets() in this case. | |
128 URLMatcherCondition CreateHostPrefixCondition(const std::string& prefix); | |
129 URLMatcherCondition CreateHostSuffixCondition(const std::string& suffix); | |
130 URLMatcherCondition CreateHostContainsCondition(const std::string& str); | |
131 URLMatcherCondition CreateHostEqualsCondition(const std::string& str); | |
132 | |
133 URLMatcherCondition CreatePathPrefixCondition(const std::string& prefix); | |
134 URLMatcherCondition CreatePathSuffixCondition(const std::string& suffix); | |
135 URLMatcherCondition CreatePathContainsCondition(const std::string& str); | |
136 URLMatcherCondition CreatePathEqualsCondition(const std::string& str); | |
137 | |
138 URLMatcherCondition CreateQueryPrefixCondition(const std::string& prefix); | |
139 URLMatcherCondition CreateQuerySuffixCondition(const std::string& suffix); | |
140 URLMatcherCondition CreateQueryContainsCondition(const std::string& str); | |
141 URLMatcherCondition CreateQueryEqualsCondition(const std::string& str); | |
142 | |
143 // This covers the common case, where you don't care whether a domain | |
144 // "foobar.com" is expressed as "foobar.com" or "www.foobar.com", and it | |
145 // should be followed by a given |path_prefix|. | |
146 URLMatcherCondition CreateHostSuffixPathPrefixCondition( | |
147 const std::string& host_suffix, | |
148 const std::string& path_prefix); | |
149 URLMatcherCondition CreateHostEqualsPathPrefixCondition( | |
150 const std::string& host, | |
151 const std::string& path_prefix); | |
152 | |
153 // Canonicalizes a URL for "CreateURL*Condition" searches. | |
154 std::string CanonicalizeURLForFullSearches(const GURL& url) const; | |
155 | |
156 // Canonicalizes a URL for "CreateURLMatchesCondition" searches. | |
157 std::string CanonicalizeURLForRegexSearches(const GURL& url) const; | |
158 | |
159 URLMatcherCondition CreateURLPrefixCondition(const std::string& prefix); | |
160 URLMatcherCondition CreateURLSuffixCondition(const std::string& suffix); | |
161 URLMatcherCondition CreateURLContainsCondition(const std::string& str); | |
162 URLMatcherCondition CreateURLEqualsCondition(const std::string& str); | |
163 | |
164 URLMatcherCondition CreateURLMatchesCondition(const std::string& regex); | |
165 | |
166 // Removes all patterns from |pattern_singletons_| that are not listed in | |
167 // |used_patterns|. These patterns are not referenced any more and get | |
168 // freed. | |
169 void ForgetUnusedPatterns( | |
170 const std::set<StringPattern::ID>& used_patterns); | |
171 | |
172 // Returns true if this object retains no allocated data. Only for debugging. | |
173 bool IsEmpty() const; | |
174 | |
175 private: | |
176 // Creates a URLMatcherCondition according to the parameters passed. | |
177 // The URLMatcherCondition will refer to a StringPattern that is | |
178 // owned by |pattern_singletons_|. | |
179 URLMatcherCondition CreateCondition(URLMatcherCondition::Criterion criterion, | |
180 const std::string& pattern); | |
181 | |
182 // Prepends a "." to the hostname if it does not start with one. | |
183 std::string CanonicalizeHostname(const std::string& hostname) const; | |
184 | |
185 // Counter that ensures that all created StringPatterns have unique IDs. | |
186 // Note that substring patterns and regex patterns will use different IDs. | |
187 int id_counter_; | |
188 | |
189 // This comparison considers only the pattern() value of the | |
190 // StringPatterns. | |
191 struct StringPatternPointerCompare { | |
192 bool operator()(StringPattern* lhs, StringPattern* rhs) const; | |
193 }; | |
194 // Set to ensure that we generate only one StringPattern for each content | |
195 // of StringPattern::pattern(). | |
196 typedef std::set<StringPattern*, StringPatternPointerCompare> | |
197 PatternSingletons; | |
198 PatternSingletons substring_pattern_singletons_; | |
199 PatternSingletons regex_pattern_singletons_; | |
200 | |
201 DISALLOW_COPY_AND_ASSIGN(URLMatcherConditionFactory); | |
202 }; | |
203 | |
204 // This class represents a filter for the URL scheme to be hooked up into a | |
205 // URLMatcherConditionSet. | |
206 class URLMatcherSchemeFilter { | |
207 public: | |
208 explicit URLMatcherSchemeFilter(const std::string& filter); | |
209 explicit URLMatcherSchemeFilter(const std::vector<std::string>& filters); | |
210 ~URLMatcherSchemeFilter(); | |
211 bool IsMatch(const GURL& url) const; | |
212 | |
213 private: | |
214 std::vector<std::string> filters_; | |
215 | |
216 DISALLOW_COPY_AND_ASSIGN(URLMatcherSchemeFilter); | |
217 }; | |
218 | |
219 // This class represents a filter for port numbers to be hooked up into a | |
220 // URLMatcherConditionSet. | |
221 class URLMatcherPortFilter { | |
222 public: | |
223 // Boundaries of a port range (both ends are included). | |
224 typedef std::pair<int, int> Range; | |
225 explicit URLMatcherPortFilter(const std::vector<Range>& ranges); | |
226 ~URLMatcherPortFilter(); | |
227 bool IsMatch(const GURL& url) const; | |
228 | |
229 // Creates a port range [from, to]; both ends are included. | |
230 static Range CreateRange(int from, int to); | |
231 // Creates a port range containing a single port. | |
232 static Range CreateRange(int port); | |
233 | |
234 private: | |
235 std::vector<Range> ranges_; | |
236 | |
237 DISALLOW_COPY_AND_ASSIGN(URLMatcherPortFilter); | |
238 }; | |
239 | |
240 // This class represents a set of conditions that all need to match on a | |
241 // given URL in order to be considered a match. | |
242 class URLMatcherConditionSet : public base::RefCounted<URLMatcherConditionSet> { | |
243 public: | |
244 typedef int ID; | |
245 typedef std::set<URLMatcherCondition> Conditions; | |
246 typedef std::vector<scoped_refptr<URLMatcherConditionSet> > Vector; | |
247 | |
248 // Matches if all conditions in |conditions| are fulfilled. | |
249 URLMatcherConditionSet(ID id, const Conditions& conditions); | |
250 | |
251 // Matches if all conditions in |conditions|, |scheme_filter| and | |
252 // |port_filter| are fulfilled. |scheme_filter| and |port_filter| may be NULL, | |
253 // in which case, no restrictions are imposed on the scheme/port of a URL. | |
254 URLMatcherConditionSet(ID id, const Conditions& conditions, | |
255 scoped_ptr<URLMatcherSchemeFilter> scheme_filter, | |
256 scoped_ptr<URLMatcherPortFilter> port_filter); | |
257 | |
258 ID id() const { return id_; } | |
259 const Conditions& conditions() const { return conditions_; } | |
260 | |
261 bool IsMatch(const std::set<StringPattern::ID>& matching_patterns, | |
262 const GURL& url) const; | |
263 | |
264 private: | |
265 friend class base::RefCounted<URLMatcherConditionSet>; | |
266 ~URLMatcherConditionSet(); | |
267 ID id_; | |
268 Conditions conditions_; | |
269 scoped_ptr<URLMatcherSchemeFilter> scheme_filter_; | |
270 scoped_ptr<URLMatcherPortFilter> port_filter_; | |
271 | |
272 DISALLOW_COPY_AND_ASSIGN(URLMatcherConditionSet); | |
273 }; | |
274 | |
275 // This class allows matching one URL against a large set of | |
276 // URLMatcherConditionSets at the same time. | |
277 class URLMatcher { | |
278 public: | |
279 URLMatcher(); | |
280 ~URLMatcher(); | |
281 | |
282 // Adds new URLMatcherConditionSet to this URL Matcher. Each condition set | |
283 // must have a unique ID. | |
284 // This is an expensive operation as it triggers pre-calculations on the | |
285 // currently registered condition sets. Do not call this operation many | |
286 // times with a single condition set in each call. | |
287 void AddConditionSets(const URLMatcherConditionSet::Vector& condition_sets); | |
288 | |
289 // Removes the listed condition sets. All |condition_set_ids| must be | |
290 // currently registered. This function should be called with large batches | |
291 // of |condition_set_ids| at a time to improve performance. | |
292 void RemoveConditionSets( | |
293 const std::vector<URLMatcherConditionSet::ID>& condition_set_ids); | |
294 | |
295 // Removes all unused condition sets from the ConditionFactory. | |
296 void ClearUnusedConditionSets(); | |
297 | |
298 // Returns the IDs of all URLMatcherConditionSet that match to this |url|. | |
299 std::set<URLMatcherConditionSet::ID> MatchURL(const GURL& url) const; | |
300 | |
301 // Returns the URLMatcherConditionFactory that must be used to create | |
302 // URLMatcherConditionSets for this URLMatcher. | |
303 URLMatcherConditionFactory* condition_factory() { | |
304 return &condition_factory_; | |
305 } | |
306 | |
307 // Returns true if this object retains no allocated data. Only for debugging. | |
308 bool IsEmpty() const; | |
309 | |
310 private: | |
311 void UpdateSubstringSetMatcher(bool full_url_conditions); | |
312 void UpdateRegexSetMatcher(); | |
313 void UpdateTriggers(); | |
314 void UpdateConditionFactory(); | |
315 void UpdateInternalDatastructures(); | |
316 | |
317 URLMatcherConditionFactory condition_factory_; | |
318 | |
319 // Maps the ID of a URLMatcherConditionSet to the respective | |
320 // URLMatcherConditionSet. | |
321 typedef std::map<URLMatcherConditionSet::ID, | |
322 scoped_refptr<URLMatcherConditionSet> > | |
323 URLMatcherConditionSets; | |
324 URLMatcherConditionSets url_matcher_condition_sets_; | |
325 | |
326 // Maps a StringPattern ID to the URLMatcherConditions that need to | |
327 // be triggered in case of a StringPattern match. | |
328 typedef std::map<StringPattern::ID, std::set<URLMatcherConditionSet::ID> > | |
329 StringPatternTriggers; | |
330 StringPatternTriggers substring_match_triggers_; | |
331 | |
332 SubstringSetMatcher full_url_matcher_; | |
333 SubstringSetMatcher url_component_matcher_; | |
334 RegexSetMatcher regex_set_matcher_; | |
335 std::set<const StringPattern*> registered_full_url_patterns_; | |
336 std::set<const StringPattern*> registered_url_component_patterns_; | |
337 | |
338 DISALLOW_COPY_AND_ASSIGN(URLMatcher); | |
339 }; | |
340 | |
341 } // namespace extensions | |
342 | |
343 #endif // CHROME_COMMON_EXTENSIONS_MATCHER_URL_MATCHER_H_ | |
OLD | NEW |