OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "chrome/common/extensions/matcher/url_matcher.h" | |
6 | |
7 #include <algorithm> | |
8 #include <iterator> | |
9 | |
10 #include "base/logging.h" | |
11 #include "content/public/common/url_constants.h" | |
12 #include "googleurl/src/gurl.h" | |
13 #include "googleurl/src/url_canon.h" | |
14 | |
15 namespace extensions { | |
16 | |
17 // This set of classes implement a mapping of URL Component Patterns, such as | |
18 // host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns | |
19 // for use in substring comparisons. | |
20 // | |
21 // The idea of this mapping is to reduce the problem of comparing many | |
22 // URL Component Patterns against one URL to the problem of searching many | |
23 // substrings in one string: | |
24 // | |
25 // ---------------------- ----------------- | |
26 // | URL Query operator | ----translate----> | StringPattern | | |
27 // ---------------------- ----------------- | |
28 // ^ | |
29 // | | |
30 // compare | |
31 // | | |
32 // v | |
33 // ---------------------- ----------------- | |
34 // | URL to compare | | | | |
35 // | to all URL Query | ----translate----> | String | | |
36 // | operators | | | | |
37 // ---------------------- ----------------- | |
38 // | |
39 // The reason for this problem reduction is that there are efficient algorithms | |
40 // for searching many substrings in one string (see Aho-Corasick algorithm). | |
41 // | |
42 // Additionally, some of the same pieces are reused to implement regular | |
43 // expression comparisons. The FilteredRE2 implementation for matching many | |
44 // regular expressions against one string uses prefiltering, in which a set | |
45 // of substrings (derived from the regexes) are first searched for, to reduce | |
46 // the number of regular expressions to test; the prefiltering step also | |
47 // uses Aho-Corasick. | |
48 // | |
49 // Case 1: {host,path,query}_{prefix,suffix,equals} searches. | |
50 // ========================================================== | |
51 // | |
52 // For searches in this class, we normalize URLs as follows: | |
53 // | |
54 // Step 1: | |
55 // Remove scheme, port and segment from URL: | |
56 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes | |
57 // www.example.com/index.html?search=foo | |
58 // | |
59 // We remove the scheme and port number because they can be checked later | |
60 // in a secondary filter step. We remove the segment (the #... part) because | |
61 // this is not guaranteed to be ASCII-7 encoded. | |
62 // | |
63 // Step 2: | |
64 // Translate URL to String and add the following position markers: | |
65 // - BU = Beginning of URL | |
66 // - ED = End of Domain | |
67 // - EP = End of Path | |
68 // - EU = End of URL | |
69 // Furthermore, the hostname is canonicalized to start with a ".". | |
70 // | |
71 // Position markers are represented as characters >127, which are therefore | |
72 // guaranteed not to be part of the ASCII-7 encoded URL character set. | |
73 // | |
74 // -> www.example.com/index.html?search=foo becomes | |
75 // BU .www.example.com ED /index.html EP ?search=foo EU | |
76 // | |
77 // -> www.example.com/index.html becomes | |
78 // BU .www.example.com ED /index.html EP EU | |
79 // | |
80 // Step 3: | |
81 // Translate URL Component Patterns as follows: | |
82 // | |
83 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix) | |
84 // -> host_prefix("www.example") = BU .www.example | |
85 // | |
86 // host_suffix(suffix) = suffix ED | |
87 // -> host_suffix("example.com") = example.com ED | |
88 // -> host_suffix(".example.com") = .example.com ED | |
89 // | |
90 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED | |
91 // -> host_equals("www.example.com") = BU .www.example.com ED | |
92 // | |
93 // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}). | |
94 // | |
95 // With this, we can search the StringPatterns in the normalized URL. | |
96 // | |
97 // | |
98 // Case 2: url_{prefix,suffix,equals,contains} searches. | |
99 // ===================================================== | |
100 // | |
101 // Step 1: as above, except that | |
102 // - the scheme is not removed | |
103 // - the port is not removed if it is specified and does not match the default | |
104 // port for the given scheme. | |
105 // | |
106 // Step 2: | |
107 // Translate URL to String and add the following position markers: | |
108 // - BU = Beginning of URL | |
109 // - EU = End of URL | |
110 // | |
111 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes | |
112 // BU http://www.example.com:8080/index.html?search=foo EU | |
113 // -> http://www.example.com:80/index.html?search=foo#first_match becomes | |
114 // BU http://www.example.com/index.html?search=foo EU | |
115 // | |
116 // url_prefix(prefix) = BU prefix | |
117 // -> url_prefix("http://www.example") = BU http://www.example | |
118 // | |
119 // url_contains(substring) = substring | |
120 // -> url_contains("index") = index | |
121 // | |
122 // | |
123 // Case 3: {host,path,query}_contains searches. | |
124 // ============================================ | |
125 // | |
126 // These kinds of searches are not supported directly but can be derived | |
127 // by a combination of a url_contains() query followed by an explicit test: | |
128 // | |
129 // host_contains(str) = url_contains(str) followed by test whether str occurs | |
130 // in host component of original URL. | |
131 // -> host_contains("example.co") = example.co | |
132 // followed by gurl.host().find("example.co"); | |
133 // | |
134 // [similarly for path_contains and query_contains]. | |
135 // | |
136 // | |
137 // Regular expression matching (url_matches searches) | |
138 // ================================================== | |
139 // | |
140 // This class also supports matching regular expressions (RE2 syntax) | |
141 // against full URLs, which are transformed as in case 2. | |
142 | |
143 namespace { | |
144 | |
145 bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) { | |
146 return criterion == URLMatcherCondition::URL_MATCHES; | |
147 } | |
148 | |
149 } // namespace | |
150 | |
151 // | |
152 // URLMatcherCondition | |
153 // | |
154 | |
155 URLMatcherCondition::URLMatcherCondition() | |
156 : criterion_(HOST_PREFIX), | |
157 string_pattern_(NULL) {} | |
158 | |
159 URLMatcherCondition::~URLMatcherCondition() {} | |
160 | |
161 URLMatcherCondition::URLMatcherCondition( | |
162 Criterion criterion, | |
163 const StringPattern* string_pattern) | |
164 : criterion_(criterion), | |
165 string_pattern_(string_pattern) {} | |
166 | |
167 URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs) | |
168 : criterion_(rhs.criterion_), | |
169 string_pattern_(rhs.string_pattern_) {} | |
170 | |
171 URLMatcherCondition& URLMatcherCondition::operator=( | |
172 const URLMatcherCondition& rhs) { | |
173 criterion_ = rhs.criterion_; | |
174 string_pattern_ = rhs.string_pattern_; | |
175 return *this; | |
176 } | |
177 | |
178 bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const { | |
179 if (criterion_ < rhs.criterion_) return true; | |
180 if (criterion_ > rhs.criterion_) return false; | |
181 if (string_pattern_ != NULL && rhs.string_pattern_ != NULL) | |
182 return *string_pattern_ < *rhs.string_pattern_; | |
183 if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true; | |
184 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL, | |
185 // or both are NULL. | |
186 return false; | |
187 } | |
188 | |
189 bool URLMatcherCondition::IsFullURLCondition() const { | |
190 // For these criteria the SubstringMatcher needs to be executed on the | |
191 // GURL that is canonicalized with | |
192 // URLMatcherConditionFactory::CanonicalizeURLForFullSearches. | |
193 switch (criterion_) { | |
194 case HOST_CONTAINS: | |
195 case PATH_CONTAINS: | |
196 case QUERY_CONTAINS: | |
197 case URL_PREFIX: | |
198 case URL_SUFFIX: | |
199 case URL_CONTAINS: | |
200 case URL_EQUALS: | |
201 return true; | |
202 default: | |
203 break; | |
204 } | |
205 return false; | |
206 } | |
207 | |
208 bool URLMatcherCondition::IsRegexCondition() const { | |
209 return IsRegexCriterion(criterion_); | |
210 } | |
211 | |
212 bool URLMatcherCondition::IsMatch( | |
213 const std::set<StringPattern::ID>& matching_patterns, | |
214 const GURL& url) const { | |
215 DCHECK(string_pattern_); | |
216 if (!ContainsKey(matching_patterns, string_pattern_->id())) | |
217 return false; | |
218 // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on | |
219 // a substring match on the raw URL. In case of a match, we need to verify | |
220 // that the match was found in the correct component of the URL. | |
221 switch (criterion_) { | |
222 case HOST_CONTAINS: | |
223 return url.host().find(string_pattern_->pattern()) != | |
224 std::string::npos; | |
225 case PATH_CONTAINS: | |
226 return url.path().find(string_pattern_->pattern()) != | |
227 std::string::npos; | |
228 case QUERY_CONTAINS: | |
229 return url.query().find(string_pattern_->pattern()) != | |
230 std::string::npos; | |
231 default: | |
232 break; | |
233 } | |
234 return true; | |
235 } | |
236 | |
237 // | |
238 // URLMatcherConditionFactory | |
239 // | |
240 | |
241 namespace { | |
242 // These are symbols that are not contained in 7-bit ASCII used in GURLs. | |
243 const char kBeginningOfURL[] = {static_cast<char>(-1), 0}; | |
244 const char kEndOfDomain[] = {static_cast<char>(-2), 0}; | |
245 const char kEndOfPath[] = {static_cast<char>(-3), 0}; | |
246 const char kEndOfURL[] = {static_cast<char>(-4), 0}; | |
247 } // namespace | |
248 | |
249 URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {} | |
250 | |
251 URLMatcherConditionFactory::~URLMatcherConditionFactory() { | |
252 STLDeleteElements(&substring_pattern_singletons_); | |
253 STLDeleteElements(®ex_pattern_singletons_); | |
254 } | |
255 | |
256 std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches( | |
257 const GURL& url) const { | |
258 return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain + | |
259 url.path() + kEndOfPath + (url.has_query() ? "?" + url.query() : "") + | |
260 kEndOfURL; | |
261 } | |
262 | |
263 URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition( | |
264 const std::string& prefix) { | |
265 return CreateCondition(URLMatcherCondition::HOST_PREFIX, | |
266 kBeginningOfURL + CanonicalizeHostname(prefix)); | |
267 } | |
268 | |
269 URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition( | |
270 const std::string& suffix) { | |
271 return CreateCondition(URLMatcherCondition::HOST_SUFFIX, | |
272 suffix + kEndOfDomain); | |
273 } | |
274 | |
275 URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition( | |
276 const std::string& str) { | |
277 return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str); | |
278 } | |
279 | |
280 URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition( | |
281 const std::string& str) { | |
282 return CreateCondition(URLMatcherCondition::HOST_EQUALS, | |
283 kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain); | |
284 } | |
285 | |
286 URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition( | |
287 const std::string& prefix) { | |
288 return CreateCondition(URLMatcherCondition::PATH_PREFIX, | |
289 kEndOfDomain + prefix); | |
290 } | |
291 | |
292 URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition( | |
293 const std::string& suffix) { | |
294 return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath); | |
295 } | |
296 | |
297 URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition( | |
298 const std::string& str) { | |
299 return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str); | |
300 } | |
301 | |
302 URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition( | |
303 const std::string& str) { | |
304 return CreateCondition(URLMatcherCondition::PATH_EQUALS, | |
305 kEndOfDomain + str + kEndOfPath); | |
306 } | |
307 | |
308 URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition( | |
309 const std::string& prefix) { | |
310 std::string pattern; | |
311 if (!prefix.empty() && prefix[0] == '?') | |
312 pattern = kEndOfPath + prefix; | |
313 else | |
314 pattern = kEndOfPath + ('?' + prefix); | |
315 | |
316 return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern); | |
317 } | |
318 | |
319 URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition( | |
320 const std::string& suffix) { | |
321 if (!suffix.empty() && suffix[0] == '?') { | |
322 return CreateQueryEqualsCondition(suffix); | |
323 } else { | |
324 return CreateCondition(URLMatcherCondition::QUERY_SUFFIX, | |
325 suffix + kEndOfURL); | |
326 } | |
327 } | |
328 | |
329 URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition( | |
330 const std::string& str) { | |
331 if (!str.empty() && str[0] == '?') | |
332 return CreateQueryPrefixCondition(str); | |
333 else | |
334 return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str); | |
335 } | |
336 | |
337 URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition( | |
338 const std::string& str) { | |
339 std::string pattern; | |
340 if (!str.empty() && str[0] == '?') | |
341 pattern = kEndOfPath + str + kEndOfURL; | |
342 else | |
343 pattern = kEndOfPath + ('?' + str) + kEndOfURL; | |
344 | |
345 return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern); | |
346 } | |
347 | |
348 URLMatcherCondition | |
349 URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition( | |
350 const std::string& host_suffix, | |
351 const std::string& path_prefix) { | |
352 return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX, | |
353 host_suffix + kEndOfDomain + path_prefix); | |
354 } | |
355 | |
356 URLMatcherCondition | |
357 URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition( | |
358 const std::string& host, | |
359 const std::string& path_prefix) { | |
360 return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX, | |
361 kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain + | |
362 path_prefix); | |
363 } | |
364 | |
365 std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches( | |
366 const GURL& url) const { | |
367 GURL::Replacements replacements; | |
368 replacements.ClearPassword(); | |
369 replacements.ClearUsername(); | |
370 replacements.ClearRef(); | |
371 // Clear port if it is implicit from scheme. | |
372 if (url.has_port()) { | |
373 const std::string& port = url.scheme(); | |
374 if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) == | |
375 url.EffectiveIntPort()) { | |
376 replacements.ClearPort(); | |
377 } | |
378 } | |
379 return kBeginningOfURL + url.ReplaceComponents(replacements).spec() + | |
380 kEndOfURL; | |
381 } | |
382 | |
383 std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches( | |
384 const GURL& url) const { | |
385 GURL::Replacements replacements; | |
386 replacements.ClearPassword(); | |
387 replacements.ClearUsername(); | |
388 replacements.ClearRef(); | |
389 // Clear port if it is implicit from scheme. | |
390 if (url.has_port()) { | |
391 const std::string& port = url.scheme(); | |
392 if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) == | |
393 url.EffectiveIntPort()) { | |
394 replacements.ClearPort(); | |
395 } | |
396 } | |
397 return url.ReplaceComponents(replacements).spec(); | |
398 } | |
399 | |
400 URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition( | |
401 const std::string& prefix) { | |
402 return CreateCondition(URLMatcherCondition::URL_PREFIX, | |
403 kBeginningOfURL + prefix); | |
404 } | |
405 | |
406 URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition( | |
407 const std::string& suffix) { | |
408 return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL); | |
409 } | |
410 | |
411 URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition( | |
412 const std::string& str) { | |
413 return CreateCondition(URLMatcherCondition::URL_CONTAINS, str); | |
414 } | |
415 | |
416 URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition( | |
417 const std::string& str) { | |
418 return CreateCondition(URLMatcherCondition::URL_EQUALS, | |
419 kBeginningOfURL + str + kEndOfURL); | |
420 } | |
421 | |
422 URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition( | |
423 const std::string& regex) { | |
424 return CreateCondition(URLMatcherCondition::URL_MATCHES, regex); | |
425 } | |
426 | |
427 void URLMatcherConditionFactory::ForgetUnusedPatterns( | |
428 const std::set<StringPattern::ID>& used_patterns) { | |
429 PatternSingletons::iterator i = substring_pattern_singletons_.begin(); | |
430 while (i != substring_pattern_singletons_.end()) { | |
431 if (used_patterns.find((*i)->id()) != used_patterns.end()) { | |
432 ++i; | |
433 } else { | |
434 delete *i; | |
435 substring_pattern_singletons_.erase(i++); | |
436 } | |
437 } | |
438 i = regex_pattern_singletons_.begin(); | |
439 while (i != regex_pattern_singletons_.end()) { | |
440 if (used_patterns.find((*i)->id()) != used_patterns.end()) { | |
441 ++i; | |
442 } else { | |
443 delete *i; | |
444 regex_pattern_singletons_.erase(i++); | |
445 } | |
446 } | |
447 } | |
448 | |
449 bool URLMatcherConditionFactory::IsEmpty() const { | |
450 return substring_pattern_singletons_.empty() && | |
451 regex_pattern_singletons_.empty(); | |
452 } | |
453 | |
454 URLMatcherCondition URLMatcherConditionFactory::CreateCondition( | |
455 URLMatcherCondition::Criterion criterion, | |
456 const std::string& pattern) { | |
457 StringPattern search_pattern(pattern, 0); | |
458 PatternSingletons* pattern_singletons = | |
459 IsRegexCriterion(criterion) ? ®ex_pattern_singletons_ | |
460 : &substring_pattern_singletons_; | |
461 | |
462 PatternSingletons::const_iterator iter = | |
463 pattern_singletons->find(&search_pattern); | |
464 | |
465 if (iter != pattern_singletons->end()) { | |
466 return URLMatcherCondition(criterion, *iter); | |
467 } else { | |
468 StringPattern* new_pattern = | |
469 new StringPattern(pattern, id_counter_++); | |
470 pattern_singletons->insert(new_pattern); | |
471 return URLMatcherCondition(criterion, new_pattern); | |
472 } | |
473 } | |
474 | |
475 std::string URLMatcherConditionFactory::CanonicalizeHostname( | |
476 const std::string& hostname) const { | |
477 if (!hostname.empty() && hostname[0] == '.') | |
478 return hostname; | |
479 else | |
480 return "." + hostname; | |
481 } | |
482 | |
483 bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()( | |
484 StringPattern* lhs, | |
485 StringPattern* rhs) const { | |
486 if (lhs == NULL && rhs != NULL) return true; | |
487 if (lhs != NULL && rhs != NULL) | |
488 return lhs->pattern() < rhs->pattern(); | |
489 // Either both are NULL or only rhs is NULL. | |
490 return false; | |
491 } | |
492 | |
493 // | |
494 // URLMatcherSchemeFilter | |
495 // | |
496 | |
497 URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter) | |
498 : filters_(1) { | |
499 filters_.push_back(filter); | |
500 } | |
501 | |
502 URLMatcherSchemeFilter::URLMatcherSchemeFilter( | |
503 const std::vector<std::string>& filters) | |
504 : filters_(filters) {} | |
505 | |
506 URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {} | |
507 | |
508 bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const { | |
509 return std::find(filters_.begin(), filters_.end(), url.scheme()) != | |
510 filters_.end(); | |
511 } | |
512 | |
513 // | |
514 // URLMatcherPortFilter | |
515 // | |
516 | |
517 URLMatcherPortFilter::URLMatcherPortFilter( | |
518 const std::vector<URLMatcherPortFilter::Range>& ranges) | |
519 : ranges_(ranges) {} | |
520 | |
521 URLMatcherPortFilter::~URLMatcherPortFilter() {} | |
522 | |
523 bool URLMatcherPortFilter::IsMatch(const GURL& url) const { | |
524 int port = url.EffectiveIntPort(); | |
525 for (std::vector<Range>::const_iterator i = ranges_.begin(); | |
526 i != ranges_.end(); ++i) { | |
527 if (i->first <= port && port <= i->second) | |
528 return true; | |
529 } | |
530 return false; | |
531 } | |
532 | |
533 // static | |
534 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from, | |
535 int to) { | |
536 return Range(from, to); | |
537 } | |
538 | |
539 // static | |
540 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) { | |
541 return Range(port, port); | |
542 } | |
543 | |
544 // | |
545 // URLMatcherConditionSet | |
546 // | |
547 | |
548 URLMatcherConditionSet::~URLMatcherConditionSet() {} | |
549 | |
550 URLMatcherConditionSet::URLMatcherConditionSet( | |
551 ID id, | |
552 const Conditions& conditions) | |
553 : id_(id), | |
554 conditions_(conditions) {} | |
555 | |
556 URLMatcherConditionSet::URLMatcherConditionSet( | |
557 ID id, | |
558 const Conditions& conditions, | |
559 scoped_ptr<URLMatcherSchemeFilter> scheme_filter, | |
560 scoped_ptr<URLMatcherPortFilter> port_filter) | |
561 : id_(id), | |
562 conditions_(conditions), | |
563 scheme_filter_(scheme_filter.Pass()), | |
564 port_filter_(port_filter.Pass()) {} | |
565 | |
566 bool URLMatcherConditionSet::IsMatch( | |
567 const std::set<StringPattern::ID>& matching_patterns, | |
568 const GURL& url) const { | |
569 for (Conditions::const_iterator i = conditions_.begin(); | |
570 i != conditions_.end(); ++i) { | |
571 if (!i->IsMatch(matching_patterns, url)) | |
572 return false; | |
573 } | |
574 if (scheme_filter_.get() && !scheme_filter_->IsMatch(url)) | |
575 return false; | |
576 if (port_filter_.get() && !port_filter_->IsMatch(url)) | |
577 return false; | |
578 return true; | |
579 } | |
580 | |
581 // | |
582 // URLMatcher | |
583 // | |
584 | |
585 URLMatcher::URLMatcher() {} | |
586 | |
587 URLMatcher::~URLMatcher() {} | |
588 | |
589 void URLMatcher::AddConditionSets( | |
590 const URLMatcherConditionSet::Vector& condition_sets) { | |
591 for (URLMatcherConditionSet::Vector::const_iterator i = | |
592 condition_sets.begin(); i != condition_sets.end(); ++i) { | |
593 DCHECK(url_matcher_condition_sets_.find((*i)->id()) == | |
594 url_matcher_condition_sets_.end()); | |
595 url_matcher_condition_sets_[(*i)->id()] = *i; | |
596 } | |
597 UpdateInternalDatastructures(); | |
598 } | |
599 | |
600 void URLMatcher::RemoveConditionSets( | |
601 const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) { | |
602 for (std::vector<URLMatcherConditionSet::ID>::const_iterator i = | |
603 condition_set_ids.begin(); i != condition_set_ids.end(); ++i) { | |
604 DCHECK(url_matcher_condition_sets_.find(*i) != | |
605 url_matcher_condition_sets_.end()); | |
606 url_matcher_condition_sets_.erase(*i); | |
607 } | |
608 UpdateInternalDatastructures(); | |
609 } | |
610 | |
611 void URLMatcher::ClearUnusedConditionSets() { | |
612 UpdateConditionFactory(); | |
613 } | |
614 | |
615 std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL( | |
616 const GURL& url) const { | |
617 // Find all IDs of StringPatterns that match |url|. | |
618 // See URLMatcherConditionFactory for the canonicalization of URLs and the | |
619 // distinction between full url searches and url component searches. | |
620 std::set<StringPattern::ID> matches; | |
621 full_url_matcher_.Match( | |
622 condition_factory_.CanonicalizeURLForFullSearches(url), &matches); | |
623 url_component_matcher_.Match( | |
624 condition_factory_.CanonicalizeURLForComponentSearches(url), &matches); | |
625 regex_set_matcher_.Match( | |
626 condition_factory_.CanonicalizeURLForRegexSearches(url), &matches); | |
627 | |
628 // Calculate all URLMatcherConditionSets for which all URLMatcherConditions | |
629 // were fulfilled. | |
630 std::set<URLMatcherConditionSet::ID> result; | |
631 for (std::set<StringPattern::ID>::const_iterator i = matches.begin(); | |
632 i != matches.end(); ++i) { | |
633 // For each URLMatcherConditionSet there is exactly one condition | |
634 // registered in substring_match_triggers_. This means that the following | |
635 // logic tests each URLMatcherConditionSet exactly once if it can be | |
636 // completely fulfilled. | |
637 StringPatternTriggers::const_iterator triggered_condition_sets_iter = | |
638 substring_match_triggers_.find(*i); | |
639 if (triggered_condition_sets_iter == substring_match_triggers_.end()) | |
640 continue; // Not all substring matches are triggers for a condition set. | |
641 const std::set<URLMatcherConditionSet::ID>& condition_sets = | |
642 triggered_condition_sets_iter->second; | |
643 for (std::set<URLMatcherConditionSet::ID>::const_iterator j = | |
644 condition_sets.begin(); j != condition_sets.end(); ++j) { | |
645 URLMatcherConditionSets::const_iterator condition_set_iter = | |
646 url_matcher_condition_sets_.find(*j); | |
647 DCHECK(condition_set_iter != url_matcher_condition_sets_.end()); | |
648 if (condition_set_iter->second->IsMatch(matches, url)) | |
649 result.insert(*j); | |
650 } | |
651 } | |
652 | |
653 return result; | |
654 } | |
655 | |
656 bool URLMatcher::IsEmpty() const { | |
657 return condition_factory_.IsEmpty() && | |
658 url_matcher_condition_sets_.empty() && | |
659 substring_match_triggers_.empty() && | |
660 full_url_matcher_.IsEmpty() && | |
661 url_component_matcher_.IsEmpty() && | |
662 registered_full_url_patterns_.empty() && | |
663 registered_url_component_patterns_.empty(); | |
664 } | |
665 | |
666 void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) { | |
667 // The purpose of |full_url_conditions| is just that we need to execute | |
668 // the same logic once for Full URL searches and once for URL Component | |
669 // searches (see URLMatcherConditionFactory). | |
670 | |
671 // Determine which patterns need to be registered when this function | |
672 // terminates. | |
673 std::set<const StringPattern*> new_patterns; | |
674 for (URLMatcherConditionSets::const_iterator condition_set_iter = | |
675 url_matcher_condition_sets_.begin(); | |
676 condition_set_iter != url_matcher_condition_sets_.end(); | |
677 ++condition_set_iter) { | |
678 const URLMatcherConditionSet::Conditions& conditions = | |
679 condition_set_iter->second->conditions(); | |
680 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = | |
681 conditions.begin(); condition_iter != conditions.end(); | |
682 ++condition_iter) { | |
683 // If we are called to process Full URL searches, ignore others, and | |
684 // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.) | |
685 if (!condition_iter->IsRegexCondition() && | |
686 full_url_conditions == condition_iter->IsFullURLCondition()) | |
687 new_patterns.insert(condition_iter->string_pattern()); | |
688 } | |
689 } | |
690 | |
691 // This is the set of patterns that were registered before this function | |
692 // is called. | |
693 std::set<const StringPattern*>& registered_patterns = | |
694 full_url_conditions ? registered_full_url_patterns_ | |
695 : registered_url_component_patterns_; | |
696 | |
697 // Add all patterns that are in new_patterns but not in registered_patterns. | |
698 std::vector<const StringPattern*> patterns_to_register; | |
699 std::set_difference( | |
700 new_patterns.begin(), new_patterns.end(), | |
701 registered_patterns.begin(), registered_patterns.end(), | |
702 std::back_inserter(patterns_to_register)); | |
703 | |
704 // Remove all patterns that are in registered_patterns but not in | |
705 // new_patterns. | |
706 std::vector<const StringPattern*> patterns_to_unregister; | |
707 std::set_difference( | |
708 registered_patterns.begin(), registered_patterns.end(), | |
709 new_patterns.begin(), new_patterns.end(), | |
710 std::back_inserter(patterns_to_unregister)); | |
711 | |
712 // Update the SubstringSetMatcher. | |
713 SubstringSetMatcher& url_matcher = | |
714 full_url_conditions ? full_url_matcher_ : url_component_matcher_; | |
715 url_matcher.RegisterAndUnregisterPatterns(patterns_to_register, | |
716 patterns_to_unregister); | |
717 | |
718 // Update the set of registered_patterns for the next time this function | |
719 // is being called. | |
720 registered_patterns.swap(new_patterns); | |
721 } | |
722 | |
723 void URLMatcher::UpdateRegexSetMatcher() { | |
724 std::vector<const StringPattern*> new_patterns; | |
725 | |
726 for (URLMatcherConditionSets::const_iterator condition_set_iter = | |
727 url_matcher_condition_sets_.begin(); | |
728 condition_set_iter != url_matcher_condition_sets_.end(); | |
729 ++condition_set_iter) { | |
730 const URLMatcherConditionSet::Conditions& conditions = | |
731 condition_set_iter->second->conditions(); | |
732 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = | |
733 conditions.begin(); condition_iter != conditions.end(); | |
734 ++condition_iter) { | |
735 if (condition_iter->IsRegexCondition()) | |
736 new_patterns.push_back(condition_iter->string_pattern()); | |
737 } | |
738 } | |
739 | |
740 // Start over from scratch. We can't really do better than this, since the | |
741 // FilteredRE2 backend doesn't support incremental updates. | |
742 regex_set_matcher_.ClearPatterns(); | |
743 regex_set_matcher_.AddPatterns(new_patterns); | |
744 } | |
745 | |
746 void URLMatcher::UpdateTriggers() { | |
747 // Count substring pattern frequencies. | |
748 std::map<StringPattern::ID, size_t> substring_pattern_frequencies; | |
749 for (URLMatcherConditionSets::const_iterator condition_set_iter = | |
750 url_matcher_condition_sets_.begin(); | |
751 condition_set_iter != url_matcher_condition_sets_.end(); | |
752 ++condition_set_iter) { | |
753 const URLMatcherConditionSet::Conditions& conditions = | |
754 condition_set_iter->second->conditions(); | |
755 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = | |
756 conditions.begin(); condition_iter != conditions.end(); | |
757 ++condition_iter) { | |
758 const StringPattern* pattern = condition_iter->string_pattern(); | |
759 substring_pattern_frequencies[pattern->id()]++; | |
760 } | |
761 } | |
762 | |
763 // Update trigger conditions: Determine for each URLMatcherConditionSet which | |
764 // URLMatcherCondition contains a StringPattern that occurs least | |
765 // frequently in this URLMatcher. We assume that this condition is very | |
766 // specific and occurs rarely in URLs. If a match occurs for this | |
767 // URLMatcherCondition, we want to test all other URLMatcherCondition in the | |
768 // respective URLMatcherConditionSet as well to see whether the entire | |
769 // URLMatcherConditionSet is considered matching. | |
770 substring_match_triggers_.clear(); | |
771 for (URLMatcherConditionSets::const_iterator condition_set_iter = | |
772 url_matcher_condition_sets_.begin(); | |
773 condition_set_iter != url_matcher_condition_sets_.end(); | |
774 ++condition_set_iter) { | |
775 const URLMatcherConditionSet::Conditions& conditions = | |
776 condition_set_iter->second->conditions(); | |
777 if (conditions.empty()) | |
778 continue; | |
779 URLMatcherConditionSet::Conditions::const_iterator condition_iter = | |
780 conditions.begin(); | |
781 StringPattern::ID trigger = condition_iter->string_pattern()->id(); | |
782 // We skip the first element in the following loop. | |
783 ++condition_iter; | |
784 for (; condition_iter != conditions.end(); ++condition_iter) { | |
785 StringPattern::ID current_id = | |
786 condition_iter->string_pattern()->id(); | |
787 if (substring_pattern_frequencies[trigger] > | |
788 substring_pattern_frequencies[current_id]) { | |
789 trigger = current_id; | |
790 } | |
791 } | |
792 substring_match_triggers_[trigger].insert(condition_set_iter->second->id()); | |
793 } | |
794 } | |
795 | |
796 void URLMatcher::UpdateConditionFactory() { | |
797 std::set<StringPattern::ID> used_patterns; | |
798 for (URLMatcherConditionSets::const_iterator condition_set_iter = | |
799 url_matcher_condition_sets_.begin(); | |
800 condition_set_iter != url_matcher_condition_sets_.end(); | |
801 ++condition_set_iter) { | |
802 const URLMatcherConditionSet::Conditions& conditions = | |
803 condition_set_iter->second->conditions(); | |
804 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = | |
805 conditions.begin(); condition_iter != conditions.end(); | |
806 ++condition_iter) { | |
807 used_patterns.insert(condition_iter->string_pattern()->id()); | |
808 } | |
809 } | |
810 condition_factory_.ForgetUnusedPatterns(used_patterns); | |
811 } | |
812 | |
813 void URLMatcher::UpdateInternalDatastructures() { | |
814 UpdateSubstringSetMatcher(false); | |
815 UpdateSubstringSetMatcher(true); | |
816 UpdateRegexSetMatcher(); | |
817 UpdateTriggers(); | |
818 UpdateConditionFactory(); | |
819 } | |
820 | |
821 } // namespace extensions | |
OLD | NEW |