| Index: chrome/common/extensions/matcher/url_matcher.cc
|
| diff --git a/chrome/common/extensions/matcher/url_matcher.cc b/chrome/common/extensions/matcher/url_matcher.cc
|
| index 0eb5ab507351d641e4655d34802bbe4bf4224f77..ac639d996f5423a831dce99b1797319f65f67f21 100644
|
| --- a/chrome/common/extensions/matcher/url_matcher.cc
|
| +++ b/chrome/common/extensions/matcher/url_matcher.cc
|
| @@ -15,29 +15,37 @@
|
| namespace extensions {
|
|
|
| // This set of classes implement a mapping of URL Component Patterns, such as
|
| -// host_prefix, host_suffix, host_equals, ..., etc., to SubstringPatterns.
|
| +// host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
|
| +// for use in substring comparisons.
|
| //
|
| // The idea of this mapping is to reduce the problem of comparing many
|
| // URL Component Patterns against one URL to the problem of searching many
|
| // substrings in one string:
|
| //
|
| -// ---------------------- --------------------
|
| -// | URL Query operator | ----translate----> | SubstringPattern |
|
| -// ---------------------- --------------------
|
| -// ^
|
| -// |
|
| -// compare
|
| -// |
|
| -// v
|
| -// ---------------------- --------------------
|
| -// | URL to compare | | |
|
| -// | to all URL Query | ----translate----> | String |
|
| -// | operators | | |
|
| -// ---------------------- --------------------
|
| +// ---------------------- -----------------
|
| +// | URL Query operator | ----translate----> | StringPattern |
|
| +// ---------------------- -----------------
|
| +// ^
|
| +// |
|
| +// compare
|
| +// |
|
| +// v
|
| +// ---------------------- -----------------
|
| +// | URL to compare | | |
|
| +// | to all URL Query | ----translate----> | String |
|
| +// | operators | | |
|
| +// ---------------------- -----------------
|
| //
|
| // The reason for this problem reduction is that there are efficient algorithms
|
| // for searching many substrings in one string (see Aho-Corasick algorithm).
|
| //
|
| +// Additionally, some of the same pieces are reused to implement regular
|
| +// expression comparisons. The FilteredRE2 implementation for matching many
|
| +// regular expressions against one string uses prefiltering, in which a set
|
| +// of substrings (derived from the regexes) are first searched for, to reduce
|
| +// the number of regular expressions to test; the prefiltering step also
|
| +// uses Aho-Corasick.
|
| +//
|
| // Case 1: {host,path,query}_{prefix,suffix,equals} searches.
|
| // ==========================================================
|
| //
|
| @@ -84,7 +92,7 @@ namespace extensions {
|
| //
|
| // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
|
| //
|
| -// With this, we can search the SubstringPatterns in the normalized URL.
|
| +// With this, we can search the StringPatterns in the normalized URL.
|
| //
|
| //
|
| // Case 2: url_{prefix,suffix,equals,contains} searches.
|
| @@ -124,7 +132,21 @@ namespace extensions {
|
| // followed by gurl.host().find("example.co");
|
| //
|
| // [similarly for path_contains and query_contains].
|
| +//
|
| +//
|
| +// Regular expression matching (url_matches searches)
|
| +// ==================================================
|
| +//
|
| +// This class also supports matching regular expressions (RE2 syntax)
|
| +// against full URLs, which are transformed as in case 2.
|
|
|
| +namespace {
|
| +
|
| +bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) {
|
| + return criterion == URLMatcherCondition::URL_MATCHES;
|
| +}
|
| +
|
| +} // namespace
|
|
|
| //
|
| // URLMatcherCondition
|
| @@ -132,34 +154,34 @@ namespace extensions {
|
|
|
| URLMatcherCondition::URLMatcherCondition()
|
| : criterion_(HOST_PREFIX),
|
| - substring_pattern_(NULL) {}
|
| + string_pattern_(NULL) {}
|
|
|
| URLMatcherCondition::~URLMatcherCondition() {}
|
|
|
| URLMatcherCondition::URLMatcherCondition(
|
| Criterion criterion,
|
| - const SubstringPattern* substring_pattern)
|
| + const StringPattern* string_pattern)
|
| : criterion_(criterion),
|
| - substring_pattern_(substring_pattern) {}
|
| + string_pattern_(string_pattern) {}
|
|
|
| URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
|
| : criterion_(rhs.criterion_),
|
| - substring_pattern_(rhs.substring_pattern_) {}
|
| + string_pattern_(rhs.string_pattern_) {}
|
|
|
| URLMatcherCondition& URLMatcherCondition::operator=(
|
| const URLMatcherCondition& rhs) {
|
| criterion_ = rhs.criterion_;
|
| - substring_pattern_ = rhs.substring_pattern_;
|
| + string_pattern_ = rhs.string_pattern_;
|
| return *this;
|
| }
|
|
|
| bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {
|
| if (criterion_ < rhs.criterion_) return true;
|
| if (criterion_ > rhs.criterion_) return false;
|
| - if (substring_pattern_ != NULL && rhs.substring_pattern_ != NULL)
|
| - return *substring_pattern_ < *rhs.substring_pattern_;
|
| - if (substring_pattern_ == NULL && rhs.substring_pattern_ != NULL) return true;
|
| - // Either substring_pattern_ != NULL && rhs.substring_pattern_ == NULL,
|
| + if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
|
| + return *string_pattern_ < *rhs.string_pattern_;
|
| + if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true;
|
| + // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
|
| // or both are NULL.
|
| return false;
|
| }
|
| @@ -183,25 +205,28 @@ bool URLMatcherCondition::IsFullURLCondition() const {
|
| return false;
|
| }
|
|
|
| +bool URLMatcherCondition::IsRegexCondition() const {
|
| + return IsRegexCriterion(criterion_);
|
| +}
|
| +
|
| bool URLMatcherCondition::IsMatch(
|
| - const std::set<SubstringPattern::ID>& matching_substring_patterns,
|
| + const std::set<StringPattern::ID>& matching_patterns,
|
| const GURL& url) const {
|
| - DCHECK(substring_pattern_);
|
| - if (matching_substring_patterns.find(substring_pattern_->id()) ==
|
| - matching_substring_patterns.end())
|
| + DCHECK(string_pattern_);
|
| + if (!ContainsKey(matching_patterns, string_pattern_->id()))
|
| return false;
|
| // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
|
| // a substring match on the raw URL. In case of a match, we need to verify
|
| // that the match was found in the correct component of the URL.
|
| switch (criterion_) {
|
| case HOST_CONTAINS:
|
| - return url.host().find(substring_pattern_->pattern()) !=
|
| + return url.host().find(string_pattern_->pattern()) !=
|
| std::string::npos;
|
| case PATH_CONTAINS:
|
| - return url.path().find(substring_pattern_->pattern()) !=
|
| + return url.path().find(string_pattern_->pattern()) !=
|
| std::string::npos;
|
| case QUERY_CONTAINS:
|
| - return url.query().find(substring_pattern_->pattern()) !=
|
| + return url.query().find(string_pattern_->pattern()) !=
|
| std::string::npos;
|
| default:
|
| break;
|
| @@ -224,7 +249,8 @@ const char kEndOfURL[] = {static_cast<char>(-4), 0};
|
| URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}
|
|
|
| URLMatcherConditionFactory::~URLMatcherConditionFactory() {
|
| - STLDeleteElements(&pattern_singletons_);
|
| + STLDeleteElements(&substring_pattern_singletons_);
|
| + STLDeleteElements(®ex_pattern_singletons_);
|
| }
|
|
|
| std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
|
| @@ -336,6 +362,23 @@ std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
|
| kEndOfURL;
|
| }
|
|
|
| +std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
|
| + const GURL& url) {
|
| + GURL::Replacements replacements;
|
| + replacements.ClearPassword();
|
| + replacements.ClearUsername();
|
| + replacements.ClearRef();
|
| + // Clear port if it is implicit from scheme.
|
| + if (url.has_port()) {
|
| + const std::string& port = url.scheme();
|
| + if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) ==
|
| + url.EffectiveIntPort()) {
|
| + replacements.ClearPort();
|
| + }
|
| + }
|
| + return url.ReplaceComponents(replacements).spec();
|
| +}
|
| +
|
| URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
|
| const std::string& prefix) {
|
| return CreateCondition(URLMatcherCondition::URL_PREFIX,
|
| @@ -358,35 +401,55 @@ URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
|
| kBeginningOfURL + str + kEndOfURL);
|
| }
|
|
|
| +URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition(
|
| + const std::string& regex) {
|
| + return CreateCondition(URLMatcherCondition::URL_MATCHES, regex);
|
| +}
|
| +
|
| void URLMatcherConditionFactory::ForgetUnusedPatterns(
|
| - const std::set<SubstringPattern::ID>& used_patterns) {
|
| - PatternSingletons::iterator i = pattern_singletons_.begin();
|
| - while (i != pattern_singletons_.end()) {
|
| + const std::set<StringPattern::ID>& used_patterns) {
|
| + PatternSingletons::iterator i = substring_pattern_singletons_.begin();
|
| + while (i != substring_pattern_singletons_.end()) {
|
| if (used_patterns.find((*i)->id()) != used_patterns.end()) {
|
| ++i;
|
| } else {
|
| delete *i;
|
| - pattern_singletons_.erase(i++);
|
| + substring_pattern_singletons_.erase(i++);
|
| + }
|
| + }
|
| + i = regex_pattern_singletons_.begin();
|
| + while (i != regex_pattern_singletons_.end()) {
|
| + if (used_patterns.find((*i)->id()) != used_patterns.end()) {
|
| + ++i;
|
| + } else {
|
| + delete *i;
|
| + regex_pattern_singletons_.erase(i++);
|
| }
|
| }
|
| }
|
|
|
| bool URLMatcherConditionFactory::IsEmpty() const {
|
| - return pattern_singletons_.empty();
|
| + return substring_pattern_singletons_.empty() &&
|
| + regex_pattern_singletons_.empty();
|
| }
|
|
|
| URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
|
| URLMatcherCondition::Criterion criterion,
|
| const std::string& pattern) {
|
| - SubstringPattern search_pattern(pattern, 0);
|
| + StringPattern search_pattern(pattern, 0);
|
| + PatternSingletons* pattern_singletons =
|
| + IsRegexCriterion(criterion) ? ®ex_pattern_singletons_
|
| + : &substring_pattern_singletons_;
|
| +
|
| PatternSingletons::const_iterator iter =
|
| - pattern_singletons_.find(&search_pattern);
|
| - if (iter != pattern_singletons_.end()) {
|
| + pattern_singletons->find(&search_pattern);
|
| +
|
| + if (iter != pattern_singletons->end()) {
|
| return URLMatcherCondition(criterion, *iter);
|
| } else {
|
| - SubstringPattern* new_pattern =
|
| - new SubstringPattern(pattern, id_counter_++);
|
| - pattern_singletons_.insert(new_pattern);
|
| + StringPattern* new_pattern =
|
| + new StringPattern(pattern, id_counter_++);
|
| + pattern_singletons->insert(new_pattern);
|
| return URLMatcherCondition(criterion, new_pattern);
|
| }
|
| }
|
| @@ -399,9 +462,9 @@ std::string URLMatcherConditionFactory::CanonicalizeHostname(
|
| return "." + hostname;
|
| }
|
|
|
| -bool URLMatcherConditionFactory::SubstringPatternPointerCompare::operator()(
|
| - SubstringPattern* lhs,
|
| - SubstringPattern* rhs) const {
|
| +bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
|
| + StringPattern* lhs,
|
| + StringPattern* rhs) const {
|
| if (lhs == NULL && rhs != NULL) return true;
|
| if (lhs != NULL && rhs != NULL)
|
| return lhs->pattern() < rhs->pattern();
|
| @@ -483,11 +546,11 @@ URLMatcherConditionSet::URLMatcherConditionSet(
|
| port_filter_(port_filter.Pass()) {}
|
|
|
| bool URLMatcherConditionSet::IsMatch(
|
| - const std::set<SubstringPattern::ID>& matching_substring_patterns,
|
| + const std::set<StringPattern::ID>& matching_patterns,
|
| const GURL& url) const {
|
| for (Conditions::const_iterator i = conditions_.begin();
|
| i != conditions_.end(); ++i) {
|
| - if (!i->IsMatch(matching_substring_patterns, url))
|
| + if (!i->IsMatch(matching_patterns, url))
|
| return false;
|
| }
|
| if (scheme_filter_.get() && !scheme_filter_->IsMatch(url))
|
| @@ -497,7 +560,6 @@ bool URLMatcherConditionSet::IsMatch(
|
| return true;
|
| }
|
|
|
| -
|
| //
|
| // URLMatcher
|
| //
|
| @@ -533,19 +595,21 @@ void URLMatcher::ClearUnusedConditionSets() {
|
| }
|
|
|
| std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(const GURL& url) {
|
| - // Find all IDs of SubstringPatterns that match |url|.
|
| + // Find all IDs of StringPatterns that match |url|.
|
| // See URLMatcherConditionFactory for the canonicalization of URLs and the
|
| // distinction between full url searches and url component searches.
|
| - std::set<SubstringPattern::ID> matches;
|
| + std::set<StringPattern::ID> matches;
|
| full_url_matcher_.Match(
|
| condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
|
| url_component_matcher_.Match(
|
| condition_factory_.CanonicalizeURLForComponentSearches(url), &matches);
|
| + regex_set_matcher_.Match(
|
| + condition_factory_.CanonicalizeURLForRegexSearches(url), &matches);
|
|
|
| // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
|
| // were fulfilled.
|
| std::set<URLMatcherConditionSet::ID> result;
|
| - for (std::set<SubstringPattern::ID>::const_iterator i = matches.begin();
|
| + for (std::set<StringPattern::ID>::const_iterator i = matches.begin();
|
| i != matches.end(); ++i) {
|
| // For each URLMatcherConditionSet there is exactly one condition
|
| // registered in substring_match_triggers_. This means that the following
|
| @@ -580,7 +644,7 @@ void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
|
|
|
| // Determine which patterns need to be registered when this function
|
| // terminates.
|
| - std::set<const SubstringPattern*> new_patterns;
|
| + std::set<const StringPattern*> new_patterns;
|
| for (URLMatcherConditionSets::const_iterator condition_set_iter =
|
| url_matcher_condition_sets_.begin();
|
| condition_set_iter != url_matcher_condition_sets_.end();
|
| @@ -590,21 +654,22 @@ void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
|
| for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
|
| conditions.begin(); condition_iter != conditions.end();
|
| ++condition_iter) {
|
| - // If we are called to process Full URL searches, ignore all others,
|
| - // and vice versa.
|
| - if (full_url_conditions == condition_iter->IsFullURLCondition())
|
| - new_patterns.insert(condition_iter->substring_pattern());
|
| + // If we are called to process Full URL searches, ignore others, and
|
| + // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
|
| + if (!condition_iter->IsRegexCondition() &&
|
| + full_url_conditions == condition_iter->IsFullURLCondition())
|
| + new_patterns.insert(condition_iter->string_pattern());
|
| }
|
| }
|
|
|
| // This is the set of patterns that were registered before this function
|
| // is called.
|
| - std::set<const SubstringPattern*>& registered_patterns =
|
| + std::set<const StringPattern*>& registered_patterns =
|
| full_url_conditions ? registered_full_url_patterns_
|
| : registered_url_component_patterns_;
|
|
|
| // Add all patterns that are in new_patterns but not in registered_patterns.
|
| - std::vector<const SubstringPattern*> patterns_to_register;
|
| + std::vector<const StringPattern*> patterns_to_register;
|
| std::set_difference(
|
| new_patterns.begin(), new_patterns.end(),
|
| registered_patterns.begin(), registered_patterns.end(),
|
| @@ -612,7 +677,7 @@ void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
|
|
|
| // Remove all patterns that are in registered_patterns but not in
|
| // new_patterns.
|
| - std::vector<const SubstringPattern*> patterns_to_unregister;
|
| + std::vector<const StringPattern*> patterns_to_unregister;
|
| std::set_difference(
|
| registered_patterns.begin(), registered_patterns.end(),
|
| new_patterns.begin(), new_patterns.end(),
|
| @@ -629,9 +694,32 @@ void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
|
| registered_patterns.swap(new_patterns);
|
| }
|
|
|
| +void URLMatcher::UpdateRegexSetMatcher() {
|
| + std::vector<const StringPattern*> new_patterns;
|
| +
|
| + for (URLMatcherConditionSets::const_iterator condition_set_iter =
|
| + url_matcher_condition_sets_.begin();
|
| + condition_set_iter != url_matcher_condition_sets_.end();
|
| + ++condition_set_iter) {
|
| + const URLMatcherConditionSet::Conditions& conditions =
|
| + condition_set_iter->second->conditions();
|
| + for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
|
| + conditions.begin(); condition_iter != conditions.end();
|
| + ++condition_iter) {
|
| + if (condition_iter->IsRegexCondition())
|
| + new_patterns.push_back(condition_iter->string_pattern());
|
| + }
|
| + }
|
| +
|
| + // Start over from scratch. We can't really do better than this, since the
|
| + // FilteredRE2 backend doesn't support incremental updates.
|
| + regex_set_matcher_.ClearPatterns();
|
| + regex_set_matcher_.AddPatterns(new_patterns);
|
| +}
|
| +
|
| void URLMatcher::UpdateTriggers() {
|
| // Count substring pattern frequencies.
|
| - std::map<SubstringPattern::ID, size_t> substring_pattern_frequencies;
|
| + std::map<StringPattern::ID, size_t> substring_pattern_frequencies;
|
| for (URLMatcherConditionSets::const_iterator condition_set_iter =
|
| url_matcher_condition_sets_.begin();
|
| condition_set_iter != url_matcher_condition_sets_.end();
|
| @@ -641,13 +729,13 @@ void URLMatcher::UpdateTriggers() {
|
| for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
|
| conditions.begin(); condition_iter != conditions.end();
|
| ++condition_iter) {
|
| - const SubstringPattern* pattern = condition_iter->substring_pattern();
|
| + const StringPattern* pattern = condition_iter->string_pattern();
|
| substring_pattern_frequencies[pattern->id()]++;
|
| }
|
| }
|
|
|
| // Update trigger conditions: Determine for each URLMatcherConditionSet which
|
| - // URLMatcherCondition contains a SubstringPattern that occurs least
|
| + // URLMatcherCondition contains a StringPattern that occurs least
|
| // frequently in this URLMatcher. We assume that this condition is very
|
| // specific and occurs rarely in URLs. If a match occurs for this
|
| // URLMatcherCondition, we want to test all other URLMatcherCondition in the
|
| @@ -664,12 +752,12 @@ void URLMatcher::UpdateTriggers() {
|
| continue;
|
| URLMatcherConditionSet::Conditions::const_iterator condition_iter =
|
| conditions.begin();
|
| - SubstringPattern::ID trigger = condition_iter->substring_pattern()->id();
|
| + StringPattern::ID trigger = condition_iter->string_pattern()->id();
|
| // We skip the first element in the following loop.
|
| ++condition_iter;
|
| for (; condition_iter != conditions.end(); ++condition_iter) {
|
| - SubstringPattern::ID current_id =
|
| - condition_iter->substring_pattern()->id();
|
| + StringPattern::ID current_id =
|
| + condition_iter->string_pattern()->id();
|
| if (substring_pattern_frequencies[trigger] >
|
| substring_pattern_frequencies[current_id]) {
|
| trigger = current_id;
|
| @@ -680,7 +768,7 @@ void URLMatcher::UpdateTriggers() {
|
| }
|
|
|
| void URLMatcher::UpdateConditionFactory() {
|
| - std::set<SubstringPattern::ID> used_patterns;
|
| + std::set<StringPattern::ID> used_patterns;
|
| for (URLMatcherConditionSets::const_iterator condition_set_iter =
|
| url_matcher_condition_sets_.begin();
|
| condition_set_iter != url_matcher_condition_sets_.end();
|
| @@ -690,7 +778,7 @@ void URLMatcher::UpdateConditionFactory() {
|
| for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
|
| conditions.begin(); condition_iter != conditions.end();
|
| ++condition_iter) {
|
| - used_patterns.insert(condition_iter->substring_pattern()->id());
|
| + used_patterns.insert(condition_iter->string_pattern()->id());
|
| }
|
| }
|
| condition_factory_.ForgetUnusedPatterns(used_patterns);
|
| @@ -699,6 +787,7 @@ void URLMatcher::UpdateConditionFactory() {
|
| void URLMatcher::UpdateInternalDatastructures() {
|
| UpdateSubstringSetMatcher(false);
|
| UpdateSubstringSetMatcher(true);
|
| + UpdateRegexSetMatcher();
|
| UpdateTriggers();
|
| UpdateConditionFactory();
|
| }
|
|
|