Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(3298)

Unified Diff: chrome/browser/extensions/api/declarative/url_matcher.h

Issue 9390018: Implementation of a Matching strategy for URLs in the Declarative WebRequest API. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: MSVC does not support EXPECT_NE on iterators Created 8 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/browser/extensions/api/declarative/url_matcher.h
diff --git a/chrome/browser/extensions/api/declarative/url_matcher.h b/chrome/browser/extensions/api/declarative/url_matcher.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee145a974e9a6c565720f78f708557609774fc4f
--- /dev/null
+++ b/chrome/browser/extensions/api/declarative/url_matcher.h
@@ -0,0 +1,262 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CHROME_BROWSER_EXTENSIONS_API_DECLARATIVE_URL_MATCHER_H_
+#define CHROME_BROWSER_EXTENSIONS_API_DECLARATIVE_URL_MATCHER_H_
+#pragma once
+
+#include <set>
+#include <vector>
+
+#include "base/memory/scoped_ptr.h"
+#include "base/memory/scoped_vector.h"
+#include "chrome/browser/extensions/api/declarative/substring_set_matcher.h"
+
+class GURL;
+
+namespace base {
+class DictionaryValue;
+}
+
+namespace extensions {
+
+// This class represents a single URL matching condition, e.g. a match on the
+// host suffix or the containment of a string in the query component of a GURL.
+//
+// The difference from a simple SubstringPattern is that this also supports
+// checking whether the {Host, Path, Query} of a URL contains a string. The
+// reduction of URL matching conditions to StringPatterns conducted by
+// URLMatcherConditionFactory is not capable of expressing that alone.
+class URLMatcherCondition {
+ public:
+ enum Criterion {
+ HOST_PREFIX,
+ HOST_SUFFIX,
+ HOST_CONTAINS,
+ HOST_EQUALS,
+ PATH_PREFIX,
+ PATH_SUFFIX,
+ PATH_CONTAINS,
+ PATH_EQUALS,
+ QUERY_PREFIX,
+ QUERY_SUFFIX,
+ QUERY_CONTAINS,
+ QUERY_EQUALS,
+ HOST_SUFFIX_PATH_PREFIX,
+ URL_PREFIX,
+ URL_SUFFIX,
+ URL_CONTAINS,
+ URL_EQUALS,
+ };
+
+ URLMatcherCondition();
+ ~URLMatcherCondition();
+ URLMatcherCondition(Criterion criterion,
+ const SubstringPattern* substring_pattern);
+ URLMatcherCondition(const URLMatcherCondition& rhs);
+ URLMatcherCondition& operator=(const URLMatcherCondition& rhs);
+ bool operator<(const URLMatcherCondition& rhs) const;
+
+ Criterion criterion() const { return criterion_; }
+ const SubstringPattern* substring_pattern() const {
+ return substring_pattern_;
+ }
+
+ // Returns whether this URLMatcherCondition needs to be executed on a
+ // full URL rather than the individual components (see
+ // URLMatcherConditionFactory).
+ bool IsFullURLCondition() const;
+
+ // Returns whether this condition is fulfilled according to
+ // |matching_substring_patterns| and |url|.
+ bool IsMatch(
+ const std::set<SubstringPattern::ID>& matching_substring_patterns,
+ const GURL& url) const;
+
+ private:
+ // |criterion_| and |substring_pattern_| describe together what property a URL
+ // needs to fulfill to be considered a match.
+ Criterion criterion_;
+
+ // This is the SubstringPattern that is used in a SubstringSetMatcher.
+ const SubstringPattern* substring_pattern_;
+};
+
+// Class to map the problem of finding {host, path, query} {prefixes, suffixes,
+// containments, and equality} in GURLs to the substring matching problem.
+//
+// Say, you want to check whether the path of a URL starts with "/index.html".
+// This class preprocesses a URL like "www.google.com/index.html" into something
+// like "www.google.com|/index.html". After preprocessing, you can search for
+// "|/index.html" in the string and see that this candidate URL actually has
+// a path that starts with "/index.html". On the contrary,
+// "www.google.com/images/index.html" would be normalized to
+// "www.google.com|/images/index.html". It is easy to see that it contains
+// "/index.html" but the path of the URL does not start with "/index.html".
+//
+// This preprocessing is important if you want to match a URL against many
+// patterns because it reduces the matching to a "discover all substrings
+// of a dictionary in a text" problem, which can be solved very efficiently
+// by the Aho-Corasick algorithm.
+//
+// IMPORTANT: The URLMatcherConditionFactory owns the SubstringPattern
+// referenced by created URLMatcherConditions. Therefore, it must outlive
+// all created URLMatcherCondition and the SubstringSetMatcher.
+class URLMatcherConditionFactory {
+ public:
+ URLMatcherConditionFactory();
+ ~URLMatcherConditionFactory();
+
+ // Canonicalizes a URL for "Create{Host,Path,Query}*Condition" searches.
+ std::string CanonicalizeURLForComponentSearches(const GURL& url);
+
+ // Factory methods for various condition types.
+ URLMatcherCondition CreateHostPrefixCondition(const std::string& prefix);
+ URLMatcherCondition CreateHostSuffixCondition(const std::string& suffix);
+ URLMatcherCondition CreateHostContainsCondition(const std::string& str);
+ URLMatcherCondition CreateHostEqualsCondition(const std::string& str);
+
+ URLMatcherCondition CreatePathPrefixCondition(const std::string& prefix);
+ URLMatcherCondition CreatePathSuffixCondition(const std::string& suffix);
+ URLMatcherCondition CreatePathContainsCondition(const std::string& str);
+ URLMatcherCondition CreatePathEqualsCondition(const std::string& str);
+
+ URLMatcherCondition CreateQueryPrefixCondition(const std::string& prefix);
+ URLMatcherCondition CreateQuerySuffixCondition(const std::string& suffix);
+ URLMatcherCondition CreateQueryContainsCondition(const std::string& str);
+ URLMatcherCondition CreateQueryEqualsCondition(const std::string& str);
+
+ // This covers the common case, where you don't care whether a domain
+ // "foobar.com" is expressed as "foobar.com" or "www.foobar.com", and it
+ // should be followed by a given |path_prefix|.
+ URLMatcherCondition CreateHostSuffixPathPrefixCondition(
+ const std::string& host_suffix,
+ const std::string& path_prefix);
+
+ // Canonicalizes a URL for "CreateURL*Condition" searches.
+ std::string CanonicalizeURLForFullSearches(const GURL& url);
+
+ URLMatcherCondition CreateURLPrefixCondition(const std::string& prefix);
+ URLMatcherCondition CreateURLSuffixCondition(const std::string& suffix);
+ URLMatcherCondition CreateURLContainsCondition(const std::string& str);
+ URLMatcherCondition CreateURLEqualsCondition(const std::string& str);
+
+ // Removes all patterns from |pattern_singletons_| that are not listed in
+ // |used_patterns|. These patterns are not referenced any more and get
+ // freed.
+ void ForgetUnusedPatterns(
+ const std::set<SubstringPattern::ID>& used_patterns);
+
+ private:
+ // Creates a URLMatcherCondition according to the parameters passed.
+ // The URLMatcherCondition will refer to a SubstringPattern that is
+ // owned by |pattern_singletons_|.
+ URLMatcherCondition CreateCondition(URLMatcherCondition::Criterion criterion,
+ const std::string& pattern);
+
+ // Prepends a "." to the hostname if it does not start with one.
+ std::string CanonicalizeHostname(const std::string& hostname) const;
+
+ // Counter that ensures that all created SubstringPatterns have unique IDs.
+ int id_counter_;
+
+ // This comparison considers only the pattern() value of the
+ // SubstringPatterns.
+ struct SubstringPatternPointerCompare {
+ bool operator()(SubstringPattern* lhs, SubstringPattern* rhs) const;
+ };
+ // Set to ensure that we generate only one SubstringPattern for each content
+ // of SubstringPattern::pattern().
+ typedef std::set<SubstringPattern*, SubstringPatternPointerCompare>
+ PatternSingletons;
+ PatternSingletons pattern_singletons_;
+
+ DISALLOW_COPY_AND_ASSIGN(URLMatcherConditionFactory);
+};
+
+// This class represents a set of conditions that all need to match on a
+// given URL in order to be considered a match.
+class URLMatcherConditionSet {
+ public:
+ typedef int ID;
+ typedef std::set<URLMatcherCondition> Conditions;
+
+ URLMatcherConditionSet();
+ ~URLMatcherConditionSet();
+ URLMatcherConditionSet(ID id, const Conditions& conditions);
+ URLMatcherConditionSet(const URLMatcherConditionSet& rhs);
+ URLMatcherConditionSet& operator=(const URLMatcherConditionSet& rhs);
+
+ ID id() const { return id_; }
+ const Conditions& conditions() const { return conditions_; }
+
+ bool IsMatch(
+ const std::set<SubstringPattern::ID>& matching_substring_patterns,
+ const GURL& url) const;
+
+ private:
+ ID id_;
+ Conditions conditions_;
+};
+
+// This class allows matching one URL against a large set of
+// URLMatcherConditionSets at the same time.
+class URLMatcher {
+ public:
+ URLMatcher();
+ ~URLMatcher();
+
+ // Adds new URLMatcherConditionSet to this URL Matcher. Each condition set
+ // must have a unique ID.
+ // This is an expensive operation as it triggers pre-calculations on the
+ // currently registered condition sets. Do not call this operation many
+ // times with a single condition set in each call.
+ void AddConditionSets(
+ const std::vector<URLMatcherConditionSet>& condition_sets);
+
+ // Removes the listed condition sets. All |condition_set_ids| must be
+ // currently registered. This function should be called with large batches
+ // of |condition_set_ids| at a time to improve performance.
+ void RemoveConditionSets(
+ const std::vector<URLMatcherConditionSet::ID>& condition_set_ids);
+
+ // Returns the IDs of all URLMatcherConditionSet that match to this |url|.
+ std::set<URLMatcherConditionSet::ID> MatchURL(const GURL& url);
+
+ // Returns the URLMatcherConditionFactory that must be used to create
+ // URLMatcherConditionSets for this URLMatcher.
+ URLMatcherConditionFactory* condition_factory() {
+ return &condition_factory_;
+ }
+
+ private:
+ void UpdateSubstringSetMatcher(bool full_url_conditions);
+ void UpdateTriggers();
+ void UpdateConditionFactory();
+ void UpdateInternalDatastructures();
+
+ URLMatcherConditionFactory condition_factory_;
+
+ // Maps the ID of a URLMatcherConditionSet to the respective
+ // URLMatcherConditionSet.
+ typedef std::map<URLMatcherConditionSet::ID, URLMatcherConditionSet>
+ URLMatcherConditionSets;
+ URLMatcherConditionSets url_matcher_condition_sets_;
+
+ // Maps a SubstringPattern ID to the URLMatcherConditions that need to
+ // be triggered in case of a SubstringPattern match.
+ std::map<SubstringPattern::ID, std::set<URLMatcherConditionSet::ID> >
+ substring_match_triggers_;
+
+ SubstringSetMatcher full_url_matcher_;
+ SubstringSetMatcher url_component_matcher_;
+ std::set<const SubstringPattern*> registered_full_url_patterns_;
+ std::set<const SubstringPattern*> registered_url_component_patterns_;
+
+ DISALLOW_COPY_AND_ASSIGN(URLMatcher);
+};
+
+} // namespace extensions
+
+#endif // CHROME_BROWSER_EXTENSIONS_API_DECLARATIVE_URL_MATCHER_H_

Powered by Google App Engine
This is Rietveld 408576698