Chromium Code Reviews| Index: chrome/browser/extensions/api/declarative/url_component_patterns.cc |
| diff --git a/chrome/browser/extensions/api/declarative/url_component_patterns.cc b/chrome/browser/extensions/api/declarative/url_component_patterns.cc |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..53cb7c4406dd932fa82d5be3335a43d36d9ba568 |
| --- /dev/null |
| +++ b/chrome/browser/extensions/api/declarative/url_component_patterns.cc |
| @@ -0,0 +1,266 @@ |
| +// Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +#include "chrome/browser/extensions/api/declarative/url_component_patterns.h" |
| + |
| +#include "base/string_util.h" |
| +#include "googleurl/src/gurl.h" |
| + |
| +// This class implements a mapping of URL Component Patterns, such as |
| +// host_prefix, host_suffix, host_equals, ..., etc., to SubstringPatterns. |
| +// |
| +// The idea of this mapping is to reduce the problem of comparing many |
| +// URL Component Patterns against one URL to the problem of searching many |
| +// substrings in one string: |
| +// |
| +// ---------------------- -------------------- |
| +// | URL Query operator | ----translate----> | SubstringPattern | |
| +// ---------------------- -------------------- |
| +// ^ |
| +// | |
| +// compare |
| +// | |
| +// v |
| +// ---------------------- -------------------- |
| +// | URL to compare | | | |
| +// | to all URL Query | ----translate----> | String | |
| +// | operators | | | |
| +// ---------------------- -------------------- |
| +// |
| +// The reason for this problem reduction is that there are efficient algorithms |
| +// for searching many substrings in one string (see Aho-Corasick algorithm). |
| +// |
| +// Case 1: {host,path,query}_{prefix,suffix,equals} searches. |
| +// ========================================================== |
| +// |
| +// For searches in this class, we normalize URLs as follows: |
| +// |
| +// Step 1: |
| +// Remove scheme, port and segment from URL: |
| +// -> http://www.example.com:8080/index.html?search=foo#first_match becomes |
| +// www.example.com/index.html?search=foo |
| +// |
| +// We remove the scheme and port number because they can be checked later |
| +// in a secondary filter step. We remove the segment (the #... part) because |
| +// this is not guarantee to be ASCII-7 encoded. |
|
Matt Perry
2012/02/14 01:38:34
guaranteed*
Matt Perry
2012/02/14 01:38:34
Forgive my ignorance, but are you sure that URLs a
battre
2012/02/14 19:32:21
Done.
battre
2012/02/14 19:32:21
According to GURL::spec():
// Returns the raw s
|
| +// |
| +// Step 2: |
| +// Translate URL to String and add the following position markers: |
| +// - BU = Beginning of URL |
| +// - ED = End of Domain |
| +// - EP = End of Path |
| +// - EU = End of URL |
| +// Furthermore, the hostname is canonicalized to start with a ".". |
| +// |
| +// Position markers are represented as characters >127, which are therefore |
| +// guaranteed not to be part of the ASCII-7 encoded URL character set. |
| +// |
| +// -> www.example.com/index.html?search=foo becomes |
| +// BU .www.example.com ED /index.html EP ?search=foo EU |
| +// |
| +// -> www.example.com/index.html becomes |
| +// BU .www.example.com ED /index.html EP EU |
| +// |
| +// Step 3: |
| +// Translate URL Component Patterns as follows: |
| +// |
| +// host_prefix(prefix) = BU add_missing_dot_prefix(prefix) |
| +// -> host_prefix("www.example") = BU .www.example |
| +// |
| +// host_suffix(suffix) = suffix ED |
| +// -> host_suffix("example.com") = example.com ED |
| +// -> host_suffix(".example.com") = .example.com ED |
| +// |
| +// host_equals(domain) = BU add_missing_dot_prefix(domain) ED |
| +// -> host_equals("www.example.com") = BU .www.example.com ED |
| +// |
| +// |
| +// path_prefix(prefix) = ED prefix |
| +// -> path_prefix("/index.html") = ED /index.html |
| +// |
| +// path_suffix(suffix) = suffix EP |
| +// -> path_suffix("index.html) = index.html EP |
| +// |
| +// path_equals(path) = ED path EP |
| +// -> path_equals("/index.html") = ED /index.html EP |
|
Matt Perry
2012/02/14 01:38:34
nit: good examples, but IMO less is more. 1 or 2 e
battre
2012/02/14 19:32:21
Done.
|
| +// |
| +// |
| +// [Similarly for query parameters (query_{prefix, suffix, equals})] |
| +// |
| +// With this, we can search the SubstringPatterns in the normalized URL. |
| +// |
| +// |
| +// Case 2: url_{prefix,suffix,equals,contains} searches. |
| +// ===================================================== |
| +// |
| +// Step 1: as above |
| +// |
| +// Step 2: |
| +// Translate URL to String and add the following position markers: |
| +// - BU = Beginning of URL |
| +// - EU = End of URL |
| +// Furthermore, the hostname is canonicalized to start with a ".". |
| +// |
| +// -> www.example.com/index.html?search=foo becomes |
| +// BU .www.example.com/index.html?search=foo EU |
| +// |
| +// url_prefix(prefix) = BU add_missing_dot_prefix(prefix) |
| +// -> url_prefix("www.example") = BU .www.example |
| +// |
| +// url_suffix(suffix) = suffix EU |
| +// -> url_suffix("index.html") = index.html EU |
| +// |
| +// url_contains(substring) = substring |
| +// -> url_contains("index") = index |
| +// |
| +// url_equals(url) = BU add_missing_dot_prefix(url) EU |
| +// -> url_equals("www.example.com/index.html") = |
| +// BU .www.example.com/index.html EU |
| +// |
| +// |
| +// Case 3: {host,path,query}_contains searches. |
| +// ============================================ |
| +// |
| +// These kinds of searches are not supported directly but can be derived |
| +// by a combination of a url_contains() query followed by an explicit test: |
| +// |
| +// host_contains(str) = url_contains(str) followed by test whether str occurs |
| +// in host comonent of original URL. |
| +// -> host_contains("example.co") = example.co |
| +// followed by gurl.host().find("example.co"); |
| +// |
| +// [similarly for path_contains and query_contains]. |
| + |
| +namespace { |
| +// These are symbols that are not contained in 7-bit ASCII used in GURLs. |
| +char BEGINNING_OF_URL[] = {128, 0}; |
|
Matt Perry
2012/02/14 01:38:34
style: const char kBeginningOfURL[] = {128, 0};
an
battre
2012/02/14 19:32:21
Done.
|
| +char END_OF_DOMAIN[] = {129, 0}; |
| +char END_OF_PATH[] = {130, 0}; |
| +char END_OF_URL[] = {131, 0}; |
| +} // namespace |
| + |
| +namespace extensions { |
| + |
| +UrlComponentPatterns::UrlComponentPatterns() : id_counter_(0) {} |
| + |
| +std::string UrlComponentPatterns::CanonlicalizeURLForComponentSearches( |
| + const GURL& url) { |
| + return BEGINNING_OF_URL + CanonicalizeHostname(url.host()) + END_OF_DOMAIN + |
| + url.path() + END_OF_PATH + (url.has_query() ? "?" + url.query() : "") + |
| + END_OF_URL; |
| +} |
| + |
| +SubstringPattern UrlComponentPatterns::CreateHostPrefixPattern( |
| + const std::string& prefix) { |
| + return CreateSingletonPattern(BEGINNING_OF_URL + |
| + CanonicalizeHostname(prefix)); |
| +} |
| + |
| +SubstringPattern UrlComponentPatterns::CreateHostSuffixPattern( |
| + const std::string& suffix) { |
| + return CreateSingletonPattern(suffix + END_OF_DOMAIN); |
| +} |
| + |
| +SubstringPattern UrlComponentPatterns::CreateHostEqualsPattern( |
| + const std::string& str) { |
| + return CreateSingletonPattern(BEGINNING_OF_URL + CanonicalizeHostname(str) + |
| + END_OF_DOMAIN); |
| +} |
| + |
| +SubstringPattern UrlComponentPatterns::CreatePathPrefixPattern( |
| + const std::string& prefix) { |
| + return CreateSingletonPattern(END_OF_DOMAIN + prefix); |
| +} |
| + |
| +SubstringPattern UrlComponentPatterns::CreatePathSuffixPattern( |
| + const std::string& suffix) { |
| + return CreateSingletonPattern(suffix + END_OF_PATH); |
| +} |
| + |
| +SubstringPattern UrlComponentPatterns::CreatePathEqualsPattern( |
| + const std::string& str) { |
| + return CreateSingletonPattern(END_OF_DOMAIN + str + END_OF_PATH); |
| +} |
| + |
| +SubstringPattern UrlComponentPatterns::CreateQueryPrefixPattern( |
| + const std::string& prefix) { |
| + return CreateSingletonPattern(END_OF_PATH + prefix); |
| +} |
| + |
| +SubstringPattern UrlComponentPatterns::CreateQuerySuffixPattern( |
| + const std::string& suffix) { |
| + return CreateSingletonPattern(suffix + END_OF_URL); |
| +} |
| + |
| +SubstringPattern UrlComponentPatterns::CreateQueryEqualsPattern( |
| + const std::string& str) { |
| + return CreateSingletonPattern(END_OF_PATH + str + END_OF_URL); |
| +} |
| + |
| +SubstringPattern UrlComponentPatterns::CreateHostSuffixPathPrefixPattern( |
| + const std::string& host_suffix, |
| + const std::string& path_prefix) { |
| + return CreateSingletonPattern(host_suffix + END_OF_DOMAIN + path_prefix); |
| +} |
| + |
| +std::string UrlComponentPatterns::CanonlicalizeURLForFullSearches( |
| + const GURL& url) { |
| + return BEGINNING_OF_URL + CanonicalizeHostname(url.host()) + url.path() + |
| + (url.has_query() ? "?" + url.query() : "") + END_OF_URL; |
| +} |
| + |
| +SubstringPattern UrlComponentPatterns::CreateURLPrefixPattern( |
| + const std::string& prefix) { |
| + return CreateSingletonPattern(BEGINNING_OF_URL + |
| + CanonicalizeHostname(prefix)); |
| +} |
| + |
| +SubstringPattern UrlComponentPatterns::CreateURLSuffixPattern( |
| + const std::string& suffix) { |
| + return CreateSingletonPattern(suffix + END_OF_URL); |
| +} |
| + |
| +SubstringPattern UrlComponentPatterns::CreateURLContainsPattern( |
| + const std::string& str) { |
| + return CreateSingletonPattern(str); |
| +} |
| + |
| +SubstringPattern UrlComponentPatterns::CreateURLEqualsPattern( |
| + const std::string& str) { |
| + return CreateSingletonPattern(BEGINNING_OF_URL + CanonicalizeHostname(str) + |
| + END_OF_URL); |
| +} |
| + |
| +SubstringPattern UrlComponentPatterns::CreateSingletonPattern( |
| + const std::string& pattern) { |
| + std::map<std::string, SubstringPattern>::const_iterator iter = |
| + pattern_singletons_.find(pattern); |
| + if (iter != pattern_singletons_.end()) |
| + return iter->second; |
| + return |
| + (pattern_singletons_[pattern] = SubstringPattern(pattern, id_counter_++)); |
| +} |
| + |
| +std::string UrlComponentPatterns::CanonicalizeHostname( |
| + const std::string hostname) const { |
| + if (StartsWithASCII(hostname, ".", true)) |
|
Matt Perry
2012/02/14 01:38:34
nit: for this simple test, I'd just check hostname
battre
2012/02/14 19:32:21
Done.
|
| + return hostname; |
| + else |
| + return "." + hostname; |
| +} |
| + |
| +void UrlComponentPatterns::DestroySingletonPattern( |
| + const SubstringPattern& pattern) { |
| + pattern_singletons_.erase(pattern.pattern()); |
| +} |
| + |
| +void UrlComponentPatterns::DestroySingletonPatterns( |
| + const std::vector<SubstringPattern>& patterns) { |
| + for (std::vector<SubstringPattern>::const_iterator i = patterns.begin(); |
| + i != patterns.end(); ++i) { |
| + DestroySingletonPattern(*i); |
| + } |
| +} |
| + |
| +} // namespace extensions |