Chromium Code Reviews| Index: content/browser/renderer_host/duplicate_content_resource_handler.cc |
| =================================================================== |
| --- content/browser/renderer_host/duplicate_content_resource_handler.cc (revision 0) |
| +++ content/browser/renderer_host/duplicate_content_resource_handler.cc (revision 0) |
| @@ -0,0 +1,135 @@ |
| +// Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +#include "content/browser/renderer_host/duplicate_content_resource_handler.h" |
| + |
| +#include <set> |
| + |
| +#include "base/logging.h" |
| +#include "base/memory/singleton.h" |
| +#include "base/metrics/histogram.h" |
| +#include "content/browser/renderer_host/resource_request_info_impl.h" |
| +#include "net/base/io_buffer.h" |
| +#include "net/url_request/url_request.h" |
| + |
| +namespace content { |
| + |
|
darin (slow to review)
2012/07/25 22:02:06
nit: no new line here
frankwang
2012/07/26 21:46:19
Done.
|
| +namespace { |
| + |
| +class GlobalDuplicateRecords { |
| + public: |
|
darin (slow to review)
2012/07/25 22:02:06
nit: indentation
frankwang
2012/07/26 21:46:19
Done.
|
| + static GlobalDuplicateRecords* GetInstance() { |
| + return Singleton<GlobalDuplicateRecords>::get(); |
|
darin (slow to review)
2012/07/25 22:02:06
we should only use Singleton<> for static objects
frankwang
2012/07/26 21:46:19
Done.
|
| + } |
| + |
| + std::set<MH_UINT32>* content_matches() { |
| + return &content_matches_; |
| + } |
| + |
| + std::set<MH_UINT32>* content_and_url_matches() { |
| + return &content_and_url_matches_; |
| + } |
| + |
| + private: |
| + friend class Singleton<GlobalDuplicateRecords>; |
| + friend struct DefaultSingletonTraits<GlobalDuplicateRecords>; |
| + |
| + GlobalDuplicateRecords() {} |
| + ~GlobalDuplicateRecords() {} |
| + |
| + std::set<MH_UINT32> content_matches_; |
|
darin (slow to review)
2012/07/25 22:02:06
how big can these sets grow? do we need to worry
frankwang
2012/07/26 21:46:19
We crawled the Alexa top 25,000 list and found tha
|
| + std::set<MH_UINT32> content_and_url_matches_; |
| +}; |
| +} // namespace |
|
darin (slow to review)
2012/07/25 22:02:06
nit: add a new line above the close of the namespa
frankwang
2012/07/26 21:46:19
Done.
|
| + |
| +DuplicateContentResourceHandler::DuplicateContentResourceHandler( |
| + scoped_ptr<ResourceHandler> next_handler, |
| + ResourceType::Type resource_type, |
| + net::URLRequest* request) |
| + : LayeredResourceHandler(next_handler.Pass()), |
| + resource_type_(resource_type), |
| + bytes_read_(0), |
| + request_(request), |
| + pmurhash_ph1_(0), |
| + pmurhash_pcarry_(0) { |
| +} |
| + |
| +DuplicateContentResourceHandler::~DuplicateContentResourceHandler() { |
| +} |
| + |
| +bool DuplicateContentResourceHandler:: |
| + OnWillRead(int request_id, net::IOBuffer** buf, |
| + int* buf_size, int min_size) { |
| + DCHECK_EQ(-1, min_size); |
| + |
| + if (!next_handler_->OnWillRead(request_id, buf, buf_size, min_size)) |
| + return false; |
| + read_buffer_ = *buf; |
| + return true; |
| +} |
| + |
| +bool DuplicateContentResourceHandler:: |
| + OnReadCompleted(int request_id, int bytes_read, bool* defer) { |
| + PMurHash32_Process(&pmurhash_ph1_, &pmurhash_pcarry_, |
|
darin (slow to review)
2012/07/25 22:02:06
how do we know that this function isn't going to s
frankwang
2012/07/26 21:46:19
PMurHash is relatively fast. The MurMur hash famil
|
| + read_buffer_->data(), bytes_read); |
| + bytes_read_ += bytes_read; |
| + return next_handler_->OnReadCompleted(request_id, bytes_read, defer); |
| +} |
| + |
| +bool DuplicateContentResourceHandler::OnResponseCompleted( |
| + int request_id, |
| + const net::URLRequestStatus& status, |
| + const std::string& security_info) { |
| + |
| + if (status.is_success()) |
| + DuplicateContentResourceHandler::RecordContentMetrics(status); |
|
darin (slow to review)
2012/07/25 22:02:06
no need for the "DuplicateContentResourceHandler::
frankwang
2012/07/26 21:46:19
Done.
|
| + |
| + return next_handler_->OnResponseCompleted(request_id, status, security_info); |
| +} |
| + |
| +void DuplicateContentResourceHandler:: |
| + RecordContentMetrics(const net::URLRequestStatus& status) { |
|
darin (slow to review)
2012/07/25 22:02:06
you don't seem to need this parameter.
frankwang
2012/07/26 21:46:19
Deleted.
|
| + MH_UINT32 contents_hash = PMurHash32_Result(pmurhash_ph1_, |
|
darin (slow to review)
2012/07/25 22:02:06
what about the runtime of this function? is it co
frankwang
2012/07/26 21:46:19
Comment left above about PMurHash.
|
| + pmurhash_pcarry_, bytes_read_); |
|
darin (slow to review)
2012/07/25 22:02:06
nit: indentation of the parameters
frankwang
2012/07/26 21:46:19
Done.
|
| + |
| + // Combine the contents_hash with the url, so we can test if future content |
| + // identical resources have the same original url or not. |
| + MH_UINT32 hashed_with_url; |
| + const std::string url_spec = request_->url().spec(); |
|
darin (slow to review)
2012/07/25 22:02:06
nit: you should use a const ref here: "const std:
frankwang
2012/07/26 21:46:19
Done.
|
| + PMurHash32_Process(&pmurhash_ph1_, &pmurhash_pcarry_, |
| + url_spec.data(), url_spec.length()); |
| + hashed_with_url = PMurHash32_Result(pmurhash_ph1_, pmurhash_pcarry_, |
| + url_spec.length() + bytes_read_); |
| + |
| + DVLOG(4) << "url: " << url_spec; |
| + DVLOG(4) << "contents hash: " << contents_hash; |
| + DVLOG(4) << "hash with url: " << hashed_with_url; |
| + |
| + std::set<MH_UINT32>* content_matches = |
| + GlobalDuplicateRecords::GetInstance()->content_matches(); |
| + std::set<MH_UINT32>* content_and_url_matches = |
| + GlobalDuplicateRecords::GetInstance()->content_and_url_matches(); |
| + |
| + const bool did_match_contents = content_matches->count(contents_hash) > 0; |
| + const bool did_match_contents_and_url = |
| + content_and_url_matches->count(hashed_with_url) > 0; |
| + |
| + UMA_HISTOGRAM_BOOLEAN("Duplicate.Hits", did_match_contents); |
| + UMA_HISTOGRAM_BOOLEAN("Duplicate.HitsSameUrl", did_match_contents && |
| + did_match_contents_and_url); |
| + if (did_match_contents && !did_match_contents_and_url) { |
| + content_and_url_matches->insert(hashed_with_url); |
|
darin (slow to review)
2012/07/25 22:02:06
nit: indentation
frankwang
2012/07/26 21:46:19
Done.
|
| + UMA_HISTOGRAM_CUSTOM_COUNTS("Duplicate.Size.HashHitUrlMiss", bytes_read_, |
| + 1, 0x7FFFFFFF, 50); |
| + UMA_HISTOGRAM_ENUMERATION("Duplicate.ResourceType.HashHitUrlMiss", |
| + resource_type_, ResourceType::LAST_TYPE); |
| + } |
| + content_matches->insert(contents_hash); |
| + content_and_url_matches->insert(hashed_with_url); |
| + |
| + bytes_read_ = 0; |
| + read_buffer_ = NULL; |
| +} |
| +} // namespace content |
|
darin (slow to review)
2012/07/25 22:02:06
nit: add a new line above the close of the namespa
frankwang
2012/07/26 21:46:19
Done.
|
| + |
|
darin (slow to review)
2012/07/25 22:02:06
nit: extraneous blank line?
frankwang
2012/07/26 21:46:19
Done.
|