components/precache/core/precache_fetcher.cc - Issue 2403193002: Precache: Optionally rank resources-to-precache globally.

Unified Diff: components/precache/core/precache_fetcher.cc

Issue 2403193002: Precache: Optionally rank resources-to-precache globally. (Closed)

Patch Set: Rebase. Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: components/precache/core/precache_fetcher.cc

diff --git a/components/precache/core/precache_fetcher.cc b/components/precache/core/precache_fetcher.cc

index 8f45bff3136821d26dcf92c8a563c76179d90e90..77766e1588e61f4af7fcaaf351b457915a93311c 100644

--- a/components/precache/core/precache_fetcher.cc

+++ b/components/precache/core/precache_fetcher.cc

@@ -6,6 +6,7 @@

#include <algorithm>

#include <limits>

+#include <set>

#include <utility>

#include <vector>

@@ -53,11 +54,14 @@ const int kNoTracking =

net::LOAD_DO_NOT_SAVE_COOKIES | net::LOAD_DO_NOT_SEND_COOKIES |

net::LOAD_DO_NOT_SEND_AUTH_DATA;

-namespace {

-// The maximum number of URLFetcher requests that can be on flight in parallel.

+// The maximum number of URLFetcher requests that can be in flight in parallel.

+// Note that OnManifestFetchComplete and OnResourceFetchComplete perform

+// remove_if operations which are O(kMaxParallelFetches). Those should be

+// optimized before increasing this value significantly.

const int kMaxParallelFetches = 10;

+namespace {

// The maximum for the Precache.Fetch.ResponseBytes.* histograms. We set this to

// a number we expect to be in the 99th percentile for the histogram, give or

// take.

@@ -183,25 +187,25 @@ std::string GetResourceURLBase64Hash(const std::vector<GURL>& urls) {

// hosts in |hosts_to_fetch|, is added to |hosts_info|.

std::deque<ManifestHostInfo> RetrieveManifestInfo(

const base::WeakPtr<PrecacheDatabase>& precache_database,

- std::vector<std::string> hosts_to_fetch) {

+ std::vector<std::pair<std::string, int64_t>> hosts_to_fetch) {

std::deque<ManifestHostInfo> hosts_info;

if (!precache_database)

return hosts_info;

for (const auto& host : hosts_to_fetch) {

- auto referrer_host_info = precache_database->GetReferrerHost(host);

+ auto referrer_host_info = precache_database->GetReferrerHost(host.first);

if (referrer_host_info.id != PrecacheReferrerHostEntry::kInvalidId) {

std::vector<GURL> used_urls, unused_urls;

precache_database->GetURLListForReferrerHost(referrer_host_info.id,

&used_urls, &unused_urls);

hosts_info.push_back(

- ManifestHostInfo(referrer_host_info.manifest_id, host,

- GetResourceURLBase64Hash(used_urls),

+ ManifestHostInfo(referrer_host_info.manifest_id, host.first,

+ host.second, GetResourceURLBase64Hash(used_urls),

GetResourceURLBase64Hash(unused_urls)));

} else {

hosts_info.push_back(

- ManifestHostInfo(PrecacheReferrerHostEntry::kInvalidId, host,

- std::string(), std::string()));

+ ManifestHostInfo(PrecacheReferrerHostEntry::kInvalidId, host.first,

+ host.second, std::string(), std::string()));

}

return hosts_info;

@@ -225,6 +229,10 @@ bool IsQuotaTimeExpired(const PrecacheQuota& quota,

start_time + base::TimeDelta::FromDays(1) < time_now;

}

+double ResourceWeight(const PrecacheResource& resource, int64_t host_visits) {

+ return resource.weight_ratio() * host_visits;

} // namespace

PrecacheFetcher::Fetcher::Fetcher(

@@ -301,9 +309,6 @@ void PrecacheFetcher::Fetcher::OnURLFetchDownloadProgress(

// |current_network_bytes| is guaranteed to be non-negative, so this cast

// is safe.

static_cast<size_t>(current_network_bytes) > max_bytes_) {

- VLOG(1) << "Cancelling " << url_ << ": (" << current << "/" << total

- << ") is over " << max_bytes_;

// Call the completion callback, to attempt the next download, or to trigger

// cleanup in precache_delegate_->OnDone().

response_bytes_ = current;

@@ -363,30 +368,22 @@ void PrecacheFetcher::RecordCompletionStatistics(

base::TimeDelta::FromSeconds(1),

base::TimeDelta::FromHours(4), 50);

- // Number of manifests for which we have downloaded all resources.

- int manifests_completed =

- unfinished_work.num_manifest_urls() - remaining_manifest_urls_to_fetch;

- // If there are resource URLs left to fetch, the last manifest is not yet

- // completed.

- if (remaining_resource_urls_to_fetch > 0)

- --manifests_completed;

- DCHECK_GE(manifests_completed, 0);

- int percent_completed = unfinished_work.num_manifest_urls() == 0

- ? 0

- : (static_cast<double>(manifests_completed) /

- unfinished_work.num_manifest_urls() * 100);

+ int num_total_resources = unfinished_work.num_resource_urls();

+ int percent_completed =

+ num_total_resources == 0

+ ? 101 // Overflow bucket.

+ : (100 * (static_cast<double>(num_total_resources -

+ remaining_resource_urls_to_fetch) /

+ num_total_resources));

UMA_HISTOGRAM_PERCENTAGE("Precache.Fetch.PercentCompleted",

percent_completed);

- UMA_HISTOGRAM_CUSTOM_COUNTS("Precache.Fetch.ResponseBytes.Total",

- unfinished_work.total_bytes(),

- 1, kMaxResponseBytes, 100);

+ UMA_HISTOGRAM_CUSTOM_COUNTS("Precache.Fetch.ResponseBytes.Total",

+ unfinished_work.total_bytes(), 1,

+ kMaxResponseBytes, 100);

UMA_HISTOGRAM_CUSTOM_COUNTS("Precache.Fetch.ResponseBytes.Network",

- unfinished_work.network_bytes(),

- 1, kMaxResponseBytes,

- 100);

+ unfinished_work.network_bytes(), 1,

+ kMaxResponseBytes, 100);

}

// static

@@ -426,8 +423,10 @@ PrecacheFetcher::PrecacheFetcher(

// keeping track of the current resource index.

for (const auto& resource : unfinished_work->resource()) {

if (resource.has_url() && resource.has_top_host_name()) {

+ // Weight doesn't matter, as the resources have already been sorted by

+ // this point.

resources_to_fetch_.emplace_back(GURL(resource.url()),

- resource.top_host_name());

+ resource.top_host_name(), 0);

}

unfinished_work_ = std::move(unfinished_work);

@@ -446,28 +445,24 @@ std::unique_ptr<PrecacheUnfinishedWork> PrecacheFetcher::CancelPrecaching() {

// If config fetch is incomplete, |top_hosts_to_fetch_| will be empty and

// top hosts should be left as is in |unfinished_work_|.

unfinished_work_->clear_top_host();

- for (const auto& top_host : top_hosts_to_fetch_) {

+ for (const auto& top_host : top_hosts_fetching_)

unfinished_work_->add_top_host()->set_hostname(top_host.hostname);

- }

+ for (const auto& top_host : top_hosts_to_fetch_)

+ unfinished_work_->add_top_host()->set_hostname(top_host.hostname);

+ }

+ for (const auto& resource : resources_fetching_) {

+ auto new_resource = unfinished_work_->add_resource();

+ new_resource->set_url(resource.url.spec());

+ new_resource->set_top_host_name(resource.referrer);

}

for (const auto& resource : resources_to_fetch_) {

auto new_resource = unfinished_work_->add_resource();

- new_resource->set_url(resource.first.spec());

- new_resource->set_top_host_name(resource.second);

- }

- for (const auto& it : pool_.elements()) {

- const Fetcher* fetcher = it.first;

- GURL config_url =

- config_url_.is_empty() ? GetDefaultConfigURL() : config_url_;

- if (fetcher->is_resource_request()) {

- auto resource = unfinished_work_->add_resource();

- resource->set_url(fetcher->url().spec());

- resource->set_top_host_name(fetcher->referrer());

- } else if (fetcher->url() != config_url) {

- unfinished_work_->add_top_host()->set_hostname(fetcher->referrer());

- }

+ new_resource->set_url(resource.url.spec());

+ new_resource->set_top_host_name(resource.referrer);

}

+ top_hosts_fetching_.clear();

top_hosts_to_fetch_.clear();

+ resources_fetching_.clear();

resources_to_fetch_.clear();

pool_.DeleteAll();

return std::move(unfinished_work_);

@@ -488,7 +483,6 @@ void PrecacheFetcher::Start() {

// Fetch the precache configuration settings from the server.

DCHECK(pool_.IsEmpty()) << "All parallel requests should be available";

- VLOG(3) << "Fetching " << config_url;

pool_.Add(base::MakeUnique<Fetcher>(

request_context_.get(), config_url, std::string(),

base::Bind(&PrecacheFetcher::OnConfigFetchComplete, AsWeakPtr()),

@@ -498,35 +492,35 @@ void PrecacheFetcher::Start() {

void PrecacheFetcher::StartNextResourceFetch() {

DCHECK(unfinished_work_->has_config_settings());

while (!resources_to_fetch_.empty() && pool_.IsAvailable()) {

- const auto& resource = resources_to_fetch_.front();

+ ResourceInfo& resource = resources_to_fetch_.front();

const size_t max_bytes = std::min(

quota_.remaining(),

std::min(unfinished_work_->config_settings().max_bytes_per_resource(),

unfinished_work_->config_settings().max_bytes_total() -

unfinished_work_->total_bytes()));

- VLOG(3) << "Fetching " << resource.first << " " << resource.second;

pool_.Add(base::MakeUnique<Fetcher>(

- request_context_.get(), resource.first, resource.second,

+ request_context_.get(), resource.url, resource.referrer,

base::Bind(&PrecacheFetcher::OnResourceFetchComplete, AsWeakPtr()),

true /* is_resource_request */, max_bytes));

+ resources_fetching_.push_back(std::move(resource));

resources_to_fetch_.pop_front();

}

-void PrecacheFetcher::StartNextManifestFetch() {

- if (top_hosts_to_fetch_.empty() || !pool_.IsAvailable())

- return;

- // We only fetch one manifest at a time to keep the size of

- // resources_to_fetch_ as small as possible.

- VLOG(3) << "Fetching " << top_hosts_to_fetch_.front().manifest_url;

- pool_.Add(base::MakeUnique<Fetcher>(

- request_context_.get(), top_hosts_to_fetch_.front().manifest_url,

- top_hosts_to_fetch_.front().hostname,

- base::Bind(&PrecacheFetcher::OnManifestFetchComplete, AsWeakPtr()),

- false /* is_resource_request */, std::numeric_limits<int32_t>::max()));

- top_hosts_to_fetch_.pop_front();

+void PrecacheFetcher::StartNextManifestFetches() {

+ // We fetch as many manifests at a time as possible, as we need all resource

+ // URLs in memory in order to rank them.

+ while (!top_hosts_to_fetch_.empty() && pool_.IsAvailable()) {

+ ManifestHostInfo& top_host = top_hosts_to_fetch_.front();

+ pool_.Add(base::MakeUnique<Fetcher>(

+ request_context_.get(), top_host.manifest_url, top_host.hostname,

+ base::Bind(&PrecacheFetcher::OnManifestFetchComplete, AsWeakPtr(),

+ top_host.visits),

+ false /* is_resource_request */, std::numeric_limits<int32_t>::max()));

+ top_hosts_fetching_.push_back(std::move(top_host));

+ top_hosts_to_fetch_.pop_front();

+ }

}

void PrecacheFetcher::NotifyDone(

@@ -545,23 +539,14 @@ void PrecacheFetcher::StartNextFetch() {

if ((unfinished_work_->total_bytes() >

unfinished_work_->config_settings().max_bytes_total()) ||

quota_.remaining() == 0) {

- size_t pending_manifests_in_pool = 0;

- size_t pending_resources_in_pool = 0;

- for (const auto& element_pair : pool_.elements()) {

- const Fetcher* fetcher = element_pair.first;

- if (fetcher->is_resource_request())

- pending_resources_in_pool++;

- else if (fetcher->url() != config_url_)

- pending_manifests_in_pool++;

- }

pool_.DeleteAll();

- NotifyDone(top_hosts_to_fetch_.size() + pending_manifests_in_pool,

- resources_to_fetch_.size() + pending_resources_in_pool);

+ NotifyDone(top_hosts_to_fetch_.size() + top_hosts_fetching_.size(),

+ resources_to_fetch_.size() + resources_fetching_.size());

return;

}

StartNextResourceFetch();

- StartNextManifestFetch();

+ StartNextManifestFetches();

if (top_hosts_to_fetch_.empty() && resources_to_fetch_.empty() &&

pool_.IsEmpty()) {

// There are no more URLs to fetch, so end the precache cycle.

@@ -589,9 +574,7 @@ void PrecacheFetcher::OnConfigFetchComplete(const Fetcher& source) {

void PrecacheFetcher::DetermineManifests() {

DCHECK(unfinished_work_->has_config_settings());

- std::vector<std::string> top_hosts_to_fetch;

- std::unique_ptr<std::deque<ManifestHostInfo>> top_hosts_info(

- new std::deque<ManifestHostInfo>);

+ std::vector<std::pair<std::string, int64_t>> top_hosts_to_fetch;

// Keep track of manifest URLs that are being fetched, in order to elide

// duplicates.

std::set<base::StringPiece> seen_top_hosts;

@@ -602,7 +585,7 @@ void PrecacheFetcher::DetermineManifests() {

if (rank > unfinished_work_->config_settings().top_sites_count())

break;

if (seen_top_hosts.insert(host.hostname()).second)

- top_hosts_to_fetch.push_back(host.hostname());

+ top_hosts_to_fetch.emplace_back(host.hostname(), host.visits());

}

// Attempt to fetch manifests for starting hosts up to the maximum top sites

@@ -613,12 +596,15 @@ void PrecacheFetcher::DetermineManifests() {

if (resources_to_fetch_.empty()) {

for (const std::string& host :

unfinished_work_->config_settings().forced_site()) {

+ // We add a forced site with visits == 0, which means its resources will

+ // be downloaded last. TODO(twifkak): Consider removing support for

+ // forced_site.

if (seen_top_hosts.insert(host).second)

- top_hosts_to_fetch.push_back(host);

+ top_hosts_to_fetch.emplace_back(host, 0);

}

- // We only fetch one manifest at a time to keep the size of

- // resources_to_fetch_ as small as possible.

+ // We retrieve manifest usage and quota info from the local database before

+ // fetching the manifests.

PostTaskAndReplyWithResult(

db_task_runner_.get(), FROM_HERE,

base::Bind(&RetrieveManifestInfo, precache_database_,

@@ -636,7 +622,7 @@ void PrecacheFetcher::OnManifestInfoRetrieved(

// is invalid.

top_hosts_to_fetch_.clear();

unfinished_work_->set_num_manifest_urls(manifests_info.size());

- NotifyDone(manifests_info.size(), resources_to_fetch_.size());

+ NotifyDone(manifests_info.size(), resources_to_rank_.size());

return;

}

@@ -683,10 +669,12 @@ void PrecacheFetcher::OnQuotaInfoRetrieved(const PrecacheQuota& quota) {

ManifestHostInfo::ManifestHostInfo(int64_t manifest_id,

const std::string& hostname,

+ int64_t visits,

const std::string& used_url_hash,

const std::string& unused_url_hash)

: manifest_id(manifest_id),

hostname(hostname),

+ visits(visits),

used_url_hash(used_url_hash),

unused_url_hash(unused_url_hash) {}

@@ -696,7 +684,19 @@ ManifestHostInfo::ManifestHostInfo(ManifestHostInfo&&) = default;

ManifestHostInfo& ManifestHostInfo::operator=(ManifestHostInfo&&) = default;

-void PrecacheFetcher::OnManifestFetchComplete(const Fetcher& source) {

+ResourceInfo::ResourceInfo(const GURL& url,

+ const std::string& referrer,

+ double weight)

+ : url(url), referrer(referrer), weight(weight) {}

+ResourceInfo::~ResourceInfo() {}

+ResourceInfo::ResourceInfo(ResourceInfo&&) = default;

+ResourceInfo& ResourceInfo::operator=(ResourceInfo&&) = default;

+void PrecacheFetcher::OnManifestFetchComplete(int64_t host_visits,

+ const Fetcher& source) {

DCHECK(unfinished_work_->has_config_settings());

UpdateStats(source.response_bytes(), source.network_response_bytes());

if (source.network_url_fetcher() == nullptr) {

@@ -715,7 +715,9 @@ void PrecacheFetcher::OnManifestFetchComplete(const Fetcher& source) {

manifest.resource(i).has_url()) {

GURL url(manifest.resource(i).url());

if (url.is_valid()) {

- resources_to_fetch_.emplace_back(url, source.referrer());

+ double weight = ResourceWeight(manifest.resource(i), host_visits);

+ if (weight >= unfinished_work_->config_settings().min_weight())

+ resources_to_rank_.emplace_back(url, source.referrer(), weight);

}

@@ -726,10 +728,43 @@ void PrecacheFetcher::OnManifestFetchComplete(const Fetcher& source) {

}

+ top_hosts_fetching_.remove_if([&source](const ManifestHostInfo& top_host) {

+ return top_host.manifest_url == source.url();

+ });

pool_.Delete(source);

+ if (top_hosts_to_fetch_.empty() && top_hosts_fetching_.empty())

+ QueueResourcesForFetch();

StartNextFetch();

}

+void PrecacheFetcher::QueueResourcesForFetch() {

+ // Done fetching manifests. Now move resources_to_rank_ into

+ // resources_to_fetch_, so that StartNextFetch will begin fetching resources.

+ resources_to_fetch_ = std::move(resources_to_rank_);

+ if (unfinished_work_->config_settings().global_ranking()) {

+ // Sort resources_to_fetch_ by descending weight.

+ std::stable_sort(resources_to_fetch_.begin(), resources_to_fetch_.end(),

+ [](const ResourceInfo& first, const ResourceInfo& second) {

+ return first.weight > second.weight;

+ });

+ }

+ // Truncate to size |total_resources_count|.

+ const size_t num_resources = std::min(

+ resources_to_fetch_.size(),

+ static_cast<size_t>(

+ unfinished_work_->config_settings().total_resources_count()));

+ resources_to_fetch_.erase(resources_to_fetch_.begin() + num_resources,

+ resources_to_fetch_.end());

+ // Save denominator for PercentCompleted UMA.

+ unfinished_work_->set_num_resource_urls(resources_to_fetch_.size());

void PrecacheFetcher::OnResourceFetchComplete(const Fetcher& source) {

UpdateStats(source.response_bytes(), source.network_response_bytes());

@@ -739,6 +774,10 @@ void PrecacheFetcher::OnResourceFetchComplete(const Fetcher& source) {

source.url(), source.referrer(), base::Time::Now(),

source.was_cached(), source.response_bytes()));

+ resources_fetching_.remove_if([&source](const ResourceInfo& resource) {

+ return resource.url == source.url();

+ });

pool_.Delete(source);

// The resource has already been put in the cache during the fetch process, so

« no previous file with comments | « components/precache/core/precache_fetcher.h ('k') | components/precache/core/precache_fetcher_unittest.cc » ('j') | no next file with comments »