chrome/browser/metrics/metrics_service.cc - Issue 10546044: [Metrics] Re-try failed protocol buffer uploads as well as failed XML uploads.

Unified Diff: chrome/browser/metrics/metrics_service.cc

Issue 10546044: [Metrics] Re-try failed protocol buffer uploads as well as failed XML uploads. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Why do today what you can put off 'til tomorrow? Created 8 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « chrome/browser/metrics/metrics_service.h ('k') | chrome/common/metrics/metrics_log_manager.h » ('j') | chrome/common/metrics/metrics_log_manager.cc » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: chrome/browser/metrics/metrics_service.cc

diff --git a/chrome/browser/metrics/metrics_service.cc b/chrome/browser/metrics/metrics_service.cc

index fef349d7b40345cacbbd1b1211308e31cdb039f1..7da0645e910daea91119553543a78ff2c10ffea4 100644

--- a/chrome/browser/metrics/metrics_service.cc

+++ b/chrome/browser/metrics/metrics_service.cc

@@ -239,11 +239,6 @@ const size_t kUploadLogAvoidRetransmitSize = 50000;

// Interval, in minutes, between state saves.

const int kSaveStateIntervalMinutes = 5;

-// Used to indicate that the response code is currently not set at all --

-// RESPONSE_CODE_INVALID can sometimes be returned in response to a request if,

-// e.g., the server is down.

-const int kNoResponseCode = net::URLFetcher::RESPONSE_CODE_INVALID - 1;

enum ResponseStatus {

UNKNOWN_FAILURE,

SUCCESS,

@@ -1017,8 +1012,12 @@ void MetricsService::PushPendingLogsToPersistentStorage() {

// We may race here, and send second copy of initial log later.

if (state_ == INITIAL_LOG_READY)

state_ = SENDING_OLD_LOGS;

- MetricsLogManager::StoreType store_type = current_fetch_xml_.get() ?

- MetricsLogManager::PROVISIONAL_STORE : MetricsLogManager::NORMAL_STORE;

+ MetricsLogManager::StoreType store_type;

+ if (current_fetch_xml_.get() || current_fetch_proto_.get())

+ store_type = MetricsLogManager::PROVISIONAL_STORE;

+ else

+ store_type = MetricsLogManager::NORMAL_STORE;

log_manager_.StoreStagedLogAsUnsent(store_type);

}

DCHECK(!log_manager_.has_staged_log());

@@ -1121,14 +1120,6 @@ void MetricsService::OnFinalLogInfoCollectionDone() {

}

MakeStagedLog();

- // MakeStagedLog should have prepared log text; if it didn't, skip this

- // upload and hope things work out next time.

- if (log_manager_.staged_log_text().empty()) {

- scheduler_->UploadCancelled();

- return;

- }

SendStagedLog();

}

@@ -1139,7 +1130,7 @@ void MetricsService::MakeStagedLog() {

switch (state_) {

case INITIALIZED:

case INIT_TASK_SCHEDULED: // We should be further along by now.

- DCHECK(false);

+ NOTREACHED();

return;

case INIT_TASK_DONE:

@@ -1207,26 +1198,21 @@ void MetricsService::SendStagedLog() {

PrepareFetchWithStagedLog();

- if (!current_fetch_xml_.get()) {

- DCHECK(!current_fetch_proto_.get());

+ if (!current_fetch_xml_.get() && !current_fetch_proto_.get()) {

// Compression failed, and log discarded :-/.

+ // Skip this upload and hope things work out next time.

log_manager_.DiscardStagedLog();

scheduler_->UploadCancelled();

// TODO(jar): If compression failed, we should have created a tiny log and

// compressed that, so that we can signal that we're losing logs.

return;

}

- // Currently, the staged log for the protobuf version of the data is discarded

- // after we create the URL request, so that there is no chance for

- // re-transmission in case the corresponding XML request fails. We will

- // handle protobuf failures more carefully once that becomes the main

- // pipeline, i.e. once we switch away from the XML pipeline.

- DCHECK(current_fetch_proto_.get() || !log_manager_.has_staged_log_proto());

DCHECK(!waiting_for_asynchronous_reporting_step_);

waiting_for_asynchronous_reporting_step_ = true;

- current_fetch_xml_->Start();

+ if (current_fetch_xml_.get())

+ current_fetch_xml_->Start();

if (current_fetch_proto_.get())

current_fetch_proto_->Start();

@@ -1234,20 +1220,22 @@ void MetricsService::SendStagedLog() {

}

void MetricsService::PrepareFetchWithStagedLog() {

- DCHECK(!log_manager_.staged_log_text().empty());

+ DCHECK(log_manager_.has_staged_log());

// Prepare the XML version.

DCHECK(!current_fetch_xml_.get());

- current_fetch_xml_.reset(content::URLFetcher::Create(

- GURL(server_url_xml_), net::URLFetcher::POST, this));

- current_fetch_xml_->SetRequestContext(

- g_browser_process->system_request_context());

- current_fetch_xml_->SetUploadData(kMimeTypeXml,

- log_manager_.staged_log_text().xml);

- // We already drop cookies server-side, but we might as well strip them out

- // client-side as well.

- current_fetch_xml_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |

- net::LOAD_DO_NOT_SEND_COOKIES);

+ if (log_manager_.has_staged_log_xml()) {

+ current_fetch_xml_.reset(content::URLFetcher::Create(

+ GURL(server_url_xml_), net::URLFetcher::POST, this));

+ current_fetch_xml_->SetRequestContext(

+ g_browser_process->system_request_context());

+ current_fetch_xml_->SetUploadData(kMimeTypeXml,

+ log_manager_.staged_log_text().xml);

+ // We already drop cookies server-side, but we might as well strip them out

+ // client-side as well.

+ current_fetch_xml_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |

+ net::LOAD_DO_NOT_SEND_COOKIES);

+ }

// Prepare the protobuf version.

DCHECK(!current_fetch_proto_.get());

@@ -1262,12 +1250,6 @@ void MetricsService::PrepareFetchWithStagedLog() {

// client-side as well.

current_fetch_proto_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |

net::LOAD_DO_NOT_SEND_COOKIES);

- // Discard the protobuf version of the staged log, so that we will avoid

- // re-uploading it even if we need to re-upload the XML version.

- // TODO(isherman): Handle protobuf upload failures more gracefully once we

- // transition away from the XML-based pipeline.

- log_manager_.DiscardStagedLogProto();

}

@@ -1295,22 +1277,9 @@ static const char* StatusToString(const net::URLRequestStatus& status) {

}

// We need to wait for two responses: the response to the XML upload, and the

-// response to the protobuf upload. For now, only the XML upload's response

-// affects decisions like whether to retry the upload, whether to abandon the

-// upload because it is too large, etc. However, we still need to wait for the

-// protobuf upload, as we cannot reset |current_fetch_proto_| until we have

-// confirmation that the network request was sent; and the easiest way to do

-// that is to wait for the response. In case the XML upload's response arrives

-// first, we cache that response until the protobuf upload's response also

-// arrives.

-//

-// Note that if the XML upload succeeds but the protobuf upload fails, we will

-// not retry the protobuf upload. If the XML upload fails while the protobuf

-// upload succeeds, we will still avoid re-uploading the protobuf data because

-// we "zap" the data after the first upload attempt. This means that we might

-// lose protobuf uploads when XML ones succeed; but we will never duplicate any

-// protobuf uploads. Protobuf failures should be rare enough to where this

-// should be ok while we have the two pipelines running in parallel.

+// response to the protobuf upload. In the case that exactly one of the uploads

+// fails and needs to be retried, we "zap" the other pipeline's staged log to

+// avoid incorrectly re-uploading it as well.

void MetricsService::OnURLFetchComplete(const net::URLFetcher* source) {

DCHECK(waiting_for_asynchronous_reporting_step_);

@@ -1318,72 +1287,76 @@ void MetricsService::OnURLFetchComplete(const net::URLFetcher* source) {

// Note however that |source| is aliased to one of these, so we should be

// careful not to delete it too early.

scoped_ptr<net::URLFetcher> s;

+ bool is_xml;

if (source == current_fetch_xml_.get()) {

s.reset(current_fetch_xml_.release());

- // Cache the XML responses, in case we still need to wait for the protobuf

- // response.

- response_code_ = source->GetResponseCode();

- response_status_ = StatusToString(source->GetStatus());

- source->GetResponseAsString(&response_data_);

- // Log a histogram to track response success vs. failure rates.

- UMA_HISTOGRAM_ENUMERATION("UMA.UploadResponseStatus.XML",

- ResponseCodeToStatus(response_code_),

- NUM_RESPONSE_STATUSES);

+ is_xml = true;

} else if (source == current_fetch_proto_.get()) {

s.reset(current_fetch_proto_.release());

- // Log a histogram to track response success vs. failure rates.

- UMA_HISTOGRAM_ENUMERATION("UMA.UploadResponseStatus.Protobuf",

- ResponseCodeToStatus(source->GetResponseCode()),

- NUM_RESPONSE_STATUSES);

+ is_xml = false;

} else {

NOTREACHED();

return;

}

- // If we're still waiting for one of the responses, keep waiting...

- if (current_fetch_xml_.get() || current_fetch_proto_.get())

- return;

+ int response_code = source->GetResponseCode();

- // We should only be able to reach here once we've received responses to both

- // the XML and the protobuf requests. We should always have the response code

- // available.

- DCHECK_NE(response_code_, kNoResponseCode);

- waiting_for_asynchronous_reporting_step_ = false;

+ // Log a histogram to track response success vs. failure rates.

+ if (is_xml) {

+ UMA_HISTOGRAM_ENUMERATION("UMA.UploadResponseStatus.XML",

+ ResponseCodeToStatus(response_code),

+ NUM_RESPONSE_STATUSES);

+ } else {

+ UMA_HISTOGRAM_ENUMERATION("UMA.UploadResponseStatus.Protobuf",

+ ResponseCodeToStatus(response_code),

+ NUM_RESPONSE_STATUSES);

+ }

// If the upload was provisionally stored, drop it now that the upload is

// known to have gone through.

log_manager_.DiscardLastProvisionalStore();

// Confirm send so that we can move on.

- VLOG(1) << "Metrics response code: " << response_code_

- << " status=" << response_status_;

+ VLOG(1) << "Metrics response code: " << response_code

jar (doing other things) 2012/06/11 21:19:35 nit: I think there was a move to reduce the verbos

+ << " status=" << StatusToString(source->GetStatus()) << " for "

+ << (is_xml ? "xml" : "protobuf") << " upload.";

- bool upload_succeeded = response_code_ == 200;

+ bool upload_succeeded = response_code == 200;

// Provide boolean for error recovery (allow us to ignore response_code).

bool discard_log = false;

- if (!upload_succeeded &&

- log_manager_.staged_log_text().xml.length() >

- kUploadLogAvoidRetransmitSize) {

- UMA_HISTOGRAM_COUNTS(

- "UMA.Large Rejected Log was Discarded",

- static_cast<int>(log_manager_.staged_log_text().xml.length()));

+ const size_t log_size = is_xml ?

+ log_manager_.staged_log_text().xml.length() :

+ log_manager_.staged_log_text().proto.length();

+ if (!upload_succeeded && log_size > kUploadLogAvoidRetransmitSize) {

+ UMA_HISTOGRAM_COUNTS("UMA.Large Rejected Log was Discarded",

+ static_cast<int>(log_size));

discard_log = true;

- } else if (response_code_ == 400) {

+ } else if (response_code == 400) {

// Bad syntax. Retransmission won't work.

discard_log = true;

}

if (!upload_succeeded && !discard_log) {

- VLOG(1) << "Metrics: transmission attempt returned a failure code: "

- << response_code_ << ". Verify network connectivity";

- LogBadResponseCode();

+ LogBadResponseCode(response_code, is_xml);

} else { // Successful receipt (or we are discarding log).

- VLOG(1) << "Metrics response data: " << response_data_;

+ std::string response_data;

+ source->GetResponseAsString(&response_data);

+ VLOG(1) << "Metrics response data: " << response_data;

jar (doing other things) 2012/06/11 21:19:35 nit: can/should this be debug-only logging?

+ if (is_xml)

+ log_manager_.DiscardStagedLogXml();

+ else

+ log_manager_.DiscardStagedLogProto();

+ }

+ // If we're still waiting for one of the responses, keep waiting...

+ if (current_fetch_xml_.get() || current_fetch_proto_.get())

+ return;

+ waiting_for_asynchronous_reporting_step_ = false;

+ if (!log_manager_.has_staged_log()) {

switch (state_) {

case INITIAL_LOG_READY:

state_ = SENDING_OLD_LOGS;

@@ -1402,18 +1375,23 @@ void MetricsService::OnURLFetchComplete(const net::URLFetcher* source) {

break;

}

- log_manager_.DiscardStagedLog();

if (log_manager_.has_unsent_logs())

DCHECK_LT(state_, SENDING_CURRENT_LOGS);

}

// Error 400 indicates a problem with the log, not with the server, so

// don't consider that a sign that the server is in trouble.

- bool server_is_healthy = upload_succeeded || response_code_ == 400;

- scheduler_->UploadFinished(server_is_healthy,

- log_manager_.has_unsent_logs());

+ bool server_is_healthy = upload_succeeded || response_code == 400;

+ // Note that we are essentially randomly choosing to report either the XML or

+ // the protobuf server's health status, but this is ok: In the case that the

+ // two statuses are not the same, one of the uploads succeeded but the other

+ // did not. In this case, we'll re-try only the upload that failed. This first

+ // re-try might ignore the server's unhealthiness; but the response to the

+ // re-tried upload will correctly propagate the server's health status for any

+ // subsequent re-tries. Hence, we'll at most delay slowing the upload rate by

+ // one re-try, which is fine.

+ scheduler_->UploadFinished(server_is_healthy, log_manager_.has_unsent_logs());

// Collect network stats if UMA upload succeeded.

IOThread* io_thread = g_browser_process->io_thread();

@@ -1422,21 +1400,25 @@ void MetricsService::OnURLFetchComplete(const net::URLFetcher* source) {

chrome_browser_net::CollectPipeliningCapabilityStatsOnUIThread(

http_pipelining_test_server_, io_thread);

}

- // Reset the cached response data.

- response_code_ = kNoResponseCode;

- response_data_ = std::string();

- response_status_ = std::string();

}

-void MetricsService::LogBadResponseCode() {

- VLOG(1) << "Verify your metrics logs are formatted correctly. Verify server "

- "is active at " << server_url_xml_;

+void MetricsService::LogBadResponseCode(int response_code, bool is_xml) {

+ VLOG(1) << "Metrics: transmission attempt returned a failure code: "

jar (doing other things) 2012/06/11 21:19:35 nit: debug only? Will folks ever notice this in o

+ << response_code << ". Verify network connectivity and that your "

+ << "metrics logs are formatted correctly. Verify server is active at "

+ << (is_xml ? server_url_xml_ : server_url_proto_) << ".";

if (!log_manager_.has_staged_log()) {

- VLOG(1) << "METRICS: Recorder shutdown during log transmission.";

+ // TODO(isherman): I don't think this code is reachable...

+ VLOG(1) << "Metrics: Recorder shutdown during log transmission.";

} else {

- VLOG(1) << "METRICS: transmission retry being scheduled for "

- << log_manager_.staged_log_text().xml;

+ if (is_xml) {

+ VLOG(1) << "Metrics: transmission retry being scheduled for \n"

+ << log_manager_.staged_log_text().xml;

+ } else {

+ VLOG(1)

+ << "Metrics: transmission retry being scheduled for protobuf upload.";

jar (doing other things) 2012/06/11 21:19:35 nit: vlogs throughout.

+ }

}

« no previous file with comments | « chrome/browser/metrics/metrics_service.h ('k') | chrome/common/metrics/metrics_log_manager.h » ('j') | chrome/common/metrics/metrics_log_manager.cc » ('J')