chrome/browser/safe_browsing/safe_browsing_database.cc - Issue 10802098: Cleanup histograms tracking prefix-set issues in safe-browsing.

Unified Diff: chrome/browser/safe_browsing/safe_browsing_database.cc

Issue 10802098: Cleanup histograms tracking prefix-set issues in safe-browsing. (Closed) Base URL: http://git.chromium.org/chromium/src.git@master

Patch Set: Created 8 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: chrome/browser/safe_browsing/safe_browsing_database.cc

diff --git a/chrome/browser/safe_browsing/safe_browsing_database.cc b/chrome/browser/safe_browsing/safe_browsing_database.cc

index 4234ed0d9a06a58a32a69735e386ba3df694ceff..edbcd961b6a5e731407fe3ca229dcd87af4dcf8a 100644

--- a/chrome/browser/safe_browsing/safe_browsing_database.cc

+++ b/chrome/browser/safe_browsing/safe_browsing_database.cc

@@ -236,19 +236,26 @@ bool SBAddFullHashPrefixLess(const SBAddFullHash& a, const SBAddFullHash& b) {

// - Bloom filter false positives are prefix set misses.

// The following is to log actual performance to verify this.

enum PrefixSetEvent {

- PREFIX_SET_EVENT_HIT,

- PREFIX_SET_EVENT_BLOOM_HIT,

- PREFIX_SET_EVENT_BLOOM_MISS_PREFIX_HIT,

- PREFIX_SET_EVENT_BLOOM_MISS_PREFIX_HIT_INVALID,

+ // Hits to prefix set and bloom filter.

+ PREFIX_SET_HIT,

+ PREFIX_SET_BLOOM_HIT,

+ // These were to track bloom misses which hit the prefix set, with

+ // _INVALID to track where the item didn't appear to actually be in

+ // the prefix set. _INVALID was never hit.

+ PREFIX_SET_BLOOM_MISS_PREFIX_SET_HIT,

+ PREFIX_SET_BLOOM_MISS_PREFIX_SET_HIT_INVALID_OBSOLETE,

+ // GetPrefixes() after creation failed to get the same prefixes.

PREFIX_SET_GETPREFIXES_BROKEN,

- PREFIX_SET_GETPREFIXES_BROKEN_SIZE,

- PREFIX_SET_GETPREFIXES_FIRST_BROKEN,

- PREFIX_SET_SBPREFIX_WAS_BROKEN,

- PREFIX_SET_GETPREFIXES_BROKEN_SORTING,

- PREFIX_SET_GETPREFIXES_BROKEN_DUPLICATION,

- PREFIX_SET_GETPREFIX_UNSORTED_IS_DELTA,

- PREFIX_SET_GETPREFIX_UNSORTED_IS_INDEX,

- PREFIX_SET_GETPREFIX_CHECKSUM_MISMATCH,

+ // Fine-grained tests which didn't provide any good direction.

+ PREFIX_SET_GETPREFIXES_BROKEN_SIZE_OBSOLETE,

+ PREFIX_SET_GETPREFIXES_FIRST_BROKEN_OBSOLETE,

+ PREFIX_SET_SBPREFIX_WAS_BROKEN_OBSOLETE,

+ PREFIX_SET_GETPREFIXES_BROKEN_SORTING_OBSOLETE,

+ PREFIX_SET_GETPREFIXES_BROKEN_DUPLICATION_OBSOLETE,

+ PREFIX_SET_GETPREFIXES_UNSORTED_IS_DELTA_OBSOLETE,

+ PREFIX_SET_GETPREFIXES_UNSORTED_IS_INDEX_OBSOLETE,

+ // Failed checksum when creating prefix set.

+ PREFIX_SET_CREATE_PREFIX_SET_CHECKSUM,

// Memory space for histograms is determined by the max. ALWAYS ADD

// NEW VALUES BEFORE THIS ONE.

@@ -260,10 +267,14 @@ void RecordPrefixSetInfo(PrefixSetEvent event_type) {

PREFIX_SET_EVENT_MAX);

}

+// Helper to reduce code duplication.

+safe_browsing::PrefixSet* CreateEmptyPrefixSet() {

+ return new safe_browsing::PrefixSet(std::vector<SBPrefix>());

// Generate a |PrefixSet| instance from the contents of

// |add_prefixes|. Additionally performs various checks to make sure

-// that the resulting prefix set is valid, so that the

-// PREFIX_SET_EVENT_BLOOM_MISS_PREFIX_HIT_INVALID histogram in

+// that the resulting prefix set is valid, so that histograms in

// ContainsBrowseUrl() can be trustworthy.

safe_browsing::PrefixSet* PrefixSetFromAddPrefixes(

const SBAddPrefixes& add_prefixes) {

@@ -293,107 +304,35 @@ safe_browsing::PrefixSet* PrefixSetFromAddPrefixes(

std::equal(prefixes.begin(), prefixes.end(), restored.begin()))

return prefix_set.release();

- // Log BROKEN for continuity with previous release, and SIZE to

- // distinguish which test failed.

+ // NOTE(shess): Past histograms have indicated that in a given day,

+ // about 1 in 100,000 updates result in a PrefixSet which was

+ // inconsistent relative to the BloomFilter. Windows is about 5x

+ // more likely to build an inconsistent PrefixSet than Mac. A

+ // number of developers have reviewed the code, and I ran extensive

+ // fuzzing with random data, so at this point I'm trying to

+ // demonstrate memory corruption.

+ //

+ // Other findings from past instrumentation:

+ // - half of one percent of brokenness cases implied duplicate items

+ // in |prefixes|. Per the code above, this should not be

+ // possible.

+ // - about 1/20 of broken cases happened more than once for a given

+ // user. Note that empty updates generally don't hit this code at

+ // all, so that may not imply a specific input pattern breaking things.

+ // - about 1/3 of broken cases show a checksum mismatch between the

+ // checksum calculated while creating |prefix_set| and the

+ // checksum calculated immediately after creation. This is almost

+ // certainly memory corruption.

NOTREACHED();

RecordPrefixSetInfo(PREFIX_SET_GETPREFIXES_BROKEN);

- if (restored.size() != prefixes.size())

- RecordPrefixSetInfo(PREFIX_SET_GETPREFIXES_BROKEN_SIZE);

- // Try to distinguish between updates from one broken user and a

- // distributed problem.

- static bool logged_broken = false;

- if (!logged_broken) {

- RecordPrefixSetInfo(PREFIX_SET_GETPREFIXES_FIRST_BROKEN);

- logged_broken = true;

- }

- // This seems so very very unlikely. But if it ever were true, then

- // it could explain why GetPrefixes() seemed broken.

- if (sizeof(int) != sizeof(int32))

- RecordPrefixSetInfo(PREFIX_SET_SBPREFIX_WAS_BROKEN);

- // Check if memory was corrupted during construction.

+ // Broken because internal memory was corrupted during construction.

if (!prefix_set->CheckChecksum())

- RecordPrefixSetInfo(PREFIX_SET_GETPREFIX_CHECKSUM_MISMATCH);

- // Check whether |restored| is unsorted, or has duplication.

- if (restored.size()) {

- size_t unsorted_count = 0;

- bool duplicates = false;

- SBPrefix prev = restored[0];

- for (size_t i = 0; i < restored.size(); prev = restored[i], ++i) {

- if (prev > restored[i]) {

- unsorted_count++;

- UMA_HISTOGRAM_COUNTS("SB2.PrefixSetUnsortedDifference",

- prev - restored[i]);

- // When unsorted, how big is the set, and how far are we into

- // it. If the set is very small or large, that might inform

- // pursuit of a degenerate case. If the percentage is close

- // to 0%, 100%, or 50%, then there might be an interesting

- // degenerate case to explore.

- UMA_HISTOGRAM_COUNTS("SB2.PrefixSetUnsortedSize", restored.size());

- UMA_HISTOGRAM_PERCENTAGE("SB2.PrefixSetUnsortedPercent",

- i * 100 / restored.size());

- if (prefix_set->IsDeltaAt(i)) {

- RecordPrefixSetInfo(PREFIX_SET_GETPREFIX_UNSORTED_IS_DELTA);

- // Histograms require memory on the order of the number of

- // buckets, making high-precision logging expensive. For

- // now aim for a sense of the range of the problem.

- UMA_HISTOGRAM_CUSTOM_COUNTS("SB2.PrefixSetUnsortedDelta",

- prefix_set->DeltaAt(i), 1, 0xFFFF, 50);

- } else {

- RecordPrefixSetInfo(PREFIX_SET_GETPREFIX_UNSORTED_IS_INDEX);

- }

- if (prev == restored[i])

- duplicates = true;

- }

+ RecordPrefixSetInfo(PREFIX_SET_CREATE_PREFIX_SET_CHECKSUM);

- // Record findings.

- if (unsorted_count) {

- RecordPrefixSetInfo(PREFIX_SET_GETPREFIXES_BROKEN_SORTING);

- UMA_HISTOGRAM_COUNTS_100("SB2.PrefixSetUnsorted", unsorted_count);

- }

- if (duplicates)

- RecordPrefixSetInfo(PREFIX_SET_GETPREFIXES_BROKEN_DUPLICATION);

- // Fix the problems noted. If |restored| was unsorted, then

- // |duplicates| may give a false negative.

- if (unsorted_count)

- std::sort(restored.begin(), restored.end());

- if (unsorted_count || duplicates)

- restored.erase(std::unique(restored.begin(), restored.end()),

- restored.end());

- }

- // NOTE(shess): The following could be done using a single

- // uber-loop, but it's complicated by needing multiple parallel

- // iterators. Didn't seem worthwhile for something that will only

- // live for a short period and only fires for one in a million

- // updates.

+ // TODO(shess): Test whether |prefixes| changed during construction.

- // Find elements in |restored| which are not in |prefixes|.

- std::vector<SBPrefix> difference;

- std::set_difference(restored.begin(), restored.end(),

- prefixes.begin(), prefixes.end(),

- std::back_inserter(difference));

- if (difference.size())

- UMA_HISTOGRAM_COUNTS_100("SB2.PrefixSetRestoredExcess", difference.size());

- // Find elements in |prefixes| which are not in |restored|.

- difference.clear();

- std::set_difference(prefixes.begin(), prefixes.end(),

- restored.begin(), restored.end(),

- std::back_inserter(difference));

- if (difference.size())

- UMA_HISTOGRAM_COUNTS_100("SB2.PrefixSetRestoredShortfall",

- difference.size());

- return prefix_set.release();

+ return CreateEmptyPrefixSet();

}

} // namespace

@@ -616,7 +555,7 @@ bool SafeBrowsingDatabaseNew::ResetDatabase() {

BloomFilter::kBloomFilterSizeRatio);

// TODO(shess): It is simpler for the code to assume that presence

// of a bloom filter always implies presence of a prefix set.

- prefix_set_.reset(new safe_browsing::PrefixSet(std::vector<SBPrefix>()));

+ prefix_set_.reset(CreateEmptyPrefixSet());

}

// Wants to acquire the lock itself.

WhitelistEverything(&csd_whitelist_);

@@ -654,44 +593,26 @@ bool SafeBrowsingDatabaseNew::ContainsBrowseUrl(

// not empty.

const bool prefix_set_empty = !prefix_set_->GetSize();

- // Used to double-check in case of a hit mis-match.

- std::vector<SBPrefix> restored;

size_t miss_count = 0;

for (size_t i = 0; i < full_hashes.size(); ++i) {

bool found = prefix_set_->Exists(full_hashes[i].prefix);

if (browse_bloom_filter_->Exists(full_hashes[i].prefix)) {

if (!prefix_set_empty) {

- RecordPrefixSetInfo(PREFIX_SET_EVENT_BLOOM_HIT);

+ RecordPrefixSetInfo(PREFIX_SET_BLOOM_HIT);

+ // This should be less than PREFIX_SET_BLOOM_HIT by the

+ // false positive rate.

if (found)

- RecordPrefixSetInfo(PREFIX_SET_EVENT_HIT);

+ RecordPrefixSetInfo(PREFIX_SET_HIT);

}

prefix_hits->push_back(full_hashes[i].prefix);

if (prefix_miss_cache_.count(full_hashes[i].prefix) > 0)

++miss_count;

} else {

- // Bloom filter misses should never be in prefix set. Re-create

- // the original prefixes and manually search for it, to check if

- // there's a bug with how |Exists()| is implemented.

- // |UpdateBrowseStore()| previously verified that

- // |GetPrefixes()| returns the same prefixes as were passed to

- // the constructor.

+ // Bloom filter misses should never be in prefix set.

DCHECK(!found);

- if (found && !prefix_set_empty) {

- if (restored.empty())

- prefix_set_->GetPrefixes(&restored);

- // If the item is not in the re-created list, then there is an

- // error in |PrefixSet::Exists()|. If the item is in the

- // re-created list, then the bloom filter was wrong.

- if (std::binary_search(restored.begin(), restored.end(),

- full_hashes[i].prefix)) {

- RecordPrefixSetInfo(PREFIX_SET_EVENT_BLOOM_MISS_PREFIX_HIT);

- } else {

- RecordPrefixSetInfo(PREFIX_SET_EVENT_BLOOM_MISS_PREFIX_HIT_INVALID);

- }

+ if (found)

+ RecordPrefixSetInfo(PREFIX_SET_BLOOM_MISS_PREFIX_SET_HIT);

}

@@ -1386,7 +1307,7 @@ void SafeBrowsingDatabaseNew::LoadBloomFilter() {

RecordFailure(FAILURE_DATABASE_FILTER_READ);

// Use an empty prefix set until the first update.

- prefix_set_.reset(new safe_browsing::PrefixSet(std::vector<SBPrefix>()));

+ prefix_set_.reset(CreateEmptyPrefixSet());

}

bool SafeBrowsingDatabaseNew::Delete() {

« no previous file with comments | « no previous file | no next file » | no next file with comments »