chrome/common/metrics/entropy_provider_unittest.cc - Issue 10830318: Use a different algorithm with the low entropy source for field trials.

Unified Diff: chrome/common/metrics/entropy_provider_unittest.cc

Issue 10830318: Use a different algorithm with the low entropy source for field trials. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/

Patch Set: Created 8 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: chrome/common/metrics/entropy_provider_unittest.cc

===================================================================

--- chrome/common/metrics/entropy_provider_unittest.cc (revision 0)

+++ chrome/common/metrics/entropy_provider_unittest.cc (revision 0)

@@ -0,0 +1,328 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include <cmath>

+#include <limits>

+#include <numeric>

+#include "base/basictypes.h"

+#include "base/guid.h"

+#include "base/memory/scoped_ptr.h"

+#include "base/rand_util.h"

+#include "base/string_number_conversions.h"

+#include "chrome/common/metrics/entropy_provider.h"

+#include "testing/gtest/include/gtest/gtest.h"

+namespace metrics {

+namespace {

+// Computes the Chi-Square statistic for |values| assuming they follow a uniform

+// distribution, where each entry has expected value |expected_value|.

+//

+// The Chi-Square statistic is defined as Sum((O-E)^2/E) where O is the observed

+// value and E is the expected value.

+double ComputeChiSquare(const std::vector<int>& values,

+ double expected_value) {

+ double sum = 0;

+ for (size_t i = 0; i < values.size(); ++i) {

+ const double delta = values[i] - expected_value;

+ sum += (delta * delta) / expected_value;

+ }

+ return sum;

+} // namespace

+class EntropyProviderTest : public testing::Test {

+ public:

+ // Computes SHA1-based entropy for the given |trial_name| based on

+ // |entropy_source|

+ double GenerateSHA1Entropy(const std::string& entropy_source,

+ const std::string& trial_name) {

+ SHA1EntropyProvider sha1_provider(entropy_source);

+ return sha1_provider.GetEntropyForTrial(trial_name);

+ }

+ // Generates permutation-based entropy for the given |trial_name| based on

+ // |entropy_source| which must be in the range [0, entropy_max).

+ double GeneratePermutedEntropy(uint16 entropy_source,

+ size_t entropy_max,

+ const std::string& trial_name) {

+ PermutedEntropyProvider permuted_provider(entropy_source, entropy_max);

+ return permuted_provider.GetEntropyForTrial(trial_name);

+ }

+};

+TEST_F(EntropyProviderTest, UseOneTimeRandomizationSHA1) {

+ // Simply asserts that two trials using one-time randomization

+ // that have different names, normally generate different results.

+ //

+ // Note that depending on the one-time random initialization, they

+ // _might_ actually give the same result, but we know that given

+ // the particular client_id we use for unit tests they won't.

+ base::FieldTrialList field_trial_list(new SHA1EntropyProvider("client_id"));

+ scoped_refptr<base::FieldTrial> trials[] = {

+ base::FieldTrialList::FactoryGetFieldTrial("one", 100, "default",

+ base::FieldTrialList::kExpirationYearInFuture, 1, 1, NULL),

+ base::FieldTrialList::FactoryGetFieldTrial("two", 100, "default",

+ base::FieldTrialList::kExpirationYearInFuture, 1, 1, NULL),

+ };

+ for (size_t i = 0; i < arraysize(trials); ++i) {

+ trials[i]->UseOneTimeRandomization();

+ for (int j = 0; j < 100; ++j)

+ trials[i]->AppendGroup("", 1);

+ }

+ // The trials are most likely to give different results since they have

+ // different names.

+ ASSERT_NE(trials[0]->group(), trials[1]->group());

+ ASSERT_NE(trials[0]->group_name(), trials[1]->group_name());

+TEST_F(EntropyProviderTest, UseOneTimeRandomizationPermuted) {

+ // Simply asserts that two trials using one-time randomization

+ // that have different names, normally generate different results.

+ //

+ // Note that depending on the one-time random initialization, they

+ // _might_ actually give the same result, but we know that given

+ // the particular client_id we use for unit tests they won't.

+ const size_t kMaxLowEntropySize = (1 << 13);

+ base::FieldTrialList field_trial_list(

+ new PermutedEntropyProvider(1234, kMaxLowEntropySize));

+ scoped_refptr<base::FieldTrial> trials[] = {

+ base::FieldTrialList::FactoryGetFieldTrial("one", 100, "default",

+ base::FieldTrialList::kExpirationYearInFuture, 1, 1, NULL),

+ base::FieldTrialList::FactoryGetFieldTrial("two", 100, "default",

+ base::FieldTrialList::kExpirationYearInFuture, 1, 1, NULL),

+ };

+ for (size_t i = 0; i < arraysize(trials); ++i) {

+ trials[i]->UseOneTimeRandomization();

+ for (int j = 0; j < 100; ++j)

+ trials[i]->AppendGroup("", 1);

+ }

+ // The trials are most likely to give different results since they have

+ // different names.

+ ASSERT_NE(trials[0]->group(), trials[1]->group());

+ ASSERT_NE(trials[0]->group_name(), trials[1]->group_name());

+TEST_F(EntropyProviderTest, SHA1Entropy) {

+ const double results[] = {

+ GenerateSHA1Entropy("hi", "1"),

+ GenerateSHA1Entropy("there", "1"),

+ };

+ ASSERT_NE(results[0], results[1]);

+ for (size_t i = 0; i < arraysize(results); ++i) {

+ ASSERT_LE(0.0, results[i]);

+ ASSERT_GT(1.0, results[i]);

+ }

+ ASSERT_EQ(GenerateSHA1Entropy("yo", "1"),

+ GenerateSHA1Entropy("yo", "1"));

+ ASSERT_NE(GenerateSHA1Entropy("yo", "something"),

+ GenerateSHA1Entropy("yo", "else"));

+TEST_F(EntropyProviderTest, PermutedEntropy) {

+ const size_t kMaxLowEntropySize = (1 << 13);

+ const double results[] = {

+ GeneratePermutedEntropy(1234, kMaxLowEntropySize, "1"),

+ GeneratePermutedEntropy(4321, kMaxLowEntropySize, "1"),

+ };

+ ASSERT_NE(results[0], results[1]);

+ for (size_t i = 0; i < arraysize(results); ++i) {

+ ASSERT_LE(0.0, results[i]);

+ ASSERT_GT(1.0, results[i]);

+ }

+ ASSERT_EQ(GeneratePermutedEntropy(1234, kMaxLowEntropySize, "1"),

+ GeneratePermutedEntropy(1234, kMaxLowEntropySize, "1"));

+ ASSERT_NE(GeneratePermutedEntropy(1234, kMaxLowEntropySize, "something"),

+ GeneratePermutedEntropy(1234, kMaxLowEntropySize, "else"));

+TEST_F(EntropyProviderTest, SHA1EntropyIsUniform) {

+ // Size of the low entropy source to append to the high entropy source for

+ // input to the SHA1 entropy provider.

+ const size_t kMaxLowEntropySize = (1 << 13);

+ // Number of buckets in the simulated field trials.

+ const size_t kBucketCount = 20;

+ // Max number of iterations to perform before giving up and failing.

+ const size_t kMaxIterationCount = 100000;

+ // The number of iterations to perform before each time the statistical

+ // significance of the results is checked.

+ const size_t kCheckIterationCount = 10000;

Ilya Sherman 2012/08/22 04:07:42 Is it really important to check every 10000 iterat

Alexei Svitkine (slow) 2012/08/22 16:53:27 The test verifies whether the observed distributio

+ // This is the Chi-Square threshold from the Chi-Square statistic table for

+ // 19 degrees of freedom (based on |kBucketCount|) with a 99.9% confidence

+ // level. See: http://www.medcalc.org/manual/chi-square-table.php

+ const double kChiSquareThreshold = 43.82;

+ const std::string trial_names[] = {

+ "TestTrial",

+ "AnotherTestTrial",

+ "NewTabButton",

+ };

+ for (size_t i = 0; i < arraysize(trial_names); ++i) {

+ std::vector<int> distribution(kBucketCount);

+ for (size_t j = 0; j < kMaxIterationCount; ++j) {

Ilya Sherman 2012/08/22 04:07:42 nit: Perhaps this loop should start from j = 1, so

Alexei Svitkine (slow) 2012/08/22 16:53:27 Done.

+ // Use a random GUID + 13 additional bits of entropy to match how the

+ // SHA1EntropyProvider is used in metrics_service.cc.

+ const int low_entropy_source =

+ static_cast<uint16>(base::RandInt(0, kMaxLowEntropySize - 1));

+ const std::string high_entropy_source =

+ base::GenerateGUID() + base::IntToString(low_entropy_source);

+ const double entropy_value =

+ GenerateSHA1Entropy(high_entropy_source, trial_names[i]);

+ const size_t bucket = static_cast<size_t>(kBucketCount * entropy_value);

+ ASSERT_LT(bucket, kBucketCount);

+ distribution[bucket] += 1;

+ // After |kCheckIterationCount| iterations, compute the Chi-Square

+ // statistic of the distribution. If the resulting statistic is greater

+ // than |kChiSquareThreshold|, we can conclude with 99.9% confidence

+ // that the observed samples do not follow a uniform distribution.

+ //

+ // However, since 99.9% would still result in a false negative every

+ // 1000 runs of the test, do not treat it as a failure (else the test

+ // will be flaky). Instead, perform additional iterations to determine

+ // if the distribution will converge, up to |kMaxIterationCount|.

Ilya Sherman 2012/08/22 04:07:42 Perhaps instead of running the test multiple times

Alexei Svitkine (slow) 2012/08/22 16:53:27 Using a threshold that has a higher confidence doe

+ if (((j + 1) % kCheckIterationCount) == 0) {

+ const double expected_value_per_bucket =

+ static_cast<double>(j + 1) / kBucketCount;

+ const double chi_square =

+ ComputeChiSquare(distribution, expected_value_per_bucket);

+ if (chi_square < kChiSquareThreshold)

+ break;

+ // If |j == kMaxIterationCount - 1|, the Chi-Square statistic did not

+ // converge after |kMaxIterationCount|.

+ ASSERT_NE(j, kMaxIterationCount - 1) << "Failed for trial " <<

+ trial_names[i] << " with chi_square = " << chi_square <<

+ " after " << kMaxIterationCount << " iterations.";

Ilya Sherman 2012/08/22 04:07:42 This test should choose a random seed for the rand

Alexei Svitkine (slow) 2012/08/22 16:53:27 The random number generator used for the original

+ }

+TEST_F(EntropyProviderTest, PermutedEntropyIsUniform) {

+ // Size of the low entropy source to use for the permuted entropy provider.

+ const size_t kMaxLowEntropySize = (1 << 13);

+ // Number of buckets in the simulated field trials.

+ const size_t kBucketCount = 20;

+ // Max number of iterations to perform before giving up and failing.

+ const size_t kMaxIterationCount = 100000;

+ // The number of iterations to perform before each time the statistical

+ // significance of the results is checked.

+ const size_t kCheckIterationCount = 10000;

+ // This is the Chi-Square threshold from the Chi-Square statistic table for

+ // 19 degrees of freedom (based on |kBucketCount|) with a 99.9% confidence

+ // level. See: http://www.medcalc.org/manual/chi-square-table.php

+ const double kChiSquareThreshold = 43.82;

Ilya Sherman 2012/08/22 04:07:42 nit: Almost all of these constants are repeated fr

Alexei Svitkine (slow) 2012/08/22 16:53:27 Refactored the two tests to share a majority of th

+ const std::string trial_names[] = {

+ "TestTrial",

+ "AnotherTestTrial",

+ "NewTabButton",

+ };

+ for (size_t i = 0; i < arraysize(trial_names); ++i) {

+ std::vector<int> distribution(kBucketCount);

+ // Note: Given a trial name, the computed mapping will be the same.

+ // As a performance optimization, pre-compute the mapping once per trial

+ // name and index into it each iteration.

+ std::vector<uint16> mapping(kMaxLowEntropySize);

+ internal::PermuteMappingUsingTrialName(trial_names[i], &mapping);

+ for (size_t j = 0; j < kMaxIterationCount; ++j) {

+ const int low_entropy_source =

+ static_cast<uint16>(base::RandInt(0, kMaxLowEntropySize - 1));

+ const double entropy_value =

+ mapping[low_entropy_source] / static_cast<double>(kMaxLowEntropySize);

+ const size_t bucket = static_cast<size_t>(kBucketCount * entropy_value);

+ ASSERT_LT(bucket, kBucketCount);

+ distribution[bucket] += 1;

+ // After |kCheckIterationCount| iterations, compute the Chi-Square

+ // statistic of the distribution. If the resulting statistic is greater

+ // than |kChiSquareThreshold|, we can conclude with 99.9% confidence

+ // that the observed samples do not follow a uniform distribution.

+ //

+ // However, since 99.9% would still result in a false negative every

+ // 1000 runs of the test, do not treat it as a failure (else the test

+ // will be flaky). Instead, perform additional iterations to determine

+ // if the distribution will converge, up to |kMaxIterationCount|.

+ if (((j + 1) % kCheckIterationCount) == 0) {

+ const double expected_value_per_bucket =

+ static_cast<double>(j + 1) / kBucketCount;

+ const double chi_square =

+ ComputeChiSquare(distribution, expected_value_per_bucket);

+ if (chi_square < kChiSquareThreshold)

+ break;

+ // If |j == kMaxIterationCount - 1|, the Chi-Square statistic did not

+ // converge after |kMaxIterationCount|.

+ ASSERT_NE(j, kMaxIterationCount - 1) << "Failed for trial " <<

+ trial_names[i] << " with chi_square = " << chi_square <<

+ " after " << kMaxIterationCount << " iterations.";

+ }

Ilya Sherman 2012/08/22 04:07:42 nit: There is a lot of repeated code with the abov

Alexei Svitkine (slow) 2012/08/22 16:53:27 Done.

+TEST_F(EntropyProviderTest, SeededRandGeneratorIsUniform) {

+ // Verifies that SeededRandGenerator has a uniform distribution.

+ //

+ // Mirrors RandUtilTest.RandGeneratorIsUniform in base/rand_util_unittest.cc.

+ const uint32 kTopOfRange = (std::numeric_limits<uint32>::max() / 4ULL) * 3ULL;

+ const uint32 kExpectedAverage = kTopOfRange / 2ULL;

+ const uint32 kAllowedVariance = kExpectedAverage / 50ULL; // +/- 2%

+ const int kMinAttempts = 1000;

+ const int kMaxAttempts = 1000000;

+ const std::string trial_names[] = {

+ "TestTrial",

+ "AnotherTestTrial",

+ "NewTabButton",

+ };

+ for (size_t i = 0; i < arraysize(trial_names); ++i) {

+ const uint32 seed = internal::HashName(trial_names[i]);

+ internal::SeededRandGenerator rand_generator(seed);

+ double cumulative_average = 0.0;

+ int count = 0;

+ while (count < kMaxAttempts) {

+ uint32 value = rand_generator(kTopOfRange);

+ cumulative_average = (count * cumulative_average + value) / (count + 1);

+ // Don't quit too quickly for things to start converging, or we may have

+ // a false positive.

+ if (count > kMinAttempts &&

+ kExpectedAverage - kAllowedVariance < cumulative_average &&

+ cumulative_average < kExpectedAverage + kAllowedVariance) {

+ break;

+ }

+ ++count;

+ }

+ ASSERT_LT(count, kMaxAttempts) << "Expected average was " <<

+ kExpectedAverage << ", average ended at " << cumulative_average <<

+ ", for trial " << trial_names[i];

+ }

+} // namespace metrics

« no previous file with comments | « chrome/common/metrics/entropy_provider.cc ('k') | chrome/common/metrics/variations/variations_util_unittest.cc » ('j') | no next file with comments »