chrome/common/metrics/entropy_provider_unittest.cc - Issue 10830318: Use a different algorithm with the low entropy source for field trials.

Side by Side Diff: chrome/common/metrics/entropy_provider_unittest.cc

Issue 10830318: Use a different algorithm with the low entropy source for field trials. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/

Patch Set: Created 8 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include <cmath>

	6 #include <limits>

	7 #include <numeric>

	8

	9 #include "base/basictypes.h"

	10 #include "base/guid.h"

	11 #include "base/memory/scoped_ptr.h"

	12 #include "base/rand_util.h"

	13 #include "base/string_number_conversions.h"

	14 #include "chrome/common/metrics/entropy_provider.h"

	15 #include "testing/gtest/include/gtest/gtest.h"

	16

	17 namespace {

	18

	19 // Computes the standard deviation of distribution \|values\|.

	20 double ComputeStandardDeviation(const std::vector<int>& values) {

	21 const int sum = std::accumulate(values.begin(), values.end(), 0.0);

	22 const int mean = sum / values.size();
	Ilya Sherman 2012/08/21 03:42:27 nit: Should this be a double rather than an int? nit: Should this be a double rather than an int? Alexei Svitkine (slow) 2012/08/21 14:25:49 Changed to use doubles throughout the function. ( Show quoted text On 2012/08/21 03:42:27, Ilya Sherman wrote: > nit: Should this be a double rather than an int? Changed to use doubles throughout the function. (The previous code was correct but only because the mean was always a full integer - i.e. for 100000 iterations and 20 buckets, it would always be 5000).
	23 const int sum_of_squares = std::inner_product(values.begin(), values.end(),

	24 values.begin(), 0.0);

	25 const double variance = static_cast<double>(sum_of_squares) /

	26 values.size() - (mean * mean);

	27 return std::sqrt(variance);

	28 }

	29

	30 } // namespace

	31

	32

	33 class EntropyProviderTest : public testing::Test {

	34 public:

	35 // Computes SHA1-based entropy for the given \|trial_name\| based on

	36 // \|entropy_source\|

	37 double GenerateSHA1Entropy(const std::string& entropy_source,

	38 const std::string& trial_name) {

	39 SHA1EntropyProvider sha1_provider(entropy_source);

	40 return sha1_provider.GetEntropyForTrial(trial_name);

	41 }

	42

	43 // Generates permutation-based entropy for the given \|trial_name\| based on

	44 // \|entropy_source\| which must be in the range [0, entropy_max).

	45 double GeneratePermutedEntropy(uint16 entropy_source,

	46 size_t entropy_max,

	47 const std::string& trial_name) {

	48 PermutedEntropyProvider permuted_provider(entropy_source, entropy_max);

	49 return permuted_provider.GetEntropyForTrial(trial_name);

	50 }

	51 };

	52

	53

	54 TEST_F(EntropyProviderTest, UseOneTimeRandomizationSHA1) {

	55 // Simply asserts that two trials using one-time randomization

	56 // that have different names, normally generate different results.

	57 //

	58 // Note that depending on the one-time random initialization, they

	59 // _might_ actually give the same result, but we know that given

	60 // the particular client_id we use for unit tests they won't.

	61 base::FieldTrialList field_trial_list(new SHA1EntropyProvider("client_id"));

	62 scoped_refptr<base::FieldTrial> trials[] = {

	63 base::FieldTrialList::FactoryGetFieldTrial("one", 100, "default",

	64 base::FieldTrialList::kExpirationYearInFuture, 1, 1, NULL),

	65 base::FieldTrialList::FactoryGetFieldTrial("two", 100, "default",

	66 base::FieldTrialList::kExpirationYearInFuture, 1, 1, NULL),

	67 };

	68

	69 for (size_t i = 0; i < arraysize(trials); ++i) {

	70 trials[i]->UseOneTimeRandomization();

	71

	72 for (int j = 0; j < 100; ++j)

	73 trials[i]->AppendGroup("", 1);

	74 }

	75

	76 // The trials are most likely to give different results since they have

	77 // different names.

	78 ASSERT_NE(trials[0]->group(), trials[1]->group());

	79 ASSERT_NE(trials[0]->group_name(), trials[1]->group_name());

	80 }

	81

	82 TEST_F(EntropyProviderTest, UseOneTimeRandomizationPermuted) {

	83 // Simply asserts that two trials using one-time randomization

	84 // that have different names, normally generate different results.

	85 //

	86 // Note that depending on the one-time random initialization, they

	87 // _might_ actually give the same result, but we know that given

	88 // the particular client_id we use for unit tests they won't.

	89 const size_t kMaxLowEntropySize = (1 << 13);

	90 base::FieldTrialList field_trial_list(

	91 new PermutedEntropyProvider(1234, kMaxLowEntropySize));

	92 scoped_refptr<base::FieldTrial> trials[] = {

	93 base::FieldTrialList::FactoryGetFieldTrial("one", 100, "default",

	94 base::FieldTrialList::kExpirationYearInFuture, 1, 1, NULL),

	95 base::FieldTrialList::FactoryGetFieldTrial("two", 100, "default",

	96 base::FieldTrialList::kExpirationYearInFuture, 1, 1, NULL),

	97 };

	98

	99 for (size_t i = 0; i < arraysize(trials); ++i) {

	100 trials[i]->UseOneTimeRandomization();

	101

	102 for (int j = 0; j < 100; ++j)

	103 trials[i]->AppendGroup("", 1);

	104 }

	105

	106 // The trials are most likely to give different results since they have

	107 // different names.

	108 ASSERT_NE(trials[0]->group(), trials[1]->group());

	109 ASSERT_NE(trials[0]->group_name(), trials[1]->group_name());

	110 }

	111

	112 TEST_F(EntropyProviderTest, SHA1Entropy) {

	113 const double results[] = {

	114 GenerateSHA1Entropy("hi", "1"),

	115 GenerateSHA1Entropy("there", "1"),

	116 };

	117 ASSERT_NE(results[0], results[1]);

	118 for (size_t i = 0; i < arraysize(results); ++i) {

	119 ASSERT_LE(0.0, results[i]);

	120 ASSERT_GT(1.0, results[i]);

	121 }

	122

	123 ASSERT_EQ(GenerateSHA1Entropy("yo", "1"),

	124 GenerateSHA1Entropy("yo", "1"));

	125 ASSERT_NE(GenerateSHA1Entropy("yo", "something"),

	126 GenerateSHA1Entropy("yo", "else"));

	127 }

	128

	129 TEST_F(EntropyProviderTest, PermutedEntropy) {

	130 const size_t kMaxLowEntropySize = (1 << 13);

	131 const double results[] = {

	132 GeneratePermutedEntropy(1234, kMaxLowEntropySize, "1"),

	133 GeneratePermutedEntropy(4321, kMaxLowEntropySize, "1"),

	134 };

	135 ASSERT_NE(results[0], results[1]);

	136 for (size_t i = 0; i < arraysize(results); ++i) {

	137 ASSERT_LE(0.0, results[i]);

	138 ASSERT_GT(1.0, results[i]);

	139 }

	140

	141 ASSERT_EQ(GeneratePermutedEntropy(1234, kMaxLowEntropySize, "1"),

	142 GeneratePermutedEntropy(1234, kMaxLowEntropySize, "1"));

	143 ASSERT_NE(GeneratePermutedEntropy(1234, kMaxLowEntropySize, "something"),

	144 GeneratePermutedEntropy(1234, kMaxLowEntropySize, "else"));

	145 }

	146

	147 TEST_F(EntropyProviderTest, SHA1EntropyIsUniform) {

	148 const size_t kMaxLowEntropySize = (1 << 13);

	149 const int kBucketCount = 20;

	150 const int kIterationCount = 100000;
	Ilya Sherman 2012/08/21 03:42:27 nit: size_t throughout? nit: size_t throughout? Alexei Svitkine (slow) 2012/08/21 14:25:49 Done. Show quoted text On 2012/08/21 03:42:27, Ilya Sherman wrote: > nit: size_t throughout? Done.
	151

	152 const std::string trial_names[] = {

	153 "TestTrial",

	154 "AnotherTestTrial",

	155 "NewTabButton",

	156 };

	157

	158 for (size_t i = 0; i < arraysize(trial_names); ++i) {

	159 std::vector<int> distribution(kBucketCount);

	160

	161 for (int j = 0; j < kIterationCount; ++j) {

	162 // Use a random GUID + 13 bits of entropy to match how the

	163 // SHA1EntropyProvider is used in metrics_service.cc.

	164 const int low_entropy_source =

	165 static_cast<uint16>(base::RandInt(0, kMaxLowEntropySize - 1));

	166 const std::string high_entropy_source =

	167 base::GenerateGUID() + base::IntToString(low_entropy_source);

	168 const double entropy_value =

	169 GenerateSHA1Entropy(high_entropy_source, trial_names[i]);

	170 const int bucket = static_cast<int>(kBucketCount * entropy_value);

	171 ASSERT_LT(bucket, kBucketCount);

	172 distribution[bucket] += 1;

	173 }

	174

	175 int min_value = std::numeric_limits<int>::max();

	176 int max_value = std::numeric_limits<int>::min();

	177 for (int j = 0; j < kBucketCount; ++j) {

	178 min_value = std::min(min_value, distribution[j]);

	179 max_value = std::max(max_value, distribution[j]);

	180 }

	181

	182 // TODO(asvitkine): Figure out pass / fail criteria.

	183 printf("min = %d, max = %d, stddev = %f\n", min_value, max_value,

	184 ComputeStandardDeviation(distribution));

	185 }

	186 }

	187

	188 TEST_F(EntropyProviderTest, PermutedEntropyIsUniform) {

	189 const size_t kMaxLowEntropySize = (1 << 13);

	190 const int kBucketCount = 20;

	191 const int kIterationCount = 100000;

	192

	193 const std::string trial_names[] = {

	194 "TestTrial",

	195 "AnotherTestTrial",

	196 "NewTabButton",

	197 };

	198

	199 for (size_t i = 0; i < arraysize(trial_names); ++i) {

	200 std::vector<int> distribution(kBucketCount);

	201

	202 // Note: Given a trial name, the computed mapping will be the same.

	203 // As a performance optimization, pre-compute the mapping once per trial

	204 // name and index into it each iteration.
	Ilya Sherman 2012/08/21 03:42:27 Is this performance optimization needed for the te Is this performance optimization needed for the test to run smoothly? If not, I'd prefer to test the actual code, so that there's no chance for a bug to get fixed there and not in the test. If the optimization is necessary, though, well, so be it. Alexei Svitkine (slow) 2012/08/21 14:25:49 Unfortunately, yes it is needed. Without it, this Show quoted text On 2012/08/21 03:42:27, Ilya Sherman wrote: > Is this performance optimization needed for the test to run smoothly? If not, > I'd prefer to test the actual code, so that there's no chance for a bug to get > fixed there and not in the test. If the optimization is necessary, though, > well, so be it. Unfortunately, yes it is needed. Without it, this test case takes 15s to run on my machine. With the optimization, it runs in 0.5s. (Note: That 15s on my beefy machine may very well mean a timeout on our much slower bots.)
	205 std::vector<uint16> mapping(kMaxLowEntropySize);

	206 metrics_internal::PermuteMappingUsingTrialName(trial_names[i], &mapping);

	207

	208 for (int j = 0; j < kIterationCount; ++j) {

	209 const int low_entropy_source =

	210 static_cast<uint16>(base::RandInt(0, kMaxLowEntropySize - 1));

	211 const double entropy_value =

	212 mapping[low_entropy_source] / static_cast<double>(kMaxLowEntropySize);

	213 const int bucket = static_cast<int>(kBucketCount * entropy_value);

	214 ASSERT_LT(bucket, kBucketCount);

	215 distribution[bucket] += 1;

	216 }

	217

	218 int min_value = std::numeric_limits<int>::max();

	219 int max_value = std::numeric_limits<int>::min();

	220 for (int j = 0; j < kBucketCount; ++j) {

	221 min_value = std::min(min_value, distribution[j]);

	222 max_value = std::max(max_value, distribution[j]);

	223 }

	224

	225 // TODO(asvitkine): Figure out pass / fail criteria.

	226 printf("min = %d, max = %d, stddev = %f\n", min_value, max_value,

	227 ComputeStandardDeviation(distribution));

	228 }

	229 }

	230

	231 TEST_F(EntropyProviderTest, SeededRandGeneratorIsUniform) {

	232 // Verifies that SeededRandGenerator has a uniform distribution.

	233 //

	234 // Mirrors RandUtilTest.RandGeneratorIsUniform in base/rand_util_unittest.cc.

	235

	236 const uint32 kTopOfRange = (std::numeric_limits<uint32>::max() / 4ULL) * 3ULL;

	237 const uint32 kExpectedAverage = kTopOfRange / 2ULL;

	238 const uint32 kAllowedVariance = kExpectedAverage / 50ULL; // +/- 2%

	239 const int kMinAttempts = 1000;

	240 const int kMaxAttempts = 1000000;

	241

	242 const std::string trial_names[] = {

	243 "TestTrial",

	244 "AnotherTestTrial",

	245 "NewTabButton",

	246 };

	247

	248 for (size_t i = 0; i < arraysize(trial_names); ++i) {

	249 const uint32 seed = metrics_internal::HashName(trial_names[i]);

	250 metrics_internal::SeededRandGenerator rand_generator(seed);

	251

	252 double cumulative_average = 0.0;

	253 int count = 0;

	254 while (count < kMaxAttempts) {

	255 uint32 value = rand_generator(kTopOfRange);

	256 cumulative_average = (count * cumulative_average + value) / (count + 1);

	257

	258 // Don't quit too quickly for things to start converging, or we may have

	259 // a false positive.

	260 if (count > kMinAttempts &&

	261 kExpectedAverage - kAllowedVariance < cumulative_average &&

	262 cumulative_average < kExpectedAverage + kAllowedVariance) {

	263 break;

	264 }

	265

	266 ++count;

	267 }

	268

	269 ASSERT_LT(count, kMaxAttempts) << "Expected average was " <<

	270 kExpectedAverage << ", average ended at " << cumulative_average <<

	271 ", for trial " << trial_names[i];

	272 }

	273 }

	274

OLD	NEW