Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(419)

Side by Side Diff: chrome/common/metrics/entropy_provider_unittest.cc

Issue 10830318: Use a different algorithm with the low entropy source for field trials. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/
Patch Set: Created 8 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <cmath>
6 #include <limits>
7 #include <numeric>
8
9 #include "base/basictypes.h"
10 #include "base/guid.h"
11 #include "base/memory/scoped_ptr.h"
12 #include "base/rand_util.h"
13 #include "base/string_number_conversions.h"
14 #include "chrome/common/metrics/entropy_provider.h"
15 #include "testing/gtest/include/gtest/gtest.h"
16
17 namespace metrics {
18
19 namespace {
20
21 // Computes the Chi-Square statistic for |values| assuming they follow a uniform
22 // distribution, where each entry has expected value |expected_value|.
23 //
24 // The Chi-Square statistic is defined as Sum((O-E)^2/E) where O is the observed
25 // value and E is the expected value.
26 double ComputeChiSquare(const std::vector<int>& values,
27 double expected_value) {
28 double sum = 0;
29 for (size_t i = 0; i < values.size(); ++i) {
30 const double delta = values[i] - expected_value;
31 sum += (delta * delta) / expected_value;
32 }
33 return sum;
34 }
35
36 } // namespace
37
38
39 class EntropyProviderTest : public testing::Test {
40 public:
41 // Computes SHA1-based entropy for the given |trial_name| based on
42 // |entropy_source|
43 double GenerateSHA1Entropy(const std::string& entropy_source,
44 const std::string& trial_name) {
45 SHA1EntropyProvider sha1_provider(entropy_source);
46 return sha1_provider.GetEntropyForTrial(trial_name);
47 }
48
49 // Generates permutation-based entropy for the given |trial_name| based on
50 // |entropy_source| which must be in the range [0, entropy_max).
51 double GeneratePermutedEntropy(uint16 entropy_source,
52 size_t entropy_max,
53 const std::string& trial_name) {
54 PermutedEntropyProvider permuted_provider(entropy_source, entropy_max);
55 return permuted_provider.GetEntropyForTrial(trial_name);
56 }
57 };
58
59
60 TEST_F(EntropyProviderTest, UseOneTimeRandomizationSHA1) {
61 // Simply asserts that two trials using one-time randomization
62 // that have different names, normally generate different results.
63 //
64 // Note that depending on the one-time random initialization, they
65 // _might_ actually give the same result, but we know that given
66 // the particular client_id we use for unit tests they won't.
67 base::FieldTrialList field_trial_list(new SHA1EntropyProvider("client_id"));
68 scoped_refptr<base::FieldTrial> trials[] = {
69 base::FieldTrialList::FactoryGetFieldTrial("one", 100, "default",
70 base::FieldTrialList::kExpirationYearInFuture, 1, 1, NULL),
71 base::FieldTrialList::FactoryGetFieldTrial("two", 100, "default",
72 base::FieldTrialList::kExpirationYearInFuture, 1, 1, NULL),
73 };
74
75 for (size_t i = 0; i < arraysize(trials); ++i) {
76 trials[i]->UseOneTimeRandomization();
77
78 for (int j = 0; j < 100; ++j)
79 trials[i]->AppendGroup("", 1);
80 }
81
82 // The trials are most likely to give different results since they have
83 // different names.
84 ASSERT_NE(trials[0]->group(), trials[1]->group());
85 ASSERT_NE(trials[0]->group_name(), trials[1]->group_name());
86 }
87
88 TEST_F(EntropyProviderTest, UseOneTimeRandomizationPermuted) {
89 // Simply asserts that two trials using one-time randomization
90 // that have different names, normally generate different results.
91 //
92 // Note that depending on the one-time random initialization, they
93 // _might_ actually give the same result, but we know that given
94 // the particular client_id we use for unit tests they won't.
95 const size_t kMaxLowEntropySize = (1 << 13);
96 base::FieldTrialList field_trial_list(
97 new PermutedEntropyProvider(1234, kMaxLowEntropySize));
98 scoped_refptr<base::FieldTrial> trials[] = {
99 base::FieldTrialList::FactoryGetFieldTrial("one", 100, "default",
100 base::FieldTrialList::kExpirationYearInFuture, 1, 1, NULL),
101 base::FieldTrialList::FactoryGetFieldTrial("two", 100, "default",
102 base::FieldTrialList::kExpirationYearInFuture, 1, 1, NULL),
103 };
104
105 for (size_t i = 0; i < arraysize(trials); ++i) {
106 trials[i]->UseOneTimeRandomization();
107
108 for (int j = 0; j < 100; ++j)
109 trials[i]->AppendGroup("", 1);
110 }
111
112 // The trials are most likely to give different results since they have
113 // different names.
114 ASSERT_NE(trials[0]->group(), trials[1]->group());
115 ASSERT_NE(trials[0]->group_name(), trials[1]->group_name());
116 }
117
118 TEST_F(EntropyProviderTest, SHA1Entropy) {
119 const double results[] = {
120 GenerateSHA1Entropy("hi", "1"),
121 GenerateSHA1Entropy("there", "1"),
122 };
123 ASSERT_NE(results[0], results[1]);
124 for (size_t i = 0; i < arraysize(results); ++i) {
125 ASSERT_LE(0.0, results[i]);
126 ASSERT_GT(1.0, results[i]);
127 }
128
129 ASSERT_EQ(GenerateSHA1Entropy("yo", "1"),
130 GenerateSHA1Entropy("yo", "1"));
131 ASSERT_NE(GenerateSHA1Entropy("yo", "something"),
132 GenerateSHA1Entropy("yo", "else"));
133 }
134
135 TEST_F(EntropyProviderTest, PermutedEntropy) {
136 const size_t kMaxLowEntropySize = (1 << 13);
137 const double results[] = {
138 GeneratePermutedEntropy(1234, kMaxLowEntropySize, "1"),
139 GeneratePermutedEntropy(4321, kMaxLowEntropySize, "1"),
140 };
141 ASSERT_NE(results[0], results[1]);
142 for (size_t i = 0; i < arraysize(results); ++i) {
143 ASSERT_LE(0.0, results[i]);
144 ASSERT_GT(1.0, results[i]);
145 }
146
147 ASSERT_EQ(GeneratePermutedEntropy(1234, kMaxLowEntropySize, "1"),
148 GeneratePermutedEntropy(1234, kMaxLowEntropySize, "1"));
149 ASSERT_NE(GeneratePermutedEntropy(1234, kMaxLowEntropySize, "something"),
150 GeneratePermutedEntropy(1234, kMaxLowEntropySize, "else"));
151 }
152
153 TEST_F(EntropyProviderTest, SHA1EntropyIsUniform) {
154 // Size of the low entropy source to append to the high entropy source for
155 // input to the SHA1 entropy provider.
156 const size_t kMaxLowEntropySize = (1 << 13);
157 // Number of buckets in the simulated field trials.
158 const size_t kBucketCount = 20;
159 // Max number of iterations to perform before giving up and failing.
160 const size_t kMaxIterationCount = 100000;
161 // The number of iterations to perform before each time the statistical
162 // significance of the results is checked.
163 const size_t kCheckIterationCount = 10000;
Ilya Sherman 2012/08/22 04:07:42 Is it really important to check every 10000 iterat
Alexei Svitkine (slow) 2012/08/22 16:53:27 The test verifies whether the observed distributio
164 // This is the Chi-Square threshold from the Chi-Square statistic table for
165 // 19 degrees of freedom (based on |kBucketCount|) with a 99.9% confidence
166 // level. See: http://www.medcalc.org/manual/chi-square-table.php
167 const double kChiSquareThreshold = 43.82;
168
169 const std::string trial_names[] = {
170 "TestTrial",
171 "AnotherTestTrial",
172 "NewTabButton",
173 };
174
175 for (size_t i = 0; i < arraysize(trial_names); ++i) {
176 std::vector<int> distribution(kBucketCount);
177
178 for (size_t j = 0; j < kMaxIterationCount; ++j) {
Ilya Sherman 2012/08/22 04:07:42 nit: Perhaps this loop should start from j = 1, so
Alexei Svitkine (slow) 2012/08/22 16:53:27 Done.
179 // Use a random GUID + 13 additional bits of entropy to match how the
180 // SHA1EntropyProvider is used in metrics_service.cc.
181 const int low_entropy_source =
182 static_cast<uint16>(base::RandInt(0, kMaxLowEntropySize - 1));
183 const std::string high_entropy_source =
184 base::GenerateGUID() + base::IntToString(low_entropy_source);
185 const double entropy_value =
186 GenerateSHA1Entropy(high_entropy_source, trial_names[i]);
187 const size_t bucket = static_cast<size_t>(kBucketCount * entropy_value);
188 ASSERT_LT(bucket, kBucketCount);
189 distribution[bucket] += 1;
190
191 // After |kCheckIterationCount| iterations, compute the Chi-Square
192 // statistic of the distribution. If the resulting statistic is greater
193 // than |kChiSquareThreshold|, we can conclude with 99.9% confidence
194 // that the observed samples do not follow a uniform distribution.
195 //
196 // However, since 99.9% would still result in a false negative every
197 // 1000 runs of the test, do not treat it as a failure (else the test
198 // will be flaky). Instead, perform additional iterations to determine
199 // if the distribution will converge, up to |kMaxIterationCount|.
Ilya Sherman 2012/08/22 04:07:42 Perhaps instead of running the test multiple times
Alexei Svitkine (slow) 2012/08/22 16:53:27 Using a threshold that has a higher confidence doe
200 if (((j + 1) % kCheckIterationCount) == 0) {
201 const double expected_value_per_bucket =
202 static_cast<double>(j + 1) / kBucketCount;
203 const double chi_square =
204 ComputeChiSquare(distribution, expected_value_per_bucket);
205 if (chi_square < kChiSquareThreshold)
206 break;
207
208 // If |j == kMaxIterationCount - 1|, the Chi-Square statistic did not
209 // converge after |kMaxIterationCount|.
210 ASSERT_NE(j, kMaxIterationCount - 1) << "Failed for trial " <<
211 trial_names[i] << " with chi_square = " << chi_square <<
212 " after " << kMaxIterationCount << " iterations.";
Ilya Sherman 2012/08/22 04:07:42 This test should choose a random seed for the rand
Alexei Svitkine (slow) 2012/08/22 16:53:27 The random number generator used for the original
213 }
214 }
215 }
216 }
217
218 TEST_F(EntropyProviderTest, PermutedEntropyIsUniform) {
219 // Size of the low entropy source to use for the permuted entropy provider.
220 const size_t kMaxLowEntropySize = (1 << 13);
221 // Number of buckets in the simulated field trials.
222 const size_t kBucketCount = 20;
223 // Max number of iterations to perform before giving up and failing.
224 const size_t kMaxIterationCount = 100000;
225 // The number of iterations to perform before each time the statistical
226 // significance of the results is checked.
227 const size_t kCheckIterationCount = 10000;
228 // This is the Chi-Square threshold from the Chi-Square statistic table for
229 // 19 degrees of freedom (based on |kBucketCount|) with a 99.9% confidence
230 // level. See: http://www.medcalc.org/manual/chi-square-table.php
231 const double kChiSquareThreshold = 43.82;
Ilya Sherman 2012/08/22 04:07:42 nit: Almost all of these constants are repeated fr
Alexei Svitkine (slow) 2012/08/22 16:53:27 Refactored the two tests to share a majority of th
232
233 const std::string trial_names[] = {
234 "TestTrial",
235 "AnotherTestTrial",
236 "NewTabButton",
237 };
238
239 for (size_t i = 0; i < arraysize(trial_names); ++i) {
240 std::vector<int> distribution(kBucketCount);
241
242 // Note: Given a trial name, the computed mapping will be the same.
243 // As a performance optimization, pre-compute the mapping once per trial
244 // name and index into it each iteration.
245 std::vector<uint16> mapping(kMaxLowEntropySize);
246 internal::PermuteMappingUsingTrialName(trial_names[i], &mapping);
247
248 for (size_t j = 0; j < kMaxIterationCount; ++j) {
249 const int low_entropy_source =
250 static_cast<uint16>(base::RandInt(0, kMaxLowEntropySize - 1));
251 const double entropy_value =
252 mapping[low_entropy_source] / static_cast<double>(kMaxLowEntropySize);
253 const size_t bucket = static_cast<size_t>(kBucketCount * entropy_value);
254 ASSERT_LT(bucket, kBucketCount);
255 distribution[bucket] += 1;
256
257 // After |kCheckIterationCount| iterations, compute the Chi-Square
258 // statistic of the distribution. If the resulting statistic is greater
259 // than |kChiSquareThreshold|, we can conclude with 99.9% confidence
260 // that the observed samples do not follow a uniform distribution.
261 //
262 // However, since 99.9% would still result in a false negative every
263 // 1000 runs of the test, do not treat it as a failure (else the test
264 // will be flaky). Instead, perform additional iterations to determine
265 // if the distribution will converge, up to |kMaxIterationCount|.
266 if (((j + 1) % kCheckIterationCount) == 0) {
267 const double expected_value_per_bucket =
268 static_cast<double>(j + 1) / kBucketCount;
269 const double chi_square =
270 ComputeChiSquare(distribution, expected_value_per_bucket);
271 if (chi_square < kChiSquareThreshold)
272 break;
273
274 // If |j == kMaxIterationCount - 1|, the Chi-Square statistic did not
275 // converge after |kMaxIterationCount|.
276 ASSERT_NE(j, kMaxIterationCount - 1) << "Failed for trial " <<
277 trial_names[i] << " with chi_square = " << chi_square <<
278 " after " << kMaxIterationCount << " iterations.";
279 }
280 }
281 }
Ilya Sherman 2012/08/22 04:07:42 nit: There is a lot of repeated code with the abov
Alexei Svitkine (slow) 2012/08/22 16:53:27 Done.
282 }
283
284 TEST_F(EntropyProviderTest, SeededRandGeneratorIsUniform) {
285 // Verifies that SeededRandGenerator has a uniform distribution.
286 //
287 // Mirrors RandUtilTest.RandGeneratorIsUniform in base/rand_util_unittest.cc.
288
289 const uint32 kTopOfRange = (std::numeric_limits<uint32>::max() / 4ULL) * 3ULL;
290 const uint32 kExpectedAverage = kTopOfRange / 2ULL;
291 const uint32 kAllowedVariance = kExpectedAverage / 50ULL; // +/- 2%
292 const int kMinAttempts = 1000;
293 const int kMaxAttempts = 1000000;
294
295 const std::string trial_names[] = {
296 "TestTrial",
297 "AnotherTestTrial",
298 "NewTabButton",
299 };
300
301 for (size_t i = 0; i < arraysize(trial_names); ++i) {
302 const uint32 seed = internal::HashName(trial_names[i]);
303 internal::SeededRandGenerator rand_generator(seed);
304
305 double cumulative_average = 0.0;
306 int count = 0;
307 while (count < kMaxAttempts) {
308 uint32 value = rand_generator(kTopOfRange);
309 cumulative_average = (count * cumulative_average + value) / (count + 1);
310
311 // Don't quit too quickly for things to start converging, or we may have
312 // a false positive.
313 if (count > kMinAttempts &&
314 kExpectedAverage - kAllowedVariance < cumulative_average &&
315 cumulative_average < kExpectedAverage + kAllowedVariance) {
316 break;
317 }
318
319 ++count;
320 }
321
322 ASSERT_LT(count, kMaxAttempts) << "Expected average was " <<
323 kExpectedAverage << ", average ended at " << cumulative_average <<
324 ", for trial " << trial_names[i];
325 }
326 }
327
328 } // namespace metrics
OLDNEW
« no previous file with comments | « chrome/common/metrics/entropy_provider.cc ('k') | chrome/common/metrics/variations/variations_util_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698