OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/browser/safe_browsing/prefix_set.h" | 5 #include "chrome/browser/safe_browsing/prefix_set.h" |
6 | 6 |
7 #include <algorithm> | 7 #include <algorithm> |
8 #include <math.h> | 8 #include <math.h> |
9 | 9 |
10 #include "base/file_util.h" | 10 #include "base/file_util.h" |
(...skipping 21 matching lines...) Expand all Loading... |
32 // For |std::upper_bound()| to find a prefix w/in a vector of pairs. | 32 // For |std::upper_bound()| to find a prefix w/in a vector of pairs. |
33 bool PrefixLess(const std::pair<SBPrefix,size_t>& a, | 33 bool PrefixLess(const std::pair<SBPrefix,size_t>& a, |
34 const std::pair<SBPrefix,size_t>& b) { | 34 const std::pair<SBPrefix,size_t>& b) { |
35 return a.first < b.first; | 35 return a.first < b.first; |
36 } | 36 } |
37 | 37 |
38 } // namespace | 38 } // namespace |
39 | 39 |
40 namespace safe_browsing { | 40 namespace safe_browsing { |
41 | 41 |
42 PrefixSet::PrefixSet(const std::vector<SBPrefix>& sorted_prefixes) | 42 PrefixSet::PrefixSet(const std::vector<SBPrefix>& sorted_prefixes) { |
43 : checksum_(0) { | |
44 if (sorted_prefixes.size()) { | 43 if (sorted_prefixes.size()) { |
45 // Estimate the resulting vector sizes. There will be strictly | 44 // Estimate the resulting vector sizes. There will be strictly |
46 // more than |min_runs| entries in |index_|, but there generally | 45 // more than |min_runs| entries in |index_|, but there generally |
47 // aren't many forced breaks. | 46 // aren't many forced breaks. |
48 const size_t min_runs = sorted_prefixes.size() / kMaxRun; | 47 const size_t min_runs = sorted_prefixes.size() / kMaxRun; |
49 index_.reserve(min_runs); | 48 index_.reserve(min_runs); |
50 deltas_.reserve(sorted_prefixes.size() - min_runs); | 49 deltas_.reserve(sorted_prefixes.size() - min_runs); |
51 | 50 |
52 // Lead with the first prefix. | 51 // Lead with the first prefix. |
53 SBPrefix prev_prefix = sorted_prefixes[0]; | 52 SBPrefix prev_prefix = sorted_prefixes[0]; |
54 size_t run_length = 0; | 53 size_t run_length = 0; |
55 index_.push_back(std::make_pair(prev_prefix, deltas_.size())); | 54 index_.push_back(std::make_pair(prev_prefix, deltas_.size())); |
56 | 55 |
57 // Used to build a checksum from the data used to construct the | |
58 // structures. Since the data is a bunch of uniform hashes, it | |
59 // seems reasonable to just xor most of it in, rather than trying | |
60 // to use a more complicated algorithm. | |
61 uint32 checksum = static_cast<uint32>(sorted_prefixes[0]); | |
62 checksum ^= static_cast<uint32>(deltas_.size()); | |
63 | |
64 for (size_t i = 1; i < sorted_prefixes.size(); ++i) { | 56 for (size_t i = 1; i < sorted_prefixes.size(); ++i) { |
65 // Skip duplicates. | 57 // Skip duplicates. |
66 if (sorted_prefixes[i] == prev_prefix) | 58 if (sorted_prefixes[i] == prev_prefix) |
67 continue; | 59 continue; |
68 | 60 |
69 // Calculate the delta. |unsigned| is mandatory, because the | 61 // Calculate the delta. |unsigned| is mandatory, because the |
70 // sorted_prefixes could be more than INT_MAX apart. | 62 // sorted_prefixes could be more than INT_MAX apart. |
71 DCHECK_GT(sorted_prefixes[i], prev_prefix); | 63 DCHECK_GT(sorted_prefixes[i], prev_prefix); |
72 const unsigned delta = sorted_prefixes[i] - prev_prefix; | 64 const unsigned delta = sorted_prefixes[i] - prev_prefix; |
73 const uint16 delta16 = static_cast<uint16>(delta); | 65 const uint16 delta16 = static_cast<uint16>(delta); |
74 | 66 |
75 // New index ref if the delta doesn't fit, or if too many | 67 // New index ref if the delta doesn't fit, or if too many |
76 // consecutive deltas have been encoded. | 68 // consecutive deltas have been encoded. |
77 if (delta != static_cast<unsigned>(delta16) || run_length >= kMaxRun) { | 69 if (delta != static_cast<unsigned>(delta16) || run_length >= kMaxRun) { |
78 checksum ^= static_cast<uint32>(sorted_prefixes[i]); | |
79 checksum ^= static_cast<uint32>(deltas_.size()); | |
80 index_.push_back(std::make_pair(sorted_prefixes[i], deltas_.size())); | 70 index_.push_back(std::make_pair(sorted_prefixes[i], deltas_.size())); |
81 run_length = 0; | 71 run_length = 0; |
82 } else { | 72 } else { |
83 checksum ^= static_cast<uint32>(delta16); | |
84 // Continue the run of deltas. | 73 // Continue the run of deltas. |
85 deltas_.push_back(delta16); | 74 deltas_.push_back(delta16); |
86 DCHECK_EQ(static_cast<unsigned>(deltas_.back()), delta); | 75 DCHECK_EQ(static_cast<unsigned>(deltas_.back()), delta); |
87 ++run_length; | 76 ++run_length; |
88 } | 77 } |
89 | 78 |
90 prev_prefix = sorted_prefixes[i]; | 79 prev_prefix = sorted_prefixes[i]; |
91 } | 80 } |
92 checksum_ = checksum; | |
93 DCHECK(CheckChecksum()); | |
94 | 81 |
95 // Send up some memory-usage stats. Bits because fractional bytes | 82 // Send up some memory-usage stats. Bits because fractional bytes |
96 // are weird. | 83 // are weird. |
97 const size_t bits_used = index_.size() * sizeof(index_[0]) * CHAR_BIT + | 84 const size_t bits_used = index_.size() * sizeof(index_[0]) * CHAR_BIT + |
98 deltas_.size() * sizeof(deltas_[0]) * CHAR_BIT; | 85 deltas_.size() * sizeof(deltas_[0]) * CHAR_BIT; |
99 const size_t unique_prefixes = index_.size() + deltas_.size(); | 86 const size_t unique_prefixes = index_.size() + deltas_.size(); |
100 static const size_t kMaxBitsPerPrefix = sizeof(SBPrefix) * CHAR_BIT; | 87 static const size_t kMaxBitsPerPrefix = sizeof(SBPrefix) * CHAR_BIT; |
101 UMA_HISTOGRAM_ENUMERATION("SB2.PrefixSetBitsPerPrefix", | 88 UMA_HISTOGRAM_ENUMERATION("SB2.PrefixSetBitsPerPrefix", |
102 bits_used / unique_prefixes, | 89 bits_used / unique_prefixes, |
103 kMaxBitsPerPrefix); | 90 kMaxBitsPerPrefix); |
104 } | 91 } |
105 } | 92 } |
106 | 93 |
107 PrefixSet::PrefixSet(std::vector<std::pair<SBPrefix,size_t> > *index, | 94 PrefixSet::PrefixSet(std::vector<std::pair<SBPrefix,size_t> > *index, |
108 std::vector<uint16> *deltas) | 95 std::vector<uint16> *deltas) { |
109 : checksum_(0) { | |
110 DCHECK(index && deltas); | 96 DCHECK(index && deltas); |
111 index_.swap(*index); | 97 index_.swap(*index); |
112 deltas_.swap(*deltas); | 98 deltas_.swap(*deltas); |
113 } | 99 } |
114 | 100 |
115 PrefixSet::~PrefixSet() {} | 101 PrefixSet::~PrefixSet() {} |
116 | 102 |
117 bool PrefixSet::Exists(SBPrefix prefix) const { | 103 bool PrefixSet::Exists(SBPrefix prefix) const { |
118 if (index_.empty()) | 104 if (index_.empty()) |
119 return false; | 105 return false; |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
158 | 144 |
159 SBPrefix current = index_[ii].first; | 145 SBPrefix current = index_[ii].first; |
160 prefixes->push_back(current); | 146 prefixes->push_back(current); |
161 for (size_t di = index_[ii].second; di < deltas_end; ++di) { | 147 for (size_t di = index_[ii].second; di < deltas_end; ++di) { |
162 current += deltas_[di]; | 148 current += deltas_[di]; |
163 prefixes->push_back(current); | 149 prefixes->push_back(current); |
164 } | 150 } |
165 } | 151 } |
166 } | 152 } |
167 | 153 |
168 // NOTE(shess): For debugging potential memory corruption. I wanted | |
169 // to test that the output of GetPrefixes() matched the checksum over | |
170 // the unique elements in the input to the constructor, but the buffer | |
171 // for GetPrefixes() to write to could itself be corrupted. That | |
172 // would make it look like GetPrefixes() itself was broken. At that | |
173 // point my head exploded, so I wrote this. | |
174 SBPrefix PrefixSet::GetPrefixesChecksum() const { | |
175 SBPrefix checksum = 0; | |
176 | |
177 for (size_t ii = 0; ii < index_.size(); ++ii) { | |
178 // The deltas for this |index_| entry run to the next index entry, | |
179 // or the end of the deltas. | |
180 const size_t deltas_end = | |
181 (ii + 1 < index_.size()) ? index_[ii + 1].second : deltas_.size(); | |
182 | |
183 SBPrefix current = index_[ii].first; | |
184 checksum ^= current; | |
185 for (size_t di = index_[ii].second; di < deltas_end; ++di) { | |
186 current += deltas_[di]; | |
187 checksum ^= current; | |
188 } | |
189 } | |
190 | |
191 return checksum; | |
192 } | |
193 | |
194 // static | 154 // static |
195 PrefixSet* PrefixSet::LoadFile(const FilePath& filter_name) { | 155 PrefixSet* PrefixSet::LoadFile(const FilePath& filter_name) { |
196 int64 size_64; | 156 int64 size_64; |
197 if (!file_util::GetFileSize(filter_name, &size_64)) | 157 if (!file_util::GetFileSize(filter_name, &size_64)) |
198 return NULL; | 158 return NULL; |
199 using base::MD5Digest; | 159 using base::MD5Digest; |
200 if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest))) | 160 if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest))) |
201 return NULL; | 161 return NULL; |
202 | 162 |
203 file_util::ScopedFILE file(file_util::OpenFile(filter_name, "rb")); | 163 file_util::ScopedFILE file(file_util::OpenFile(filter_name, "rb")); |
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
319 written = fwrite(&digest, sizeof(digest), 1, file.get()); | 279 written = fwrite(&digest, sizeof(digest), 1, file.get()); |
320 if (written != 1) | 280 if (written != 1) |
321 return false; | 281 return false; |
322 | 282 |
323 // TODO(shess): Can this code check that the close was successful? | 283 // TODO(shess): Can this code check that the close was successful? |
324 file.reset(); | 284 file.reset(); |
325 | 285 |
326 return true; | 286 return true; |
327 } | 287 } |
328 | 288 |
329 size_t PrefixSet::IndexBinFor(size_t target_index) const { | |
330 // The items in |index_| have the logical index of each previous | |
331 // item in |index_| plus the count of deltas between the items. | |
332 // Since the indices into |deltas_| are absolute, the logical index | |
333 // is then the sum of the two indices. | |
334 size_t lo = 0; | |
335 size_t hi = index_.size(); | |
336 | |
337 // Binary search because linear search was too slow (really, the | |
338 // unit test sucked). Inline because the elements can't be compared | |
339 // in isolation (their position matters). | |
340 while (hi - lo > 1) { | |
341 const size_t i = (lo + hi) / 2; | |
342 | |
343 if (target_index < i + index_[i].second) { | |
344 DCHECK_LT(i, hi); // Always making progress. | |
345 hi = i; | |
346 } else { | |
347 DCHECK_GT(i, lo); // Always making progress. | |
348 lo = i; | |
349 } | |
350 } | |
351 return lo; | |
352 } | |
353 | |
354 size_t PrefixSet::GetSize() const { | |
355 return index_.size() + deltas_.size(); | |
356 } | |
357 | |
358 bool PrefixSet::IsDeltaAt(size_t target_index) const { | |
359 CHECK_LT(target_index, GetSize()); | |
360 | |
361 const size_t i = IndexBinFor(target_index); | |
362 return target_index > i + index_[i].second; | |
363 } | |
364 | |
365 uint16 PrefixSet::DeltaAt(size_t target_index) const { | |
366 CHECK_LT(target_index, GetSize()); | |
367 | |
368 // Find the |index_| entry which contains |target_index|. | |
369 const size_t i = IndexBinFor(target_index); | |
370 | |
371 // Exactly on the |index_| entry means no delta. | |
372 CHECK_GT(target_index, i + index_[i].second); | |
373 | |
374 // -i backs out the |index_| entries, -1 gets the delta that lead to | |
375 // the value at |target_index|. | |
376 CHECK_LT(target_index - i - 1, deltas_.size()); | |
377 return deltas_[target_index - i - 1]; | |
378 } | |
379 | |
380 bool PrefixSet::CheckChecksum() const { | |
381 uint32 checksum = 0; | |
382 | |
383 for (size_t ii = 0; ii < index_.size(); ++ii) { | |
384 checksum ^= static_cast<uint32>(index_[ii].first); | |
385 checksum ^= static_cast<uint32>(index_[ii].second); | |
386 } | |
387 | |
388 for (size_t di = 0; di < deltas_.size(); ++di) { | |
389 checksum ^= static_cast<uint32>(deltas_[di]); | |
390 } | |
391 | |
392 return checksum == checksum_; | |
393 } | |
394 | |
395 } // namespace safe_browsing | 289 } // namespace safe_browsing |
OLD | NEW |