OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/browser/history/url_index_private_data.h" | 5 #include "chrome/browser/history/url_index_private_data.h" |
6 | 6 |
7 #include <algorithm> | 7 #include <algorithm> |
8 #include <functional> | 8 #include <functional> |
9 #include <iterator> | 9 #include <iterator> |
10 #include <limits> | 10 #include <limits> |
11 #include <numeric> | 11 #include <numeric> |
12 #include <vector> | 12 #include <vector> |
13 | 13 |
14 #include "base/basictypes.h" | 14 #include "base/basictypes.h" |
15 #include "base/file_util.h" | 15 #include "base/file_util.h" |
16 #include "base/i18n/case_conversion.h" | 16 #include "base/i18n/case_conversion.h" |
| 17 #include "base/metrics/histogram.h" |
17 #include "base/string_util.h" | 18 #include "base/string_util.h" |
| 19 #include "base/time.h" |
18 #include "base/utf_string_conversions.h" | 20 #include "base/utf_string_conversions.h" |
19 #include "chrome/browser/autocomplete/autocomplete_provider.h" | 21 #include "chrome/browser/autocomplete/autocomplete_provider.h" |
| 22 #include "chrome/browser/autocomplete/url_prefix.h" |
20 #include "chrome/browser/history/history_database.h" | 23 #include "chrome/browser/history/history_database.h" |
21 #include "chrome/browser/history/in_memory_url_cache_database.h" | 24 #include "chrome/browser/history/in_memory_url_index.h" |
22 #include "chrome/common/chrome_constants.h" | |
23 #include "chrome/common/url_constants.h" | |
24 #include "content/public/browser/browser_thread.h" | 25 #include "content/public/browser/browser_thread.h" |
25 #include "content/public/browser/notification_details.h" | 26 #include "content/public/browser/notification_details.h" |
26 #include "content/public/browser/notification_service.h" | 27 #include "content/public/browser/notification_service.h" |
27 #include "content/public/browser/notification_source.h" | 28 #include "content/public/browser/notification_source.h" |
28 #include "net/base/net_util.h" | 29 #include "net/base/net_util.h" |
| 30 #include "third_party/protobuf/src/google/protobuf/repeated_field.h" |
29 | 31 |
30 using content::BrowserThread; | 32 using google::protobuf::RepeatedField; |
| 33 using google::protobuf::RepeatedPtrField; |
| 34 using in_memory_url_index::InMemoryURLIndexCacheItem; |
31 | 35 |
32 namespace history { | 36 namespace history { |
33 | 37 |
34 // Comparison function for sorting search terms by descending length. | 38 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem; |
35 bool LengthGreater(const string16& string_a, const string16& string_b) { | 39 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry; |
36 return string_a.length() > string_b.length(); | 40 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem; |
37 } | 41 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem; |
38 | 42 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry |
39 // Initializes a whitelist of URL schemes. | 43 CharWordMapEntry; |
40 void InitializeSchemeWhitelist(std::set<std::string>* whitelist) { | 44 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem |
41 DCHECK(whitelist); | 45 WordIDHistoryMapItem; |
42 if (!whitelist->empty()) | 46 typedef imui:: |
43 return; // Nothing to do, already initialized. | 47 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry |
44 whitelist->insert(std::string(chrome::kAboutScheme)); | 48 WordIDHistoryMapEntry; |
45 whitelist->insert(std::string(chrome::kChromeUIScheme)); | 49 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem; |
46 whitelist->insert(std::string(chrome::kFileScheme)); | 50 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry |
47 whitelist->insert(std::string(chrome::kFtpScheme)); | 51 HistoryInfoMapEntry; |
48 whitelist->insert(std::string(chrome::kHttpScheme)); | 52 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem; |
49 whitelist->insert(std::string(chrome::kHttpsScheme)); | 53 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry |
50 whitelist->insert(std::string(chrome::kMailToScheme)); | 54 WordStartsMapEntry; |
51 } | |
52 | |
53 // CacheTransaction ------------------------------------------------------------ | |
54 | |
55 // Simple automatic helper class encapsulating calls to BeginTransaction/ | |
56 // CommitTransaction. | |
57 class CacheTransaction { | |
58 public: | |
59 explicit CacheTransaction(InMemoryURLCacheDatabase* db); | |
60 ~CacheTransaction(); | |
61 | |
62 private: | |
63 InMemoryURLCacheDatabase* db_; | |
64 | |
65 DISALLOW_COPY_AND_ASSIGN(CacheTransaction); | |
66 }; | |
67 | |
68 CacheTransaction::CacheTransaction(InMemoryURLCacheDatabase* db) | |
69 : db_(db) { | |
70 if (db_) | |
71 db_->BeginTransaction(); | |
72 } | |
73 | |
74 CacheTransaction::~CacheTransaction() { | |
75 if (db_) | |
76 db_->CommitTransaction(); | |
77 } | |
78 | |
79 // URLIndexPrivateData Public (but only to InMemoryURLIndex) Functions --------- | |
80 | |
81 URLIndexPrivateData::URLIndexPrivateData(const FilePath& history_dir, | |
82 const std::string& languages) | |
83 : history_dir_(history_dir), | |
84 languages_(languages), | |
85 cache_enabled_(true), | |
86 shutdown_(false), | |
87 lock_(), | |
88 pre_filter_item_count_(0), | |
89 post_filter_item_count_(0), | |
90 post_scoring_item_count_(0) { | |
91 InitializeSchemeWhitelist(&scheme_whitelist_); | |
92 } | |
93 | |
94 bool URLIndexPrivateData::Init( | |
95 base::SequencedWorkerPool::SequenceToken sequence_token) { | |
96 // Since init occurs on the DB thread, it is possible that the profile was | |
97 // told to shutdown before this task had a chance to kick off. | |
98 base::AutoLock lock(lock_); | |
99 if (shutdown_) | |
100 return false; | |
101 cache_db_ = new InMemoryURLCacheDatabase(); | |
102 FilePath dir_path; | |
103 set_cache_enabled(GetCacheDBPath(&dir_path) && | |
104 cache_db_->Init(dir_path, sequence_token)); | |
105 return cache_enabled(); | |
106 } | |
107 | |
108 void URLIndexPrivateData::Reset() { | |
109 Clear(); | |
110 if (cache_enabled()) | |
111 cache_db_->Reset(); | |
112 } | |
113 | |
114 bool URLIndexPrivateData::Empty() const { | |
115 return history_info_map_.empty(); | |
116 } | |
117 | |
118 URLIndexPrivateData* URLIndexPrivateData::Snapshot() const { | |
119 return new URLIndexPrivateData(*this); | |
120 } | |
121 | |
122 void URLIndexPrivateData::Shutdown() { | |
123 base::AutoLock lock(lock_); | |
124 shutdown_ = true; | |
125 if (cache_enabled()) | |
126 cache_db_->Shutdown(); | |
127 } | |
128 | |
129 // Helper predicates for ValidateConsistency. | |
130 template <typename Pair> | |
131 struct SelectFirst : std::unary_function<Pair, typename Pair::first_type> { | |
132 const typename Pair::first_type& operator()(const Pair& x) const { | |
133 return x.first; | |
134 } | |
135 }; | |
136 | |
137 template <typename Pair> | |
138 struct SelectSecond : std::unary_function<Pair, typename Pair::second_type> { | |
139 const typename Pair::second_type& operator()(const Pair& x) const { | |
140 return x.second; | |
141 } | |
142 }; | |
143 | |
144 template <typename SourcePair, typename Target> | |
145 struct TargetInserter : std::unary_function<SourcePair, void> { | |
146 explicit TargetInserter(Target* target) : target_(target) {} | |
147 | |
148 void operator()(const SourcePair& source) { | |
149 target_->insert(source.second.begin(), source.second.end()); | |
150 } | |
151 | |
152 Target* target_; | |
153 }; | |
154 | |
155 bool URLIndexPrivateData::ValidateConsistency() const { | |
156 // Scope things so that a large data set doesn't unnecessarily accumulate and | |
157 // hang around. | |
158 { | |
159 // Make a set of WordIDs from word_map_. | |
160 WordIDSet word_id_set_a; | |
161 std::transform(word_map_.begin(), word_map_.end(), | |
162 std::inserter(word_id_set_a, word_id_set_a.begin()), | |
163 SelectSecond<WordMap::value_type>()); | |
164 | |
165 // Compare word_map_'s word set to the words from word_id_history_map_. | |
166 { | |
167 WordIDSet word_id_set_b; | |
168 std::transform(word_id_history_map_.begin(), word_id_history_map_.end(), | |
169 std::inserter(word_id_set_b, word_id_set_b.begin()), | |
170 SelectFirst<WordIDHistoryMap::value_type>()); | |
171 WordIDSet leftovers; | |
172 std::set_symmetric_difference( | |
173 word_id_set_a.begin(), word_id_set_a.end(), | |
174 word_id_set_b.begin(), word_id_set_b.end(), | |
175 std::inserter(leftovers, leftovers.begin())); | |
176 if (!leftovers.empty()) | |
177 return false; | |
178 } | |
179 | |
180 // Compare word_map_'s word set to the words from history_id_word_map_. | |
181 { | |
182 WordIDSet word_id_set_b; | |
183 std::for_each(history_id_word_map_.begin(), history_id_word_map_.end(), | |
184 TargetInserter<HistoryIDWordMap::value_type, | |
185 WordIDSet>(&word_id_set_b)); | |
186 WordIDSet leftovers; | |
187 std::set_symmetric_difference( | |
188 word_id_set_a.begin(), word_id_set_a.end(), | |
189 word_id_set_b.begin(), word_id_set_b.end(), | |
190 std::inserter(leftovers, leftovers.begin())); | |
191 if (!leftovers.empty()) | |
192 return false; | |
193 } | |
194 | |
195 // Compare word_map_'s word set to the words from char_word_map_. | |
196 { | |
197 WordIDSet word_id_set_b; | |
198 std::for_each(char_word_map_.begin(), char_word_map_.end(), | |
199 TargetInserter<CharWordIDMap::value_type, | |
200 WordIDSet>(&word_id_set_b)); | |
201 WordIDSet leftovers; | |
202 std::set_symmetric_difference( | |
203 word_id_set_a.begin(), word_id_set_a.end(), | |
204 word_id_set_b.begin(), word_id_set_b.end(), | |
205 std::inserter(leftovers, leftovers.begin())); | |
206 if (!leftovers.empty()) | |
207 return false; | |
208 } | |
209 | |
210 // Intersect available_words_ with set of WordIDs (created above from | |
211 // word_map_) and test for no common WordIDs. | |
212 { | |
213 WordIDSet leftovers; | |
214 std::set_intersection(word_id_set_a.begin(), word_id_set_a.end(), | |
215 available_words_.begin(), available_words_.end(), | |
216 std::inserter(leftovers, leftovers.begin())); | |
217 if (!leftovers.empty()) | |
218 return false; | |
219 } | |
220 } | |
221 | |
222 { | |
223 // Make a set of HistoryIDs from history_info_map_. | |
224 HistoryIDSet history_id_set_a; | |
225 std::transform(history_info_map_.begin(), history_info_map_.end(), | |
226 std::inserter(history_id_set_a, history_id_set_a.begin()), | |
227 SelectFirst<HistoryInfoMap::value_type>()); | |
228 | |
229 // Compare history_info_map_'s set to word_id_history_map_'s HistoryIDs. | |
230 { | |
231 HistoryIDSet history_id_set_b; | |
232 std::transform(history_info_map_.begin(), history_info_map_.end(), | |
233 std::inserter(history_id_set_b, history_id_set_b.begin()), | |
234 SelectFirst<HistoryInfoMap::value_type>()); | |
235 HistoryIDSet leftovers; | |
236 std::set_symmetric_difference( | |
237 history_id_set_a.begin(), history_id_set_a.end(), | |
238 history_id_set_b.begin(), history_id_set_b.end(), | |
239 std::inserter(leftovers, leftovers.begin())); | |
240 if (!leftovers.empty()) | |
241 return false; | |
242 } | |
243 | |
244 // Compare history_info_map_'s set to word_id_history_map_'s HistoryIDs. | |
245 { | |
246 HistoryIDSet history_id_set_b; | |
247 std::for_each(word_id_history_map_.begin(), word_id_history_map_.end(), | |
248 TargetInserter<WordIDHistoryMap::value_type, | |
249 HistoryIDSet>(&history_id_set_b)); | |
250 HistoryIDSet leftovers; | |
251 std::set_symmetric_difference( | |
252 history_id_set_a.begin(), history_id_set_a.end(), | |
253 history_id_set_b.begin(), history_id_set_b.end(), | |
254 std::inserter(leftovers, leftovers.begin())); | |
255 if (!leftovers.empty()) | |
256 return false; | |
257 } | |
258 | |
259 // Compare history_info_map_'s set to word_starts_map_'s HistoryIDs. | |
260 { | |
261 HistoryIDSet history_id_set_b; | |
262 std::transform(word_starts_map_.begin(), word_starts_map_.end(), | |
263 std::inserter(history_id_set_b, history_id_set_b.begin()), | |
264 SelectFirst<WordStartsMap::value_type>()); | |
265 HistoryIDSet leftovers; | |
266 std::set_symmetric_difference( | |
267 history_id_set_a.begin(), history_id_set_a.end(), | |
268 history_id_set_b.begin(), history_id_set_b.end(), | |
269 std::inserter(leftovers, leftovers.begin())); | |
270 if (!leftovers.empty()) | |
271 return false; | |
272 } | |
273 } | |
274 | |
275 // Make sets of words from word_list_ and word_map_ and test for equality. | |
276 { | |
277 String16Set word_list_set; | |
278 std::copy(word_list_.begin(), word_list_.end(), | |
279 std::inserter(word_list_set, word_list_set.end())); | |
280 String16Set word_map_set; | |
281 std::transform(word_map_.begin(), word_map_.end(), | |
282 std::inserter(word_map_set, word_map_set.begin()), | |
283 SelectFirst<WordMap::value_type>()); | |
284 if (word_list_set != word_map_set) | |
285 return false; | |
286 } | |
287 | |
288 return true; | |
289 } | |
290 | |
291 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( | |
292 const string16& search_string) { | |
293 pre_filter_item_count_ = 0; | |
294 post_filter_item_count_ = 0; | |
295 post_scoring_item_count_ = 0; | |
296 // The search string we receive may contain escaped characters. For reducing | |
297 // the index we need individual, lower-cased words, ignoring escapings. For | |
298 // the final filtering we need whitespace separated substrings possibly | |
299 // containing escaped characters. | |
300 string16 lower_raw_string(base::i18n::ToLower(search_string)); | |
301 string16 lower_unescaped_string = | |
302 net::UnescapeURLComponent(lower_raw_string, | |
303 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); | |
304 // Extract individual 'words' (as opposed to 'terms'; see below) from the | |
305 // search string. When the user types "colspec=ID%20Mstone Release" we get | |
306 // four 'words': "colspec", "id", "mstone" and "release". | |
307 String16Vector lower_words( | |
308 history::String16VectorFromString16(lower_unescaped_string, false, NULL)); | |
309 ScoredHistoryMatches scored_items; | |
310 | |
311 // Do nothing if we have indexed no words (probably because we've not been | |
312 // initialized yet) or the search string has no words. | |
313 if (word_list_.empty() || lower_words.empty()) { | |
314 search_term_cache_.clear(); // Invalidate the term cache. | |
315 return scored_items; | |
316 } | |
317 | |
318 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep | |
319 // approach. | |
320 ResetSearchTermCache(); | |
321 | |
322 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); | |
323 | |
324 // Trim the candidate pool if it is large. Note that we do not filter out | |
325 // items that do not contain the search terms as proper substrings -- doing | |
326 // so is the performance-costly operation we are trying to avoid in order | |
327 // to maintain omnibox responsiveness. | |
328 const size_t kItemsToScoreLimit = 500; | |
329 pre_filter_item_count_ = history_id_set.size(); | |
330 // If we trim the results set we do not want to cache the results for next | |
331 // time as the user's ultimately desired result could easily be eliminated | |
332 // in this early rough filter. | |
333 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); | |
334 if (was_trimmed) { | |
335 HistoryIDVector history_ids; | |
336 std::copy(history_id_set.begin(), history_id_set.end(), | |
337 std::back_inserter(history_ids)); | |
338 // Trim down the set by sorting by typed-count, visit-count, and last | |
339 // visit. | |
340 HistoryItemFactorGreater | |
341 item_factor_functor(history_info_map_); | |
342 std::partial_sort(history_ids.begin(), | |
343 history_ids.begin() + kItemsToScoreLimit, | |
344 history_ids.end(), | |
345 item_factor_functor); | |
346 history_id_set.clear(); | |
347 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, | |
348 std::inserter(history_id_set, history_id_set.end())); | |
349 post_filter_item_count_ = history_id_set.size(); | |
350 } | |
351 | |
352 // Pass over all of the candidates filtering out any without a proper | |
353 // substring match, inserting those which pass in order by score. Note that | |
354 // in this step we are using the raw search string complete with escaped | |
355 // URL elements. When the user has specifically typed something akin to | |
356 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that | |
357 // specific substring appears in the URL or page title. | |
358 | |
359 // We call these 'terms' (as opposed to 'words'; see above) as in this case | |
360 // we only want to break up the search string on 'true' whitespace rather than | |
361 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we | |
362 // get two 'terms': "colspec=id%20mstone" and "release". | |
363 history::String16Vector lower_raw_terms; | |
364 Tokenize(lower_raw_string, kWhitespaceUTF16, &lower_raw_terms); | |
365 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), | |
366 AddHistoryMatch(*this, lower_raw_string, | |
367 lower_raw_terms, base::Time::Now())).ScoredMatches(); | |
368 | |
369 // Select and sort only the top kMaxMatches results. | |
370 if (scored_items.size() > AutocompleteProvider::kMaxMatches) { | |
371 std::partial_sort(scored_items.begin(), | |
372 scored_items.begin() + | |
373 AutocompleteProvider::kMaxMatches, | |
374 scored_items.end(), | |
375 ScoredHistoryMatch::MatchScoreGreater); | |
376 scored_items.resize(AutocompleteProvider::kMaxMatches); | |
377 } else { | |
378 std::sort(scored_items.begin(), scored_items.end(), | |
379 ScoredHistoryMatch::MatchScoreGreater); | |
380 } | |
381 post_scoring_item_count_ = scored_items.size(); | |
382 | |
383 if (was_trimmed) { | |
384 search_term_cache_.clear(); // Invalidate the term cache. | |
385 } else { | |
386 // Remove any stale SearchTermCacheItems. | |
387 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | |
388 cache_iter != search_term_cache_.end(); ) { | |
389 if (!cache_iter->second.used_) | |
390 search_term_cache_.erase(cache_iter++); | |
391 else | |
392 ++cache_iter; | |
393 } | |
394 } | |
395 | |
396 return scored_items; | |
397 } | |
398 | |
399 bool URLIndexPrivateData::UpdateURL(const URLRow& row) { | |
400 // The row may or may not already be in our index. If it is not already | |
401 // indexed and it qualifies then it gets indexed. If it is already | |
402 // indexed and still qualifies then it gets updated, otherwise it | |
403 // is deleted from the index. | |
404 bool row_was_updated = false; | |
405 URLID row_id = row.id(); | |
406 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id); | |
407 if (row_pos == history_info_map_.end()) { | |
408 // This new row should be indexed if it qualifies. | |
409 URLRow new_row(row); | |
410 new_row.set_id(row_id); | |
411 row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) && | |
412 IndexRow(new_row); | |
413 } else if (RowQualifiesAsSignificant(row, base::Time())) { | |
414 // This indexed row still qualifies and will be re-indexed. | |
415 // The url won't have changed but the title, visit count, etc. | |
416 // might have changed. | |
417 URLRow& row_to_update = row_pos->second; | |
418 bool title_updated = row_to_update.title() != row.title(); | |
419 if (row_to_update.visit_count() != row.visit_count() || | |
420 row_to_update.typed_count() != row.typed_count() || | |
421 row_to_update.last_visit() != row.last_visit() || title_updated) { | |
422 CacheTransaction cache_transaction(cache_enabled() ? | |
423 cache_db_.get() : NULL); | |
424 row_to_update.set_visit_count(row.visit_count()); | |
425 row_to_update.set_typed_count(row.typed_count()); | |
426 row_to_update.set_last_visit(row.last_visit()); | |
427 row_to_update.set_hidden(row.hidden()); | |
428 // While the URL is guaranteed to remain stable, the title may have | |
429 // changed. If so, then update the index with the changed words. | |
430 if (title_updated) { | |
431 // Clear all words associated with this row and re-index both the | |
432 // URL and title. | |
433 RemoveRowWordsFromIndex(row_to_update); | |
434 row_to_update.set_title(row.title()); | |
435 RowWordStarts word_starts; | |
436 AddRowWordsToIndex(row_to_update, &word_starts); | |
437 word_starts_map_[row_id] = word_starts; | |
438 // Replace all word_starts for the row. | |
439 if (cache_enabled()) { | |
440 cache_db_->RemoveWordStarts(row_id); | |
441 cache_db_->AddRowWordStarts(row_id, word_starts); | |
442 } | |
443 } | |
444 if (cache_enabled()) { | |
445 cache_db_->RemoveHistoryIDFromURLs(row_id); | |
446 cache_db_->AddHistoryToURLs(row_id, row); | |
447 } | |
448 row_was_updated = true; | |
449 } | |
450 } else { | |
451 // This indexed row no longer qualifies and will be de-indexed by | |
452 // clearing all words associated with this row. | |
453 RemoveRowFromIndex(row); | |
454 row_was_updated = true; | |
455 } | |
456 if (row_was_updated) | |
457 search_term_cache_.clear(); // This invalidates the cache. | |
458 return row_was_updated; | |
459 } | |
460 | |
461 // Helper functor for DeleteURL. | |
462 class HistoryInfoMapItemHasURL { | |
463 public: | |
464 explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {} | |
465 | |
466 bool operator()(const std::pair<const HistoryID, URLRow>& item) { | |
467 return item.second.url() == url_; | |
468 } | |
469 | |
470 private: | |
471 const GURL& url_; | |
472 }; | |
473 | |
474 bool URLIndexPrivateData::DeleteURL(const GURL& url) { | |
475 DCHECK(!BrowserThread::IsWellKnownThread(BrowserThread::UI) || | |
476 BrowserThread::CurrentlyOn(BrowserThread::UI)); | |
477 // Find the matching entry in the history_info_map_. | |
478 HistoryInfoMap::iterator pos = std::find_if( | |
479 history_info_map_.begin(), | |
480 history_info_map_.end(), | |
481 HistoryInfoMapItemHasURL(url)); | |
482 if (pos == history_info_map_.end()) | |
483 return false; | |
484 RemoveRowFromIndex(pos->second); | |
485 search_term_cache_.clear(); // This invalidates the cache. | |
486 return true; | |
487 } | |
488 | |
489 bool URLIndexPrivateData::RestoreFromCacheTask() { | |
490 DCHECK(!BrowserThread::IsWellKnownThread(BrowserThread::UI) || | |
491 !BrowserThread::CurrentlyOn(BrowserThread::UI)); | |
492 DCHECK(cache_db_); | |
493 if (IsShutdown()) | |
494 return false; | |
495 Clear(); | |
496 if (RestoreFromCache(cache_db_)) | |
497 return true; | |
498 // If there was no SQLite-based cache then there might be an old, | |
499 // deprecated, protobuf-based cache file laying around. Get rid of it. | |
500 DeleteOldVersionCacheFile(); | |
501 return false; | |
502 } | |
503 | |
504 // static | |
505 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory( | |
506 HistoryDatabase* history_db, | |
507 scoped_refptr<URLIndexPrivateData> old_data) { | |
508 DCHECK(!BrowserThread::IsWellKnownThread(BrowserThread::UI) || | |
509 !BrowserThread::CurrentlyOn(BrowserThread::UI)); | |
510 if (!history_db) | |
511 return NULL; | |
512 | |
513 scoped_refptr<URLIndexPrivateData> rebuilt_data( | |
514 new URLIndexPrivateData(*(old_data.get()))); | |
515 | |
516 // NOTE: We disable the cache database until after the replacement private | |
517 // data has been created and then we re-enable the database. Once the private | |
518 // data has been swapped in the database is refreshed with the new data. | |
519 URLDatabase::URLEnumerator history_enum; | |
520 if (!history_db->InitURLEnumeratorForSignificant(&history_enum)) | |
521 return NULL; | |
522 | |
523 for (URLRow row; !old_data->IsShutdown() && history_enum.GetNextURL(&row); ) | |
524 rebuilt_data->IndexRow(row); | |
525 | |
526 if (old_data->IsShutdown()) { | |
527 rebuilt_data.release(); | |
528 } | |
529 | |
530 return rebuilt_data; | |
531 } | |
532 | |
533 void URLIndexPrivateData::RefreshCacheTask() { | |
534 DCHECK(!BrowserThread::IsWellKnownThread(BrowserThread::UI) || | |
535 !BrowserThread::CurrentlyOn(BrowserThread::UI)); | |
536 base::AutoLock lock(lock_); | |
537 if (shutdown_ || !cache_enabled()) | |
538 return; | |
539 // Should a refresh fail for any reason we consider the database to be | |
540 // corrupt and unavailable; no further remedial action is possible. | |
541 set_cache_enabled(cache_db_->Reset() && cache_db_->Refresh(*this)); | |
542 } | |
543 | |
544 // static | |
545 void URLIndexPrivateData::InitializeSchemeWhitelistForTesting( | |
546 std::set<std::string>* whitelist) { | |
547 InitializeSchemeWhitelist(whitelist); | |
548 } | |
549 | 55 |
550 // SearchTermCacheItem --------------------------------------------------------- | 56 // SearchTermCacheItem --------------------------------------------------------- |
551 | 57 |
552 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( | 58 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( |
553 const WordIDSet& word_id_set, | 59 const WordIDSet& word_id_set, |
554 const HistoryIDSet& history_id_set) | 60 const HistoryIDSet& history_id_set) |
555 : word_id_set_(word_id_set), | 61 : word_id_set_(word_id_set), |
556 history_id_set_(history_id_set), | 62 history_id_set_(history_id_set), |
557 used_(true) {} | 63 used_(true) {} |
558 | 64 |
559 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() | 65 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() |
560 : used_(true) {} | 66 : used_(true) {} |
561 | 67 |
562 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {} | 68 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {} |
563 | 69 |
564 // URLIndexPrivateData::AddHistoryMatch ---------------------------------------- | 70 // Algorithm Functions --------------------------------------------------------- |
565 | 71 |
566 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch( | 72 // Comparison function for sorting search terms by descending length. |
567 const URLIndexPrivateData& private_data, | 73 bool LengthGreater(const string16& string_a, const string16& string_b) { |
568 const string16& lower_string, | 74 return string_a.length() > string_b.length(); |
569 const String16Vector& lower_terms, | |
570 const base::Time now) | |
571 : private_data_(private_data), | |
572 lower_string_(lower_string), | |
573 lower_terms_(lower_terms), | |
574 now_(now) {} | |
575 | |
576 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {} | |
577 | |
578 void URLIndexPrivateData::AddHistoryMatch::operator()( | |
579 const HistoryID history_id) { | |
580 HistoryInfoMap::const_iterator hist_pos = | |
581 private_data_.history_info_map_.find(history_id); | |
582 if (hist_pos != private_data_.history_info_map_.end()) { | |
583 const URLRow& hist_item = hist_pos->second; | |
584 WordStartsMap::const_iterator starts_pos = | |
585 private_data_.word_starts_map_.find(history_id); | |
586 DCHECK(starts_pos != private_data_.word_starts_map_.end()); | |
587 ScoredHistoryMatch match(hist_item, lower_string_, lower_terms_, | |
588 starts_pos->second, now_); | |
589 if (match.raw_score > 0) | |
590 scored_matches_.push_back(match); | |
591 } | |
592 } | 75 } |
593 | 76 |
594 // URLIndexPrivateData::HistoryItemFactorGreater ------------------------------- | 77 // InMemoryURLIndex's Private Data --------------------------------------------- |
595 | 78 |
596 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater( | 79 URLIndexPrivateData::URLIndexPrivateData() |
597 const HistoryInfoMap& history_info_map) | 80 : restored_cache_version_(0), |
598 : history_info_map_(history_info_map) { | 81 saved_cache_version_(kCurrentCacheFileVersion), |
599 } | |
600 | |
601 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {} | |
602 | |
603 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()( | |
604 const HistoryID h1, | |
605 const HistoryID h2) { | |
606 HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1)); | |
607 if (entry1 == history_info_map_.end()) | |
608 return false; | |
609 HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2)); | |
610 if (entry2 == history_info_map_.end()) | |
611 return true; | |
612 const URLRow& r1(entry1->second); | |
613 const URLRow& r2(entry2->second); | |
614 // First cut: typed count, visit count, recency. | |
615 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks | |
616 // recently visited (within the last 12/24 hours) as highly important. Get | |
617 // input from mpearson. | |
618 if (r1.typed_count() != r2.typed_count()) | |
619 return (r1.typed_count() > r2.typed_count()); | |
620 if (r1.visit_count() != r2.visit_count()) | |
621 return (r1.visit_count() > r2.visit_count()); | |
622 return (r1.last_visit() > r2.last_visit()); | |
623 } | |
624 | |
625 // URLIndexPrivateData Private Functions --------------------------------------- | |
626 | |
627 URLIndexPrivateData::URLIndexPrivateData(const URLIndexPrivateData& old_data) | |
628 : history_dir_(old_data.history_dir_), | |
629 languages_(old_data.languages_), | |
630 cache_enabled_(old_data.cache_enabled_), | |
631 shutdown_(old_data.IsShutdown()), | |
632 lock_(), | |
633 scheme_whitelist_(old_data.scheme_whitelist_), | |
634 pre_filter_item_count_(0), | 82 pre_filter_item_count_(0), |
635 post_filter_item_count_(0), | 83 post_filter_item_count_(0), |
636 post_scoring_item_count_(0) { | 84 post_scoring_item_count_(0) { |
637 cache_db_ = old_data.cache_db_; | |
638 } | |
639 | |
640 // Called only by unit tests. | |
641 URLIndexPrivateData::URLIndexPrivateData() | |
642 : cache_enabled_(true), | |
643 shutdown_(false), | |
644 lock_(), | |
645 pre_filter_item_count_(0), | |
646 post_filter_item_count_(0), | |
647 post_scoring_item_count_(0) { | |
648 InitializeSchemeWhitelist(&scheme_whitelist_); | |
649 } | 85 } |
650 | 86 |
651 URLIndexPrivateData::~URLIndexPrivateData() {} | 87 URLIndexPrivateData::~URLIndexPrivateData() {} |
652 | 88 |
653 bool URLIndexPrivateData::IsShutdown() const { | |
654 base::AutoLock lock(lock_); | |
655 return shutdown_; | |
656 } | |
657 | |
658 void URLIndexPrivateData::Clear() { | 89 void URLIndexPrivateData::Clear() { |
659 word_list_.clear(); | 90 word_list_.clear(); |
660 available_words_.clear(); | 91 available_words_.clear(); |
661 word_map_.clear(); | 92 word_map_.clear(); |
662 char_word_map_.clear(); | 93 char_word_map_.clear(); |
663 word_id_history_map_.clear(); | 94 word_id_history_map_.clear(); |
664 history_id_word_map_.clear(); | 95 history_id_word_map_.clear(); |
665 history_info_map_.clear(); | 96 history_info_map_.clear(); |
666 word_starts_map_.clear(); | 97 word_starts_map_.clear(); |
667 } | 98 } |
668 | 99 |
| 100 bool URLIndexPrivateData::Empty() const { |
| 101 return history_info_map_.empty(); |
| 102 } |
| 103 |
| 104 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const { |
| 105 scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData; |
| 106 data_copy->word_list_ = word_list_; |
| 107 data_copy->available_words_ = available_words_; |
| 108 data_copy->word_map_ = word_map_; |
| 109 data_copy->char_word_map_ = char_word_map_; |
| 110 data_copy->word_id_history_map_ = word_id_history_map_; |
| 111 data_copy->history_id_word_map_ = history_id_word_map_; |
| 112 data_copy->history_info_map_ = history_info_map_; |
| 113 data_copy->word_starts_map_ = word_starts_map_; |
| 114 return data_copy; |
| 115 // Not copied: |
| 116 // search_term_cache_ |
| 117 // pre_filter_item_count_ |
| 118 // post_filter_item_count_ |
| 119 // post_scoring_item_count_ |
| 120 }; |
| 121 |
669 // Cache Updating -------------------------------------------------------------- | 122 // Cache Updating -------------------------------------------------------------- |
670 | 123 |
671 bool URLIndexPrivateData::IndexRow(const URLRow& row) { | 124 bool URLIndexPrivateData::IndexRow( |
| 125 const URLRow& row, |
| 126 const std::string& languages, |
| 127 const std::set<std::string>& scheme_whitelist) { |
672 const GURL& gurl(row.url()); | 128 const GURL& gurl(row.url()); |
673 | 129 |
674 // Index only URLs with a whitelisted scheme. | 130 // Index only URLs with a whitelisted scheme. |
675 if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist_)) | 131 if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist)) |
676 return false; | 132 return false; |
677 | 133 |
| 134 URLID row_id = row.id(); |
678 // Strip out username and password before saving and indexing. | 135 // Strip out username and password before saving and indexing. |
679 string16 url(net::FormatUrl(gurl, languages_, | 136 string16 url(net::FormatUrl(gurl, languages, |
680 net::kFormatUrlOmitUsernamePassword, | 137 net::kFormatUrlOmitUsernamePassword, |
681 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS, | 138 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS, |
682 NULL, NULL, NULL)); | 139 NULL, NULL, NULL)); |
683 | 140 |
684 URLID row_id = row.id(); | |
685 HistoryID history_id = static_cast<HistoryID>(row_id); | 141 HistoryID history_id = static_cast<HistoryID>(row_id); |
686 DCHECK(history_info_map_.find(history_id) == history_info_map_.end()); | |
687 DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max()); | 142 DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max()); |
688 | 143 |
689 // Add the row for quick lookup in the history info store. | 144 // Add the row for quick lookup in the history info store. |
690 URLRow new_row(GURL(url), row_id); | 145 URLRow new_row(GURL(url), row_id); |
691 new_row.set_visit_count(row.visit_count()); | 146 new_row.set_visit_count(row.visit_count()); |
692 new_row.set_typed_count(row.typed_count()); | 147 new_row.set_typed_count(row.typed_count()); |
693 new_row.set_last_visit(row.last_visit()); | 148 new_row.set_last_visit(row.last_visit()); |
694 new_row.set_title(row.title()); | 149 new_row.set_title(row.title()); |
695 | |
696 CacheTransaction cache_transaction(cache_enabled() ? cache_db_.get() : NULL); | |
697 history_info_map_[history_id] = new_row; | 150 history_info_map_[history_id] = new_row; |
698 if (cache_enabled()) | |
699 cache_db_->AddHistoryToURLs(history_id, row); | |
700 | 151 |
701 // Index the words contained in the URL and title of the row. | 152 // Index the words contained in the URL and title of the row. |
702 RowWordStarts word_starts; | 153 RowWordStarts word_starts; |
703 AddRowWordsToIndex(new_row, &word_starts); | 154 AddRowWordsToIndex(new_row, &word_starts, languages); |
704 word_starts_map_[history_id] = word_starts; | 155 word_starts_map_[history_id] = word_starts; |
705 if (cache_enabled()) | |
706 cache_db_->AddRowWordStarts(history_id, word_starts); | |
707 return true; | 156 return true; |
708 } | 157 } |
709 | 158 |
710 void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row, | 159 void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row, |
711 RowWordStarts* word_starts) { | 160 RowWordStarts* word_starts, |
| 161 const std::string& languages) { |
712 HistoryID history_id = static_cast<HistoryID>(row.id()); | 162 HistoryID history_id = static_cast<HistoryID>(row.id()); |
713 // Split URL into individual, unique words then add in the title words. | 163 // Split URL into individual, unique words then add in the title words. |
714 const GURL& gurl(row.url()); | 164 const GURL& gurl(row.url()); |
715 string16 url(net::FormatUrl(gurl, languages_, | 165 string16 url(net::FormatUrl(gurl, languages, |
716 net::kFormatUrlOmitUsernamePassword, | 166 net::kFormatUrlOmitUsernamePassword, |
717 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS, | 167 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS, |
718 NULL, NULL, NULL)); | 168 NULL, NULL, NULL)); |
719 url = base::i18n::ToLower(url); | 169 url = base::i18n::ToLower(url); |
720 String16Set url_words = String16SetFromString16(url, | 170 String16Set url_words = String16SetFromString16(url, |
721 word_starts ? &word_starts->url_word_starts_ : NULL); | 171 word_starts ? &word_starts->url_word_starts_ : NULL); |
722 String16Set title_words = String16SetFromString16(row.title(), | 172 String16Set title_words = String16SetFromString16(row.title(), |
723 word_starts ? &word_starts->title_word_starts_ : NULL); | 173 word_starts ? &word_starts->title_word_starts_ : NULL); |
724 String16Set words; | 174 String16Set words; |
725 std::set_union(url_words.begin(), url_words.end(), | 175 std::set_union(url_words.begin(), url_words.end(), |
(...skipping 12 matching lines...) Expand all Loading... |
738 if (word_pos != word_map_.end()) | 188 if (word_pos != word_map_.end()) |
739 UpdateWordHistory(word_pos->second, history_id); | 189 UpdateWordHistory(word_pos->second, history_id); |
740 else | 190 else |
741 AddWordHistory(term, history_id); | 191 AddWordHistory(term, history_id); |
742 } | 192 } |
743 | 193 |
744 void URLIndexPrivateData::UpdateWordHistory(WordID word_id, | 194 void URLIndexPrivateData::UpdateWordHistory(WordID word_id, |
745 HistoryID history_id) { | 195 HistoryID history_id) { |
746 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); | 196 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); |
747 DCHECK(history_pos != word_id_history_map_.end()); | 197 DCHECK(history_pos != word_id_history_map_.end()); |
748 // There is no need to record changes to the word_id_history_map_ in the cache | |
749 // because it can be rebuilt from the history_id_word_map_'s table. | |
750 HistoryIDSet& history_id_set(history_pos->second); | 198 HistoryIDSet& history_id_set(history_pos->second); |
751 history_id_set.insert(history_id); | 199 history_id_set.insert(history_id); |
752 AddToHistoryIDWordMap(history_id, word_id); | 200 AddToHistoryIDWordMap(history_id, word_id); |
753 | |
754 // Add word_id/history_id entry to the word_history table | |
755 if (cache_enabled()) | |
756 cache_db_->AddHistoryToWordHistory(word_id, history_id); | |
757 } | 201 } |
758 | 202 |
759 void URLIndexPrivateData::AddWordHistory(const string16& term, | 203 void URLIndexPrivateData::AddWordHistory(const string16& term, |
760 HistoryID history_id) { | 204 HistoryID history_id) { |
761 WordID word_id = word_list_.size(); | 205 WordID word_id = word_list_.size(); |
762 if (available_words_.empty()) { | 206 if (available_words_.empty()) { |
763 word_list_.push_back(term); | 207 word_list_.push_back(term); |
764 } else { | 208 } else { |
765 word_id = *(available_words_.begin()); | 209 word_id = *(available_words_.begin()); |
766 word_list_[word_id] = term; | 210 word_list_[word_id] = term; |
767 available_words_.erase(word_id); | 211 available_words_.erase(word_id); |
768 } | 212 } |
769 word_map_[term] = word_id; | 213 word_map_[term] = word_id; |
770 | 214 |
771 // Add word_id/term entry to the words table; | |
772 if (cache_enabled()) | |
773 cache_db_->AddWordToWords(word_id, term); | |
774 | |
775 HistoryIDSet history_id_set; | 215 HistoryIDSet history_id_set; |
776 history_id_set.insert(history_id); | 216 history_id_set.insert(history_id); |
777 word_id_history_map_[word_id] = history_id_set; | 217 word_id_history_map_[word_id] = history_id_set; |
778 AddToHistoryIDWordMap(history_id, word_id); | 218 AddToHistoryIDWordMap(history_id, word_id); |
779 | 219 |
780 // Add word_id/history_id entry to the word_history table | |
781 if (cache_enabled()) | |
782 cache_db_->AddHistoryToWordHistory(word_id, history_id); | |
783 | |
784 // For each character in the newly added word (i.e. a word that is not | 220 // For each character in the newly added word (i.e. a word that is not |
785 // already in the word index), add the word to the character index. | 221 // already in the word index), add the word to the character index. |
786 Char16Set characters = Char16SetFromString16(term); | 222 Char16Set characters = Char16SetFromString16(term); |
787 for (Char16Set::iterator uni_char_iter = characters.begin(); | 223 for (Char16Set::iterator uni_char_iter = characters.begin(); |
788 uni_char_iter != characters.end(); ++uni_char_iter) { | 224 uni_char_iter != characters.end(); ++uni_char_iter) { |
789 char16 uni_char = *uni_char_iter; | 225 char16 uni_char = *uni_char_iter; |
790 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); | 226 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); |
791 if (char_iter != char_word_map_.end()) { | 227 if (char_iter != char_word_map_.end()) { |
792 // Update existing entry in the char/word index. | 228 // Update existing entry in the char/word index. |
793 WordIDSet& word_id_set(char_iter->second); | 229 WordIDSet& word_id_set(char_iter->second); |
794 word_id_set.insert(word_id); | 230 word_id_set.insert(word_id); |
795 } else { | 231 } else { |
796 // Create a new entry in the char/word index. | 232 // Create a new entry in the char/word index. |
797 WordIDSet word_id_set; | 233 WordIDSet word_id_set; |
798 word_id_set.insert(word_id); | 234 word_id_set.insert(word_id); |
799 char_word_map_[uni_char] = word_id_set; | 235 char_word_map_[uni_char] = word_id_set; |
800 } | 236 } |
801 // Add uni_char/word_id entry to the char_words database table. | |
802 if (cache_enabled()) | |
803 cache_db_->AddWordToCharWords(uni_char, word_id); | |
804 } | 237 } |
805 } | 238 } |
806 | 239 |
807 void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) { | 240 void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) { |
808 CacheTransaction cache_transaction(cache_enabled() ? cache_db_.get() : NULL); | |
809 RemoveRowWordsFromIndex(row); | 241 RemoveRowWordsFromIndex(row); |
810 HistoryID history_id = static_cast<HistoryID>(row.id()); | 242 HistoryID history_id = static_cast<HistoryID>(row.id()); |
811 history_info_map_.erase(history_id); | 243 history_info_map_.erase(history_id); |
812 word_starts_map_.erase(history_id); | 244 word_starts_map_.erase(history_id); |
813 if (cache_enabled()) | |
814 cache_db_->RemoveHistoryIDFromURLs(history_id); | |
815 } | 245 } |
816 | 246 |
817 void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) { | 247 void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) { |
818 // Remove the entries in history_id_word_map_ and word_id_history_map_ for | 248 // Remove the entries in history_id_word_map_ and word_id_history_map_ for |
819 // this row. | 249 // this row. |
820 HistoryID history_id = static_cast<HistoryID>(row.id()); | 250 HistoryID history_id = static_cast<HistoryID>(row.id()); |
821 WordIDSet word_id_set = history_id_word_map_[history_id]; | 251 WordIDSet word_id_set = history_id_word_map_[history_id]; |
822 history_id_word_map_.erase(history_id); | 252 history_id_word_map_.erase(history_id); |
823 | 253 |
824 // Reconcile any changes to word usage. | 254 // Reconcile any changes to word usage. |
(...skipping 13 matching lines...) Expand all Loading... |
838 char_word_map_[uni_char].erase(word_id); | 268 char_word_map_[uni_char].erase(word_id); |
839 if (char_word_map_[uni_char].empty()) | 269 if (char_word_map_[uni_char].empty()) |
840 char_word_map_.erase(uni_char); // No longer in use. | 270 char_word_map_.erase(uni_char); // No longer in use. |
841 } | 271 } |
842 | 272 |
843 // Complete the removal of references to the word. | 273 // Complete the removal of references to the word. |
844 word_id_history_map_.erase(word_id); | 274 word_id_history_map_.erase(word_id); |
845 word_map_.erase(word); | 275 word_map_.erase(word); |
846 word_list_[word_id] = string16(); | 276 word_list_[word_id] = string16(); |
847 available_words_.insert(word_id); | 277 available_words_.insert(word_id); |
848 if (cache_enabled()) | |
849 cache_db_->RemoveWordFromWords(word_id); | |
850 } | 278 } |
851 if (cache_enabled()) | |
852 cache_db_->RemoveHistoryIDFromWordHistory(history_id); | |
853 } | 279 } |
854 | 280 |
855 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id, | 281 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id, |
856 WordID word_id) { | 282 WordID word_id) { |
857 HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id); | 283 HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id); |
858 if (iter != history_id_word_map_.end()) { | 284 if (iter != history_id_word_map_.end()) { |
859 WordIDSet& word_id_set(iter->second); | 285 WordIDSet& word_id_set(iter->second); |
860 word_id_set.insert(word_id); | 286 word_id_set.insert(word_id); |
861 } else { | 287 } else { |
862 WordIDSet word_id_set; | 288 WordIDSet word_id_set; |
863 word_id_set.insert(word_id); | 289 word_id_set.insert(word_id); |
864 history_id_word_map_[history_id] = word_id_set; | 290 history_id_word_map_[history_id] = word_id_set; |
865 } | 291 } |
866 } | 292 } |
867 | 293 |
| 294 bool URLIndexPrivateData::UpdateURL( |
| 295 const URLRow& row, |
| 296 const std::string& languages, |
| 297 const std::set<std::string>& scheme_whitelist) { |
| 298 // The row may or may not already be in our index. If it is not already |
| 299 // indexed and it qualifies then it gets indexed. If it is already |
| 300 // indexed and still qualifies then it gets updated, otherwise it |
| 301 // is deleted from the index. |
| 302 bool row_was_updated = false; |
| 303 URLID row_id = row.id(); |
| 304 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id); |
| 305 if (row_pos == history_info_map_.end()) { |
| 306 // This new row should be indexed if it qualifies. |
| 307 URLRow new_row(row); |
| 308 new_row.set_id(row_id); |
| 309 row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) && |
| 310 IndexRow(new_row, languages, scheme_whitelist); |
| 311 } else if (RowQualifiesAsSignificant(row, base::Time())) { |
| 312 // This indexed row still qualifies and will be re-indexed. |
| 313 // The url won't have changed but the title, visit count, etc. |
| 314 // might have changed. |
| 315 URLRow& row_to_update = row_pos->second; |
| 316 bool title_updated = row_to_update.title() != row.title(); |
| 317 if (row_to_update.visit_count() != row.visit_count() || |
| 318 row_to_update.typed_count() != row.typed_count() || |
| 319 row_to_update.last_visit() != row.last_visit() || title_updated) { |
| 320 row_to_update.set_visit_count(row.visit_count()); |
| 321 row_to_update.set_typed_count(row.typed_count()); |
| 322 row_to_update.set_last_visit(row.last_visit()); |
| 323 // While the URL is guaranteed to remain stable, the title may have |
| 324 // changed. If so, then update the index with the changed words. |
| 325 if (title_updated) { |
| 326 // Clear all words associated with this row and re-index both the |
| 327 // URL and title. |
| 328 RemoveRowWordsFromIndex(row_to_update); |
| 329 row_to_update.set_title(row.title()); |
| 330 RowWordStarts word_starts; |
| 331 AddRowWordsToIndex(row_to_update, &word_starts, languages); |
| 332 word_starts_map_[row_id] = word_starts; |
| 333 } |
| 334 row_was_updated = true; |
| 335 } |
| 336 } else { |
| 337 // This indexed row no longer qualifies and will be de-indexed by |
| 338 // clearing all words associated with this row. |
| 339 RemoveRowFromIndex(row); |
| 340 row_was_updated = true; |
| 341 } |
| 342 if (row_was_updated) |
| 343 search_term_cache_.clear(); // This invalidates the cache. |
| 344 return row_was_updated; |
| 345 } |
| 346 |
| 347 // Helper functor for DeleteURL. |
| 348 class HistoryInfoMapItemHasURL { |
| 349 public: |
| 350 explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {} |
| 351 |
| 352 bool operator()(const std::pair<const HistoryID, URLRow>& item) { |
| 353 return item.second.url() == url_; |
| 354 } |
| 355 |
| 356 private: |
| 357 const GURL& url_; |
| 358 }; |
| 359 |
| 360 bool URLIndexPrivateData::DeleteURL(const GURL& url) { |
| 361 // Find the matching entry in the history_info_map_. |
| 362 HistoryInfoMap::iterator pos = std::find_if( |
| 363 history_info_map_.begin(), |
| 364 history_info_map_.end(), |
| 365 HistoryInfoMapItemHasURL(url)); |
| 366 if (pos == history_info_map_.end()) |
| 367 return false; |
| 368 RemoveRowFromIndex(pos->second); |
| 369 search_term_cache_.clear(); // This invalidates the cache. |
| 370 return true; |
| 371 } |
| 372 |
| 373 // URLIndexPrivateData::HistoryItemFactorGreater ------------------------------- |
| 374 |
| 375 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater( |
| 376 const HistoryInfoMap& history_info_map) |
| 377 : history_info_map_(history_info_map) { |
| 378 } |
| 379 |
| 380 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {} |
| 381 |
| 382 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()( |
| 383 const HistoryID h1, |
| 384 const HistoryID h2) { |
| 385 HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1)); |
| 386 if (entry1 == history_info_map_.end()) |
| 387 return false; |
| 388 HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2)); |
| 389 if (entry2 == history_info_map_.end()) |
| 390 return true; |
| 391 const URLRow& r1(entry1->second); |
| 392 const URLRow& r2(entry2->second); |
| 393 // First cut: typed count, visit count, recency. |
| 394 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks |
| 395 // recently visited (within the last 12/24 hours) as highly important. Get |
| 396 // input from mpearson. |
| 397 if (r1.typed_count() != r2.typed_count()) |
| 398 return (r1.typed_count() > r2.typed_count()); |
| 399 if (r1.visit_count() != r2.visit_count()) |
| 400 return (r1.visit_count() > r2.visit_count()); |
| 401 return (r1.last_visit() > r2.last_visit()); |
| 402 } |
| 403 |
| 404 // Cache Searching ------------------------------------------------------------- |
| 405 |
| 406 // NOTE: This is the main public search function. |
| 407 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( |
| 408 const string16& search_string) { |
| 409 pre_filter_item_count_ = 0; |
| 410 post_filter_item_count_ = 0; |
| 411 post_scoring_item_count_ = 0; |
| 412 // The search string we receive may contain escaped characters. For reducing |
| 413 // the index we need individual, lower-cased words, ignoring escapings. For |
| 414 // the final filtering we need whitespace separated substrings possibly |
| 415 // containing escaped characters. |
| 416 string16 lower_raw_string(base::i18n::ToLower(search_string)); |
| 417 string16 lower_unescaped_string = |
| 418 net::UnescapeURLComponent(lower_raw_string, |
| 419 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); |
| 420 // Extract individual 'words' (as opposed to 'terms'; see below) from the |
| 421 // search string. When the user types "colspec=ID%20Mstone Release" we get |
| 422 // four 'words': "colspec", "id", "mstone" and "release". |
| 423 String16Vector lower_words( |
| 424 history::String16VectorFromString16(lower_unescaped_string, false, NULL)); |
| 425 ScoredHistoryMatches scored_items; |
| 426 |
| 427 // Do nothing if we have indexed no words (probably because we've not been |
| 428 // initialized yet) or the search string has no words. |
| 429 if (word_list_.empty() || lower_words.empty()) { |
| 430 search_term_cache_.clear(); // Invalidate the term cache. |
| 431 return scored_items; |
| 432 } |
| 433 |
| 434 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep |
| 435 // approach. |
| 436 ResetSearchTermCache(); |
| 437 |
| 438 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); |
| 439 |
| 440 // Trim the candidate pool if it is large. Note that we do not filter out |
| 441 // items that do not contain the search terms as proper substrings -- doing |
| 442 // so is the performance-costly operation we are trying to avoid in order |
| 443 // to maintain omnibox responsiveness. |
| 444 const size_t kItemsToScoreLimit = 500; |
| 445 pre_filter_item_count_ = history_id_set.size(); |
| 446 // If we trim the results set we do not want to cache the results for next |
| 447 // time as the user's ultimately desired result could easily be eliminated |
| 448 // in this early rough filter. |
| 449 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); |
| 450 if (was_trimmed) { |
| 451 HistoryIDVector history_ids; |
| 452 std::copy(history_id_set.begin(), history_id_set.end(), |
| 453 std::back_inserter(history_ids)); |
| 454 // Trim down the set by sorting by typed-count, visit-count, and last |
| 455 // visit. |
| 456 HistoryItemFactorGreater |
| 457 item_factor_functor(history_info_map_); |
| 458 std::partial_sort(history_ids.begin(), |
| 459 history_ids.begin() + kItemsToScoreLimit, |
| 460 history_ids.end(), |
| 461 item_factor_functor); |
| 462 history_id_set.clear(); |
| 463 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, |
| 464 std::inserter(history_id_set, history_id_set.end())); |
| 465 post_filter_item_count_ = history_id_set.size(); |
| 466 } |
| 467 |
| 468 // Pass over all of the candidates filtering out any without a proper |
| 469 // substring match, inserting those which pass in order by score. Note that |
| 470 // in this step we are using the raw search string complete with escaped |
| 471 // URL elements. When the user has specifically typed something akin to |
| 472 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that |
| 473 // specific substring appears in the URL or page title. |
| 474 |
| 475 // We call these 'terms' (as opposed to 'words'; see above) as in this case |
| 476 // we only want to break up the search string on 'true' whitespace rather than |
| 477 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we |
| 478 // get two 'terms': "colspec=id%20mstone" and "release". |
| 479 history::String16Vector lower_raw_terms; |
| 480 Tokenize(lower_raw_string, kWhitespaceUTF16, &lower_raw_terms); |
| 481 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), |
| 482 AddHistoryMatch(*this, lower_raw_string, |
| 483 lower_raw_terms, base::Time::Now())).ScoredMatches(); |
| 484 |
| 485 // Select and sort only the top kMaxMatches results. |
| 486 if (scored_items.size() > AutocompleteProvider::kMaxMatches) { |
| 487 std::partial_sort(scored_items.begin(), |
| 488 scored_items.begin() + |
| 489 AutocompleteProvider::kMaxMatches, |
| 490 scored_items.end(), |
| 491 ScoredHistoryMatch::MatchScoreGreater); |
| 492 scored_items.resize(AutocompleteProvider::kMaxMatches); |
| 493 } else { |
| 494 std::sort(scored_items.begin(), scored_items.end(), |
| 495 ScoredHistoryMatch::MatchScoreGreater); |
| 496 } |
| 497 post_scoring_item_count_ = scored_items.size(); |
| 498 |
| 499 if (was_trimmed) { |
| 500 search_term_cache_.clear(); // Invalidate the term cache. |
| 501 } else { |
| 502 // Remove any stale SearchTermCacheItems. |
| 503 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); |
| 504 cache_iter != search_term_cache_.end(); ) { |
| 505 if (!cache_iter->second.used_) |
| 506 search_term_cache_.erase(cache_iter++); |
| 507 else |
| 508 ++cache_iter; |
| 509 } |
| 510 } |
| 511 |
| 512 return scored_items; |
| 513 } |
| 514 |
| 515 // URLIndexPrivateData::AddHistoryMatch ---------------------------------------- |
| 516 |
| 517 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch( |
| 518 const URLIndexPrivateData& private_data, |
| 519 const string16& lower_string, |
| 520 const String16Vector& lower_terms, |
| 521 const base::Time now) |
| 522 : private_data_(private_data), |
| 523 lower_string_(lower_string), |
| 524 lower_terms_(lower_terms), |
| 525 now_(now) {} |
| 526 |
| 527 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {} |
| 528 |
| 529 void URLIndexPrivateData::AddHistoryMatch::operator()( |
| 530 const HistoryID history_id) { |
| 531 HistoryInfoMap::const_iterator hist_pos = |
| 532 private_data_.history_info_map_.find(history_id); |
| 533 if (hist_pos != private_data_.history_info_map_.end()) { |
| 534 const URLRow& hist_item = hist_pos->second; |
| 535 WordStartsMap::const_iterator starts_pos = |
| 536 private_data_.word_starts_map_.find(history_id); |
| 537 DCHECK(starts_pos != private_data_.word_starts_map_.end()); |
| 538 ScoredHistoryMatch match(hist_item, lower_string_, lower_terms_, |
| 539 starts_pos->second, now_); |
| 540 if (match.raw_score > 0) |
| 541 scored_matches_.push_back(match); |
| 542 } |
| 543 } |
| 544 |
868 void URLIndexPrivateData::ResetSearchTermCache() { | 545 void URLIndexPrivateData::ResetSearchTermCache() { |
869 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin(); | 546 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin(); |
870 iter != search_term_cache_.end(); ++iter) | 547 iter != search_term_cache_.end(); ++iter) |
871 iter->second.used_ = false; | 548 iter->second.used_ = false; |
872 } | 549 } |
873 | 550 |
874 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords( | 551 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords( |
875 const String16Vector& unsorted_words) { | 552 const String16Vector& unsorted_words) { |
876 // Break the terms down into individual terms (words), get the candidate | 553 // Break the terms down into individual terms (words), get the candidate |
877 // set for each term, and intersect each to get a final candidate list. | 554 // set for each term, and intersect each to get a final candidate list. |
(...skipping 168 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1046 std::set_intersection(word_id_set.begin(), word_id_set.end(), | 723 std::set_intersection(word_id_set.begin(), word_id_set.end(), |
1047 char_word_id_set.begin(), char_word_id_set.end(), | 724 char_word_id_set.begin(), char_word_id_set.end(), |
1048 std::inserter(new_word_id_set, | 725 std::inserter(new_word_id_set, |
1049 new_word_id_set.begin())); | 726 new_word_id_set.begin())); |
1050 word_id_set.swap(new_word_id_set); | 727 word_id_set.swap(new_word_id_set); |
1051 } | 728 } |
1052 } | 729 } |
1053 return word_id_set; | 730 return word_id_set; |
1054 } | 731 } |
1055 | 732 |
1056 bool URLIndexPrivateData::RestoreFromCache(InMemoryURLCacheDatabase* cache_db) { | 733 // Cache Saving ---------------------------------------------------------------- |
1057 DCHECK(cache_db); | 734 |
1058 if (!cache_db->RestorePrivateData(this)) { | 735 // static |
1059 cache_db->Reset(); | 736 void URLIndexPrivateData::WritePrivateDataToCacheFileTask( |
1060 return false; | 737 scoped_refptr<URLIndexPrivateData> private_data, |
1061 } | 738 const FilePath& file_path, |
1062 return !Empty() && ValidateConsistency(); | 739 scoped_refptr<RefCountedBool> succeeded) { |
1063 } | 740 DCHECK(private_data.get()); |
1064 | 741 DCHECK(!file_path.empty()); |
1065 void URLIndexPrivateData::DeleteOldVersionCacheFile() const { | 742 succeeded->set_value(private_data->SaveToFile(file_path)); |
1066 if (history_dir_.empty()) | 743 } |
1067 return; | 744 |
1068 FilePath path = history_dir_.Append(chrome::kHQPCacheFilename); | 745 bool URLIndexPrivateData::SaveToFile(const FilePath& file_path) { |
1069 file_util::Delete(path, false); | 746 base::TimeTicks beginning_time = base::TimeTicks::Now(); |
1070 } | 747 InMemoryURLIndexCacheItem index_cache; |
1071 | 748 SavePrivateData(&index_cache); |
1072 bool URLIndexPrivateData::GetCacheDBPath(FilePath* file_path) { | 749 std::string data; |
1073 DCHECK(file_path); | 750 if (!index_cache.SerializeToString(&data)) { |
1074 if (history_dir_.empty()) | 751 LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache."; |
1075 return false; | 752 return false; |
1076 *file_path = history_dir_.Append(chrome::kHQPCacheDBFilename); | 753 } |
1077 return true; | 754 |
1078 } | 755 int size = data.size(); |
1079 | 756 if (file_util::WriteFile(file_path, data.c_str(), size) != size) { |
1080 void URLIndexPrivateData::SetCacheDatabaseForTesting( | 757 LOG(WARNING) << "Failed to write " << file_path.value(); |
1081 InMemoryURLCacheDatabase* test_db) { | 758 return false; |
1082 cache_db_ = test_db; | 759 } |
| 760 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime", |
| 761 base::TimeTicks::Now() - beginning_time); |
| 762 return true; |
| 763 } |
| 764 |
| 765 void URLIndexPrivateData::SavePrivateData( |
| 766 InMemoryURLIndexCacheItem* cache) const { |
| 767 DCHECK(cache); |
| 768 cache->set_timestamp(base::Time::Now().ToInternalValue()); |
| 769 cache->set_version(saved_cache_version_); |
| 770 // history_item_count_ is no longer used but rather than change the protobuf |
| 771 // definition use a placeholder. This will go away with the switch to SQLite. |
| 772 cache->set_history_item_count(0); |
| 773 SaveWordList(cache); |
| 774 SaveWordMap(cache); |
| 775 SaveCharWordMap(cache); |
| 776 SaveWordIDHistoryMap(cache); |
| 777 SaveHistoryInfoMap(cache); |
| 778 SaveWordStartsMap(cache); |
| 779 } |
| 780 |
| 781 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const { |
| 782 if (word_list_.empty()) |
| 783 return; |
| 784 WordListItem* list_item = cache->mutable_word_list(); |
| 785 list_item->set_word_count(word_list_.size()); |
| 786 for (String16Vector::const_iterator iter = word_list_.begin(); |
| 787 iter != word_list_.end(); ++iter) |
| 788 list_item->add_word(UTF16ToUTF8(*iter)); |
| 789 } |
| 790 |
| 791 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const { |
| 792 if (word_map_.empty()) |
| 793 return; |
| 794 WordMapItem* map_item = cache->mutable_word_map(); |
| 795 map_item->set_item_count(word_map_.size()); |
| 796 for (WordMap::const_iterator iter = word_map_.begin(); |
| 797 iter != word_map_.end(); ++iter) { |
| 798 WordMapEntry* map_entry = map_item->add_word_map_entry(); |
| 799 map_entry->set_word(UTF16ToUTF8(iter->first)); |
| 800 map_entry->set_word_id(iter->second); |
| 801 } |
| 802 } |
| 803 |
| 804 void URLIndexPrivateData::SaveCharWordMap( |
| 805 InMemoryURLIndexCacheItem* cache) const { |
| 806 if (char_word_map_.empty()) |
| 807 return; |
| 808 CharWordMapItem* map_item = cache->mutable_char_word_map(); |
| 809 map_item->set_item_count(char_word_map_.size()); |
| 810 for (CharWordIDMap::const_iterator iter = char_word_map_.begin(); |
| 811 iter != char_word_map_.end(); ++iter) { |
| 812 CharWordMapEntry* map_entry = map_item->add_char_word_map_entry(); |
| 813 map_entry->set_char_16(iter->first); |
| 814 const WordIDSet& word_id_set(iter->second); |
| 815 map_entry->set_item_count(word_id_set.size()); |
| 816 for (WordIDSet::const_iterator set_iter = word_id_set.begin(); |
| 817 set_iter != word_id_set.end(); ++set_iter) |
| 818 map_entry->add_word_id(*set_iter); |
| 819 } |
| 820 } |
| 821 |
| 822 void URLIndexPrivateData::SaveWordIDHistoryMap( |
| 823 InMemoryURLIndexCacheItem* cache) const { |
| 824 if (word_id_history_map_.empty()) |
| 825 return; |
| 826 WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map(); |
| 827 map_item->set_item_count(word_id_history_map_.size()); |
| 828 for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin(); |
| 829 iter != word_id_history_map_.end(); ++iter) { |
| 830 WordIDHistoryMapEntry* map_entry = |
| 831 map_item->add_word_id_history_map_entry(); |
| 832 map_entry->set_word_id(iter->first); |
| 833 const HistoryIDSet& history_id_set(iter->second); |
| 834 map_entry->set_item_count(history_id_set.size()); |
| 835 for (HistoryIDSet::const_iterator set_iter = history_id_set.begin(); |
| 836 set_iter != history_id_set.end(); ++set_iter) |
| 837 map_entry->add_history_id(*set_iter); |
| 838 } |
| 839 } |
| 840 |
| 841 void URLIndexPrivateData::SaveHistoryInfoMap( |
| 842 InMemoryURLIndexCacheItem* cache) const { |
| 843 if (history_info_map_.empty()) |
| 844 return; |
| 845 HistoryInfoMapItem* map_item = cache->mutable_history_info_map(); |
| 846 map_item->set_item_count(history_info_map_.size()); |
| 847 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); |
| 848 iter != history_info_map_.end(); ++iter) { |
| 849 HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry(); |
| 850 map_entry->set_history_id(iter->first); |
| 851 const URLRow& url_row(iter->second); |
| 852 // Note: We only save information that contributes to the index so there |
| 853 // is no need to save search_term_cache_ (not persistent). |
| 854 map_entry->set_visit_count(url_row.visit_count()); |
| 855 map_entry->set_typed_count(url_row.typed_count()); |
| 856 map_entry->set_last_visit(url_row.last_visit().ToInternalValue()); |
| 857 map_entry->set_url(url_row.url().spec()); |
| 858 map_entry->set_title(UTF16ToUTF8(url_row.title())); |
| 859 } |
| 860 } |
| 861 |
| 862 void URLIndexPrivateData::SaveWordStartsMap( |
| 863 InMemoryURLIndexCacheItem* cache) const { |
| 864 if (word_starts_map_.empty()) |
| 865 return; |
| 866 // For unit testing: Enable saving of the cache as an earlier version to |
| 867 // allow testing of cache file upgrading in ReadFromFile(). |
| 868 // TODO(mrossetti): Instead of intruding on production code with this kind of |
| 869 // test harness, save a copy of an older version cache with known results. |
| 870 // Implement this when switching the caching over to SQLite. |
| 871 if (saved_cache_version_ < 1) |
| 872 return; |
| 873 |
| 874 WordStartsMapItem* map_item = cache->mutable_word_starts_map(); |
| 875 map_item->set_item_count(word_starts_map_.size()); |
| 876 for (WordStartsMap::const_iterator iter = word_starts_map_.begin(); |
| 877 iter != word_starts_map_.end(); ++iter) { |
| 878 WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry(); |
| 879 map_entry->set_history_id(iter->first); |
| 880 const RowWordStarts& word_starts(iter->second); |
| 881 for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin(); |
| 882 i != word_starts.url_word_starts_.end(); ++i) |
| 883 map_entry->add_url_word_starts(*i); |
| 884 for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin(); |
| 885 i != word_starts.title_word_starts_.end(); ++i) |
| 886 map_entry->add_title_word_starts(*i); |
| 887 } |
| 888 } |
| 889 |
| 890 // Cache Restoring ------------------------------------------------------------- |
| 891 |
| 892 // static |
| 893 void URLIndexPrivateData::RestoreFromFileTask( |
| 894 const FilePath& file_path, |
| 895 scoped_refptr<URLIndexPrivateData> private_data, |
| 896 const std::string& languages) { |
| 897 private_data = URLIndexPrivateData::RestoreFromFile(file_path, languages); |
| 898 } |
| 899 |
| 900 // static |
| 901 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile( |
| 902 const FilePath& file_path, |
| 903 const std::string& languages) { |
| 904 base::TimeTicks beginning_time = base::TimeTicks::Now(); |
| 905 if (!file_util::PathExists(file_path)) |
| 906 return NULL; |
| 907 std::string data; |
| 908 // If there is no cache file then simply give up. This will cause us to |
| 909 // attempt to rebuild from the history database. |
| 910 if (!file_util::ReadFileToString(file_path, &data)) |
| 911 return NULL; |
| 912 |
| 913 scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData); |
| 914 InMemoryURLIndexCacheItem index_cache; |
| 915 if (!index_cache.ParseFromArray(data.c_str(), data.size())) { |
| 916 LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from " |
| 917 << file_path.value(); |
| 918 return restored_data; |
| 919 } |
| 920 |
| 921 if (!restored_data->RestorePrivateData(index_cache, languages)) |
| 922 return NULL; |
| 923 |
| 924 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime", |
| 925 base::TimeTicks::Now() - beginning_time); |
| 926 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", |
| 927 restored_data->history_id_word_map_.size()); |
| 928 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size()); |
| 929 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", |
| 930 restored_data->word_map_.size()); |
| 931 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", |
| 932 restored_data->char_word_map_.size()); |
| 933 if (restored_data->Empty()) |
| 934 return NULL; // 'No data' is the same as a failed reload. |
| 935 return restored_data; |
| 936 } |
| 937 |
| 938 // static |
| 939 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory( |
| 940 HistoryDatabase* history_db, |
| 941 const std::string& languages, |
| 942 const std::set<std::string>& scheme_whitelist) { |
| 943 if (!history_db) |
| 944 return NULL; |
| 945 |
| 946 base::TimeTicks beginning_time = base::TimeTicks::Now(); |
| 947 |
| 948 scoped_refptr<URLIndexPrivateData> rebuilt_data(new URLIndexPrivateData); |
| 949 URLDatabase::URLEnumerator history_enum; |
| 950 if (!history_db->InitURLEnumeratorForSignificant(&history_enum)) |
| 951 return NULL; |
| 952 for (URLRow row; history_enum.GetNextURL(&row); ) |
| 953 rebuilt_data->IndexRow(row, languages, scheme_whitelist); |
| 954 |
| 955 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime", |
| 956 base::TimeTicks::Now() - beginning_time); |
| 957 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", |
| 958 rebuilt_data->history_id_word_map_.size()); |
| 959 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", |
| 960 rebuilt_data->word_map_.size()); |
| 961 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", |
| 962 rebuilt_data->char_word_map_.size()); |
| 963 return rebuilt_data; |
| 964 } |
| 965 |
| 966 bool URLIndexPrivateData::RestorePrivateData( |
| 967 const InMemoryURLIndexCacheItem& cache, |
| 968 const std::string& languages) { |
| 969 if (cache.has_version()) |
| 970 restored_cache_version_ = cache.version(); |
| 971 return RestoreWordList(cache) && RestoreWordMap(cache) && |
| 972 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) && |
| 973 RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages); |
| 974 } |
| 975 |
| 976 bool URLIndexPrivateData::RestoreWordList( |
| 977 const InMemoryURLIndexCacheItem& cache) { |
| 978 if (!cache.has_word_list()) |
| 979 return false; |
| 980 const WordListItem& list_item(cache.word_list()); |
| 981 uint32 expected_item_count = list_item.word_count(); |
| 982 uint32 actual_item_count = list_item.word_size(); |
| 983 if (actual_item_count == 0 || actual_item_count != expected_item_count) |
| 984 return false; |
| 985 const RepeatedPtrField<std::string>& words(list_item.word()); |
| 986 for (RepeatedPtrField<std::string>::const_iterator iter = words.begin(); |
| 987 iter != words.end(); ++iter) |
| 988 word_list_.push_back(UTF8ToUTF16(*iter)); |
| 989 return true; |
| 990 } |
| 991 |
| 992 bool URLIndexPrivateData::RestoreWordMap( |
| 993 const InMemoryURLIndexCacheItem& cache) { |
| 994 if (!cache.has_word_map()) |
| 995 return false; |
| 996 const WordMapItem& list_item(cache.word_map()); |
| 997 uint32 expected_item_count = list_item.item_count(); |
| 998 uint32 actual_item_count = list_item.word_map_entry_size(); |
| 999 if (actual_item_count == 0 || actual_item_count != expected_item_count) |
| 1000 return false; |
| 1001 const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry()); |
| 1002 for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin(); |
| 1003 iter != entries.end(); ++iter) |
| 1004 word_map_[UTF8ToUTF16(iter->word())] = iter->word_id(); |
| 1005 return true; |
| 1006 } |
| 1007 |
| 1008 bool URLIndexPrivateData::RestoreCharWordMap( |
| 1009 const InMemoryURLIndexCacheItem& cache) { |
| 1010 if (!cache.has_char_word_map()) |
| 1011 return false; |
| 1012 const CharWordMapItem& list_item(cache.char_word_map()); |
| 1013 uint32 expected_item_count = list_item.item_count(); |
| 1014 uint32 actual_item_count = list_item.char_word_map_entry_size(); |
| 1015 if (actual_item_count == 0 || actual_item_count != expected_item_count) |
| 1016 return false; |
| 1017 const RepeatedPtrField<CharWordMapEntry>& |
| 1018 entries(list_item.char_word_map_entry()); |
| 1019 for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter = |
| 1020 entries.begin(); iter != entries.end(); ++iter) { |
| 1021 expected_item_count = iter->item_count(); |
| 1022 actual_item_count = iter->word_id_size(); |
| 1023 if (actual_item_count == 0 || actual_item_count != expected_item_count) |
| 1024 return false; |
| 1025 char16 uni_char = static_cast<char16>(iter->char_16()); |
| 1026 WordIDSet word_id_set; |
| 1027 const RepeatedField<int32>& word_ids(iter->word_id()); |
| 1028 for (RepeatedField<int32>::const_iterator jiter = word_ids.begin(); |
| 1029 jiter != word_ids.end(); ++jiter) |
| 1030 word_id_set.insert(*jiter); |
| 1031 char_word_map_[uni_char] = word_id_set; |
| 1032 } |
| 1033 return true; |
| 1034 } |
| 1035 |
| 1036 bool URLIndexPrivateData::RestoreWordIDHistoryMap( |
| 1037 const InMemoryURLIndexCacheItem& cache) { |
| 1038 if (!cache.has_word_id_history_map()) |
| 1039 return false; |
| 1040 const WordIDHistoryMapItem& list_item(cache.word_id_history_map()); |
| 1041 uint32 expected_item_count = list_item.item_count(); |
| 1042 uint32 actual_item_count = list_item.word_id_history_map_entry_size(); |
| 1043 if (actual_item_count == 0 || actual_item_count != expected_item_count) |
| 1044 return false; |
| 1045 const RepeatedPtrField<WordIDHistoryMapEntry>& |
| 1046 entries(list_item.word_id_history_map_entry()); |
| 1047 for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter = |
| 1048 entries.begin(); iter != entries.end(); ++iter) { |
| 1049 expected_item_count = iter->item_count(); |
| 1050 actual_item_count = iter->history_id_size(); |
| 1051 if (actual_item_count == 0 || actual_item_count != expected_item_count) |
| 1052 return false; |
| 1053 WordID word_id = iter->word_id(); |
| 1054 HistoryIDSet history_id_set; |
| 1055 const RepeatedField<int64>& history_ids(iter->history_id()); |
| 1056 for (RepeatedField<int64>::const_iterator jiter = history_ids.begin(); |
| 1057 jiter != history_ids.end(); ++jiter) { |
| 1058 history_id_set.insert(*jiter); |
| 1059 AddToHistoryIDWordMap(*jiter, word_id); |
| 1060 } |
| 1061 word_id_history_map_[word_id] = history_id_set; |
| 1062 } |
| 1063 return true; |
| 1064 } |
| 1065 |
| 1066 bool URLIndexPrivateData::RestoreHistoryInfoMap( |
| 1067 const InMemoryURLIndexCacheItem& cache) { |
| 1068 if (!cache.has_history_info_map()) |
| 1069 return false; |
| 1070 const HistoryInfoMapItem& list_item(cache.history_info_map()); |
| 1071 uint32 expected_item_count = list_item.item_count(); |
| 1072 uint32 actual_item_count = list_item.history_info_map_entry_size(); |
| 1073 if (actual_item_count == 0 || actual_item_count != expected_item_count) |
| 1074 return false; |
| 1075 const RepeatedPtrField<HistoryInfoMapEntry>& |
| 1076 entries(list_item.history_info_map_entry()); |
| 1077 for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter = |
| 1078 entries.begin(); iter != entries.end(); ++iter) { |
| 1079 HistoryID history_id = iter->history_id(); |
| 1080 GURL url(iter->url()); |
| 1081 URLRow url_row(url, history_id); |
| 1082 url_row.set_visit_count(iter->visit_count()); |
| 1083 url_row.set_typed_count(iter->typed_count()); |
| 1084 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit())); |
| 1085 if (iter->has_title()) { |
| 1086 string16 title(UTF8ToUTF16(iter->title())); |
| 1087 url_row.set_title(title); |
| 1088 } |
| 1089 history_info_map_[history_id] = url_row; |
| 1090 } |
| 1091 return true; |
| 1092 } |
| 1093 |
| 1094 bool URLIndexPrivateData::RestoreWordStartsMap( |
| 1095 const InMemoryURLIndexCacheItem& cache, |
| 1096 const std::string& languages) { |
| 1097 // Note that this function must be called after RestoreHistoryInfoMap() has |
| 1098 // been run as the word starts may have to be recalculated from the urls and |
| 1099 // page titles. |
| 1100 if (cache.has_word_starts_map()) { |
| 1101 const WordStartsMapItem& list_item(cache.word_starts_map()); |
| 1102 uint32 expected_item_count = list_item.item_count(); |
| 1103 uint32 actual_item_count = list_item.word_starts_map_entry_size(); |
| 1104 if (actual_item_count == 0 || actual_item_count != expected_item_count) |
| 1105 return false; |
| 1106 const RepeatedPtrField<WordStartsMapEntry>& |
| 1107 entries(list_item.word_starts_map_entry()); |
| 1108 for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter = |
| 1109 entries.begin(); iter != entries.end(); ++iter) { |
| 1110 HistoryID history_id = iter->history_id(); |
| 1111 RowWordStarts word_starts; |
| 1112 // Restore the URL word starts. |
| 1113 const RepeatedField<int32>& url_starts(iter->url_word_starts()); |
| 1114 for (RepeatedField<int32>::const_iterator jiter = url_starts.begin(); |
| 1115 jiter != url_starts.end(); ++jiter) |
| 1116 word_starts.url_word_starts_.push_back(*jiter); |
| 1117 // Restore the page title word starts. |
| 1118 const RepeatedField<int32>& title_starts(iter->title_word_starts()); |
| 1119 for (RepeatedField<int32>::const_iterator jiter = title_starts.begin(); |
| 1120 jiter != title_starts.end(); ++jiter) |
| 1121 word_starts.title_word_starts_.push_back(*jiter); |
| 1122 word_starts_map_[history_id] = word_starts; |
| 1123 } |
| 1124 } else { |
| 1125 // Since the cache did not contain any word starts we must rebuild then from |
| 1126 // the URL and page titles. |
| 1127 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); |
| 1128 iter != history_info_map_.end(); ++iter) { |
| 1129 RowWordStarts word_starts; |
| 1130 const URLRow& row(iter->second); |
| 1131 string16 url(net::FormatUrl(row.url(), languages, |
| 1132 net::kFormatUrlOmitUsernamePassword, |
| 1133 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS, |
| 1134 NULL, NULL, NULL)); |
| 1135 url = base::i18n::ToLower(url); |
| 1136 String16VectorFromString16(url, false, &word_starts.url_word_starts_); |
| 1137 String16VectorFromString16( |
| 1138 row.title(), false, &word_starts.title_word_starts_); |
| 1139 word_starts_map_[iter->first] = word_starts; |
| 1140 } |
| 1141 } |
| 1142 return true; |
1083 } | 1143 } |
1084 | 1144 |
1085 // static | 1145 // static |
1086 bool URLIndexPrivateData::URLSchemeIsWhitelisted( | 1146 bool URLIndexPrivateData::URLSchemeIsWhitelisted( |
1087 const GURL& gurl, | 1147 const GURL& gurl, |
1088 const std::set<std::string>& whitelist) { | 1148 const std::set<std::string>& whitelist) { |
1089 return whitelist.find(gurl.scheme()) != whitelist.end(); | 1149 return whitelist.find(gurl.scheme()) != whitelist.end(); |
1090 } | 1150 } |
1091 | 1151 |
1092 } // namespace history | 1152 } // namespace history |
OLD | NEW |