OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/browser/history/url_index_private_data.h" | 5 #include "chrome/browser/history/url_index_private_data.h" |
6 | 6 |
7 #include <algorithm> | 7 #include <algorithm> |
8 #include <functional> | 8 #include <functional> |
9 #include <iterator> | 9 #include <iterator> |
10 #include <limits> | 10 #include <limits> |
11 #include <numeric> | 11 #include <numeric> |
12 #include <vector> | 12 #include <vector> |
13 | 13 |
14 #include "base/basictypes.h" | 14 #include "base/basictypes.h" |
15 #include "base/file_util.h" | 15 #include "base/file_util.h" |
16 #include "base/i18n/case_conversion.h" | 16 #include "base/i18n/case_conversion.h" |
17 #include "base/metrics/histogram.h" | |
18 #include "base/string_util.h" | 17 #include "base/string_util.h" |
19 #include "base/time.h" | |
20 #include "base/utf_string_conversions.h" | 18 #include "base/utf_string_conversions.h" |
21 #include "chrome/browser/autocomplete/autocomplete_provider.h" | 19 #include "chrome/browser/autocomplete/autocomplete_provider.h" |
22 #include "chrome/browser/autocomplete/url_prefix.h" | |
23 #include "chrome/browser/history/history_database.h" | 20 #include "chrome/browser/history/history_database.h" |
24 #include "chrome/browser/history/in_memory_url_index.h" | 21 #include "chrome/browser/history/in_memory_url_cache_database.h" |
22 #include "chrome/common/chrome_constants.h" | |
23 #include "chrome/common/url_constants.h" | |
25 #include "content/public/browser/browser_thread.h" | 24 #include "content/public/browser/browser_thread.h" |
26 #include "content/public/browser/notification_details.h" | 25 #include "content/public/browser/notification_details.h" |
27 #include "content/public/browser/notification_service.h" | 26 #include "content/public/browser/notification_service.h" |
28 #include "content/public/browser/notification_source.h" | 27 #include "content/public/browser/notification_source.h" |
29 #include "net/base/net_util.h" | 28 #include "net/base/net_util.h" |
30 #include "third_party/protobuf/src/google/protobuf/repeated_field.h" | 29 |
31 | 30 using content::BrowserThread; |
32 using google::protobuf::RepeatedField; | |
33 using google::protobuf::RepeatedPtrField; | |
34 using in_memory_url_index::InMemoryURLIndexCacheItem; | |
35 | 31 |
36 namespace history { | 32 namespace history { |
37 | 33 |
38 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem; | 34 // Comparison function for sorting search terms by descending length. |
39 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry; | 35 bool LengthGreater(const string16& string_a, const string16& string_b) { |
40 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem; | 36 return string_a.length() > string_b.length(); |
41 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem; | 37 } |
42 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry | 38 |
43 CharWordMapEntry; | 39 // Initializes a whitelist of URL schemes. |
44 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem | 40 void InitializeSchemeWhitelist(std::set<std::string>* whitelist) { |
45 WordIDHistoryMapItem; | 41 DCHECK(whitelist); |
46 typedef imui:: | 42 if (!whitelist->empty()) |
47 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry | 43 return; // Nothing to do, already initialized. |
48 WordIDHistoryMapEntry; | 44 whitelist->insert(std::string(chrome::kAboutScheme)); |
49 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem; | 45 whitelist->insert(std::string(chrome::kChromeUIScheme)); |
50 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry | 46 whitelist->insert(std::string(chrome::kFileScheme)); |
51 HistoryInfoMapEntry; | 47 whitelist->insert(std::string(chrome::kFtpScheme)); |
52 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem; | 48 whitelist->insert(std::string(chrome::kHttpScheme)); |
53 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry | 49 whitelist->insert(std::string(chrome::kHttpsScheme)); |
54 WordStartsMapEntry; | 50 whitelist->insert(std::string(chrome::kMailToScheme)); |
51 } | |
52 | |
53 // CacheTransaction ------------------------------------------------------------ | |
54 | |
55 // Simple automatic helper class encapsulating calls to BeginTransaction/ | |
56 // CommitTransaction. | |
57 class CacheTransaction { | |
58 public: | |
59 explicit CacheTransaction(InMemoryURLCacheDatabase* db); | |
60 ~CacheTransaction(); | |
61 | |
62 private: | |
63 InMemoryURLCacheDatabase* db_; | |
64 | |
65 DISALLOW_COPY_AND_ASSIGN(CacheTransaction); | |
66 }; | |
67 | |
68 CacheTransaction::CacheTransaction(InMemoryURLCacheDatabase* db) | |
69 : db_(db) { | |
70 if (db_) | |
71 db_->BeginTransaction(); | |
72 } | |
73 | |
74 CacheTransaction::~CacheTransaction() { | |
75 if (db_) | |
76 db_->CommitTransaction(); | |
77 } | |
78 | |
79 // URLIndexPrivateData Public (but only to InMemoryURLIndex) Functions --------- | |
80 | |
81 URLIndexPrivateData::URLIndexPrivateData(const FilePath& history_dir, | |
82 const std::string& languages) | |
83 : history_dir_(history_dir), | |
84 languages_(languages), | |
85 cache_enabled_(true), | |
86 shutdown_(false), | |
87 lock_(), | |
88 pre_filter_item_count_(0), | |
89 post_filter_item_count_(0), | |
90 post_scoring_item_count_(0) { | |
91 InitializeSchemeWhitelist(&scheme_whitelist_); | |
92 } | |
93 | |
94 bool URLIndexPrivateData::Init( | |
95 base::SequencedWorkerPool::SequenceToken sequence_token) { | |
96 // Since init occurs on the DB thread, it is possible that the profile was | |
97 // told to shutdown before this task had a chance to kick off. | |
98 base::AutoLock lock(lock_); | |
99 if (shutdown_) | |
100 return false; | |
101 cache_db_ = new InMemoryURLCacheDatabase(); | |
102 FilePath dir_path; | |
103 set_cache_enabled(GetCacheDBPath(&dir_path) && | |
104 cache_db_->Init(dir_path, sequence_token)); | |
105 return cache_enabled(); | |
106 } | |
107 | |
108 void URLIndexPrivateData::Reset() { | |
109 Clear(); | |
110 if (cache_enabled()) | |
111 cache_db_->Reset(); | |
112 } | |
113 | |
114 bool URLIndexPrivateData::Empty() const { | |
115 return history_info_map_.empty(); | |
116 } | |
117 | |
118 URLIndexPrivateData* URLIndexPrivateData::Snapshot() const { | |
119 return new URLIndexPrivateData(*this); | |
120 } | |
121 | |
122 void URLIndexPrivateData::Shutdown() { | |
123 base::AutoLock lock(lock_); | |
124 shutdown_ = true; | |
125 if (cache_enabled()) | |
126 cache_db_->Shutdown(); | |
127 } | |
128 | |
129 // Helper predicates for ValidateConsistency. | |
130 template <typename Pair> | |
131 struct SelectFirst : std::unary_function<Pair, typename Pair::first_type> { | |
132 const typename Pair::first_type& operator()(const Pair& x) const { | |
133 return x.first; | |
134 } | |
135 }; | |
136 | |
137 template <typename Pair> | |
138 struct SelectSecond : std::unary_function<Pair, typename Pair::second_type> { | |
139 const typename Pair::second_type& operator()(const Pair& x) const { | |
140 return x.second; | |
141 } | |
142 }; | |
143 | |
144 template <typename SourcePair, typename Target> | |
145 struct TargetInserter : std::unary_function<SourcePair, void> { | |
146 explicit TargetInserter(Target* target) : target_(target) {} | |
147 | |
148 void operator()(const SourcePair& source) { | |
149 target_->insert(source.second.begin(), source.second.end()); | |
150 } | |
151 | |
152 Target* target_; | |
153 }; | |
154 | |
155 bool URLIndexPrivateData::ValidateConsistency() const { | |
156 // Scope things so that a large data set doesn't unnecessarily accumulate and | |
157 // hang around. | |
158 { | |
159 // Make a set of WordIDs from word_map_. | |
160 WordIDSet word_id_set_a; | |
161 std::transform(word_map_.begin(), word_map_.end(), | |
162 std::inserter(word_id_set_a, word_id_set_a.begin()), | |
163 SelectSecond<WordMap::value_type>()); | |
164 | |
165 // Compare word_map_'s word set to the words from word_id_history_map_. | |
166 { | |
167 WordIDSet word_id_set_b; | |
168 std::transform(word_id_history_map_.begin(), word_id_history_map_.end(), | |
169 std::inserter(word_id_set_b, word_id_set_b.begin()), | |
170 SelectFirst<WordIDHistoryMap::value_type>()); | |
171 WordIDSet leftovers; | |
172 std::set_symmetric_difference( | |
173 word_id_set_a.begin(), word_id_set_a.end(), | |
174 word_id_set_b.begin(), word_id_set_b.end(), | |
175 std::inserter(leftovers, leftovers.begin())); | |
176 if (!leftovers.empty()) | |
177 return false; | |
178 } | |
179 | |
180 // Compare word_map_'s word set to the words from history_id_word_map_. | |
181 { | |
182 WordIDSet word_id_set_b; | |
183 std::for_each(history_id_word_map_.begin(), history_id_word_map_.end(), | |
184 TargetInserter<HistoryIDWordMap::value_type, | |
185 WordIDSet>(&word_id_set_b)); | |
186 WordIDSet leftovers; | |
187 std::set_symmetric_difference( | |
188 word_id_set_a.begin(), word_id_set_a.end(), | |
189 word_id_set_b.begin(), word_id_set_b.end(), | |
190 std::inserter(leftovers, leftovers.begin())); | |
191 if (!leftovers.empty()) | |
192 return false; | |
193 } | |
194 | |
195 // Compare word_map_'s word set to the words from char_word_map_. | |
196 { | |
197 WordIDSet word_id_set_b; | |
198 std::for_each(char_word_map_.begin(), char_word_map_.end(), | |
199 TargetInserter<CharWordIDMap::value_type, | |
200 WordIDSet>(&word_id_set_b)); | |
201 WordIDSet leftovers; | |
202 std::set_symmetric_difference( | |
203 word_id_set_a.begin(), word_id_set_a.end(), | |
204 word_id_set_b.begin(), word_id_set_b.end(), | |
205 std::inserter(leftovers, leftovers.begin())); | |
206 if (!leftovers.empty()) | |
207 return false; | |
208 } | |
209 | |
210 // Intersect available_words_ with set of WordIDs (created above from | |
211 // word_map_) and test for no common WordIDs. | |
212 { | |
213 WordIDSet leftovers; | |
214 std::set_intersection(word_id_set_a.begin(), word_id_set_a.end(), | |
215 available_words_.begin(), available_words_.end(), | |
216 std::inserter(leftovers, leftovers.begin())); | |
217 if (!leftovers.empty()) | |
218 return false; | |
219 } | |
220 } | |
221 | |
222 { | |
223 // Make a set of HistoryIDs from history_info_map_. | |
224 HistoryIDSet history_id_set_a; | |
225 std::transform(history_info_map_.begin(), history_info_map_.end(), | |
226 std::inserter(history_id_set_a, history_id_set_a.begin()), | |
227 SelectFirst<HistoryInfoMap::value_type>()); | |
228 | |
229 // Compare history_info_map_'s set to word_id_history_map_'s HistoryIDs. | |
230 { | |
231 HistoryIDSet history_id_set_b; | |
232 std::transform(history_info_map_.begin(), history_info_map_.end(), | |
233 std::inserter(history_id_set_b, history_id_set_b.begin()), | |
234 SelectFirst<HistoryInfoMap::value_type>()); | |
235 HistoryIDSet leftovers; | |
236 std::set_symmetric_difference( | |
237 history_id_set_a.begin(), history_id_set_a.end(), | |
238 history_id_set_b.begin(), history_id_set_b.end(), | |
239 std::inserter(leftovers, leftovers.begin())); | |
240 if (!leftovers.empty()) | |
241 return false; | |
242 } | |
243 | |
244 // Compare history_info_map_'s set to word_id_history_map_'s HistoryIDs. | |
245 { | |
246 HistoryIDSet history_id_set_b; | |
247 std::for_each(word_id_history_map_.begin(), word_id_history_map_.end(), | |
248 TargetInserter<WordIDHistoryMap::value_type, | |
249 HistoryIDSet>(&history_id_set_b)); | |
250 HistoryIDSet leftovers; | |
251 std::set_symmetric_difference( | |
252 history_id_set_a.begin(), history_id_set_a.end(), | |
253 history_id_set_b.begin(), history_id_set_b.end(), | |
254 std::inserter(leftovers, leftovers.begin())); | |
255 if (!leftovers.empty()) | |
256 return false; | |
257 } | |
258 | |
259 // Compare history_info_map_'s set to word_starts_map_'s HistoryIDs. | |
260 { | |
261 HistoryIDSet history_id_set_b; | |
262 std::transform(word_starts_map_.begin(), word_starts_map_.end(), | |
263 std::inserter(history_id_set_b, history_id_set_b.begin()), | |
264 SelectFirst<WordStartsMap::value_type>()); | |
265 HistoryIDSet leftovers; | |
266 std::set_symmetric_difference( | |
267 history_id_set_a.begin(), history_id_set_a.end(), | |
268 history_id_set_b.begin(), history_id_set_b.end(), | |
269 std::inserter(leftovers, leftovers.begin())); | |
270 if (!leftovers.empty()) | |
271 return false; | |
272 } | |
273 } | |
274 | |
275 // Make sets of words from word_list_ and word_map_ and test for equality. | |
276 { | |
277 String16Set word_list_set; | |
278 std::copy(word_list_.begin(), word_list_.end(), | |
279 std::inserter(word_list_set, word_list_set.end())); | |
280 String16Set word_map_set; | |
281 std::transform(word_map_.begin(), word_map_.end(), | |
282 std::inserter(word_map_set, word_map_set.begin()), | |
283 SelectFirst<WordMap::value_type>()); | |
284 if (word_list_set != word_map_set) | |
285 return false; | |
286 } | |
287 | |
288 return true; | |
289 } | |
290 | |
291 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( | |
292 const string16& search_string) { | |
293 pre_filter_item_count_ = 0; | |
294 post_filter_item_count_ = 0; | |
295 post_scoring_item_count_ = 0; | |
296 // The search string we receive may contain escaped characters. For reducing | |
297 // the index we need individual, lower-cased words, ignoring escapings. For | |
298 // the final filtering we need whitespace separated substrings possibly | |
299 // containing escaped characters. | |
300 string16 lower_raw_string(base::i18n::ToLower(search_string)); | |
301 string16 lower_unescaped_string = | |
302 net::UnescapeURLComponent(lower_raw_string, | |
303 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); | |
304 // Extract individual 'words' (as opposed to 'terms'; see below) from the | |
305 // search string. When the user types "colspec=ID%20Mstone Release" we get | |
306 // four 'words': "colspec", "id", "mstone" and "release". | |
307 String16Vector lower_words( | |
308 history::String16VectorFromString16(lower_unescaped_string, false, NULL)); | |
309 ScoredHistoryMatches scored_items; | |
310 | |
311 // Do nothing if we have indexed no words (probably because we've not been | |
312 // initialized yet) or the search string has no words. | |
313 if (word_list_.empty() || lower_words.empty()) { | |
314 search_term_cache_.clear(); // Invalidate the term cache. | |
315 return scored_items; | |
316 } | |
317 | |
318 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep | |
319 // approach. | |
320 ResetSearchTermCache(); | |
321 | |
322 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); | |
323 | |
324 // Trim the candidate pool if it is large. Note that we do not filter out | |
325 // items that do not contain the search terms as proper substrings -- doing | |
326 // so is the performance-costly operation we are trying to avoid in order | |
327 // to maintain omnibox responsiveness. | |
328 const size_t kItemsToScoreLimit = 500; | |
329 pre_filter_item_count_ = history_id_set.size(); | |
330 // If we trim the results set we do not want to cache the results for next | |
331 // time as the user's ultimately desired result could easily be eliminated | |
332 // in this early rough filter. | |
333 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); | |
334 if (was_trimmed) { | |
335 HistoryIDVector history_ids; | |
336 std::copy(history_id_set.begin(), history_id_set.end(), | |
337 std::back_inserter(history_ids)); | |
338 // Trim down the set by sorting by typed-count, visit-count, and last | |
339 // visit. | |
340 HistoryItemFactorGreater | |
341 item_factor_functor(history_info_map_); | |
342 std::partial_sort(history_ids.begin(), | |
343 history_ids.begin() + kItemsToScoreLimit, | |
344 history_ids.end(), | |
345 item_factor_functor); | |
346 history_id_set.clear(); | |
347 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, | |
348 std::inserter(history_id_set, history_id_set.end())); | |
349 post_filter_item_count_ = history_id_set.size(); | |
350 } | |
351 | |
352 // Pass over all of the candidates filtering out any without a proper | |
353 // substring match, inserting those which pass in order by score. Note that | |
354 // in this step we are using the raw search string complete with escaped | |
355 // URL elements. When the user has specifically typed something akin to | |
356 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that | |
357 // specific substring appears in the URL or page title. | |
358 | |
359 // We call these 'terms' (as opposed to 'words'; see above) as in this case | |
360 // we only want to break up the search string on 'true' whitespace rather than | |
361 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we | |
362 // get two 'terms': "colspec=id%20mstone" and "release". | |
363 history::String16Vector lower_raw_terms; | |
364 Tokenize(lower_raw_string, kWhitespaceUTF16, &lower_raw_terms); | |
365 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), | |
366 AddHistoryMatch(*this, lower_raw_string, | |
367 lower_raw_terms, base::Time::Now())).ScoredMatches(); | |
368 | |
369 // Select and sort only the top kMaxMatches results. | |
370 if (scored_items.size() > AutocompleteProvider::kMaxMatches) { | |
371 std::partial_sort(scored_items.begin(), | |
372 scored_items.begin() + | |
373 AutocompleteProvider::kMaxMatches, | |
374 scored_items.end(), | |
375 ScoredHistoryMatch::MatchScoreGreater); | |
376 scored_items.resize(AutocompleteProvider::kMaxMatches); | |
377 } else { | |
378 std::sort(scored_items.begin(), scored_items.end(), | |
379 ScoredHistoryMatch::MatchScoreGreater); | |
380 } | |
381 post_scoring_item_count_ = scored_items.size(); | |
382 | |
383 if (was_trimmed) { | |
384 search_term_cache_.clear(); // Invalidate the term cache. | |
385 } else { | |
386 // Remove any stale SearchTermCacheItems. | |
387 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | |
388 cache_iter != search_term_cache_.end(); ) { | |
389 if (!cache_iter->second.used_) | |
390 search_term_cache_.erase(cache_iter++); | |
391 else | |
392 ++cache_iter; | |
393 } | |
394 } | |
395 | |
396 return scored_items; | |
397 } | |
398 | |
399 bool URLIndexPrivateData::UpdateURL(const URLRow& row) { | |
400 // The row may or may not already be in our index. If it is not already | |
401 // indexed and it qualifies then it gets indexed. If it is already | |
402 // indexed and still qualifies then it gets updated, otherwise it | |
403 // is deleted from the index. | |
404 bool row_was_updated = false; | |
405 URLID row_id = row.id(); | |
406 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id); | |
407 if (row_pos == history_info_map_.end()) { | |
408 // This new row should be indexed if it qualifies. | |
409 URLRow new_row(row); | |
410 new_row.set_id(row_id); | |
411 row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) && | |
412 IndexRow(new_row); | |
413 } else if (RowQualifiesAsSignificant(row, base::Time())) { | |
414 // This indexed row still qualifies and will be re-indexed. | |
415 // The url won't have changed but the title, visit count, etc. | |
416 // might have changed. | |
417 URLRow& row_to_update = row_pos->second; | |
418 bool title_updated = row_to_update.title() != row.title(); | |
419 if (row_to_update.visit_count() != row.visit_count() || | |
420 row_to_update.typed_count() != row.typed_count() || | |
421 row_to_update.last_visit() != row.last_visit() || title_updated) { | |
422 CacheTransaction cache_transaction(cache_enabled() ? | |
423 cache_db_.get() : NULL); | |
424 row_to_update.set_visit_count(row.visit_count()); | |
425 row_to_update.set_typed_count(row.typed_count()); | |
426 row_to_update.set_last_visit(row.last_visit()); | |
427 row_to_update.set_hidden(row.hidden()); | |
428 // While the URL is guaranteed to remain stable, the title may have | |
429 // changed. If so, then update the index with the changed words. | |
430 if (title_updated) { | |
431 // Clear all words associated with this row and re-index both the | |
432 // URL and title. | |
433 RemoveRowWordsFromIndex(row_to_update); | |
434 row_to_update.set_title(row.title()); | |
435 RowWordStarts word_starts; | |
436 AddRowWordsToIndex(row_to_update, &word_starts); | |
437 word_starts_map_[row_id] = word_starts; | |
438 // Replace all word_starts for the row. | |
439 if (cache_enabled()) { | |
440 cache_db_->RemoveWordStarts(row_id); | |
441 cache_db_->AddRowWordStarts(row_id, word_starts); | |
442 } | |
443 } | |
444 if (cache_enabled()) { | |
445 cache_db_->RemoveHistoryIDFromURLs(row_id); | |
446 cache_db_->AddHistoryToURLs(row_id, row); | |
447 } | |
448 row_was_updated = true; | |
449 } | |
450 } else { | |
451 // This indexed row no longer qualifies and will be de-indexed by | |
452 // clearing all words associated with this row. | |
453 RemoveRowFromIndex(row); | |
454 row_was_updated = true; | |
455 } | |
456 if (row_was_updated) | |
457 search_term_cache_.clear(); // This invalidates the cache. | |
458 return row_was_updated; | |
459 } | |
460 | |
461 // Helper functor for DeleteURL. | |
462 class HistoryInfoMapItemHasURL { | |
463 public: | |
464 explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {} | |
465 | |
466 bool operator()(const std::pair<const HistoryID, URLRow>& item) { | |
467 return item.second.url() == url_; | |
468 } | |
469 | |
470 private: | |
471 const GURL& url_; | |
472 }; | |
473 | |
474 bool URLIndexPrivateData::DeleteURL(const GURL& url) { | |
475 DCHECK(!BrowserThread::IsWellKnownThread(BrowserThread::UI) || | |
476 BrowserThread::CurrentlyOn(BrowserThread::UI)); | |
477 // Find the matching entry in the history_info_map_. | |
478 HistoryInfoMap::iterator pos = std::find_if( | |
479 history_info_map_.begin(), | |
480 history_info_map_.end(), | |
481 HistoryInfoMapItemHasURL(url)); | |
482 if (pos == history_info_map_.end()) | |
483 return false; | |
484 RemoveRowFromIndex(pos->second); | |
485 search_term_cache_.clear(); // This invalidates the cache. | |
486 return true; | |
487 } | |
488 | |
489 bool URLIndexPrivateData::RestoreFromCacheTask() { | |
490 DCHECK(!BrowserThread::IsWellKnownThread(BrowserThread::UI) || | |
491 !BrowserThread::CurrentlyOn(BrowserThread::UI)); | |
492 DCHECK(cache_db_); | |
493 if (IsShutdown()) | |
494 return false; | |
495 Clear(); | |
496 if (RestoreFromCache(cache_db_)) | |
497 return true; | |
498 // If there was no SQLite-based cache then there might be an old, | |
499 // deprecated, protobuf-based cache file laying around. Get rid of it. | |
500 DeleteOldVersionCacheFile(); | |
501 return false; | |
502 } | |
503 | |
504 // static | |
505 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory( | |
506 HistoryDatabase* history_db, | |
507 scoped_refptr<URLIndexPrivateData> old_data) { | |
508 DCHECK(!BrowserThread::IsWellKnownThread(BrowserThread::UI) || | |
509 !BrowserThread::CurrentlyOn(BrowserThread::UI)); | |
510 if (!history_db) | |
511 return NULL; | |
512 | |
513 scoped_refptr<URLIndexPrivateData> rebuilt_data( | |
514 new URLIndexPrivateData(*(old_data.get()))); | |
515 | |
516 // NOTE: We disable the cache database until after the replacement private | |
517 // data has been created and then we re-enable the database. Once the private | |
518 // data has been swapped in the database is refreshed with the new data. | |
519 URLDatabase::URLEnumerator history_enum; | |
520 if (!history_db->InitURLEnumeratorForSignificant(&history_enum)) | |
521 return NULL; | |
522 | |
523 for (URLRow row; !old_data->IsShutdown() && history_enum.GetNextURL(&row); ) | |
524 rebuilt_data->IndexRow(row); | |
525 | |
526 if (old_data->IsShutdown()) { | |
527 rebuilt_data.release(); | |
528 } | |
529 | |
530 return rebuilt_data; | |
531 } | |
532 | |
533 void URLIndexPrivateData::RefreshCacheTask() { | |
534 DCHECK(!BrowserThread::IsWellKnownThread(BrowserThread::UI) || | |
535 !BrowserThread::CurrentlyOn(BrowserThread::UI)); | |
536 base::AutoLock lock(lock_); | |
537 if (shutdown_ || !cache_enabled()) | |
538 return; | |
539 // Should a refresh fail for any reason we consider the database to be | |
540 // corrupt and unavailable; no further remedial action is possible. | |
541 set_cache_enabled(cache_db_->Reset() && cache_db_->Refresh(*this)); | |
542 } | |
543 | |
544 // static | |
545 void URLIndexPrivateData::InitializeSchemeWhitelistForTesting( | |
546 std::set<std::string>* whitelist) { | |
547 InitializeSchemeWhitelist(whitelist); | |
548 } | |
55 | 549 |
56 // SearchTermCacheItem --------------------------------------------------------- | 550 // SearchTermCacheItem --------------------------------------------------------- |
57 | 551 |
58 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( | 552 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( |
59 const WordIDSet& word_id_set, | 553 const WordIDSet& word_id_set, |
60 const HistoryIDSet& history_id_set) | 554 const HistoryIDSet& history_id_set) |
61 : word_id_set_(word_id_set), | 555 : word_id_set_(word_id_set), |
62 history_id_set_(history_id_set), | 556 history_id_set_(history_id_set), |
63 used_(true) {} | 557 used_(true) {} |
64 | 558 |
65 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() | 559 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() |
66 : used_(true) {} | 560 : used_(true) {} |
67 | 561 |
68 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {} | 562 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {} |
69 | 563 |
70 // Algorithm Functions --------------------------------------------------------- | 564 // URLIndexPrivateData::AddHistoryMatch ---------------------------------------- |
71 | 565 |
72 // Comparison function for sorting search terms by descending length. | 566 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch( |
73 bool LengthGreater(const string16& string_a, const string16& string_b) { | 567 const URLIndexPrivateData& private_data, |
74 return string_a.length() > string_b.length(); | 568 const string16& lower_string, |
569 const String16Vector& lower_terms, | |
570 const base::Time now) | |
571 : private_data_(private_data), | |
572 lower_string_(lower_string), | |
573 lower_terms_(lower_terms), | |
574 now_(now) {} | |
575 | |
576 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {} | |
577 | |
578 void URLIndexPrivateData::AddHistoryMatch::operator()( | |
579 const HistoryID history_id) { | |
580 HistoryInfoMap::const_iterator hist_pos = | |
581 private_data_.history_info_map_.find(history_id); | |
582 if (hist_pos != private_data_.history_info_map_.end()) { | |
583 const URLRow& hist_item = hist_pos->second; | |
584 WordStartsMap::const_iterator starts_pos = | |
585 private_data_.word_starts_map_.find(history_id); | |
586 DCHECK(starts_pos != private_data_.word_starts_map_.end()); | |
587 ScoredHistoryMatch match(hist_item, lower_string_, lower_terms_, | |
588 starts_pos->second, now_); | |
589 if (match.raw_score > 0) | |
590 scored_matches_.push_back(match); | |
591 } | |
75 } | 592 } |
76 | 593 |
77 // InMemoryURLIndex's Private Data --------------------------------------------- | 594 // URLIndexPrivateData::HistoryItemFactorGreater ------------------------------- |
78 | 595 |
79 URLIndexPrivateData::URLIndexPrivateData() | 596 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater( |
80 : restored_cache_version_(0), | 597 const HistoryInfoMap& history_info_map) |
81 saved_cache_version_(kCurrentCacheFileVersion), | 598 : history_info_map_(history_info_map) { |
599 } | |
600 | |
601 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {} | |
602 | |
603 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()( | |
604 const HistoryID h1, | |
605 const HistoryID h2) { | |
606 HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1)); | |
607 if (entry1 == history_info_map_.end()) | |
608 return false; | |
609 HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2)); | |
610 if (entry2 == history_info_map_.end()) | |
611 return true; | |
612 const URLRow& r1(entry1->second); | |
613 const URLRow& r2(entry2->second); | |
614 // First cut: typed count, visit count, recency. | |
615 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks | |
616 // recently visited (within the last 12/24 hours) as highly important. Get | |
617 // input from mpearson. | |
618 if (r1.typed_count() != r2.typed_count()) | |
619 return (r1.typed_count() > r2.typed_count()); | |
620 if (r1.visit_count() != r2.visit_count()) | |
621 return (r1.visit_count() > r2.visit_count()); | |
622 return (r1.last_visit() > r2.last_visit()); | |
623 } | |
624 | |
625 // URLIndexPrivateData Private Functions --------------------------------------- | |
626 | |
627 URLIndexPrivateData::URLIndexPrivateData(const URLIndexPrivateData& old_data) | |
628 : history_dir_(old_data.history_dir_), | |
629 languages_(old_data.languages_), | |
630 cache_enabled_(old_data.cache_enabled_), | |
631 shutdown_(old_data.IsShutdown()), | |
632 lock_(), | |
633 scheme_whitelist_(old_data.scheme_whitelist_), | |
82 pre_filter_item_count_(0), | 634 pre_filter_item_count_(0), |
83 post_filter_item_count_(0), | 635 post_filter_item_count_(0), |
84 post_scoring_item_count_(0) { | 636 post_scoring_item_count_(0) { |
637 cache_db_ = old_data.cache_db_; | |
638 } | |
639 | |
640 // Called only by unit tests. | |
641 URLIndexPrivateData::URLIndexPrivateData() | |
642 : cache_enabled_(true), | |
643 shutdown_(false), | |
644 lock_(), | |
645 pre_filter_item_count_(0), | |
646 post_filter_item_count_(0), | |
647 post_scoring_item_count_(0) { | |
648 InitializeSchemeWhitelist(&scheme_whitelist_); | |
85 } | 649 } |
86 | 650 |
87 URLIndexPrivateData::~URLIndexPrivateData() {} | 651 URLIndexPrivateData::~URLIndexPrivateData() {} |
88 | 652 |
653 bool URLIndexPrivateData::IsShutdown() const { | |
654 base::AutoLock lock(lock_); | |
655 return shutdown_; | |
sky
2012/08/22 19:42:34
nit: spacing.
| |
656 } | |
657 | |
89 void URLIndexPrivateData::Clear() { | 658 void URLIndexPrivateData::Clear() { |
90 word_list_.clear(); | 659 word_list_.clear(); |
91 available_words_.clear(); | 660 available_words_.clear(); |
92 word_map_.clear(); | 661 word_map_.clear(); |
93 char_word_map_.clear(); | 662 char_word_map_.clear(); |
94 word_id_history_map_.clear(); | 663 word_id_history_map_.clear(); |
95 history_id_word_map_.clear(); | 664 history_id_word_map_.clear(); |
96 history_info_map_.clear(); | 665 history_info_map_.clear(); |
97 word_starts_map_.clear(); | 666 word_starts_map_.clear(); |
98 } | 667 } |
99 | 668 |
100 bool URLIndexPrivateData::Empty() const { | |
101 return history_info_map_.empty(); | |
102 } | |
103 | |
104 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const { | |
105 scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData; | |
106 data_copy->word_list_ = word_list_; | |
107 data_copy->available_words_ = available_words_; | |
108 data_copy->word_map_ = word_map_; | |
109 data_copy->char_word_map_ = char_word_map_; | |
110 data_copy->word_id_history_map_ = word_id_history_map_; | |
111 data_copy->history_id_word_map_ = history_id_word_map_; | |
112 data_copy->history_info_map_ = history_info_map_; | |
113 data_copy->word_starts_map_ = word_starts_map_; | |
114 return data_copy; | |
115 // Not copied: | |
116 // search_term_cache_ | |
117 // pre_filter_item_count_ | |
118 // post_filter_item_count_ | |
119 // post_scoring_item_count_ | |
120 }; | |
121 | |
122 // Cache Updating -------------------------------------------------------------- | 669 // Cache Updating -------------------------------------------------------------- |
123 | 670 |
124 bool URLIndexPrivateData::IndexRow( | 671 bool URLIndexPrivateData::IndexRow(const URLRow& row) { |
125 const URLRow& row, | |
126 const std::string& languages, | |
127 const std::set<std::string>& scheme_whitelist) { | |
128 const GURL& gurl(row.url()); | 672 const GURL& gurl(row.url()); |
129 | 673 |
130 // Index only URLs with a whitelisted scheme. | 674 // Index only URLs with a whitelisted scheme. |
131 if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist)) | 675 if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist_)) |
132 return false; | 676 return false; |
133 | 677 |
134 URLID row_id = row.id(); | |
135 // Strip out username and password before saving and indexing. | 678 // Strip out username and password before saving and indexing. |
136 string16 url(net::FormatUrl(gurl, languages, | 679 string16 url(net::FormatUrl(gurl, languages_, |
137 net::kFormatUrlOmitUsernamePassword, | 680 net::kFormatUrlOmitUsernamePassword, |
138 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS, | 681 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS, |
139 NULL, NULL, NULL)); | 682 NULL, NULL, NULL)); |
140 | 683 |
684 URLID row_id = row.id(); | |
141 HistoryID history_id = static_cast<HistoryID>(row_id); | 685 HistoryID history_id = static_cast<HistoryID>(row_id); |
686 DCHECK(history_info_map_.find(history_id) == history_info_map_.end()); | |
142 DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max()); | 687 DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max()); |
143 | 688 |
144 // Add the row for quick lookup in the history info store. | 689 // Add the row for quick lookup in the history info store. |
145 URLRow new_row(GURL(url), row_id); | 690 URLRow new_row(GURL(url), row_id); |
146 new_row.set_visit_count(row.visit_count()); | 691 new_row.set_visit_count(row.visit_count()); |
147 new_row.set_typed_count(row.typed_count()); | 692 new_row.set_typed_count(row.typed_count()); |
148 new_row.set_last_visit(row.last_visit()); | 693 new_row.set_last_visit(row.last_visit()); |
149 new_row.set_title(row.title()); | 694 new_row.set_title(row.title()); |
695 | |
696 CacheTransaction cache_transaction(cache_enabled() ? cache_db_.get() : NULL); | |
150 history_info_map_[history_id] = new_row; | 697 history_info_map_[history_id] = new_row; |
698 if (cache_enabled()) | |
699 cache_db_->AddHistoryToURLs(history_id, row); | |
151 | 700 |
152 // Index the words contained in the URL and title of the row. | 701 // Index the words contained in the URL and title of the row. |
153 RowWordStarts word_starts; | 702 RowWordStarts word_starts; |
154 AddRowWordsToIndex(new_row, &word_starts, languages); | 703 AddRowWordsToIndex(new_row, &word_starts); |
155 word_starts_map_[history_id] = word_starts; | 704 word_starts_map_[history_id] = word_starts; |
705 if (cache_enabled()) | |
706 cache_db_->AddRowWordStarts(history_id, word_starts); | |
156 return true; | 707 return true; |
157 } | 708 } |
158 | 709 |
159 void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row, | 710 void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row, |
160 RowWordStarts* word_starts, | 711 RowWordStarts* word_starts) { |
161 const std::string& languages) { | |
162 HistoryID history_id = static_cast<HistoryID>(row.id()); | 712 HistoryID history_id = static_cast<HistoryID>(row.id()); |
163 // Split URL into individual, unique words then add in the title words. | 713 // Split URL into individual, unique words then add in the title words. |
164 const GURL& gurl(row.url()); | 714 const GURL& gurl(row.url()); |
165 string16 url(net::FormatUrl(gurl, languages, | 715 string16 url(net::FormatUrl(gurl, languages_, |
166 net::kFormatUrlOmitUsernamePassword, | 716 net::kFormatUrlOmitUsernamePassword, |
167 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS, | 717 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS, |
168 NULL, NULL, NULL)); | 718 NULL, NULL, NULL)); |
169 url = base::i18n::ToLower(url); | 719 url = base::i18n::ToLower(url); |
170 String16Set url_words = String16SetFromString16(url, | 720 String16Set url_words = String16SetFromString16(url, |
171 word_starts ? &word_starts->url_word_starts_ : NULL); | 721 word_starts ? &word_starts->url_word_starts_ : NULL); |
172 String16Set title_words = String16SetFromString16(row.title(), | 722 String16Set title_words = String16SetFromString16(row.title(), |
173 word_starts ? &word_starts->title_word_starts_ : NULL); | 723 word_starts ? &word_starts->title_word_starts_ : NULL); |
174 String16Set words; | 724 String16Set words; |
175 std::set_union(url_words.begin(), url_words.end(), | 725 std::set_union(url_words.begin(), url_words.end(), |
(...skipping 12 matching lines...) Expand all Loading... | |
188 if (word_pos != word_map_.end()) | 738 if (word_pos != word_map_.end()) |
189 UpdateWordHistory(word_pos->second, history_id); | 739 UpdateWordHistory(word_pos->second, history_id); |
190 else | 740 else |
191 AddWordHistory(term, history_id); | 741 AddWordHistory(term, history_id); |
192 } | 742 } |
193 | 743 |
194 void URLIndexPrivateData::UpdateWordHistory(WordID word_id, | 744 void URLIndexPrivateData::UpdateWordHistory(WordID word_id, |
195 HistoryID history_id) { | 745 HistoryID history_id) { |
196 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); | 746 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); |
197 DCHECK(history_pos != word_id_history_map_.end()); | 747 DCHECK(history_pos != word_id_history_map_.end()); |
748 // There is no need to record changes to the word_id_history_map_ in the cache | |
749 // because it can be rebuilt from the history_id_word_map_'s table. | |
198 HistoryIDSet& history_id_set(history_pos->second); | 750 HistoryIDSet& history_id_set(history_pos->second); |
199 history_id_set.insert(history_id); | 751 history_id_set.insert(history_id); |
200 AddToHistoryIDWordMap(history_id, word_id); | 752 AddToHistoryIDWordMap(history_id, word_id); |
753 | |
754 // Add word_id/history_id entry to the word_history table | |
755 if (cache_enabled()) | |
756 cache_db_->AddHistoryToWordHistory(word_id, history_id); | |
201 } | 757 } |
202 | 758 |
203 void URLIndexPrivateData::AddWordHistory(const string16& term, | 759 void URLIndexPrivateData::AddWordHistory(const string16& term, |
204 HistoryID history_id) { | 760 HistoryID history_id) { |
205 WordID word_id = word_list_.size(); | 761 WordID word_id = word_list_.size(); |
206 if (available_words_.empty()) { | 762 if (available_words_.empty()) { |
207 word_list_.push_back(term); | 763 word_list_.push_back(term); |
208 } else { | 764 } else { |
209 word_id = *(available_words_.begin()); | 765 word_id = *(available_words_.begin()); |
210 word_list_[word_id] = term; | 766 word_list_[word_id] = term; |
211 available_words_.erase(word_id); | 767 available_words_.erase(word_id); |
212 } | 768 } |
213 word_map_[term] = word_id; | 769 word_map_[term] = word_id; |
214 | 770 |
771 // Add word_id/term entry to the words table; | |
772 if (cache_enabled()) | |
773 cache_db_->AddWordToWords(word_id, term); | |
774 | |
215 HistoryIDSet history_id_set; | 775 HistoryIDSet history_id_set; |
216 history_id_set.insert(history_id); | 776 history_id_set.insert(history_id); |
217 word_id_history_map_[word_id] = history_id_set; | 777 word_id_history_map_[word_id] = history_id_set; |
218 AddToHistoryIDWordMap(history_id, word_id); | 778 AddToHistoryIDWordMap(history_id, word_id); |
219 | 779 |
780 // Add word_id/history_id entry to the word_history table | |
781 if (cache_enabled()) | |
782 cache_db_->AddHistoryToWordHistory(word_id, history_id); | |
783 | |
220 // For each character in the newly added word (i.e. a word that is not | 784 // For each character in the newly added word (i.e. a word that is not |
221 // already in the word index), add the word to the character index. | 785 // already in the word index), add the word to the character index. |
222 Char16Set characters = Char16SetFromString16(term); | 786 Char16Set characters = Char16SetFromString16(term); |
223 for (Char16Set::iterator uni_char_iter = characters.begin(); | 787 for (Char16Set::iterator uni_char_iter = characters.begin(); |
224 uni_char_iter != characters.end(); ++uni_char_iter) { | 788 uni_char_iter != characters.end(); ++uni_char_iter) { |
225 char16 uni_char = *uni_char_iter; | 789 char16 uni_char = *uni_char_iter; |
226 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); | 790 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); |
227 if (char_iter != char_word_map_.end()) { | 791 if (char_iter != char_word_map_.end()) { |
228 // Update existing entry in the char/word index. | 792 // Update existing entry in the char/word index. |
229 WordIDSet& word_id_set(char_iter->second); | 793 WordIDSet& word_id_set(char_iter->second); |
230 word_id_set.insert(word_id); | 794 word_id_set.insert(word_id); |
231 } else { | 795 } else { |
232 // Create a new entry in the char/word index. | 796 // Create a new entry in the char/word index. |
233 WordIDSet word_id_set; | 797 WordIDSet word_id_set; |
234 word_id_set.insert(word_id); | 798 word_id_set.insert(word_id); |
235 char_word_map_[uni_char] = word_id_set; | 799 char_word_map_[uni_char] = word_id_set; |
236 } | 800 } |
801 // Add uni_char/word_id entry to the char_words database table. | |
802 if (cache_enabled()) | |
803 cache_db_->AddWordToCharWords(uni_char, word_id); | |
237 } | 804 } |
238 } | 805 } |
239 | 806 |
240 void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) { | 807 void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) { |
808 CacheTransaction cache_transaction(cache_enabled() ? cache_db_.get() : NULL); | |
241 RemoveRowWordsFromIndex(row); | 809 RemoveRowWordsFromIndex(row); |
242 HistoryID history_id = static_cast<HistoryID>(row.id()); | 810 HistoryID history_id = static_cast<HistoryID>(row.id()); |
243 history_info_map_.erase(history_id); | 811 history_info_map_.erase(history_id); |
244 word_starts_map_.erase(history_id); | 812 word_starts_map_.erase(history_id); |
813 if (cache_enabled()) | |
814 cache_db_->RemoveHistoryIDFromURLs(history_id); | |
245 } | 815 } |
246 | 816 |
247 void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) { | 817 void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) { |
248 // Remove the entries in history_id_word_map_ and word_id_history_map_ for | 818 // Remove the entries in history_id_word_map_ and word_id_history_map_ for |
249 // this row. | 819 // this row. |
250 HistoryID history_id = static_cast<HistoryID>(row.id()); | 820 HistoryID history_id = static_cast<HistoryID>(row.id()); |
251 WordIDSet word_id_set = history_id_word_map_[history_id]; | 821 WordIDSet word_id_set = history_id_word_map_[history_id]; |
252 history_id_word_map_.erase(history_id); | 822 history_id_word_map_.erase(history_id); |
253 | 823 |
254 // Reconcile any changes to word usage. | 824 // Reconcile any changes to word usage. |
(...skipping 13 matching lines...) Expand all Loading... | |
268 char_word_map_[uni_char].erase(word_id); | 838 char_word_map_[uni_char].erase(word_id); |
269 if (char_word_map_[uni_char].empty()) | 839 if (char_word_map_[uni_char].empty()) |
270 char_word_map_.erase(uni_char); // No longer in use. | 840 char_word_map_.erase(uni_char); // No longer in use. |
271 } | 841 } |
272 | 842 |
273 // Complete the removal of references to the word. | 843 // Complete the removal of references to the word. |
274 word_id_history_map_.erase(word_id); | 844 word_id_history_map_.erase(word_id); |
275 word_map_.erase(word); | 845 word_map_.erase(word); |
276 word_list_[word_id] = string16(); | 846 word_list_[word_id] = string16(); |
277 available_words_.insert(word_id); | 847 available_words_.insert(word_id); |
848 if (cache_enabled()) | |
849 cache_db_->RemoveWordFromWords(word_id); | |
278 } | 850 } |
851 if (cache_enabled()) | |
852 cache_db_->RemoveHistoryIDFromWordHistory(history_id); | |
279 } | 853 } |
280 | 854 |
281 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id, | 855 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id, |
282 WordID word_id) { | 856 WordID word_id) { |
283 HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id); | 857 HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id); |
284 if (iter != history_id_word_map_.end()) { | 858 if (iter != history_id_word_map_.end()) { |
285 WordIDSet& word_id_set(iter->second); | 859 WordIDSet& word_id_set(iter->second); |
286 word_id_set.insert(word_id); | 860 word_id_set.insert(word_id); |
287 } else { | 861 } else { |
288 WordIDSet word_id_set; | 862 WordIDSet word_id_set; |
289 word_id_set.insert(word_id); | 863 word_id_set.insert(word_id); |
290 history_id_word_map_[history_id] = word_id_set; | 864 history_id_word_map_[history_id] = word_id_set; |
291 } | 865 } |
292 } | 866 } |
293 | 867 |
294 bool URLIndexPrivateData::UpdateURL( | |
295 const URLRow& row, | |
296 const std::string& languages, | |
297 const std::set<std::string>& scheme_whitelist) { | |
298 // The row may or may not already be in our index. If it is not already | |
299 // indexed and it qualifies then it gets indexed. If it is already | |
300 // indexed and still qualifies then it gets updated, otherwise it | |
301 // is deleted from the index. | |
302 bool row_was_updated = false; | |
303 URLID row_id = row.id(); | |
304 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id); | |
305 if (row_pos == history_info_map_.end()) { | |
306 // This new row should be indexed if it qualifies. | |
307 URLRow new_row(row); | |
308 new_row.set_id(row_id); | |
309 row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) && | |
310 IndexRow(new_row, languages, scheme_whitelist); | |
311 } else if (RowQualifiesAsSignificant(row, base::Time())) { | |
312 // This indexed row still qualifies and will be re-indexed. | |
313 // The url won't have changed but the title, visit count, etc. | |
314 // might have changed. | |
315 URLRow& row_to_update = row_pos->second; | |
316 bool title_updated = row_to_update.title() != row.title(); | |
317 if (row_to_update.visit_count() != row.visit_count() || | |
318 row_to_update.typed_count() != row.typed_count() || | |
319 row_to_update.last_visit() != row.last_visit() || title_updated) { | |
320 row_to_update.set_visit_count(row.visit_count()); | |
321 row_to_update.set_typed_count(row.typed_count()); | |
322 row_to_update.set_last_visit(row.last_visit()); | |
323 // While the URL is guaranteed to remain stable, the title may have | |
324 // changed. If so, then update the index with the changed words. | |
325 if (title_updated) { | |
326 // Clear all words associated with this row and re-index both the | |
327 // URL and title. | |
328 RemoveRowWordsFromIndex(row_to_update); | |
329 row_to_update.set_title(row.title()); | |
330 RowWordStarts word_starts; | |
331 AddRowWordsToIndex(row_to_update, &word_starts, languages); | |
332 word_starts_map_[row_id] = word_starts; | |
333 } | |
334 row_was_updated = true; | |
335 } | |
336 } else { | |
337 // This indexed row no longer qualifies and will be de-indexed by | |
338 // clearing all words associated with this row. | |
339 RemoveRowFromIndex(row); | |
340 row_was_updated = true; | |
341 } | |
342 if (row_was_updated) | |
343 search_term_cache_.clear(); // This invalidates the cache. | |
344 return row_was_updated; | |
345 } | |
346 | |
347 // Helper functor for DeleteURL. | |
348 class HistoryInfoMapItemHasURL { | |
349 public: | |
350 explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {} | |
351 | |
352 bool operator()(const std::pair<const HistoryID, URLRow>& item) { | |
353 return item.second.url() == url_; | |
354 } | |
355 | |
356 private: | |
357 const GURL& url_; | |
358 }; | |
359 | |
360 bool URLIndexPrivateData::DeleteURL(const GURL& url) { | |
361 // Find the matching entry in the history_info_map_. | |
362 HistoryInfoMap::iterator pos = std::find_if( | |
363 history_info_map_.begin(), | |
364 history_info_map_.end(), | |
365 HistoryInfoMapItemHasURL(url)); | |
366 if (pos == history_info_map_.end()) | |
367 return false; | |
368 RemoveRowFromIndex(pos->second); | |
369 search_term_cache_.clear(); // This invalidates the cache. | |
370 return true; | |
371 } | |
372 | |
373 // URLIndexPrivateData::HistoryItemFactorGreater ------------------------------- | |
374 | |
375 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater( | |
376 const HistoryInfoMap& history_info_map) | |
377 : history_info_map_(history_info_map) { | |
378 } | |
379 | |
380 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {} | |
381 | |
382 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()( | |
383 const HistoryID h1, | |
384 const HistoryID h2) { | |
385 HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1)); | |
386 if (entry1 == history_info_map_.end()) | |
387 return false; | |
388 HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2)); | |
389 if (entry2 == history_info_map_.end()) | |
390 return true; | |
391 const URLRow& r1(entry1->second); | |
392 const URLRow& r2(entry2->second); | |
393 // First cut: typed count, visit count, recency. | |
394 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks | |
395 // recently visited (within the last 12/24 hours) as highly important. Get | |
396 // input from mpearson. | |
397 if (r1.typed_count() != r2.typed_count()) | |
398 return (r1.typed_count() > r2.typed_count()); | |
399 if (r1.visit_count() != r2.visit_count()) | |
400 return (r1.visit_count() > r2.visit_count()); | |
401 return (r1.last_visit() > r2.last_visit()); | |
402 } | |
403 | |
404 // Cache Searching ------------------------------------------------------------- | |
405 | |
406 // NOTE: This is the main public search function. | |
407 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( | |
408 const string16& search_string) { | |
409 pre_filter_item_count_ = 0; | |
410 post_filter_item_count_ = 0; | |
411 post_scoring_item_count_ = 0; | |
412 // The search string we receive may contain escaped characters. For reducing | |
413 // the index we need individual, lower-cased words, ignoring escapings. For | |
414 // the final filtering we need whitespace separated substrings possibly | |
415 // containing escaped characters. | |
416 string16 lower_raw_string(base::i18n::ToLower(search_string)); | |
417 string16 lower_unescaped_string = | |
418 net::UnescapeURLComponent(lower_raw_string, | |
419 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); | |
420 // Extract individual 'words' (as opposed to 'terms'; see below) from the | |
421 // search string. When the user types "colspec=ID%20Mstone Release" we get | |
422 // four 'words': "colspec", "id", "mstone" and "release". | |
423 String16Vector lower_words( | |
424 history::String16VectorFromString16(lower_unescaped_string, false, NULL)); | |
425 ScoredHistoryMatches scored_items; | |
426 | |
427 // Do nothing if we have indexed no words (probably because we've not been | |
428 // initialized yet) or the search string has no words. | |
429 if (word_list_.empty() || lower_words.empty()) { | |
430 search_term_cache_.clear(); // Invalidate the term cache. | |
431 return scored_items; | |
432 } | |
433 | |
434 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep | |
435 // approach. | |
436 ResetSearchTermCache(); | |
437 | |
438 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); | |
439 | |
440 // Trim the candidate pool if it is large. Note that we do not filter out | |
441 // items that do not contain the search terms as proper substrings -- doing | |
442 // so is the performance-costly operation we are trying to avoid in order | |
443 // to maintain omnibox responsiveness. | |
444 const size_t kItemsToScoreLimit = 500; | |
445 pre_filter_item_count_ = history_id_set.size(); | |
446 // If we trim the results set we do not want to cache the results for next | |
447 // time as the user's ultimately desired result could easily be eliminated | |
448 // in this early rough filter. | |
449 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); | |
450 if (was_trimmed) { | |
451 HistoryIDVector history_ids; | |
452 std::copy(history_id_set.begin(), history_id_set.end(), | |
453 std::back_inserter(history_ids)); | |
454 // Trim down the set by sorting by typed-count, visit-count, and last | |
455 // visit. | |
456 HistoryItemFactorGreater | |
457 item_factor_functor(history_info_map_); | |
458 std::partial_sort(history_ids.begin(), | |
459 history_ids.begin() + kItemsToScoreLimit, | |
460 history_ids.end(), | |
461 item_factor_functor); | |
462 history_id_set.clear(); | |
463 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, | |
464 std::inserter(history_id_set, history_id_set.end())); | |
465 post_filter_item_count_ = history_id_set.size(); | |
466 } | |
467 | |
468 // Pass over all of the candidates filtering out any without a proper | |
469 // substring match, inserting those which pass in order by score. Note that | |
470 // in this step we are using the raw search string complete with escaped | |
471 // URL elements. When the user has specifically typed something akin to | |
472 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that | |
473 // specific substring appears in the URL or page title. | |
474 | |
475 // We call these 'terms' (as opposed to 'words'; see above) as in this case | |
476 // we only want to break up the search string on 'true' whitespace rather than | |
477 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we | |
478 // get two 'terms': "colspec=id%20mstone" and "release". | |
479 history::String16Vector lower_raw_terms; | |
480 Tokenize(lower_raw_string, kWhitespaceUTF16, &lower_raw_terms); | |
481 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), | |
482 AddHistoryMatch(*this, lower_raw_string, | |
483 lower_raw_terms, base::Time::Now())).ScoredMatches(); | |
484 | |
485 // Select and sort only the top kMaxMatches results. | |
486 if (scored_items.size() > AutocompleteProvider::kMaxMatches) { | |
487 std::partial_sort(scored_items.begin(), | |
488 scored_items.begin() + | |
489 AutocompleteProvider::kMaxMatches, | |
490 scored_items.end(), | |
491 ScoredHistoryMatch::MatchScoreGreater); | |
492 scored_items.resize(AutocompleteProvider::kMaxMatches); | |
493 } else { | |
494 std::sort(scored_items.begin(), scored_items.end(), | |
495 ScoredHistoryMatch::MatchScoreGreater); | |
496 } | |
497 post_scoring_item_count_ = scored_items.size(); | |
498 | |
499 if (was_trimmed) { | |
500 search_term_cache_.clear(); // Invalidate the term cache. | |
501 } else { | |
502 // Remove any stale SearchTermCacheItems. | |
503 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); | |
504 cache_iter != search_term_cache_.end(); ) { | |
505 if (!cache_iter->second.used_) | |
506 search_term_cache_.erase(cache_iter++); | |
507 else | |
508 ++cache_iter; | |
509 } | |
510 } | |
511 | |
512 return scored_items; | |
513 } | |
514 | |
515 // URLIndexPrivateData::AddHistoryMatch ---------------------------------------- | |
516 | |
517 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch( | |
518 const URLIndexPrivateData& private_data, | |
519 const string16& lower_string, | |
520 const String16Vector& lower_terms, | |
521 const base::Time now) | |
522 : private_data_(private_data), | |
523 lower_string_(lower_string), | |
524 lower_terms_(lower_terms), | |
525 now_(now) {} | |
526 | |
527 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {} | |
528 | |
529 void URLIndexPrivateData::AddHistoryMatch::operator()( | |
530 const HistoryID history_id) { | |
531 HistoryInfoMap::const_iterator hist_pos = | |
532 private_data_.history_info_map_.find(history_id); | |
533 if (hist_pos != private_data_.history_info_map_.end()) { | |
534 const URLRow& hist_item = hist_pos->second; | |
535 WordStartsMap::const_iterator starts_pos = | |
536 private_data_.word_starts_map_.find(history_id); | |
537 DCHECK(starts_pos != private_data_.word_starts_map_.end()); | |
538 ScoredHistoryMatch match(hist_item, lower_string_, lower_terms_, | |
539 starts_pos->second, now_); | |
540 if (match.raw_score > 0) | |
541 scored_matches_.push_back(match); | |
542 } | |
543 } | |
544 | |
545 void URLIndexPrivateData::ResetSearchTermCache() { | 868 void URLIndexPrivateData::ResetSearchTermCache() { |
546 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin(); | 869 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin(); |
547 iter != search_term_cache_.end(); ++iter) | 870 iter != search_term_cache_.end(); ++iter) |
548 iter->second.used_ = false; | 871 iter->second.used_ = false; |
549 } | 872 } |
550 | 873 |
551 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords( | 874 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords( |
552 const String16Vector& unsorted_words) { | 875 const String16Vector& unsorted_words) { |
553 // Break the terms down into individual terms (words), get the candidate | 876 // Break the terms down into individual terms (words), get the candidate |
554 // set for each term, and intersect each to get a final candidate list. | 877 // set for each term, and intersect each to get a final candidate list. |
(...skipping 168 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
723 std::set_intersection(word_id_set.begin(), word_id_set.end(), | 1046 std::set_intersection(word_id_set.begin(), word_id_set.end(), |
724 char_word_id_set.begin(), char_word_id_set.end(), | 1047 char_word_id_set.begin(), char_word_id_set.end(), |
725 std::inserter(new_word_id_set, | 1048 std::inserter(new_word_id_set, |
726 new_word_id_set.begin())); | 1049 new_word_id_set.begin())); |
727 word_id_set.swap(new_word_id_set); | 1050 word_id_set.swap(new_word_id_set); |
728 } | 1051 } |
729 } | 1052 } |
730 return word_id_set; | 1053 return word_id_set; |
731 } | 1054 } |
732 | 1055 |
733 // Cache Saving ---------------------------------------------------------------- | 1056 bool URLIndexPrivateData::RestoreFromCache(InMemoryURLCacheDatabase* cache_db) { |
734 | 1057 DCHECK(cache_db); |
735 // static | 1058 if (!cache_db->RestorePrivateData(this)) { |
736 void URLIndexPrivateData::WritePrivateDataToCacheFileTask( | 1059 cache_db->Reset(); |
737 scoped_refptr<URLIndexPrivateData> private_data, | 1060 return false; |
738 const FilePath& file_path, | 1061 } |
739 scoped_refptr<RefCountedBool> succeeded) { | 1062 return !Empty() && ValidateConsistency(); |
740 DCHECK(private_data.get()); | |
741 DCHECK(!file_path.empty()); | |
742 succeeded->set_value(private_data->SaveToFile(file_path)); | |
743 } | 1063 } |
744 | 1064 |
745 bool URLIndexPrivateData::SaveToFile(const FilePath& file_path) { | 1065 void URLIndexPrivateData::DeleteOldVersionCacheFile() const { |
746 base::TimeTicks beginning_time = base::TimeTicks::Now(); | 1066 if (history_dir_.empty()) |
747 InMemoryURLIndexCacheItem index_cache; | 1067 return; |
748 SavePrivateData(&index_cache); | 1068 FilePath path = history_dir_.Append(chrome::kHQPCacheFilename); |
749 std::string data; | 1069 file_util::Delete(path, false); |
750 if (!index_cache.SerializeToString(&data)) { | 1070 } |
751 LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache."; | 1071 |
1072 bool URLIndexPrivateData::GetCacheDBPath(FilePath* file_path) { | |
1073 DCHECK(file_path); | |
1074 if (history_dir_.empty()) | |
752 return false; | 1075 return false; |
753 } | 1076 *file_path = history_dir_.Append(chrome::kHQPCacheDBFilename); |
754 | |
755 int size = data.size(); | |
756 if (file_util::WriteFile(file_path, data.c_str(), size) != size) { | |
757 LOG(WARNING) << "Failed to write " << file_path.value(); | |
758 return false; | |
759 } | |
760 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime", | |
761 base::TimeTicks::Now() - beginning_time); | |
762 return true; | 1077 return true; |
763 } | 1078 } |
764 | 1079 |
765 void URLIndexPrivateData::SavePrivateData( | 1080 void URLIndexPrivateData::SetCacheDatabaseForTesting( |
766 InMemoryURLIndexCacheItem* cache) const { | 1081 InMemoryURLCacheDatabase* test_db) { |
767 DCHECK(cache); | 1082 cache_db_ = test_db; |
768 cache->set_timestamp(base::Time::Now().ToInternalValue()); | |
769 cache->set_version(saved_cache_version_); | |
770 // history_item_count_ is no longer used but rather than change the protobuf | |
771 // definition use a placeholder. This will go away with the switch to SQLite. | |
772 cache->set_history_item_count(0); | |
773 SaveWordList(cache); | |
774 SaveWordMap(cache); | |
775 SaveCharWordMap(cache); | |
776 SaveWordIDHistoryMap(cache); | |
777 SaveHistoryInfoMap(cache); | |
778 SaveWordStartsMap(cache); | |
779 } | |
780 | |
781 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const { | |
782 if (word_list_.empty()) | |
783 return; | |
784 WordListItem* list_item = cache->mutable_word_list(); | |
785 list_item->set_word_count(word_list_.size()); | |
786 for (String16Vector::const_iterator iter = word_list_.begin(); | |
787 iter != word_list_.end(); ++iter) | |
788 list_item->add_word(UTF16ToUTF8(*iter)); | |
789 } | |
790 | |
791 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const { | |
792 if (word_map_.empty()) | |
793 return; | |
794 WordMapItem* map_item = cache->mutable_word_map(); | |
795 map_item->set_item_count(word_map_.size()); | |
796 for (WordMap::const_iterator iter = word_map_.begin(); | |
797 iter != word_map_.end(); ++iter) { | |
798 WordMapEntry* map_entry = map_item->add_word_map_entry(); | |
799 map_entry->set_word(UTF16ToUTF8(iter->first)); | |
800 map_entry->set_word_id(iter->second); | |
801 } | |
802 } | |
803 | |
804 void URLIndexPrivateData::SaveCharWordMap( | |
805 InMemoryURLIndexCacheItem* cache) const { | |
806 if (char_word_map_.empty()) | |
807 return; | |
808 CharWordMapItem* map_item = cache->mutable_char_word_map(); | |
809 map_item->set_item_count(char_word_map_.size()); | |
810 for (CharWordIDMap::const_iterator iter = char_word_map_.begin(); | |
811 iter != char_word_map_.end(); ++iter) { | |
812 CharWordMapEntry* map_entry = map_item->add_char_word_map_entry(); | |
813 map_entry->set_char_16(iter->first); | |
814 const WordIDSet& word_id_set(iter->second); | |
815 map_entry->set_item_count(word_id_set.size()); | |
816 for (WordIDSet::const_iterator set_iter = word_id_set.begin(); | |
817 set_iter != word_id_set.end(); ++set_iter) | |
818 map_entry->add_word_id(*set_iter); | |
819 } | |
820 } | |
821 | |
822 void URLIndexPrivateData::SaveWordIDHistoryMap( | |
823 InMemoryURLIndexCacheItem* cache) const { | |
824 if (word_id_history_map_.empty()) | |
825 return; | |
826 WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map(); | |
827 map_item->set_item_count(word_id_history_map_.size()); | |
828 for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin(); | |
829 iter != word_id_history_map_.end(); ++iter) { | |
830 WordIDHistoryMapEntry* map_entry = | |
831 map_item->add_word_id_history_map_entry(); | |
832 map_entry->set_word_id(iter->first); | |
833 const HistoryIDSet& history_id_set(iter->second); | |
834 map_entry->set_item_count(history_id_set.size()); | |
835 for (HistoryIDSet::const_iterator set_iter = history_id_set.begin(); | |
836 set_iter != history_id_set.end(); ++set_iter) | |
837 map_entry->add_history_id(*set_iter); | |
838 } | |
839 } | |
840 | |
841 void URLIndexPrivateData::SaveHistoryInfoMap( | |
842 InMemoryURLIndexCacheItem* cache) const { | |
843 if (history_info_map_.empty()) | |
844 return; | |
845 HistoryInfoMapItem* map_item = cache->mutable_history_info_map(); | |
846 map_item->set_item_count(history_info_map_.size()); | |
847 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); | |
848 iter != history_info_map_.end(); ++iter) { | |
849 HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry(); | |
850 map_entry->set_history_id(iter->first); | |
851 const URLRow& url_row(iter->second); | |
852 // Note: We only save information that contributes to the index so there | |
853 // is no need to save search_term_cache_ (not persistent). | |
854 map_entry->set_visit_count(url_row.visit_count()); | |
855 map_entry->set_typed_count(url_row.typed_count()); | |
856 map_entry->set_last_visit(url_row.last_visit().ToInternalValue()); | |
857 map_entry->set_url(url_row.url().spec()); | |
858 map_entry->set_title(UTF16ToUTF8(url_row.title())); | |
859 } | |
860 } | |
861 | |
862 void URLIndexPrivateData::SaveWordStartsMap( | |
863 InMemoryURLIndexCacheItem* cache) const { | |
864 if (word_starts_map_.empty()) | |
865 return; | |
866 // For unit testing: Enable saving of the cache as an earlier version to | |
867 // allow testing of cache file upgrading in ReadFromFile(). | |
868 // TODO(mrossetti): Instead of intruding on production code with this kind of | |
869 // test harness, save a copy of an older version cache with known results. | |
870 // Implement this when switching the caching over to SQLite. | |
871 if (saved_cache_version_ < 1) | |
872 return; | |
873 | |
874 WordStartsMapItem* map_item = cache->mutable_word_starts_map(); | |
875 map_item->set_item_count(word_starts_map_.size()); | |
876 for (WordStartsMap::const_iterator iter = word_starts_map_.begin(); | |
877 iter != word_starts_map_.end(); ++iter) { | |
878 WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry(); | |
879 map_entry->set_history_id(iter->first); | |
880 const RowWordStarts& word_starts(iter->second); | |
881 for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin(); | |
882 i != word_starts.url_word_starts_.end(); ++i) | |
883 map_entry->add_url_word_starts(*i); | |
884 for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin(); | |
885 i != word_starts.title_word_starts_.end(); ++i) | |
886 map_entry->add_title_word_starts(*i); | |
887 } | |
888 } | |
889 | |
890 // Cache Restoring ------------------------------------------------------------- | |
891 | |
892 // static | |
893 void URLIndexPrivateData::RestoreFromFileTask( | |
894 const FilePath& file_path, | |
895 scoped_refptr<URLIndexPrivateData> private_data, | |
896 const std::string& languages) { | |
897 private_data = URLIndexPrivateData::RestoreFromFile(file_path, languages); | |
898 } | |
899 | |
900 // static | |
901 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile( | |
902 const FilePath& file_path, | |
903 const std::string& languages) { | |
904 base::TimeTicks beginning_time = base::TimeTicks::Now(); | |
905 if (!file_util::PathExists(file_path)) | |
906 return NULL; | |
907 std::string data; | |
908 // If there is no cache file then simply give up. This will cause us to | |
909 // attempt to rebuild from the history database. | |
910 if (!file_util::ReadFileToString(file_path, &data)) | |
911 return NULL; | |
912 | |
913 scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData); | |
914 InMemoryURLIndexCacheItem index_cache; | |
915 if (!index_cache.ParseFromArray(data.c_str(), data.size())) { | |
916 LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from " | |
917 << file_path.value(); | |
918 return restored_data; | |
919 } | |
920 | |
921 if (!restored_data->RestorePrivateData(index_cache, languages)) | |
922 return NULL; | |
923 | |
924 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime", | |
925 base::TimeTicks::Now() - beginning_time); | |
926 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", | |
927 restored_data->history_id_word_map_.size()); | |
928 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size()); | |
929 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", | |
930 restored_data->word_map_.size()); | |
931 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", | |
932 restored_data->char_word_map_.size()); | |
933 if (restored_data->Empty()) | |
934 return NULL; // 'No data' is the same as a failed reload. | |
935 return restored_data; | |
936 } | |
937 | |
938 // static | |
939 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory( | |
940 HistoryDatabase* history_db, | |
941 const std::string& languages, | |
942 const std::set<std::string>& scheme_whitelist) { | |
943 if (!history_db) | |
944 return NULL; | |
945 | |
946 base::TimeTicks beginning_time = base::TimeTicks::Now(); | |
947 | |
948 scoped_refptr<URLIndexPrivateData> rebuilt_data(new URLIndexPrivateData); | |
949 URLDatabase::URLEnumerator history_enum; | |
950 if (!history_db->InitURLEnumeratorForSignificant(&history_enum)) | |
951 return NULL; | |
952 for (URLRow row; history_enum.GetNextURL(&row); ) | |
953 rebuilt_data->IndexRow(row, languages, scheme_whitelist); | |
954 | |
955 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime", | |
956 base::TimeTicks::Now() - beginning_time); | |
957 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", | |
958 rebuilt_data->history_id_word_map_.size()); | |
959 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", | |
960 rebuilt_data->word_map_.size()); | |
961 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", | |
962 rebuilt_data->char_word_map_.size()); | |
963 return rebuilt_data; | |
964 } | |
965 | |
966 bool URLIndexPrivateData::RestorePrivateData( | |
967 const InMemoryURLIndexCacheItem& cache, | |
968 const std::string& languages) { | |
969 if (cache.has_version()) | |
970 restored_cache_version_ = cache.version(); | |
971 return RestoreWordList(cache) && RestoreWordMap(cache) && | |
972 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) && | |
973 RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages); | |
974 } | |
975 | |
976 bool URLIndexPrivateData::RestoreWordList( | |
977 const InMemoryURLIndexCacheItem& cache) { | |
978 if (!cache.has_word_list()) | |
979 return false; | |
980 const WordListItem& list_item(cache.word_list()); | |
981 uint32 expected_item_count = list_item.word_count(); | |
982 uint32 actual_item_count = list_item.word_size(); | |
983 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
984 return false; | |
985 const RepeatedPtrField<std::string>& words(list_item.word()); | |
986 for (RepeatedPtrField<std::string>::const_iterator iter = words.begin(); | |
987 iter != words.end(); ++iter) | |
988 word_list_.push_back(UTF8ToUTF16(*iter)); | |
989 return true; | |
990 } | |
991 | |
992 bool URLIndexPrivateData::RestoreWordMap( | |
993 const InMemoryURLIndexCacheItem& cache) { | |
994 if (!cache.has_word_map()) | |
995 return false; | |
996 const WordMapItem& list_item(cache.word_map()); | |
997 uint32 expected_item_count = list_item.item_count(); | |
998 uint32 actual_item_count = list_item.word_map_entry_size(); | |
999 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
1000 return false; | |
1001 const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry()); | |
1002 for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin(); | |
1003 iter != entries.end(); ++iter) | |
1004 word_map_[UTF8ToUTF16(iter->word())] = iter->word_id(); | |
1005 return true; | |
1006 } | |
1007 | |
1008 bool URLIndexPrivateData::RestoreCharWordMap( | |
1009 const InMemoryURLIndexCacheItem& cache) { | |
1010 if (!cache.has_char_word_map()) | |
1011 return false; | |
1012 const CharWordMapItem& list_item(cache.char_word_map()); | |
1013 uint32 expected_item_count = list_item.item_count(); | |
1014 uint32 actual_item_count = list_item.char_word_map_entry_size(); | |
1015 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
1016 return false; | |
1017 const RepeatedPtrField<CharWordMapEntry>& | |
1018 entries(list_item.char_word_map_entry()); | |
1019 for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter = | |
1020 entries.begin(); iter != entries.end(); ++iter) { | |
1021 expected_item_count = iter->item_count(); | |
1022 actual_item_count = iter->word_id_size(); | |
1023 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
1024 return false; | |
1025 char16 uni_char = static_cast<char16>(iter->char_16()); | |
1026 WordIDSet word_id_set; | |
1027 const RepeatedField<int32>& word_ids(iter->word_id()); | |
1028 for (RepeatedField<int32>::const_iterator jiter = word_ids.begin(); | |
1029 jiter != word_ids.end(); ++jiter) | |
1030 word_id_set.insert(*jiter); | |
1031 char_word_map_[uni_char] = word_id_set; | |
1032 } | |
1033 return true; | |
1034 } | |
1035 | |
1036 bool URLIndexPrivateData::RestoreWordIDHistoryMap( | |
1037 const InMemoryURLIndexCacheItem& cache) { | |
1038 if (!cache.has_word_id_history_map()) | |
1039 return false; | |
1040 const WordIDHistoryMapItem& list_item(cache.word_id_history_map()); | |
1041 uint32 expected_item_count = list_item.item_count(); | |
1042 uint32 actual_item_count = list_item.word_id_history_map_entry_size(); | |
1043 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
1044 return false; | |
1045 const RepeatedPtrField<WordIDHistoryMapEntry>& | |
1046 entries(list_item.word_id_history_map_entry()); | |
1047 for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter = | |
1048 entries.begin(); iter != entries.end(); ++iter) { | |
1049 expected_item_count = iter->item_count(); | |
1050 actual_item_count = iter->history_id_size(); | |
1051 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
1052 return false; | |
1053 WordID word_id = iter->word_id(); | |
1054 HistoryIDSet history_id_set; | |
1055 const RepeatedField<int64>& history_ids(iter->history_id()); | |
1056 for (RepeatedField<int64>::const_iterator jiter = history_ids.begin(); | |
1057 jiter != history_ids.end(); ++jiter) { | |
1058 history_id_set.insert(*jiter); | |
1059 AddToHistoryIDWordMap(*jiter, word_id); | |
1060 } | |
1061 word_id_history_map_[word_id] = history_id_set; | |
1062 } | |
1063 return true; | |
1064 } | |
1065 | |
1066 bool URLIndexPrivateData::RestoreHistoryInfoMap( | |
1067 const InMemoryURLIndexCacheItem& cache) { | |
1068 if (!cache.has_history_info_map()) | |
1069 return false; | |
1070 const HistoryInfoMapItem& list_item(cache.history_info_map()); | |
1071 uint32 expected_item_count = list_item.item_count(); | |
1072 uint32 actual_item_count = list_item.history_info_map_entry_size(); | |
1073 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
1074 return false; | |
1075 const RepeatedPtrField<HistoryInfoMapEntry>& | |
1076 entries(list_item.history_info_map_entry()); | |
1077 for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter = | |
1078 entries.begin(); iter != entries.end(); ++iter) { | |
1079 HistoryID history_id = iter->history_id(); | |
1080 GURL url(iter->url()); | |
1081 URLRow url_row(url, history_id); | |
1082 url_row.set_visit_count(iter->visit_count()); | |
1083 url_row.set_typed_count(iter->typed_count()); | |
1084 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit())); | |
1085 if (iter->has_title()) { | |
1086 string16 title(UTF8ToUTF16(iter->title())); | |
1087 url_row.set_title(title); | |
1088 } | |
1089 history_info_map_[history_id] = url_row; | |
1090 } | |
1091 return true; | |
1092 } | |
1093 | |
1094 bool URLIndexPrivateData::RestoreWordStartsMap( | |
1095 const InMemoryURLIndexCacheItem& cache, | |
1096 const std::string& languages) { | |
1097 // Note that this function must be called after RestoreHistoryInfoMap() has | |
1098 // been run as the word starts may have to be recalculated from the urls and | |
1099 // page titles. | |
1100 if (cache.has_word_starts_map()) { | |
1101 const WordStartsMapItem& list_item(cache.word_starts_map()); | |
1102 uint32 expected_item_count = list_item.item_count(); | |
1103 uint32 actual_item_count = list_item.word_starts_map_entry_size(); | |
1104 if (actual_item_count == 0 || actual_item_count != expected_item_count) | |
1105 return false; | |
1106 const RepeatedPtrField<WordStartsMapEntry>& | |
1107 entries(list_item.word_starts_map_entry()); | |
1108 for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter = | |
1109 entries.begin(); iter != entries.end(); ++iter) { | |
1110 HistoryID history_id = iter->history_id(); | |
1111 RowWordStarts word_starts; | |
1112 // Restore the URL word starts. | |
1113 const RepeatedField<int32>& url_starts(iter->url_word_starts()); | |
1114 for (RepeatedField<int32>::const_iterator jiter = url_starts.begin(); | |
1115 jiter != url_starts.end(); ++jiter) | |
1116 word_starts.url_word_starts_.push_back(*jiter); | |
1117 // Restore the page title word starts. | |
1118 const RepeatedField<int32>& title_starts(iter->title_word_starts()); | |
1119 for (RepeatedField<int32>::const_iterator jiter = title_starts.begin(); | |
1120 jiter != title_starts.end(); ++jiter) | |
1121 word_starts.title_word_starts_.push_back(*jiter); | |
1122 word_starts_map_[history_id] = word_starts; | |
1123 } | |
1124 } else { | |
1125 // Since the cache did not contain any word starts we must rebuild then from | |
1126 // the URL and page titles. | |
1127 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); | |
1128 iter != history_info_map_.end(); ++iter) { | |
1129 RowWordStarts word_starts; | |
1130 const URLRow& row(iter->second); | |
1131 string16 url(net::FormatUrl(row.url(), languages, | |
1132 net::kFormatUrlOmitUsernamePassword, | |
1133 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS, | |
1134 NULL, NULL, NULL)); | |
1135 url = base::i18n::ToLower(url); | |
1136 String16VectorFromString16(url, false, &word_starts.url_word_starts_); | |
1137 String16VectorFromString16( | |
1138 row.title(), false, &word_starts.title_word_starts_); | |
1139 word_starts_map_[iter->first] = word_starts; | |
1140 } | |
1141 } | |
1142 return true; | |
1143 } | 1083 } |
1144 | 1084 |
1145 // static | 1085 // static |
1146 bool URLIndexPrivateData::URLSchemeIsWhitelisted( | 1086 bool URLIndexPrivateData::URLSchemeIsWhitelisted( |
1147 const GURL& gurl, | 1087 const GURL& gurl, |
1148 const std::set<std::string>& whitelist) { | 1088 const std::set<std::string>& whitelist) { |
1149 return whitelist.find(gurl.scheme()) != whitelist.end(); | 1089 return whitelist.find(gurl.scheme()) != whitelist.end(); |
1150 } | 1090 } |
1151 | 1091 |
1152 } // namespace history | 1092 } // namespace history |
OLD | NEW |