Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(79)

Side by Side Diff: chrome/browser/history/url_index_private_data.cc

Issue 9655003: Gather word-start Information to Aid in Scoring. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/
Patch Set: Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/browser/history/url_index_private_data.h" 5 #include "chrome/browser/history/url_index_private_data.h"
6 6
7 #include <algorithm> 7 #include <algorithm>
8 #include <functional> 8 #include <functional>
9 #include <iterator> 9 #include <iterator>
10 #include <limits> 10 #include <limits>
(...skipping 24 matching lines...) Expand all
35 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry 35 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry
36 CharWordMapEntry; 36 CharWordMapEntry;
37 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem 37 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem
38 WordIDHistoryMapItem; 38 WordIDHistoryMapItem;
39 typedef imui:: 39 typedef imui::
40 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry 40 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry
41 WordIDHistoryMapEntry; 41 WordIDHistoryMapEntry;
42 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem; 42 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem;
43 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry 43 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry
44 HistoryInfoMapEntry; 44 HistoryInfoMapEntry;
45 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem;
46 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry
47 WordStartsMapEntry;
45 48
46 // The maximum score any candidate result can achieve. 49 // The maximum score any candidate result can achieve.
47 const int kMaxTotalScore = 1425; 50 const int kMaxTotalScore = 1425;
48 51
49 // Score ranges used to get a 'base' score for each of the scoring factors 52 // Score ranges used to get a 'base' score for each of the scoring factors
50 // (such as recency of last visit, times visited, times the URL was typed, 53 // (such as recency of last visit, times visited, times the URL was typed,
51 // and the quality of the string match). There is a matching value range for 54 // and the quality of the string match). There is a matching value range for
52 // each of these scores for each factor. Note that the top score is greater 55 // each of these scores for each factor. Note that the top score is greater
53 // than |kMaxTotalScore|. The score for each candidate will be capped in the 56 // than |kMaxTotalScore|. The score for each candidate will be capped in the
54 // final calculation. 57 // final calculation.
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after
108 score += (value - value_ranks[i]) * 111 score += (value - value_ranks[i]) *
109 (kScoreRank[i - 1] - kScoreRank[i]) / 112 (kScoreRank[i - 1] - kScoreRank[i]) /
110 (value_ranks[i - 1] - value_ranks[i]); 113 (value_ranks[i - 1] - value_ranks[i]);
111 } 114 }
112 return score; 115 return score;
113 } 116 }
114 117
115 // InMemoryURLIndex's Private Data --------------------------------------------- 118 // InMemoryURLIndex's Private Data ---------------------------------------------
116 119
117 URLIndexPrivateData::URLIndexPrivateData() 120 URLIndexPrivateData::URLIndexPrivateData()
118 : pre_filter_item_count_(0), 121 : restored_cache_version_(0),
122 saved_cache_version_(kCurrentCacheFileVersion),
123 pre_filter_item_count_(0),
119 post_filter_item_count_(0), 124 post_filter_item_count_(0),
120 post_scoring_item_count_(0) { 125 post_scoring_item_count_(0) {
121 URLIndexPrivateData::InitializeSchemeWhitelist(&scheme_whitelist_); 126 URLIndexPrivateData::InitializeSchemeWhitelist(&scheme_whitelist_);
122 } 127 }
123 128
124 URLIndexPrivateData::~URLIndexPrivateData() {} 129 URLIndexPrivateData::~URLIndexPrivateData() {}
125 130
126 void URLIndexPrivateData::Clear() { 131 void URLIndexPrivateData::Clear() {
127 word_list_.clear(); 132 word_list_.clear();
128 available_words_.clear(); 133 available_words_.clear();
129 word_map_.clear(); 134 word_map_.clear();
130 char_word_map_.clear(); 135 char_word_map_.clear();
131 word_id_history_map_.clear(); 136 word_id_history_map_.clear();
132 history_id_word_map_.clear(); 137 history_id_word_map_.clear();
133 history_info_map_.clear(); 138 history_info_map_.clear();
139 word_starts_map_.clear();
134 } 140 }
135 141
136 // Cache Updating -------------------------------------------------------------- 142 // Cache Updating --------------------------------------------------------------
137 143
138 bool URLIndexPrivateData::IndexRow(const URLRow& row) { 144 bool URLIndexPrivateData::IndexRow(const URLRow& row) {
139 const GURL& gurl(row.url()); 145 const GURL& gurl(row.url());
140 146
141 // Index only URLs with a whitelisted scheme. 147 // Index only URLs with a whitelisted scheme.
142 if (!URLIndexPrivateData::URLSchemeIsWhitelisted(gurl)) 148 if (!URLIndexPrivateData::URLSchemeIsWhitelisted(gurl))
143 return false; 149 return false;
(...skipping 10 matching lines...) Expand all
154 160
155 // Add the row for quick lookup in the history info store. 161 // Add the row for quick lookup in the history info store.
156 URLRow new_row(GURL(url), row_id); 162 URLRow new_row(GURL(url), row_id);
157 new_row.set_visit_count(row.visit_count()); 163 new_row.set_visit_count(row.visit_count());
158 new_row.set_typed_count(row.typed_count()); 164 new_row.set_typed_count(row.typed_count());
159 new_row.set_last_visit(row.last_visit()); 165 new_row.set_last_visit(row.last_visit());
160 new_row.set_title(row.title()); 166 new_row.set_title(row.title());
161 history_info_map_[history_id] = new_row; 167 history_info_map_[history_id] = new_row;
162 168
163 // Index the words contained in the URL and title of the row. 169 // Index the words contained in the URL and title of the row.
164 AddRowWordsToIndex(new_row); 170 WordStarts word_starts;
171 AddRowWordsToIndex(new_row, &word_starts);
172 word_starts_map_[history_id] = word_starts;
165 return true; 173 return true;
166 } 174 }
167 175
168 void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row) { 176 void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row,
177 WordStarts* word_starts) {
169 HistoryID history_id = static_cast<HistoryID>(row.id()); 178 HistoryID history_id = static_cast<HistoryID>(row.id());
170 // Split URL into individual, unique words then add in the title words. 179 // Split URL into individual, unique words then add in the title words.
171 const GURL& gurl(row.url()); 180 const GURL& gurl(row.url());
172 string16 url(net::FormatUrl(gurl, languages_, 181 string16 url(net::FormatUrl(gurl, languages_,
173 net::kFormatUrlOmitUsernamePassword, 182 net::kFormatUrlOmitUsernamePassword,
174 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS, 183 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS,
175 NULL, NULL, NULL)); 184 NULL, NULL, NULL));
176 url = base::i18n::ToLower(url); 185 url = base::i18n::ToLower(url);
177 String16Set url_words = String16SetFromString16(url); 186 String16Set url_words = String16SetFromString16(url,
178 String16Set title_words = String16SetFromString16(row.title()); 187 word_starts ? &word_starts->url_word_starts_ : NULL);
188 String16Set title_words = String16SetFromString16(row.title(),
189 word_starts ? &word_starts->title_word_starts_ : NULL);
179 String16Set words; 190 String16Set words;
180 std::set_union(url_words.begin(), url_words.end(), 191 std::set_union(url_words.begin(), url_words.end(),
181 title_words.begin(), title_words.end(), 192 title_words.begin(), title_words.end(),
182 std::insert_iterator<String16Set>(words, words.begin())); 193 std::insert_iterator<String16Set>(words, words.begin()));
183 for (String16Set::iterator word_iter = words.begin(); 194 for (String16Set::iterator word_iter = words.begin();
184 word_iter != words.end(); ++word_iter) 195 word_iter != words.end(); ++word_iter)
185 AddWordToIndex(*word_iter, history_id); 196 AddWordToIndex(*word_iter, history_id);
186 197
187 search_term_cache_.clear(); // Invalidate the term cache. 198 search_term_cache_.clear(); // Invalidate the term cache.
188 } 199 }
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
239 word_id_set.insert(word_id); 250 word_id_set.insert(word_id);
240 char_word_map_[uni_char] = word_id_set; 251 char_word_map_[uni_char] = word_id_set;
241 } 252 }
242 } 253 }
243 } 254 }
244 255
245 void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) { 256 void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) {
246 RemoveRowWordsFromIndex(row); 257 RemoveRowWordsFromIndex(row);
247 HistoryID history_id = static_cast<HistoryID>(row.id()); 258 HistoryID history_id = static_cast<HistoryID>(row.id());
248 history_info_map_.erase(history_id); 259 history_info_map_.erase(history_id);
260 word_starts_map_.erase(history_id);
249 } 261 }
250 262
251 void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) { 263 void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) {
252 // Remove the entries in history_id_word_map_ and word_id_history_map_ for 264 // Remove the entries in history_id_word_map_ and word_id_history_map_ for
253 // this row. 265 // this row.
254 HistoryID history_id = static_cast<HistoryID>(row.id()); 266 HistoryID history_id = static_cast<HistoryID>(row.id());
255 WordIDSet word_id_set = history_id_word_map_[history_id]; 267 WordIDSet word_id_set = history_id_word_map_[history_id];
256 history_id_word_map_.erase(history_id); 268 history_id_word_map_.erase(history_id);
257 269
258 // Reconcile any changes to word usage. 270 // Reconcile any changes to word usage.
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after
321 row_to_update.set_visit_count(row.visit_count()); 333 row_to_update.set_visit_count(row.visit_count());
322 row_to_update.set_typed_count(row.typed_count()); 334 row_to_update.set_typed_count(row.typed_count());
323 row_to_update.set_last_visit(row.last_visit()); 335 row_to_update.set_last_visit(row.last_visit());
324 // While the URL is guaranteed to remain stable, the title may have 336 // While the URL is guaranteed to remain stable, the title may have
325 // changed. If so, then update the index with the changed words. 337 // changed. If so, then update the index with the changed words.
326 if (title_updated) { 338 if (title_updated) {
327 // Clear all words associated with this row and re-index both the 339 // Clear all words associated with this row and re-index both the
328 // URL and title. 340 // URL and title.
329 RemoveRowWordsFromIndex(row_to_update); 341 RemoveRowWordsFromIndex(row_to_update);
330 row_to_update.set_title(row.title()); 342 row_to_update.set_title(row.title());
331 AddRowWordsToIndex(row_to_update); 343 WordStarts word_starts;
344 AddRowWordsToIndex(row_to_update, &word_starts);
345 word_starts_map_[row_id] = word_starts;
332 } 346 }
333 row_was_updated = true; 347 row_was_updated = true;
334 } 348 }
335 } else { 349 } else {
336 // This indexed row no longer qualifies and will be de-indexed by 350 // This indexed row no longer qualifies and will be de-indexed by
337 // clearing all words associated with this row. 351 // clearing all words associated with this row.
338 RemoveRowFromIndex(row); 352 RemoveRowFromIndex(row);
339 row_was_updated = true; 353 row_was_updated = true;
340 } 354 }
341 if (row_was_updated) 355 if (row_was_updated)
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after
417 // the final filtering we need whitespace separated substrings possibly 431 // the final filtering we need whitespace separated substrings possibly
418 // containing escaped characters. 432 // containing escaped characters.
419 string16 lower_raw_string(base::i18n::ToLower(search_string)); 433 string16 lower_raw_string(base::i18n::ToLower(search_string));
420 string16 lower_unescaped_string = 434 string16 lower_unescaped_string =
421 net::UnescapeURLComponent(lower_raw_string, 435 net::UnescapeURLComponent(lower_raw_string,
422 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); 436 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
423 // Extract individual 'words' (as opposed to 'terms'; see below) from the 437 // Extract individual 'words' (as opposed to 'terms'; see below) from the
424 // search string. When the user types "colspec=ID%20Mstone Release" we get 438 // search string. When the user types "colspec=ID%20Mstone Release" we get
425 // four 'words': "colspec", "id", "mstone" and "release". 439 // four 'words': "colspec", "id", "mstone" and "release".
426 String16Vector lower_words( 440 String16Vector lower_words(
427 history::String16VectorFromString16(lower_unescaped_string, false)); 441 history::String16VectorFromString16(lower_unescaped_string, false, NULL));
428 ScoredHistoryMatches scored_items; 442 ScoredHistoryMatches scored_items;
429 443
430 // Do nothing if we have indexed no words (probably because we've not been 444 // Do nothing if we have indexed no words (probably because we've not been
431 // initialized yet) or the search string has no words. 445 // initialized yet) or the search string has no words.
432 if (word_list_.empty() || lower_words.empty()) { 446 if (word_list_.empty() || lower_words.empty()) {
433 search_term_cache_.clear(); // Invalidate the term cache. 447 search_term_cache_.clear(); // Invalidate the term cache.
434 return scored_items; 448 return scored_items;
435 } 449 }
436 450
437 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep 451 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
(...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after
529 543
530 void URLIndexPrivateData::AddHistoryMatch::operator()( 544 void URLIndexPrivateData::AddHistoryMatch::operator()(
531 const HistoryID history_id) { 545 const HistoryID history_id) {
532 HistoryInfoMap::const_iterator hist_pos = 546 HistoryInfoMap::const_iterator hist_pos =
533 private_data_.history_info_map_.find(history_id); 547 private_data_.history_info_map_.find(history_id);
534 // Note that a history_id may be present in the word_id_history_map_ yet not 548 // Note that a history_id may be present in the word_id_history_map_ yet not
535 // be found in the history_info_map_. This occurs when an item has been 549 // be found in the history_info_map_. This occurs when an item has been
536 // deleted by the user or the item no longer qualifies as a quick result. 550 // deleted by the user or the item no longer qualifies as a quick result.
537 if (hist_pos != private_data_.history_info_map_.end()) { 551 if (hist_pos != private_data_.history_info_map_.end()) {
538 const URLRow& hist_item = hist_pos->second; 552 const URLRow& hist_item = hist_pos->second;
539 ScoredHistoryMatch match( 553 WordStartsMap::const_iterator starts_pos =
540 ScoredMatchForURL(hist_item, lower_string_, lower_terms_)); 554 private_data_.word_starts_map_.find(history_id);
555 DCHECK(starts_pos != private_data_.word_starts_map_.end());
556 ScoredHistoryMatch match(ScoredMatchForURL(hist_item, lower_string_,
557 lower_terms_,
558 starts_pos->second));
541 if (match.raw_score > 0) 559 if (match.raw_score > 0)
542 scored_matches_.push_back(match); 560 scored_matches_.push_back(match);
543 } 561 }
544 } 562 }
545 563
546 // static 564 // static
547 // TODO(mrossetti): This can be made a ctor for ScoredHistoryMatch. 565 // TODO(mrossetti): This can be made a ctor for ScoredHistoryMatch.
548 ScoredHistoryMatch URLIndexPrivateData::ScoredMatchForURL( 566 ScoredHistoryMatch URLIndexPrivateData::ScoredMatchForURL(
549 const URLRow& row, 567 const URLRow& row,
550 const string16& lower_string, 568 const string16& lower_string,
551 const String16Vector& terms) { 569 const String16Vector& terms,
570 const WordStarts& word_starts) {
552 ScoredHistoryMatch match(row); 571 ScoredHistoryMatch match(row);
553 GURL gurl = row.url(); 572 GURL gurl = row.url();
554 if (!gurl.is_valid()) 573 if (!gurl.is_valid())
555 return match; 574 return match;
556 575
557 // Figure out where each search term appears in the URL and/or page title 576 // Figure out where each search term appears in the URL and/or page title
558 // so that we can score as well as provide autocomplete highlighting. 577 // so that we can score as well as provide autocomplete highlighting.
559 string16 url = base::i18n::ToLower(UTF8ToUTF16(gurl.spec())); 578 string16 url = base::i18n::ToLower(UTF8ToUTF16(gurl.spec()));
560 string16 title = base::i18n::ToLower(row.title()); 579 string16 title = base::i18n::ToLower(row.title());
561 int term_num = 0; 580 int term_num = 0;
(...skipping 95 matching lines...) Expand 10 before | Expand all | Expand 10 after
657 for (size_t i = 1; i < matches.size(); ++i) { 676 for (size_t i = 1; i < matches.size(); ++i) {
658 if (matches[i - 1].term_num > matches[i].term_num) 677 if (matches[i - 1].term_num > matches[i].term_num)
659 ++out_of_order; 678 ++out_of_order;
660 } 679 }
661 order_value = (max_possible_out_of_order - out_of_order) * kOrderMaxValue / 680 order_value = (max_possible_out_of_order - out_of_order) * kOrderMaxValue /
662 max_possible_out_of_order; 681 max_possible_out_of_order;
663 } 682 }
664 683
665 // Score component for how early in the match string the first search term 684 // Score component for how early in the match string the first search term
666 // appears. Start with kStartMaxValue points and discount by 685 // appears. Start with kStartMaxValue points and discount by
667 // kStartMaxValue/kMaxSignificantStart points for each character later than 686 // kStartMaxValue/kMaxSignificantChars points for each character later than
668 // the first at which the term begins. No points are earned if the start of 687 // the first at which the term begins. No points are earned if the start of
669 // the match occurs at or after kMaxSignificantStart. 688 // the match occurs at or after kMaxSignificantChars.
670 const size_t kMaxSignificantStart = 50;
671 const int kStartMaxValue = 1000; 689 const int kStartMaxValue = 1000;
672 int start_value = (kMaxSignificantStart - 690 int start_value = (kMaxSignificantChars -
673 std::min(kMaxSignificantStart, matches[0].offset)) * kStartMaxValue / 691 std::min(kMaxSignificantChars, matches[0].offset)) * kStartMaxValue /
674 kMaxSignificantStart; 692 kMaxSignificantChars;
675 693
676 // Score component for how much of the matched string the input terms cover. 694 // Score component for how much of the matched string the input terms cover.
677 // kCompleteMaxValue points times the fraction of the URL/page title string 695 // kCompleteMaxValue points times the fraction of the URL/page title string
678 // that was matched. 696 // that was matched.
679 size_t term_length_total = std::accumulate(matches.begin(), matches.end(), 697 size_t term_length_total = std::accumulate(matches.begin(), matches.end(),
680 0, AccumulateMatchLength); 698 0, AccumulateMatchLength);
681 const size_t kMaxSignificantLength = 50; 699 const size_t kMaxSignificantLength = 50;
682 size_t max_significant_length = 700 size_t max_significant_length =
683 std::min(max_length, std::max(term_length_total, kMaxSignificantLength)); 701 std::min(max_length, std::max(term_length_total, kMaxSignificantLength));
684 const int kCompleteMaxValue = 1000; 702 const int kCompleteMaxValue = 1000;
(...skipping 236 matching lines...) Expand 10 before | Expand all | Expand 10 after
921 } 939 }
922 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime", 940 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",
923 base::TimeTicks::Now() - beginning_time); 941 base::TimeTicks::Now() - beginning_time);
924 return true; 942 return true;
925 } 943 }
926 944
927 void URLIndexPrivateData::SavePrivateData( 945 void URLIndexPrivateData::SavePrivateData(
928 InMemoryURLIndexCacheItem* cache) const { 946 InMemoryURLIndexCacheItem* cache) const {
929 DCHECK(cache); 947 DCHECK(cache);
930 cache->set_timestamp(base::Time::Now().ToInternalValue()); 948 cache->set_timestamp(base::Time::Now().ToInternalValue());
949 cache->set_version(saved_cache_version_);
931 // history_item_count_ is no longer used but rather than change the protobuf 950 // history_item_count_ is no longer used but rather than change the protobuf
932 // definition use a placeholder. This will go away with the switch to SQLite. 951 // definition use a placeholder. This will go away with the switch to SQLite.
933 cache->set_history_item_count(0); 952 cache->set_history_item_count(0);
934 SaveWordList(cache); 953 SaveWordList(cache);
935 SaveWordMap(cache); 954 SaveWordMap(cache);
936 SaveCharWordMap(cache); 955 SaveCharWordMap(cache);
937 SaveWordIDHistoryMap(cache); 956 SaveWordIDHistoryMap(cache);
938 SaveHistoryInfoMap(cache); 957 SaveHistoryInfoMap(cache);
958 SaveWordStartsMap(cache);
939 } 959 }
940 960
941 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const { 961 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const {
942 if (word_list_.empty()) 962 if (word_list_.empty())
943 return; 963 return;
944 WordListItem* list_item = cache->mutable_word_list(); 964 WordListItem* list_item = cache->mutable_word_list();
945 list_item->set_word_count(word_list_.size()); 965 list_item->set_word_count(word_list_.size());
946 for (String16Vector::const_iterator iter = word_list_.begin(); 966 for (String16Vector::const_iterator iter = word_list_.begin();
947 iter != word_list_.end(); ++iter) 967 iter != word_list_.end(); ++iter)
948 list_item->add_word(UTF16ToUTF8(*iter)); 968 list_item->add_word(UTF16ToUTF8(*iter));
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after
1013 // is no need to save search_term_cache_ (not persistent), 1033 // is no need to save search_term_cache_ (not persistent),
1014 // languages_, etc. 1034 // languages_, etc.
1015 map_entry->set_visit_count(url_row.visit_count()); 1035 map_entry->set_visit_count(url_row.visit_count());
1016 map_entry->set_typed_count(url_row.typed_count()); 1036 map_entry->set_typed_count(url_row.typed_count());
1017 map_entry->set_last_visit(url_row.last_visit().ToInternalValue()); 1037 map_entry->set_last_visit(url_row.last_visit().ToInternalValue());
1018 map_entry->set_url(url_row.url().spec()); 1038 map_entry->set_url(url_row.url().spec());
1019 map_entry->set_title(UTF16ToUTF8(url_row.title())); 1039 map_entry->set_title(UTF16ToUTF8(url_row.title()));
1020 } 1040 }
1021 } 1041 }
1022 1042
1043 void URLIndexPrivateData::SaveWordStartsMap(
1044 InMemoryURLIndexCacheItem* cache) const {
1045 if (word_starts_map_.empty())
1046 return;
1047 // For unit testing: Enable saving of the cache as an earlier version to
1048 // allow testing of cache file upgrading in ReadFromFile().
Peter Kasting 2012/03/09 02:37:43 Nit: Is this the best way of doing this? For most
mrossetti 2012/03/14 23:23:49 I hear you, and understand. Since saving using pro
1049 if (saved_cache_version_ < 1)
1050 return;
1051
1052 WordStartsMapItem* map_item = cache->mutable_word_starts_map();
1053 map_item->set_item_count(word_starts_map_.size());
1054 for (WordStartsMap::const_iterator iter = word_starts_map_.begin();
1055 iter != word_starts_map_.end(); ++iter) {
1056 WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry();
1057 map_entry->set_history_id(iter->first);
1058 const WordStarts& word_starts(iter->second);
1059 for (std::vector<int>::const_iterator siter =
1060 word_starts.url_word_starts_.begin();
1061 siter != word_starts.url_word_starts_.end(); ++siter)
1062 map_entry->add_url_word_starts(*siter);
1063 for (std::vector<int>::const_iterator siter =
1064 word_starts.title_word_starts_.begin();
1065 siter != word_starts.title_word_starts_.end(); ++siter)
1066 map_entry->add_title_word_starts(*siter);
1067 }
1068 }
1069
1023 // Cache Restoring ------------------------------------------------------------- 1070 // Cache Restoring -------------------------------------------------------------
1024 1071
1025 bool URLIndexPrivateData::RestoreFromFile(const FilePath& file_path) { 1072 bool URLIndexPrivateData::RestoreFromFile(const FilePath& file_path) {
1026 // TODO(mrossetti): Figure out how to determine if the cache is up-to-date. 1073 // TODO(mrossetti): Figure out how to determine if the cache is up-to-date.
1027 // That is: ensure that the database has not been modified since the cache 1074 // That is: ensure that the database has not been modified since the cache
1028 // was last saved. DB file modification date is inadequate. There are no 1075 // was last saved. DB file modification date is inadequate. There are no
1029 // SQLite table checksums automatically stored. 1076 // SQLite table checksums automatically stored.
1030 Clear(); // Start with a clean slate. 1077 Clear(); // Start with a clean slate.
1031 1078
1032 // FIXME(mrossetti): Move File IO to another thread. 1079 // FIXME(mrossetti): Move File IO to another thread.
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
1083 rebuilt_data->history_id_word_map_.size()); 1130 rebuilt_data->history_id_word_map_.size());
1084 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", 1131 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
1085 rebuilt_data->word_map_.size()); 1132 rebuilt_data->word_map_.size());
1086 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", 1133 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
1087 rebuilt_data->char_word_map_.size()); 1134 rebuilt_data->char_word_map_.size());
1088 return rebuilt_data.release(); 1135 return rebuilt_data.release();
1089 } 1136 }
1090 1137
1091 bool URLIndexPrivateData::RestorePrivateData( 1138 bool URLIndexPrivateData::RestorePrivateData(
1092 const InMemoryURLIndexCacheItem& cache) { 1139 const InMemoryURLIndexCacheItem& cache) {
1140 if (cache.has_version())
1141 restored_cache_version_ = cache.version();
1093 return RestoreWordList(cache) && RestoreWordMap(cache) && 1142 return RestoreWordList(cache) && RestoreWordMap(cache) &&
1094 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) && 1143 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) &&
1095 RestoreHistoryInfoMap(cache); 1144 RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache);
1096 } 1145 }
1097 1146
1098 bool URLIndexPrivateData::RestoreWordList( 1147 bool URLIndexPrivateData::RestoreWordList(
1099 const InMemoryURLIndexCacheItem& cache) { 1148 const InMemoryURLIndexCacheItem& cache) {
1100 if (!cache.has_word_list()) 1149 if (!cache.has_word_list())
1101 return false; 1150 return false;
1102 const WordListItem& list_item(cache.word_list()); 1151 const WordListItem& list_item(cache.word_list());
1103 uint32 expected_item_count = list_item.word_count(); 1152 uint32 expected_item_count = list_item.word_count();
1104 uint32 actual_item_count = list_item.word_size(); 1153 uint32 actual_item_count = list_item.word_size();
1105 if (actual_item_count == 0 || actual_item_count != expected_item_count) 1154 if (actual_item_count == 0 || actual_item_count != expected_item_count)
(...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after
1206 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit())); 1255 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit()));
1207 if (iter->has_title()) { 1256 if (iter->has_title()) {
1208 string16 title(UTF8ToUTF16(iter->title())); 1257 string16 title(UTF8ToUTF16(iter->title()));
1209 url_row.set_title(title); 1258 url_row.set_title(title);
1210 } 1259 }
1211 history_info_map_[history_id] = url_row; 1260 history_info_map_[history_id] = url_row;
1212 } 1261 }
1213 return true; 1262 return true;
1214 } 1263 }
1215 1264
1265 bool URLIndexPrivateData::RestoreWordStartsMap(
1266 const InMemoryURLIndexCacheItem& cache) {
1267 // Note that this function must be called after RestoreHistoryInfoMap() has
1268 // been run as the word starts may have to be recalculated from the urls and
1269 // page titles.
1270 if (cache.has_word_starts_map()) {
1271 const WordStartsMapItem& list_item(cache.word_starts_map());
1272 uint32 expected_item_count = list_item.item_count();
1273 uint32 actual_item_count = list_item.word_starts_map_entry_size();
1274 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1275 return false;
1276 const RepeatedPtrField<WordStartsMapEntry>&
1277 entries(list_item.word_starts_map_entry());
1278 for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter =
1279 entries.begin(); iter != entries.end(); ++iter) {
1280 HistoryID history_id = iter->history_id();
1281 WordStarts word_starts;
1282 // Restore the URL word starts.
1283 const RepeatedField<int32>& url_starts(iter->url_word_starts());
1284 for (RepeatedField<int32>::const_iterator jiter = url_starts.begin();
Peter Kasting 2012/03/09 02:37:43 Nit: Argh... use |i| and |j| rather than |iter| an
1285 jiter != url_starts.end(); ++jiter)
1286 word_starts.url_word_starts_.push_back(*jiter);
1287 // Restore the page title word starts.
1288 const RepeatedField<int32>& title_starts(iter->title_word_starts());
1289 for (RepeatedField<int32>::const_iterator jiter = title_starts.begin();
1290 jiter != title_starts.end(); ++jiter)
1291 word_starts.title_word_starts_.push_back(*jiter);
1292 word_starts_map_[history_id] = word_starts;
1293 }
1294 } else {
1295 // Since the cache did not contain any word starts we must rebuild then from
1296 // the URL and page titles.
1297 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
1298 iter != history_info_map_.end(); ++iter) {
1299 WordStarts word_starts;
1300 const URLRow& row(iter->second);
1301 string16 url(net::FormatUrl(row.url(), languages_,
1302 net::kFormatUrlOmitUsernamePassword,
1303 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS,
1304 NULL, NULL, NULL));
1305 url = base::i18n::ToLower(url);
1306 String16VectorFromString16(url, false, &word_starts.url_word_starts_);
1307 String16VectorFromString16(
1308 row.title(), false, &word_starts.title_word_starts_);
1309 word_starts_map_[iter->first] = word_starts;
1310 }
1311 }
1312 return true;
1313 }
1314
1216 } // namespace history 1315 } // namespace history
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698