| Index: chrome/browser/history/url_index_private_data.cc
|
| ===================================================================
|
| --- chrome/browser/history/url_index_private_data.cc (revision 152962)
|
| +++ chrome/browser/history/url_index_private_data.cc (working copy)
|
| @@ -14,539 +14,45 @@
|
| #include "base/basictypes.h"
|
| #include "base/file_util.h"
|
| #include "base/i18n/case_conversion.h"
|
| +#include "base/metrics/histogram.h"
|
| #include "base/string_util.h"
|
| +#include "base/time.h"
|
| #include "base/utf_string_conversions.h"
|
| #include "chrome/browser/autocomplete/autocomplete_provider.h"
|
| +#include "chrome/browser/autocomplete/url_prefix.h"
|
| #include "chrome/browser/history/history_database.h"
|
| -#include "chrome/browser/history/in_memory_url_cache_database.h"
|
| -#include "chrome/common/chrome_constants.h"
|
| -#include "chrome/common/url_constants.h"
|
| +#include "chrome/browser/history/in_memory_url_index.h"
|
| #include "content/public/browser/browser_thread.h"
|
| #include "content/public/browser/notification_details.h"
|
| #include "content/public/browser/notification_service.h"
|
| #include "content/public/browser/notification_source.h"
|
| #include "net/base/net_util.h"
|
| +#include "third_party/protobuf/src/google/protobuf/repeated_field.h"
|
|
|
| -using content::BrowserThread;
|
| +using google::protobuf::RepeatedField;
|
| +using google::protobuf::RepeatedPtrField;
|
| +using in_memory_url_index::InMemoryURLIndexCacheItem;
|
|
|
| namespace history {
|
|
|
| -// Comparison function for sorting search terms by descending length.
|
| -bool LengthGreater(const string16& string_a, const string16& string_b) {
|
| - return string_a.length() > string_b.length();
|
| -}
|
| +typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem;
|
| +typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry;
|
| +typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;
|
| +typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem;
|
| +typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry
|
| + CharWordMapEntry;
|
| +typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem
|
| + WordIDHistoryMapItem;
|
| +typedef imui::
|
| + InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry
|
| + WordIDHistoryMapEntry;
|
| +typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem;
|
| +typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry
|
| + HistoryInfoMapEntry;
|
| +typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem;
|
| +typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry
|
| + WordStartsMapEntry;
|
|
|
| -// Initializes a whitelist of URL schemes.
|
| -void InitializeSchemeWhitelist(std::set<std::string>* whitelist) {
|
| - DCHECK(whitelist);
|
| - if (!whitelist->empty())
|
| - return; // Nothing to do, already initialized.
|
| - whitelist->insert(std::string(chrome::kAboutScheme));
|
| - whitelist->insert(std::string(chrome::kChromeUIScheme));
|
| - whitelist->insert(std::string(chrome::kFileScheme));
|
| - whitelist->insert(std::string(chrome::kFtpScheme));
|
| - whitelist->insert(std::string(chrome::kHttpScheme));
|
| - whitelist->insert(std::string(chrome::kHttpsScheme));
|
| - whitelist->insert(std::string(chrome::kMailToScheme));
|
| -}
|
| -
|
| -// CacheTransaction ------------------------------------------------------------
|
| -
|
| -// Simple automatic helper class encapsulating calls to BeginTransaction/
|
| -// CommitTransaction.
|
| -class CacheTransaction {
|
| - public:
|
| - explicit CacheTransaction(InMemoryURLCacheDatabase* db);
|
| - ~CacheTransaction();
|
| -
|
| - private:
|
| - InMemoryURLCacheDatabase* db_;
|
| -
|
| - DISALLOW_COPY_AND_ASSIGN(CacheTransaction);
|
| -};
|
| -
|
| -CacheTransaction::CacheTransaction(InMemoryURLCacheDatabase* db)
|
| - : db_(db) {
|
| - if (db_)
|
| - db_->BeginTransaction();
|
| -}
|
| -
|
| -CacheTransaction::~CacheTransaction() {
|
| - if (db_)
|
| - db_->CommitTransaction();
|
| -}
|
| -
|
| -// URLIndexPrivateData Public (but only to InMemoryURLIndex) Functions ---------
|
| -
|
| -URLIndexPrivateData::URLIndexPrivateData(const FilePath& history_dir,
|
| - const std::string& languages)
|
| - : history_dir_(history_dir),
|
| - languages_(languages),
|
| - cache_enabled_(true),
|
| - shutdown_(false),
|
| - lock_(),
|
| - pre_filter_item_count_(0),
|
| - post_filter_item_count_(0),
|
| - post_scoring_item_count_(0) {
|
| - InitializeSchemeWhitelist(&scheme_whitelist_);
|
| -}
|
| -
|
| -bool URLIndexPrivateData::Init(
|
| - base::SequencedWorkerPool::SequenceToken sequence_token) {
|
| - // Since init occurs on the DB thread, it is possible that the profile was
|
| - // told to shutdown before this task had a chance to kick off.
|
| - base::AutoLock lock(lock_);
|
| - if (shutdown_)
|
| - return false;
|
| - cache_db_ = new InMemoryURLCacheDatabase();
|
| - FilePath dir_path;
|
| - set_cache_enabled(GetCacheDBPath(&dir_path) &&
|
| - cache_db_->Init(dir_path, sequence_token));
|
| - return cache_enabled();
|
| -}
|
| -
|
| -void URLIndexPrivateData::Reset() {
|
| - Clear();
|
| - if (cache_enabled())
|
| - cache_db_->Reset();
|
| -}
|
| -
|
| -bool URLIndexPrivateData::Empty() const {
|
| - return history_info_map_.empty();
|
| -}
|
| -
|
| -URLIndexPrivateData* URLIndexPrivateData::Snapshot() const {
|
| - return new URLIndexPrivateData(*this);
|
| -}
|
| -
|
| -void URLIndexPrivateData::Shutdown() {
|
| - base::AutoLock lock(lock_);
|
| - shutdown_ = true;
|
| - if (cache_enabled())
|
| - cache_db_->Shutdown();
|
| -}
|
| -
|
| -// Helper predicates for ValidateConsistency.
|
| -template <typename Pair>
|
| -struct SelectFirst : std::unary_function<Pair, typename Pair::first_type> {
|
| - const typename Pair::first_type& operator()(const Pair& x) const {
|
| - return x.first;
|
| - }
|
| -};
|
| -
|
| -template <typename Pair>
|
| -struct SelectSecond : std::unary_function<Pair, typename Pair::second_type> {
|
| - const typename Pair::second_type& operator()(const Pair& x) const {
|
| - return x.second;
|
| - }
|
| -};
|
| -
|
| -template <typename SourcePair, typename Target>
|
| -struct TargetInserter : std::unary_function<SourcePair, void> {
|
| - explicit TargetInserter(Target* target) : target_(target) {}
|
| -
|
| - void operator()(const SourcePair& source) {
|
| - target_->insert(source.second.begin(), source.second.end());
|
| - }
|
| -
|
| - Target* target_;
|
| -};
|
| -
|
| -bool URLIndexPrivateData::ValidateConsistency() const {
|
| - // Scope things so that a large data set doesn't unnecessarily accumulate and
|
| - // hang around.
|
| - {
|
| - // Make a set of WordIDs from word_map_.
|
| - WordIDSet word_id_set_a;
|
| - std::transform(word_map_.begin(), word_map_.end(),
|
| - std::inserter(word_id_set_a, word_id_set_a.begin()),
|
| - SelectSecond<WordMap::value_type>());
|
| -
|
| - // Compare word_map_'s word set to the words from word_id_history_map_.
|
| - {
|
| - WordIDSet word_id_set_b;
|
| - std::transform(word_id_history_map_.begin(), word_id_history_map_.end(),
|
| - std::inserter(word_id_set_b, word_id_set_b.begin()),
|
| - SelectFirst<WordIDHistoryMap::value_type>());
|
| - WordIDSet leftovers;
|
| - std::set_symmetric_difference(
|
| - word_id_set_a.begin(), word_id_set_a.end(),
|
| - word_id_set_b.begin(), word_id_set_b.end(),
|
| - std::inserter(leftovers, leftovers.begin()));
|
| - if (!leftovers.empty())
|
| - return false;
|
| - }
|
| -
|
| - // Compare word_map_'s word set to the words from history_id_word_map_.
|
| - {
|
| - WordIDSet word_id_set_b;
|
| - std::for_each(history_id_word_map_.begin(), history_id_word_map_.end(),
|
| - TargetInserter<HistoryIDWordMap::value_type,
|
| - WordIDSet>(&word_id_set_b));
|
| - WordIDSet leftovers;
|
| - std::set_symmetric_difference(
|
| - word_id_set_a.begin(), word_id_set_a.end(),
|
| - word_id_set_b.begin(), word_id_set_b.end(),
|
| - std::inserter(leftovers, leftovers.begin()));
|
| - if (!leftovers.empty())
|
| - return false;
|
| - }
|
| -
|
| - // Compare word_map_'s word set to the words from char_word_map_.
|
| - {
|
| - WordIDSet word_id_set_b;
|
| - std::for_each(char_word_map_.begin(), char_word_map_.end(),
|
| - TargetInserter<CharWordIDMap::value_type,
|
| - WordIDSet>(&word_id_set_b));
|
| - WordIDSet leftovers;
|
| - std::set_symmetric_difference(
|
| - word_id_set_a.begin(), word_id_set_a.end(),
|
| - word_id_set_b.begin(), word_id_set_b.end(),
|
| - std::inserter(leftovers, leftovers.begin()));
|
| - if (!leftovers.empty())
|
| - return false;
|
| - }
|
| -
|
| - // Intersect available_words_ with set of WordIDs (created above from
|
| - // word_map_) and test for no common WordIDs.
|
| - {
|
| - WordIDSet leftovers;
|
| - std::set_intersection(word_id_set_a.begin(), word_id_set_a.end(),
|
| - available_words_.begin(), available_words_.end(),
|
| - std::inserter(leftovers, leftovers.begin()));
|
| - if (!leftovers.empty())
|
| - return false;
|
| - }
|
| - }
|
| -
|
| - {
|
| - // Make a set of HistoryIDs from history_info_map_.
|
| - HistoryIDSet history_id_set_a;
|
| - std::transform(history_info_map_.begin(), history_info_map_.end(),
|
| - std::inserter(history_id_set_a, history_id_set_a.begin()),
|
| - SelectFirst<HistoryInfoMap::value_type>());
|
| -
|
| - // Compare history_info_map_'s set to word_id_history_map_'s HistoryIDs.
|
| - {
|
| - HistoryIDSet history_id_set_b;
|
| - std::transform(history_info_map_.begin(), history_info_map_.end(),
|
| - std::inserter(history_id_set_b, history_id_set_b.begin()),
|
| - SelectFirst<HistoryInfoMap::value_type>());
|
| - HistoryIDSet leftovers;
|
| - std::set_symmetric_difference(
|
| - history_id_set_a.begin(), history_id_set_a.end(),
|
| - history_id_set_b.begin(), history_id_set_b.end(),
|
| - std::inserter(leftovers, leftovers.begin()));
|
| - if (!leftovers.empty())
|
| - return false;
|
| - }
|
| -
|
| - // Compare history_info_map_'s set to word_id_history_map_'s HistoryIDs.
|
| - {
|
| - HistoryIDSet history_id_set_b;
|
| - std::for_each(word_id_history_map_.begin(), word_id_history_map_.end(),
|
| - TargetInserter<WordIDHistoryMap::value_type,
|
| - HistoryIDSet>(&history_id_set_b));
|
| - HistoryIDSet leftovers;
|
| - std::set_symmetric_difference(
|
| - history_id_set_a.begin(), history_id_set_a.end(),
|
| - history_id_set_b.begin(), history_id_set_b.end(),
|
| - std::inserter(leftovers, leftovers.begin()));
|
| - if (!leftovers.empty())
|
| - return false;
|
| - }
|
| -
|
| - // Compare history_info_map_'s set to word_starts_map_'s HistoryIDs.
|
| - {
|
| - HistoryIDSet history_id_set_b;
|
| - std::transform(word_starts_map_.begin(), word_starts_map_.end(),
|
| - std::inserter(history_id_set_b, history_id_set_b.begin()),
|
| - SelectFirst<WordStartsMap::value_type>());
|
| - HistoryIDSet leftovers;
|
| - std::set_symmetric_difference(
|
| - history_id_set_a.begin(), history_id_set_a.end(),
|
| - history_id_set_b.begin(), history_id_set_b.end(),
|
| - std::inserter(leftovers, leftovers.begin()));
|
| - if (!leftovers.empty())
|
| - return false;
|
| - }
|
| - }
|
| -
|
| - // Make sets of words from word_list_ and word_map_ and test for equality.
|
| - {
|
| - String16Set word_list_set;
|
| - std::copy(word_list_.begin(), word_list_.end(),
|
| - std::inserter(word_list_set, word_list_set.end()));
|
| - String16Set word_map_set;
|
| - std::transform(word_map_.begin(), word_map_.end(),
|
| - std::inserter(word_map_set, word_map_set.begin()),
|
| - SelectFirst<WordMap::value_type>());
|
| - if (word_list_set != word_map_set)
|
| - return false;
|
| - }
|
| -
|
| - return true;
|
| -}
|
| -
|
| -ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
|
| - const string16& search_string) {
|
| - pre_filter_item_count_ = 0;
|
| - post_filter_item_count_ = 0;
|
| - post_scoring_item_count_ = 0;
|
| - // The search string we receive may contain escaped characters. For reducing
|
| - // the index we need individual, lower-cased words, ignoring escapings. For
|
| - // the final filtering we need whitespace separated substrings possibly
|
| - // containing escaped characters.
|
| - string16 lower_raw_string(base::i18n::ToLower(search_string));
|
| - string16 lower_unescaped_string =
|
| - net::UnescapeURLComponent(lower_raw_string,
|
| - net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
|
| - // Extract individual 'words' (as opposed to 'terms'; see below) from the
|
| - // search string. When the user types "colspec=ID%20Mstone Release" we get
|
| - // four 'words': "colspec", "id", "mstone" and "release".
|
| - String16Vector lower_words(
|
| - history::String16VectorFromString16(lower_unescaped_string, false, NULL));
|
| - ScoredHistoryMatches scored_items;
|
| -
|
| - // Do nothing if we have indexed no words (probably because we've not been
|
| - // initialized yet) or the search string has no words.
|
| - if (word_list_.empty() || lower_words.empty()) {
|
| - search_term_cache_.clear(); // Invalidate the term cache.
|
| - return scored_items;
|
| - }
|
| -
|
| - // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
|
| - // approach.
|
| - ResetSearchTermCache();
|
| -
|
| - HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);
|
| -
|
| - // Trim the candidate pool if it is large. Note that we do not filter out
|
| - // items that do not contain the search terms as proper substrings -- doing
|
| - // so is the performance-costly operation we are trying to avoid in order
|
| - // to maintain omnibox responsiveness.
|
| - const size_t kItemsToScoreLimit = 500;
|
| - pre_filter_item_count_ = history_id_set.size();
|
| - // If we trim the results set we do not want to cache the results for next
|
| - // time as the user's ultimately desired result could easily be eliminated
|
| - // in this early rough filter.
|
| - bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);
|
| - if (was_trimmed) {
|
| - HistoryIDVector history_ids;
|
| - std::copy(history_id_set.begin(), history_id_set.end(),
|
| - std::back_inserter(history_ids));
|
| - // Trim down the set by sorting by typed-count, visit-count, and last
|
| - // visit.
|
| - HistoryItemFactorGreater
|
| - item_factor_functor(history_info_map_);
|
| - std::partial_sort(history_ids.begin(),
|
| - history_ids.begin() + kItemsToScoreLimit,
|
| - history_ids.end(),
|
| - item_factor_functor);
|
| - history_id_set.clear();
|
| - std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,
|
| - std::inserter(history_id_set, history_id_set.end()));
|
| - post_filter_item_count_ = history_id_set.size();
|
| - }
|
| -
|
| - // Pass over all of the candidates filtering out any without a proper
|
| - // substring match, inserting those which pass in order by score. Note that
|
| - // in this step we are using the raw search string complete with escaped
|
| - // URL elements. When the user has specifically typed something akin to
|
| - // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
|
| - // specific substring appears in the URL or page title.
|
| -
|
| - // We call these 'terms' (as opposed to 'words'; see above) as in this case
|
| - // we only want to break up the search string on 'true' whitespace rather than
|
| - // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
|
| - // get two 'terms': "colspec=id%20mstone" and "release".
|
| - history::String16Vector lower_raw_terms;
|
| - Tokenize(lower_raw_string, kWhitespaceUTF16, &lower_raw_terms);
|
| - scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
|
| - AddHistoryMatch(*this, lower_raw_string,
|
| - lower_raw_terms, base::Time::Now())).ScoredMatches();
|
| -
|
| - // Select and sort only the top kMaxMatches results.
|
| - if (scored_items.size() > AutocompleteProvider::kMaxMatches) {
|
| - std::partial_sort(scored_items.begin(),
|
| - scored_items.begin() +
|
| - AutocompleteProvider::kMaxMatches,
|
| - scored_items.end(),
|
| - ScoredHistoryMatch::MatchScoreGreater);
|
| - scored_items.resize(AutocompleteProvider::kMaxMatches);
|
| - } else {
|
| - std::sort(scored_items.begin(), scored_items.end(),
|
| - ScoredHistoryMatch::MatchScoreGreater);
|
| - }
|
| - post_scoring_item_count_ = scored_items.size();
|
| -
|
| - if (was_trimmed) {
|
| - search_term_cache_.clear(); // Invalidate the term cache.
|
| - } else {
|
| - // Remove any stale SearchTermCacheItems.
|
| - for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
|
| - cache_iter != search_term_cache_.end(); ) {
|
| - if (!cache_iter->second.used_)
|
| - search_term_cache_.erase(cache_iter++);
|
| - else
|
| - ++cache_iter;
|
| - }
|
| - }
|
| -
|
| - return scored_items;
|
| -}
|
| -
|
| -bool URLIndexPrivateData::UpdateURL(const URLRow& row) {
|
| - // The row may or may not already be in our index. If it is not already
|
| - // indexed and it qualifies then it gets indexed. If it is already
|
| - // indexed and still qualifies then it gets updated, otherwise it
|
| - // is deleted from the index.
|
| - bool row_was_updated = false;
|
| - URLID row_id = row.id();
|
| - HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id);
|
| - if (row_pos == history_info_map_.end()) {
|
| - // This new row should be indexed if it qualifies.
|
| - URLRow new_row(row);
|
| - new_row.set_id(row_id);
|
| - row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) &&
|
| - IndexRow(new_row);
|
| - } else if (RowQualifiesAsSignificant(row, base::Time())) {
|
| - // This indexed row still qualifies and will be re-indexed.
|
| - // The url won't have changed but the title, visit count, etc.
|
| - // might have changed.
|
| - URLRow& row_to_update = row_pos->second;
|
| - bool title_updated = row_to_update.title() != row.title();
|
| - if (row_to_update.visit_count() != row.visit_count() ||
|
| - row_to_update.typed_count() != row.typed_count() ||
|
| - row_to_update.last_visit() != row.last_visit() || title_updated) {
|
| - CacheTransaction cache_transaction(cache_enabled() ?
|
| - cache_db_.get() : NULL);
|
| - row_to_update.set_visit_count(row.visit_count());
|
| - row_to_update.set_typed_count(row.typed_count());
|
| - row_to_update.set_last_visit(row.last_visit());
|
| - row_to_update.set_hidden(row.hidden());
|
| - // While the URL is guaranteed to remain stable, the title may have
|
| - // changed. If so, then update the index with the changed words.
|
| - if (title_updated) {
|
| - // Clear all words associated with this row and re-index both the
|
| - // URL and title.
|
| - RemoveRowWordsFromIndex(row_to_update);
|
| - row_to_update.set_title(row.title());
|
| - RowWordStarts word_starts;
|
| - AddRowWordsToIndex(row_to_update, &word_starts);
|
| - word_starts_map_[row_id] = word_starts;
|
| - // Replace all word_starts for the row.
|
| - if (cache_enabled()) {
|
| - cache_db_->RemoveWordStarts(row_id);
|
| - cache_db_->AddRowWordStarts(row_id, word_starts);
|
| - }
|
| - }
|
| - if (cache_enabled()) {
|
| - cache_db_->RemoveHistoryIDFromURLs(row_id);
|
| - cache_db_->AddHistoryToURLs(row_id, row);
|
| - }
|
| - row_was_updated = true;
|
| - }
|
| - } else {
|
| - // This indexed row no longer qualifies and will be de-indexed by
|
| - // clearing all words associated with this row.
|
| - RemoveRowFromIndex(row);
|
| - row_was_updated = true;
|
| - }
|
| - if (row_was_updated)
|
| - search_term_cache_.clear(); // This invalidates the cache.
|
| - return row_was_updated;
|
| -}
|
| -
|
| -// Helper functor for DeleteURL.
|
| -class HistoryInfoMapItemHasURL {
|
| - public:
|
| - explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {}
|
| -
|
| - bool operator()(const std::pair<const HistoryID, URLRow>& item) {
|
| - return item.second.url() == url_;
|
| - }
|
| -
|
| - private:
|
| - const GURL& url_;
|
| -};
|
| -
|
| -bool URLIndexPrivateData::DeleteURL(const GURL& url) {
|
| - DCHECK(!BrowserThread::IsWellKnownThread(BrowserThread::UI) ||
|
| - BrowserThread::CurrentlyOn(BrowserThread::UI));
|
| - // Find the matching entry in the history_info_map_.
|
| - HistoryInfoMap::iterator pos = std::find_if(
|
| - history_info_map_.begin(),
|
| - history_info_map_.end(),
|
| - HistoryInfoMapItemHasURL(url));
|
| - if (pos == history_info_map_.end())
|
| - return false;
|
| - RemoveRowFromIndex(pos->second);
|
| - search_term_cache_.clear(); // This invalidates the cache.
|
| - return true;
|
| -}
|
| -
|
| -bool URLIndexPrivateData::RestoreFromCacheTask() {
|
| - DCHECK(!BrowserThread::IsWellKnownThread(BrowserThread::UI) ||
|
| - !BrowserThread::CurrentlyOn(BrowserThread::UI));
|
| - DCHECK(cache_db_);
|
| - if (IsShutdown())
|
| - return false;
|
| - Clear();
|
| - if (RestoreFromCache(cache_db_))
|
| - return true;
|
| - // If there was no SQLite-based cache then there might be an old,
|
| - // deprecated, protobuf-based cache file laying around. Get rid of it.
|
| - DeleteOldVersionCacheFile();
|
| - return false;
|
| -}
|
| -
|
| -// static
|
| -scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory(
|
| - HistoryDatabase* history_db,
|
| - scoped_refptr<URLIndexPrivateData> old_data) {
|
| - DCHECK(!BrowserThread::IsWellKnownThread(BrowserThread::UI) ||
|
| - !BrowserThread::CurrentlyOn(BrowserThread::UI));
|
| - if (!history_db)
|
| - return NULL;
|
| -
|
| - scoped_refptr<URLIndexPrivateData> rebuilt_data(
|
| - new URLIndexPrivateData(*(old_data.get())));
|
| -
|
| - // NOTE: We disable the cache database until after the replacement private
|
| - // data has been created and then we re-enable the database. Once the private
|
| - // data has been swapped in the database is refreshed with the new data.
|
| - URLDatabase::URLEnumerator history_enum;
|
| - if (!history_db->InitURLEnumeratorForSignificant(&history_enum))
|
| - return NULL;
|
| -
|
| - for (URLRow row; !old_data->IsShutdown() && history_enum.GetNextURL(&row); )
|
| - rebuilt_data->IndexRow(row);
|
| -
|
| - if (old_data->IsShutdown()) {
|
| - rebuilt_data.release();
|
| - }
|
| -
|
| - return rebuilt_data;
|
| -}
|
| -
|
| -void URLIndexPrivateData::RefreshCacheTask() {
|
| - DCHECK(!BrowserThread::IsWellKnownThread(BrowserThread::UI) ||
|
| - !BrowserThread::CurrentlyOn(BrowserThread::UI));
|
| - base::AutoLock lock(lock_);
|
| - if (shutdown_ || !cache_enabled())
|
| - return;
|
| - // Should a refresh fail for any reason we consider the database to be
|
| - // corrupt and unavailable; no further remedial action is possible.
|
| - set_cache_enabled(cache_db_->Reset() && cache_db_->Refresh(*this));
|
| -}
|
| -
|
| -// static
|
| -void URLIndexPrivateData::InitializeSchemeWhitelistForTesting(
|
| - std::set<std::string>* whitelist) {
|
| - InitializeSchemeWhitelist(whitelist);
|
| -}
|
| -
|
| // SearchTermCacheItem ---------------------------------------------------------
|
|
|
| URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(
|
| @@ -561,100 +67,25 @@
|
|
|
| URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {}
|
|
|
| -// URLIndexPrivateData::AddHistoryMatch ----------------------------------------
|
| +// Algorithm Functions ---------------------------------------------------------
|
|
|
| -URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch(
|
| - const URLIndexPrivateData& private_data,
|
| - const string16& lower_string,
|
| - const String16Vector& lower_terms,
|
| - const base::Time now)
|
| - : private_data_(private_data),
|
| - lower_string_(lower_string),
|
| - lower_terms_(lower_terms),
|
| - now_(now) {}
|
| -
|
| -URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {}
|
| -
|
| -void URLIndexPrivateData::AddHistoryMatch::operator()(
|
| - const HistoryID history_id) {
|
| - HistoryInfoMap::const_iterator hist_pos =
|
| - private_data_.history_info_map_.find(history_id);
|
| - if (hist_pos != private_data_.history_info_map_.end()) {
|
| - const URLRow& hist_item = hist_pos->second;
|
| - WordStartsMap::const_iterator starts_pos =
|
| - private_data_.word_starts_map_.find(history_id);
|
| - DCHECK(starts_pos != private_data_.word_starts_map_.end());
|
| - ScoredHistoryMatch match(hist_item, lower_string_, lower_terms_,
|
| - starts_pos->second, now_);
|
| - if (match.raw_score > 0)
|
| - scored_matches_.push_back(match);
|
| - }
|
| +// Comparison function for sorting search terms by descending length.
|
| +bool LengthGreater(const string16& string_a, const string16& string_b) {
|
| + return string_a.length() > string_b.length();
|
| }
|
|
|
| -// URLIndexPrivateData::HistoryItemFactorGreater -------------------------------
|
| +// InMemoryURLIndex's Private Data ---------------------------------------------
|
|
|
| -URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater(
|
| - const HistoryInfoMap& history_info_map)
|
| - : history_info_map_(history_info_map) {
|
| -}
|
| -
|
| -URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {}
|
| -
|
| -bool URLIndexPrivateData::HistoryItemFactorGreater::operator()(
|
| - const HistoryID h1,
|
| - const HistoryID h2) {
|
| - HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1));
|
| - if (entry1 == history_info_map_.end())
|
| - return false;
|
| - HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2));
|
| - if (entry2 == history_info_map_.end())
|
| - return true;
|
| - const URLRow& r1(entry1->second);
|
| - const URLRow& r2(entry2->second);
|
| - // First cut: typed count, visit count, recency.
|
| - // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
|
| - // recently visited (within the last 12/24 hours) as highly important. Get
|
| - // input from mpearson.
|
| - if (r1.typed_count() != r2.typed_count())
|
| - return (r1.typed_count() > r2.typed_count());
|
| - if (r1.visit_count() != r2.visit_count())
|
| - return (r1.visit_count() > r2.visit_count());
|
| - return (r1.last_visit() > r2.last_visit());
|
| -}
|
| -
|
| -// URLIndexPrivateData Private Functions ---------------------------------------
|
| -
|
| -URLIndexPrivateData::URLIndexPrivateData(const URLIndexPrivateData& old_data)
|
| - : history_dir_(old_data.history_dir_),
|
| - languages_(old_data.languages_),
|
| - cache_enabled_(old_data.cache_enabled_),
|
| - shutdown_(old_data.IsShutdown()),
|
| - lock_(),
|
| - scheme_whitelist_(old_data.scheme_whitelist_),
|
| - pre_filter_item_count_(0),
|
| - post_filter_item_count_(0),
|
| - post_scoring_item_count_(0) {
|
| - cache_db_ = old_data.cache_db_;
|
| -}
|
| -
|
| -// Called only by unit tests.
|
| URLIndexPrivateData::URLIndexPrivateData()
|
| - : cache_enabled_(true),
|
| - shutdown_(false),
|
| - lock_(),
|
| + : restored_cache_version_(0),
|
| + saved_cache_version_(kCurrentCacheFileVersion),
|
| pre_filter_item_count_(0),
|
| post_filter_item_count_(0),
|
| post_scoring_item_count_(0) {
|
| - InitializeSchemeWhitelist(&scheme_whitelist_);
|
| }
|
|
|
| URLIndexPrivateData::~URLIndexPrivateData() {}
|
|
|
| -bool URLIndexPrivateData::IsShutdown() const {
|
| - base::AutoLock lock(lock_);
|
| - return shutdown_;
|
| -}
|
| -
|
| void URLIndexPrivateData::Clear() {
|
| word_list_.clear();
|
| available_words_.clear();
|
| @@ -666,24 +97,48 @@
|
| word_starts_map_.clear();
|
| }
|
|
|
| +bool URLIndexPrivateData::Empty() const {
|
| + return history_info_map_.empty();
|
| +}
|
| +
|
| +scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const {
|
| + scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData;
|
| + data_copy->word_list_ = word_list_;
|
| + data_copy->available_words_ = available_words_;
|
| + data_copy->word_map_ = word_map_;
|
| + data_copy->char_word_map_ = char_word_map_;
|
| + data_copy->word_id_history_map_ = word_id_history_map_;
|
| + data_copy->history_id_word_map_ = history_id_word_map_;
|
| + data_copy->history_info_map_ = history_info_map_;
|
| + data_copy->word_starts_map_ = word_starts_map_;
|
| + return data_copy;
|
| + // Not copied:
|
| + // search_term_cache_
|
| + // pre_filter_item_count_
|
| + // post_filter_item_count_
|
| + // post_scoring_item_count_
|
| +};
|
| +
|
| // Cache Updating --------------------------------------------------------------
|
|
|
| -bool URLIndexPrivateData::IndexRow(const URLRow& row) {
|
| +bool URLIndexPrivateData::IndexRow(
|
| + const URLRow& row,
|
| + const std::string& languages,
|
| + const std::set<std::string>& scheme_whitelist) {
|
| const GURL& gurl(row.url());
|
|
|
| // Index only URLs with a whitelisted scheme.
|
| - if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist_))
|
| + if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist))
|
| return false;
|
|
|
| + URLID row_id = row.id();
|
| // Strip out username and password before saving and indexing.
|
| - string16 url(net::FormatUrl(gurl, languages_,
|
| + string16 url(net::FormatUrl(gurl, languages,
|
| net::kFormatUrlOmitUsernamePassword,
|
| net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS,
|
| NULL, NULL, NULL));
|
|
|
| - URLID row_id = row.id();
|
| HistoryID history_id = static_cast<HistoryID>(row_id);
|
| - DCHECK(history_info_map_.find(history_id) == history_info_map_.end());
|
| DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max());
|
|
|
| // Add the row for quick lookup in the history info store.
|
| @@ -692,27 +147,22 @@
|
| new_row.set_typed_count(row.typed_count());
|
| new_row.set_last_visit(row.last_visit());
|
| new_row.set_title(row.title());
|
| -
|
| - CacheTransaction cache_transaction(cache_enabled() ? cache_db_.get() : NULL);
|
| history_info_map_[history_id] = new_row;
|
| - if (cache_enabled())
|
| - cache_db_->AddHistoryToURLs(history_id, row);
|
|
|
| // Index the words contained in the URL and title of the row.
|
| RowWordStarts word_starts;
|
| - AddRowWordsToIndex(new_row, &word_starts);
|
| + AddRowWordsToIndex(new_row, &word_starts, languages);
|
| word_starts_map_[history_id] = word_starts;
|
| - if (cache_enabled())
|
| - cache_db_->AddRowWordStarts(history_id, word_starts);
|
| return true;
|
| }
|
|
|
| void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row,
|
| - RowWordStarts* word_starts) {
|
| + RowWordStarts* word_starts,
|
| + const std::string& languages) {
|
| HistoryID history_id = static_cast<HistoryID>(row.id());
|
| // Split URL into individual, unique words then add in the title words.
|
| const GURL& gurl(row.url());
|
| - string16 url(net::FormatUrl(gurl, languages_,
|
| + string16 url(net::FormatUrl(gurl, languages,
|
| net::kFormatUrlOmitUsernamePassword,
|
| net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS,
|
| NULL, NULL, NULL));
|
| @@ -745,15 +195,9 @@
|
| HistoryID history_id) {
|
| WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);
|
| DCHECK(history_pos != word_id_history_map_.end());
|
| - // There is no need to record changes to the word_id_history_map_ in the cache
|
| - // because it can be rebuilt from the history_id_word_map_'s table.
|
| HistoryIDSet& history_id_set(history_pos->second);
|
| history_id_set.insert(history_id);
|
| AddToHistoryIDWordMap(history_id, word_id);
|
| -
|
| - // Add word_id/history_id entry to the word_history table
|
| - if (cache_enabled())
|
| - cache_db_->AddHistoryToWordHistory(word_id, history_id);
|
| }
|
|
|
| void URLIndexPrivateData::AddWordHistory(const string16& term,
|
| @@ -768,19 +212,11 @@
|
| }
|
| word_map_[term] = word_id;
|
|
|
| - // Add word_id/term entry to the words table;
|
| - if (cache_enabled())
|
| - cache_db_->AddWordToWords(word_id, term);
|
| -
|
| HistoryIDSet history_id_set;
|
| history_id_set.insert(history_id);
|
| word_id_history_map_[word_id] = history_id_set;
|
| AddToHistoryIDWordMap(history_id, word_id);
|
|
|
| - // Add word_id/history_id entry to the word_history table
|
| - if (cache_enabled())
|
| - cache_db_->AddHistoryToWordHistory(word_id, history_id);
|
| -
|
| // For each character in the newly added word (i.e. a word that is not
|
| // already in the word index), add the word to the character index.
|
| Char16Set characters = Char16SetFromString16(term);
|
| @@ -798,20 +234,14 @@
|
| word_id_set.insert(word_id);
|
| char_word_map_[uni_char] = word_id_set;
|
| }
|
| - // Add uni_char/word_id entry to the char_words database table.
|
| - if (cache_enabled())
|
| - cache_db_->AddWordToCharWords(uni_char, word_id);
|
| }
|
| }
|
|
|
| void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) {
|
| - CacheTransaction cache_transaction(cache_enabled() ? cache_db_.get() : NULL);
|
| RemoveRowWordsFromIndex(row);
|
| HistoryID history_id = static_cast<HistoryID>(row.id());
|
| history_info_map_.erase(history_id);
|
| word_starts_map_.erase(history_id);
|
| - if (cache_enabled())
|
| - cache_db_->RemoveHistoryIDFromURLs(history_id);
|
| }
|
|
|
| void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) {
|
| @@ -845,11 +275,7 @@
|
| word_map_.erase(word);
|
| word_list_[word_id] = string16();
|
| available_words_.insert(word_id);
|
| - if (cache_enabled())
|
| - cache_db_->RemoveWordFromWords(word_id);
|
| }
|
| - if (cache_enabled())
|
| - cache_db_->RemoveHistoryIDFromWordHistory(history_id);
|
| }
|
|
|
| void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id,
|
| @@ -865,6 +291,257 @@
|
| }
|
| }
|
|
|
| +bool URLIndexPrivateData::UpdateURL(
|
| + const URLRow& row,
|
| + const std::string& languages,
|
| + const std::set<std::string>& scheme_whitelist) {
|
| + // The row may or may not already be in our index. If it is not already
|
| + // indexed and it qualifies then it gets indexed. If it is already
|
| + // indexed and still qualifies then it gets updated, otherwise it
|
| + // is deleted from the index.
|
| + bool row_was_updated = false;
|
| + URLID row_id = row.id();
|
| + HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id);
|
| + if (row_pos == history_info_map_.end()) {
|
| + // This new row should be indexed if it qualifies.
|
| + URLRow new_row(row);
|
| + new_row.set_id(row_id);
|
| + row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) &&
|
| + IndexRow(new_row, languages, scheme_whitelist);
|
| + } else if (RowQualifiesAsSignificant(row, base::Time())) {
|
| + // This indexed row still qualifies and will be re-indexed.
|
| + // The url won't have changed but the title, visit count, etc.
|
| + // might have changed.
|
| + URLRow& row_to_update = row_pos->second;
|
| + bool title_updated = row_to_update.title() != row.title();
|
| + if (row_to_update.visit_count() != row.visit_count() ||
|
| + row_to_update.typed_count() != row.typed_count() ||
|
| + row_to_update.last_visit() != row.last_visit() || title_updated) {
|
| + row_to_update.set_visit_count(row.visit_count());
|
| + row_to_update.set_typed_count(row.typed_count());
|
| + row_to_update.set_last_visit(row.last_visit());
|
| + // While the URL is guaranteed to remain stable, the title may have
|
| + // changed. If so, then update the index with the changed words.
|
| + if (title_updated) {
|
| + // Clear all words associated with this row and re-index both the
|
| + // URL and title.
|
| + RemoveRowWordsFromIndex(row_to_update);
|
| + row_to_update.set_title(row.title());
|
| + RowWordStarts word_starts;
|
| + AddRowWordsToIndex(row_to_update, &word_starts, languages);
|
| + word_starts_map_[row_id] = word_starts;
|
| + }
|
| + row_was_updated = true;
|
| + }
|
| + } else {
|
| + // This indexed row no longer qualifies and will be de-indexed by
|
| + // clearing all words associated with this row.
|
| + RemoveRowFromIndex(row);
|
| + row_was_updated = true;
|
| + }
|
| + if (row_was_updated)
|
| + search_term_cache_.clear(); // This invalidates the cache.
|
| + return row_was_updated;
|
| +}
|
| +
|
| +// Helper functor for DeleteURL.
|
| +class HistoryInfoMapItemHasURL {
|
| + public:
|
| + explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {}
|
| +
|
| + bool operator()(const std::pair<const HistoryID, URLRow>& item) {
|
| + return item.second.url() == url_;
|
| + }
|
| +
|
| + private:
|
| + const GURL& url_;
|
| +};
|
| +
|
| +bool URLIndexPrivateData::DeleteURL(const GURL& url) {
|
| + // Find the matching entry in the history_info_map_.
|
| + HistoryInfoMap::iterator pos = std::find_if(
|
| + history_info_map_.begin(),
|
| + history_info_map_.end(),
|
| + HistoryInfoMapItemHasURL(url));
|
| + if (pos == history_info_map_.end())
|
| + return false;
|
| + RemoveRowFromIndex(pos->second);
|
| + search_term_cache_.clear(); // This invalidates the cache.
|
| + return true;
|
| +}
|
| +
|
| +// URLIndexPrivateData::HistoryItemFactorGreater -------------------------------
|
| +
|
| +URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater(
|
| + const HistoryInfoMap& history_info_map)
|
| + : history_info_map_(history_info_map) {
|
| +}
|
| +
|
| +URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {}
|
| +
|
| +bool URLIndexPrivateData::HistoryItemFactorGreater::operator()(
|
| + const HistoryID h1,
|
| + const HistoryID h2) {
|
| + HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1));
|
| + if (entry1 == history_info_map_.end())
|
| + return false;
|
| + HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2));
|
| + if (entry2 == history_info_map_.end())
|
| + return true;
|
| + const URLRow& r1(entry1->second);
|
| + const URLRow& r2(entry2->second);
|
| + // First cut: typed count, visit count, recency.
|
| + // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
|
| + // recently visited (within the last 12/24 hours) as highly important. Get
|
| + // input from mpearson.
|
| + if (r1.typed_count() != r2.typed_count())
|
| + return (r1.typed_count() > r2.typed_count());
|
| + if (r1.visit_count() != r2.visit_count())
|
| + return (r1.visit_count() > r2.visit_count());
|
| + return (r1.last_visit() > r2.last_visit());
|
| +}
|
| +
|
| +// Cache Searching -------------------------------------------------------------
|
| +
|
| +// NOTE: This is the main public search function.
|
| +ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
|
| + const string16& search_string) {
|
| + pre_filter_item_count_ = 0;
|
| + post_filter_item_count_ = 0;
|
| + post_scoring_item_count_ = 0;
|
| + // The search string we receive may contain escaped characters. For reducing
|
| + // the index we need individual, lower-cased words, ignoring escapings. For
|
| + // the final filtering we need whitespace separated substrings possibly
|
| + // containing escaped characters.
|
| + string16 lower_raw_string(base::i18n::ToLower(search_string));
|
| + string16 lower_unescaped_string =
|
| + net::UnescapeURLComponent(lower_raw_string,
|
| + net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
|
| + // Extract individual 'words' (as opposed to 'terms'; see below) from the
|
| + // search string. When the user types "colspec=ID%20Mstone Release" we get
|
| + // four 'words': "colspec", "id", "mstone" and "release".
|
| + String16Vector lower_words(
|
| + history::String16VectorFromString16(lower_unescaped_string, false, NULL));
|
| + ScoredHistoryMatches scored_items;
|
| +
|
| + // Do nothing if we have indexed no words (probably because we've not been
|
| + // initialized yet) or the search string has no words.
|
| + if (word_list_.empty() || lower_words.empty()) {
|
| + search_term_cache_.clear(); // Invalidate the term cache.
|
| + return scored_items;
|
| + }
|
| +
|
| + // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
|
| + // approach.
|
| + ResetSearchTermCache();
|
| +
|
| + HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);
|
| +
|
| + // Trim the candidate pool if it is large. Note that we do not filter out
|
| + // items that do not contain the search terms as proper substrings -- doing
|
| + // so is the performance-costly operation we are trying to avoid in order
|
| + // to maintain omnibox responsiveness.
|
| + const size_t kItemsToScoreLimit = 500;
|
| + pre_filter_item_count_ = history_id_set.size();
|
| + // If we trim the results set we do not want to cache the results for next
|
| + // time as the user's ultimately desired result could easily be eliminated
|
| + // in this early rough filter.
|
| + bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);
|
| + if (was_trimmed) {
|
| + HistoryIDVector history_ids;
|
| + std::copy(history_id_set.begin(), history_id_set.end(),
|
| + std::back_inserter(history_ids));
|
| + // Trim down the set by sorting by typed-count, visit-count, and last
|
| + // visit.
|
| + HistoryItemFactorGreater
|
| + item_factor_functor(history_info_map_);
|
| + std::partial_sort(history_ids.begin(),
|
| + history_ids.begin() + kItemsToScoreLimit,
|
| + history_ids.end(),
|
| + item_factor_functor);
|
| + history_id_set.clear();
|
| + std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,
|
| + std::inserter(history_id_set, history_id_set.end()));
|
| + post_filter_item_count_ = history_id_set.size();
|
| + }
|
| +
|
| + // Pass over all of the candidates filtering out any without a proper
|
| + // substring match, inserting those which pass in order by score. Note that
|
| + // in this step we are using the raw search string complete with escaped
|
| + // URL elements. When the user has specifically typed something akin to
|
| + // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
|
| + // specific substring appears in the URL or page title.
|
| +
|
| + // We call these 'terms' (as opposed to 'words'; see above) as in this case
|
| + // we only want to break up the search string on 'true' whitespace rather than
|
| + // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
|
| + // get two 'terms': "colspec=id%20mstone" and "release".
|
| + history::String16Vector lower_raw_terms;
|
| + Tokenize(lower_raw_string, kWhitespaceUTF16, &lower_raw_terms);
|
| + scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
|
| + AddHistoryMatch(*this, lower_raw_string,
|
| + lower_raw_terms, base::Time::Now())).ScoredMatches();
|
| +
|
| + // Select and sort only the top kMaxMatches results.
|
| + if (scored_items.size() > AutocompleteProvider::kMaxMatches) {
|
| + std::partial_sort(scored_items.begin(),
|
| + scored_items.begin() +
|
| + AutocompleteProvider::kMaxMatches,
|
| + scored_items.end(),
|
| + ScoredHistoryMatch::MatchScoreGreater);
|
| + scored_items.resize(AutocompleteProvider::kMaxMatches);
|
| + } else {
|
| + std::sort(scored_items.begin(), scored_items.end(),
|
| + ScoredHistoryMatch::MatchScoreGreater);
|
| + }
|
| + post_scoring_item_count_ = scored_items.size();
|
| +
|
| + if (was_trimmed) {
|
| + search_term_cache_.clear(); // Invalidate the term cache.
|
| + } else {
|
| + // Remove any stale SearchTermCacheItems.
|
| + for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
|
| + cache_iter != search_term_cache_.end(); ) {
|
| + if (!cache_iter->second.used_)
|
| + search_term_cache_.erase(cache_iter++);
|
| + else
|
| + ++cache_iter;
|
| + }
|
| + }
|
| +
|
| + return scored_items;
|
| +}
|
| +
|
| +// URLIndexPrivateData::AddHistoryMatch ----------------------------------------
|
| +
|
| +URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch(
|
| + const URLIndexPrivateData& private_data,
|
| + const string16& lower_string,
|
| + const String16Vector& lower_terms,
|
| + const base::Time now)
|
| + : private_data_(private_data),
|
| + lower_string_(lower_string),
|
| + lower_terms_(lower_terms),
|
| + now_(now) {}
|
| +
|
| +URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {}
|
| +
|
| +void URLIndexPrivateData::AddHistoryMatch::operator()(
|
| + const HistoryID history_id) {
|
| + HistoryInfoMap::const_iterator hist_pos =
|
| + private_data_.history_info_map_.find(history_id);
|
| + if (hist_pos != private_data_.history_info_map_.end()) {
|
| + const URLRow& hist_item = hist_pos->second;
|
| + WordStartsMap::const_iterator starts_pos =
|
| + private_data_.word_starts_map_.find(history_id);
|
| + DCHECK(starts_pos != private_data_.word_starts_map_.end());
|
| + ScoredHistoryMatch match(hist_item, lower_string_, lower_terms_,
|
| + starts_pos->second, now_);
|
| + if (match.raw_score > 0)
|
| + scored_matches_.push_back(match);
|
| + }
|
| +}
|
| +
|
| void URLIndexPrivateData::ResetSearchTermCache() {
|
| for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();
|
| iter != search_term_cache_.end(); ++iter)
|
| @@ -1053,35 +730,418 @@
|
| return word_id_set;
|
| }
|
|
|
| -bool URLIndexPrivateData::RestoreFromCache(InMemoryURLCacheDatabase* cache_db) {
|
| - DCHECK(cache_db);
|
| - if (!cache_db->RestorePrivateData(this)) {
|
| - cache_db->Reset();
|
| +// Cache Saving ----------------------------------------------------------------
|
| +
|
| +// static
|
| +void URLIndexPrivateData::WritePrivateDataToCacheFileTask(
|
| + scoped_refptr<URLIndexPrivateData> private_data,
|
| + const FilePath& file_path,
|
| + scoped_refptr<RefCountedBool> succeeded) {
|
| + DCHECK(private_data.get());
|
| + DCHECK(!file_path.empty());
|
| + succeeded->set_value(private_data->SaveToFile(file_path));
|
| +}
|
| +
|
| +bool URLIndexPrivateData::SaveToFile(const FilePath& file_path) {
|
| + base::TimeTicks beginning_time = base::TimeTicks::Now();
|
| + InMemoryURLIndexCacheItem index_cache;
|
| + SavePrivateData(&index_cache);
|
| + std::string data;
|
| + if (!index_cache.SerializeToString(&data)) {
|
| + LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache.";
|
| return false;
|
| }
|
| - return !Empty() && ValidateConsistency();
|
| +
|
| + int size = data.size();
|
| + if (file_util::WriteFile(file_path, data.c_str(), size) != size) {
|
| + LOG(WARNING) << "Failed to write " << file_path.value();
|
| + return false;
|
| + }
|
| + UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",
|
| + base::TimeTicks::Now() - beginning_time);
|
| + return true;
|
| }
|
|
|
| -void URLIndexPrivateData::DeleteOldVersionCacheFile() const {
|
| - if (history_dir_.empty())
|
| +void URLIndexPrivateData::SavePrivateData(
|
| + InMemoryURLIndexCacheItem* cache) const {
|
| + DCHECK(cache);
|
| + cache->set_timestamp(base::Time::Now().ToInternalValue());
|
| + cache->set_version(saved_cache_version_);
|
| + // history_item_count_ is no longer used but rather than change the protobuf
|
| + // definition use a placeholder. This will go away with the switch to SQLite.
|
| + cache->set_history_item_count(0);
|
| + SaveWordList(cache);
|
| + SaveWordMap(cache);
|
| + SaveCharWordMap(cache);
|
| + SaveWordIDHistoryMap(cache);
|
| + SaveHistoryInfoMap(cache);
|
| + SaveWordStartsMap(cache);
|
| +}
|
| +
|
| +void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const {
|
| + if (word_list_.empty())
|
| return;
|
| - FilePath path = history_dir_.Append(chrome::kHQPCacheFilename);
|
| - file_util::Delete(path, false);
|
| + WordListItem* list_item = cache->mutable_word_list();
|
| + list_item->set_word_count(word_list_.size());
|
| + for (String16Vector::const_iterator iter = word_list_.begin();
|
| + iter != word_list_.end(); ++iter)
|
| + list_item->add_word(UTF16ToUTF8(*iter));
|
| }
|
|
|
| -bool URLIndexPrivateData::GetCacheDBPath(FilePath* file_path) {
|
| - DCHECK(file_path);
|
| - if (history_dir_.empty())
|
| +void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const {
|
| + if (word_map_.empty())
|
| + return;
|
| + WordMapItem* map_item = cache->mutable_word_map();
|
| + map_item->set_item_count(word_map_.size());
|
| + for (WordMap::const_iterator iter = word_map_.begin();
|
| + iter != word_map_.end(); ++iter) {
|
| + WordMapEntry* map_entry = map_item->add_word_map_entry();
|
| + map_entry->set_word(UTF16ToUTF8(iter->first));
|
| + map_entry->set_word_id(iter->second);
|
| + }
|
| +}
|
| +
|
| +void URLIndexPrivateData::SaveCharWordMap(
|
| + InMemoryURLIndexCacheItem* cache) const {
|
| + if (char_word_map_.empty())
|
| + return;
|
| + CharWordMapItem* map_item = cache->mutable_char_word_map();
|
| + map_item->set_item_count(char_word_map_.size());
|
| + for (CharWordIDMap::const_iterator iter = char_word_map_.begin();
|
| + iter != char_word_map_.end(); ++iter) {
|
| + CharWordMapEntry* map_entry = map_item->add_char_word_map_entry();
|
| + map_entry->set_char_16(iter->first);
|
| + const WordIDSet& word_id_set(iter->second);
|
| + map_entry->set_item_count(word_id_set.size());
|
| + for (WordIDSet::const_iterator set_iter = word_id_set.begin();
|
| + set_iter != word_id_set.end(); ++set_iter)
|
| + map_entry->add_word_id(*set_iter);
|
| + }
|
| +}
|
| +
|
| +void URLIndexPrivateData::SaveWordIDHistoryMap(
|
| + InMemoryURLIndexCacheItem* cache) const {
|
| + if (word_id_history_map_.empty())
|
| + return;
|
| + WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map();
|
| + map_item->set_item_count(word_id_history_map_.size());
|
| + for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin();
|
| + iter != word_id_history_map_.end(); ++iter) {
|
| + WordIDHistoryMapEntry* map_entry =
|
| + map_item->add_word_id_history_map_entry();
|
| + map_entry->set_word_id(iter->first);
|
| + const HistoryIDSet& history_id_set(iter->second);
|
| + map_entry->set_item_count(history_id_set.size());
|
| + for (HistoryIDSet::const_iterator set_iter = history_id_set.begin();
|
| + set_iter != history_id_set.end(); ++set_iter)
|
| + map_entry->add_history_id(*set_iter);
|
| + }
|
| +}
|
| +
|
| +void URLIndexPrivateData::SaveHistoryInfoMap(
|
| + InMemoryURLIndexCacheItem* cache) const {
|
| + if (history_info_map_.empty())
|
| + return;
|
| + HistoryInfoMapItem* map_item = cache->mutable_history_info_map();
|
| + map_item->set_item_count(history_info_map_.size());
|
| + for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
|
| + iter != history_info_map_.end(); ++iter) {
|
| + HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry();
|
| + map_entry->set_history_id(iter->first);
|
| + const URLRow& url_row(iter->second);
|
| + // Note: We only save information that contributes to the index so there
|
| + // is no need to save search_term_cache_ (not persistent).
|
| + map_entry->set_visit_count(url_row.visit_count());
|
| + map_entry->set_typed_count(url_row.typed_count());
|
| + map_entry->set_last_visit(url_row.last_visit().ToInternalValue());
|
| + map_entry->set_url(url_row.url().spec());
|
| + map_entry->set_title(UTF16ToUTF8(url_row.title()));
|
| + }
|
| +}
|
| +
|
| +void URLIndexPrivateData::SaveWordStartsMap(
|
| + InMemoryURLIndexCacheItem* cache) const {
|
| + if (word_starts_map_.empty())
|
| + return;
|
| + // For unit testing: Enable saving of the cache as an earlier version to
|
| + // allow testing of cache file upgrading in ReadFromFile().
|
| + // TODO(mrossetti): Instead of intruding on production code with this kind of
|
| + // test harness, save a copy of an older version cache with known results.
|
| + // Implement this when switching the caching over to SQLite.
|
| + if (saved_cache_version_ < 1)
|
| + return;
|
| +
|
| + WordStartsMapItem* map_item = cache->mutable_word_starts_map();
|
| + map_item->set_item_count(word_starts_map_.size());
|
| + for (WordStartsMap::const_iterator iter = word_starts_map_.begin();
|
| + iter != word_starts_map_.end(); ++iter) {
|
| + WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry();
|
| + map_entry->set_history_id(iter->first);
|
| + const RowWordStarts& word_starts(iter->second);
|
| + for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin();
|
| + i != word_starts.url_word_starts_.end(); ++i)
|
| + map_entry->add_url_word_starts(*i);
|
| + for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin();
|
| + i != word_starts.title_word_starts_.end(); ++i)
|
| + map_entry->add_title_word_starts(*i);
|
| + }
|
| +}
|
| +
|
| +// Cache Restoring -------------------------------------------------------------
|
| +
|
| +// static
|
| +void URLIndexPrivateData::RestoreFromFileTask(
|
| + const FilePath& file_path,
|
| + scoped_refptr<URLIndexPrivateData> private_data,
|
| + const std::string& languages) {
|
| + private_data = URLIndexPrivateData::RestoreFromFile(file_path, languages);
|
| +}
|
| +
|
| +// static
|
| +scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile(
|
| + const FilePath& file_path,
|
| + const std::string& languages) {
|
| + base::TimeTicks beginning_time = base::TimeTicks::Now();
|
| + if (!file_util::PathExists(file_path))
|
| + return NULL;
|
| + std::string data;
|
| + // If there is no cache file then simply give up. This will cause us to
|
| + // attempt to rebuild from the history database.
|
| + if (!file_util::ReadFileToString(file_path, &data))
|
| + return NULL;
|
| +
|
| + scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData);
|
| + InMemoryURLIndexCacheItem index_cache;
|
| + if (!index_cache.ParseFromArray(data.c_str(), data.size())) {
|
| + LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from "
|
| + << file_path.value();
|
| + return restored_data;
|
| + }
|
| +
|
| + if (!restored_data->RestorePrivateData(index_cache, languages))
|
| + return NULL;
|
| +
|
| + UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",
|
| + base::TimeTicks::Now() - beginning_time);
|
| + UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
|
| + restored_data->history_id_word_map_.size());
|
| + UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size());
|
| + UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
|
| + restored_data->word_map_.size());
|
| + UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
|
| + restored_data->char_word_map_.size());
|
| + if (restored_data->Empty())
|
| + return NULL; // 'No data' is the same as a failed reload.
|
| + return restored_data;
|
| +}
|
| +
|
| +// static
|
| +scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory(
|
| + HistoryDatabase* history_db,
|
| + const std::string& languages,
|
| + const std::set<std::string>& scheme_whitelist) {
|
| + if (!history_db)
|
| + return NULL;
|
| +
|
| + base::TimeTicks beginning_time = base::TimeTicks::Now();
|
| +
|
| + scoped_refptr<URLIndexPrivateData> rebuilt_data(new URLIndexPrivateData);
|
| + URLDatabase::URLEnumerator history_enum;
|
| + if (!history_db->InitURLEnumeratorForSignificant(&history_enum))
|
| + return NULL;
|
| + for (URLRow row; history_enum.GetNextURL(&row); )
|
| + rebuilt_data->IndexRow(row, languages, scheme_whitelist);
|
| +
|
| + UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",
|
| + base::TimeTicks::Now() - beginning_time);
|
| + UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
|
| + rebuilt_data->history_id_word_map_.size());
|
| + UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
|
| + rebuilt_data->word_map_.size());
|
| + UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
|
| + rebuilt_data->char_word_map_.size());
|
| + return rebuilt_data;
|
| +}
|
| +
|
| +bool URLIndexPrivateData::RestorePrivateData(
|
| + const InMemoryURLIndexCacheItem& cache,
|
| + const std::string& languages) {
|
| + if (cache.has_version())
|
| + restored_cache_version_ = cache.version();
|
| + return RestoreWordList(cache) && RestoreWordMap(cache) &&
|
| + RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) &&
|
| + RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages);
|
| +}
|
| +
|
| +bool URLIndexPrivateData::RestoreWordList(
|
| + const InMemoryURLIndexCacheItem& cache) {
|
| + if (!cache.has_word_list())
|
| return false;
|
| - *file_path = history_dir_.Append(chrome::kHQPCacheDBFilename);
|
| + const WordListItem& list_item(cache.word_list());
|
| + uint32 expected_item_count = list_item.word_count();
|
| + uint32 actual_item_count = list_item.word_size();
|
| + if (actual_item_count == 0 || actual_item_count != expected_item_count)
|
| + return false;
|
| + const RepeatedPtrField<std::string>& words(list_item.word());
|
| + for (RepeatedPtrField<std::string>::const_iterator iter = words.begin();
|
| + iter != words.end(); ++iter)
|
| + word_list_.push_back(UTF8ToUTF16(*iter));
|
| return true;
|
| }
|
|
|
| -void URLIndexPrivateData::SetCacheDatabaseForTesting(
|
| - InMemoryURLCacheDatabase* test_db) {
|
| - cache_db_ = test_db;
|
| +bool URLIndexPrivateData::RestoreWordMap(
|
| + const InMemoryURLIndexCacheItem& cache) {
|
| + if (!cache.has_word_map())
|
| + return false;
|
| + const WordMapItem& list_item(cache.word_map());
|
| + uint32 expected_item_count = list_item.item_count();
|
| + uint32 actual_item_count = list_item.word_map_entry_size();
|
| + if (actual_item_count == 0 || actual_item_count != expected_item_count)
|
| + return false;
|
| + const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry());
|
| + for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin();
|
| + iter != entries.end(); ++iter)
|
| + word_map_[UTF8ToUTF16(iter->word())] = iter->word_id();
|
| + return true;
|
| }
|
|
|
| +bool URLIndexPrivateData::RestoreCharWordMap(
|
| + const InMemoryURLIndexCacheItem& cache) {
|
| + if (!cache.has_char_word_map())
|
| + return false;
|
| + const CharWordMapItem& list_item(cache.char_word_map());
|
| + uint32 expected_item_count = list_item.item_count();
|
| + uint32 actual_item_count = list_item.char_word_map_entry_size();
|
| + if (actual_item_count == 0 || actual_item_count != expected_item_count)
|
| + return false;
|
| + const RepeatedPtrField<CharWordMapEntry>&
|
| + entries(list_item.char_word_map_entry());
|
| + for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter =
|
| + entries.begin(); iter != entries.end(); ++iter) {
|
| + expected_item_count = iter->item_count();
|
| + actual_item_count = iter->word_id_size();
|
| + if (actual_item_count == 0 || actual_item_count != expected_item_count)
|
| + return false;
|
| + char16 uni_char = static_cast<char16>(iter->char_16());
|
| + WordIDSet word_id_set;
|
| + const RepeatedField<int32>& word_ids(iter->word_id());
|
| + for (RepeatedField<int32>::const_iterator jiter = word_ids.begin();
|
| + jiter != word_ids.end(); ++jiter)
|
| + word_id_set.insert(*jiter);
|
| + char_word_map_[uni_char] = word_id_set;
|
| + }
|
| + return true;
|
| +}
|
| +
|
| +bool URLIndexPrivateData::RestoreWordIDHistoryMap(
|
| + const InMemoryURLIndexCacheItem& cache) {
|
| + if (!cache.has_word_id_history_map())
|
| + return false;
|
| + const WordIDHistoryMapItem& list_item(cache.word_id_history_map());
|
| + uint32 expected_item_count = list_item.item_count();
|
| + uint32 actual_item_count = list_item.word_id_history_map_entry_size();
|
| + if (actual_item_count == 0 || actual_item_count != expected_item_count)
|
| + return false;
|
| + const RepeatedPtrField<WordIDHistoryMapEntry>&
|
| + entries(list_item.word_id_history_map_entry());
|
| + for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter =
|
| + entries.begin(); iter != entries.end(); ++iter) {
|
| + expected_item_count = iter->item_count();
|
| + actual_item_count = iter->history_id_size();
|
| + if (actual_item_count == 0 || actual_item_count != expected_item_count)
|
| + return false;
|
| + WordID word_id = iter->word_id();
|
| + HistoryIDSet history_id_set;
|
| + const RepeatedField<int64>& history_ids(iter->history_id());
|
| + for (RepeatedField<int64>::const_iterator jiter = history_ids.begin();
|
| + jiter != history_ids.end(); ++jiter) {
|
| + history_id_set.insert(*jiter);
|
| + AddToHistoryIDWordMap(*jiter, word_id);
|
| + }
|
| + word_id_history_map_[word_id] = history_id_set;
|
| + }
|
| + return true;
|
| +}
|
| +
|
| +bool URLIndexPrivateData::RestoreHistoryInfoMap(
|
| + const InMemoryURLIndexCacheItem& cache) {
|
| + if (!cache.has_history_info_map())
|
| + return false;
|
| + const HistoryInfoMapItem& list_item(cache.history_info_map());
|
| + uint32 expected_item_count = list_item.item_count();
|
| + uint32 actual_item_count = list_item.history_info_map_entry_size();
|
| + if (actual_item_count == 0 || actual_item_count != expected_item_count)
|
| + return false;
|
| + const RepeatedPtrField<HistoryInfoMapEntry>&
|
| + entries(list_item.history_info_map_entry());
|
| + for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter =
|
| + entries.begin(); iter != entries.end(); ++iter) {
|
| + HistoryID history_id = iter->history_id();
|
| + GURL url(iter->url());
|
| + URLRow url_row(url, history_id);
|
| + url_row.set_visit_count(iter->visit_count());
|
| + url_row.set_typed_count(iter->typed_count());
|
| + url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit()));
|
| + if (iter->has_title()) {
|
| + string16 title(UTF8ToUTF16(iter->title()));
|
| + url_row.set_title(title);
|
| + }
|
| + history_info_map_[history_id] = url_row;
|
| + }
|
| + return true;
|
| +}
|
| +
|
| +bool URLIndexPrivateData::RestoreWordStartsMap(
|
| + const InMemoryURLIndexCacheItem& cache,
|
| + const std::string& languages) {
|
| + // Note that this function must be called after RestoreHistoryInfoMap() has
|
| + // been run as the word starts may have to be recalculated from the urls and
|
| + // page titles.
|
| + if (cache.has_word_starts_map()) {
|
| + const WordStartsMapItem& list_item(cache.word_starts_map());
|
| + uint32 expected_item_count = list_item.item_count();
|
| + uint32 actual_item_count = list_item.word_starts_map_entry_size();
|
| + if (actual_item_count == 0 || actual_item_count != expected_item_count)
|
| + return false;
|
| + const RepeatedPtrField<WordStartsMapEntry>&
|
| + entries(list_item.word_starts_map_entry());
|
| + for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter =
|
| + entries.begin(); iter != entries.end(); ++iter) {
|
| + HistoryID history_id = iter->history_id();
|
| + RowWordStarts word_starts;
|
| + // Restore the URL word starts.
|
| + const RepeatedField<int32>& url_starts(iter->url_word_starts());
|
| + for (RepeatedField<int32>::const_iterator jiter = url_starts.begin();
|
| + jiter != url_starts.end(); ++jiter)
|
| + word_starts.url_word_starts_.push_back(*jiter);
|
| + // Restore the page title word starts.
|
| + const RepeatedField<int32>& title_starts(iter->title_word_starts());
|
| + for (RepeatedField<int32>::const_iterator jiter = title_starts.begin();
|
| + jiter != title_starts.end(); ++jiter)
|
| + word_starts.title_word_starts_.push_back(*jiter);
|
| + word_starts_map_[history_id] = word_starts;
|
| + }
|
| + } else {
|
| + // Since the cache did not contain any word starts we must rebuild then from
|
| + // the URL and page titles.
|
| + for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
|
| + iter != history_info_map_.end(); ++iter) {
|
| + RowWordStarts word_starts;
|
| + const URLRow& row(iter->second);
|
| + string16 url(net::FormatUrl(row.url(), languages,
|
| + net::kFormatUrlOmitUsernamePassword,
|
| + net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS,
|
| + NULL, NULL, NULL));
|
| + url = base::i18n::ToLower(url);
|
| + String16VectorFromString16(url, false, &word_starts.url_word_starts_);
|
| + String16VectorFromString16(
|
| + row.title(), false, &word_starts.title_word_starts_);
|
| + word_starts_map_[iter->first] = word_starts;
|
| + }
|
| + }
|
| + return true;
|
| +}
|
| +
|
| // static
|
| bool URLIndexPrivateData::URLSchemeIsWhitelisted(
|
| const GURL& gurl,
|
|
|