chrome/browser/history/url_index_private_data.h - Issue 10837244: Replace HistoryQuickProvider protobuf-based caching with an SQLite-based database.

Unified Diff: chrome/browser/history/url_index_private_data.h

Issue 10837244: Replace HistoryQuickProvider protobuf-based caching with an SQLite-based database. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/

Patch Set: Tweak suppression. Created 8 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « chrome/browser/history/top_sites_unittest.cc ('k') | chrome/browser/history/url_index_private_data.cc » ('j') | chrome/browser/history/url_index_private_data.cc » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: chrome/browser/history/url_index_private_data.h

===================================================================

--- chrome/browser/history/url_index_private_data.h (revision 151481)

+++ chrome/browser/history/url_index_private_data.h (working copy)

@@ -11,8 +11,9 @@

#include "base/file_path.h"

#include "base/gtest_prod_util.h"

#include "base/memory/ref_counted.h"

+#include "base/synchronization/lock.h"

+#include "base/threading/sequenced_worker_pool.h"

#include "chrome/browser/history/in_memory_url_index_types.h"

-#include "chrome/browser/history/in_memory_url_index_cache.pb.h"

#include "chrome/browser/history/scored_history_match.h"

#include "content/public/browser/notification_details.h"

@@ -27,34 +28,142 @@

namespace imui = in_memory_url_index;

class HistoryDatabase;

+class InMemoryURLCacheDatabase;

class InMemoryURLIndex;

class RefCountedBool;

-// Current version of the cache file.

-static const int kCurrentCacheFileVersion = 1;

-// A structure describing the InMemoryURLIndex's internal data and providing for

-// restoring, rebuilding and updating that internal data.

+// A structure private to InMemoryURLIndex describing its internal data and

+// providing for restoring, rebuilding and updating that internal data. As

+// this class is for exclusive use by the InMemoryURLIndex class there should

+// be no calls from any other class.

+//

+// All public member functions are called on the main thread unless otherwise

+// annotated.

class URLIndexPrivateData

: public base::RefCountedThreadSafe<URLIndexPrivateData> {

public:

- URLIndexPrivateData();

+ // Creates a new instance of private data, creating or opening the cache

+ // database located in |history_dir|. |languages| is used to break down

+ // search terms, URLs, and page titles into words and characters.

+ URLIndexPrivateData(const FilePath& history_dir,

+ const std::string& languages);

+ // Initializes the private data and its cache database. Returns true if the

+ // database is successfully initialized. Any failures will mark the cache

+ // database as not enabled. |sequence_token| is used to coordinate all

+ // future database operations (not including those performed during this

+ // initialization). Called on the DB thread.

+ bool Init(base::SequencedWorkerPool::SequenceToken sequence_token);

+ // Performs a Clear() and then erases the cache database. Called on the

+ // worker pool sequenced by InMemoryURLIndex's |sequence_token_|.

+ void Reset();

+ // Returns true if there is no data in the index.

+ bool Empty() const;

+ // Returns a copy of the private data for archiving purposes.

+ URLIndexPrivateData* Snapshot() const;

+ // Closes the database.

+ void Shutdown();

+ // Verifies that the private data is consistent.

+ bool ValidateConsistency() const;

+ // Given a string16 in |search_string|, scans the history index and returns a

+ // vector with all scored, matching history items. The |search_string| is

+ // broken down into individual terms (words), each of which must occur in the

+ // candidate history item's URL or page title for the item to qualify;

+ // however, the terms do not necessarily have to be adjacent. Once we have

+ // a set of candidates, they are filtered to insure that all |search_string|

+ // terms, as separated by whitespace, occur within the candidate's URL

+ // or page title. Scores are then calculated on no more than

+ // |kItemsToScoreLimit| candidates, as the scoring of such a large number of

+ // candidates may cause perceptible typing response delays in the omnibox.

+ // This is likely to occur for short omnibox terms such as 'h' and 'w' which

+ // will be found in nearly all history candidates. Results are sorted by

+ // descending score. The full results set (i.e. beyond the

+ // |kItemsToScoreLimit| limit) will be retained and used for subsequent calls

+ // to this function.

+ ScoredHistoryMatches HistoryItemsForTerms(const string16& search_string);

+ // Adds the history item in |row| to the index if it does not already already

+ // exist and it meets the minimum 'quick' criteria. If the row already exists

+ // in the index then the index will be updated if the row still meets the

+ // criteria, otherwise the row will be removed from the index. Returns true

+ // if the index was actually updated. Posts updates to the cache database

+ // that are run on the worker pool sequenced by InMemoryURLIndex's

+ // |sequence_token_|.

+ bool UpdateURL(const URLRow& row);

+ // Deletes index data for the history item with the given |url|.

+ // The item may not have actually been indexed, which is the case if it did

+ // not previously meet minimum 'quick' criteria. Returns true if the index

+ // was actually updated. Posts updates to the cache database that are run on

+ // the worker pool sequenced by InMemoryURLIndex's |sequence_token_|.

+ bool DeleteURL(const GURL& url);

+ // Sets if the cache database is enabled.

+ void set_cache_enabled(bool enabled) { cache_enabled_ = enabled; }

+ // Returns the cache database.

+ InMemoryURLCacheDatabase* cache_db() { return cache_db_.get(); }

+ // Restores the index data from the contents of the cache database. This is

+ // called on the DB thread during profile startup and returns true upon a

+ // successful restoration. Restoration will fail if there is no cache

+ // database or the cache database has been corrupted. All other database

+ // operations (i.e. updates from site visits, etc.) will be postponed while

+ // this task is being run.

+ bool RestoreFromCacheTask();

+ // Constructs a new private data object by rebuilding its contents from the

+ // history database in |history_db|. Returns the new URLIndexPrivateData which

+ // on success will contain the rebuilt data but upon failure will be empty.

+ // |history_dir| points to the directory in which the cache database will be

+ // created. |old_data| provides the cache database and the languages to be

+ // used for breaking down search terms, URLs and page titles. This is called

+ // on the DB thread during profile startup iff restoring from the cache

+ // database fails (see also RestoreFromCacheTask()). All other database

+ // operations (i.e. updates from site visits, etc.) will be postponed while

+ // this task is being run.

+ static scoped_refptr<URLIndexPrivateData> RebuildFromHistory(

+ HistoryDatabase* history_db,

+ scoped_refptr<URLIndexPrivateData> old_data);

+ // Completely refreshes the contents of the cache database using the contents

+ // of the in-memory index data. This task is performed on the sequenced

+ // blocking pool using the sequence_token with which this instance was

+ // Init'ed. A refresh will occur 1) during profile startup if a

+ // RebuildFromHistory(...) is required, or 2) at any time database corruption

+ // is detected while updating the database in an attempt to repair the

+ // corruption.

+ void RefreshCacheTask();

+ static void InitializeSchemeWhitelistForTesting(

+ std::set<std::string>* whitelist);

private:

friend class base::RefCountedThreadSafe<URLIndexPrivateData>;

- ~URLIndexPrivateData();

friend class AddHistoryMatch;

friend class ::HistoryQuickProviderTest;

- friend class InMemoryURLIndex;

+ friend class InMemoryURLCacheDatabase;

+ friend class InMemoryURLIndexCacheTest;

friend class InMemoryURLIndexTest;

- FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore);

+ friend class InMemoryURLIndexBaseTest;

+ friend class IntercessionaryIndexTest;

+ friend class URLIndexOldCacheTest;

+ friend class URLIndexPrivateDataTest;

FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, HugeResultSet);

- FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Scoring);

FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TitleSearch);

FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching);

FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, WhitelistedURLs);

+ FRIEND_TEST_ALL_PREFIXES(IntercessionaryIndexTest, CacheDatabaseFailure);

+ FRIEND_TEST_ALL_PREFIXES(IntercessionaryIndexTest,

+ ShutdownDuringCacheRefresh);

FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest, Initialization);

+ FRIEND_TEST_ALL_PREFIXES(URLIndexPrivateDataTest, CacheFetch);

// Support caching of term results so that we can optimize searches which

// build upon a previous search. Each entry in this map represents one

@@ -123,118 +232,41 @@

const history::HistoryInfoMap& history_info_map_;

};

- // Given a string16 in |term_string|, scans the history index and returns a

- // vector with all scored, matching history items. The |term_string| is

- // broken down into individual terms (words), each of which must occur in the

- // candidate history item's URL or page title for the item to qualify;

- // however, the terms do not necessarily have to be adjacent. Once we have

- // a set of candidates, they are filtered to insure that all |term_string|

- // terms, as separated by whitespace, occur within the candidate's URL

- // or page title. Scores are then calculated on no more than

- // |kItemsToScoreLimit| candidates, as the scoring of such a large number of

- // candidates may cause perceptible typing response delays in the omnibox.

- // This is likely to occur for short omnibox terms such as 'h' and 'w' which

- // will be found in nearly all history candidates. Results are sorted by

- // descending score. The full results set (i.e. beyond the

- // |kItemsToScoreLimit| limit) will be retained and used for subsequent calls

- // to this function.

- ScoredHistoryMatches HistoryItemsForTerms(const string16& term_string);

+ // Creates a new instance of private data for purposes of rebuilding from

+ // the history database while simultaneously allowing continued use of an

+ // older private data |old_data|. The old data will still be used for

+ // providing search results. Any updates to the private data will be queued

+ // for application to the new data once it has been successfully rebuilt.

+ URLIndexPrivateData(const URLIndexPrivateData& old_data);

- // Creates a new URLIndexPrivateData object, populates it from the contents

- // of the cache file stored in |file_path|, and assigns it to |private_data|.

- // |languages| will be used to break URLs and page titles into words.

- static void RestoreFromFileTask(

- const FilePath& file_path,

- scoped_refptr<URLIndexPrivateData> private_data,

- const std::string& languages);

+ // The following constructor is for unit testing purposes only.

+ URLIndexPrivateData();

- // Constructs a new object by restoring its contents from the file at |path|.

- // Returns the new URLIndexPrivateData which on success will contain the

- // restored data but upon failure will be empty. |languages| will be used to

- // break URLs and page titles into words

- static scoped_refptr<URLIndexPrivateData> RestoreFromFile(

- const FilePath& path,

- const std::string& languages);

+ virtual ~URLIndexPrivateData();

- // Constructs a new object by rebuilding its contents from the history

- // database in |history_db|. Returns the new URLIndexPrivateData which on

- // success will contain the rebuilt data but upon failure will be empty.

- // |languages| gives a list of language encodings by which the URLs and page

- // titles are broken down into words and characters.

- static scoped_refptr<URLIndexPrivateData> RebuildFromHistory(

- HistoryDatabase* history_db,

- const std::string& languages,

- const std::set<std::string>& scheme_whitelist);

+ // Returns true if the profile is shutting down. Thread-safe.

+ bool IsShutdown() const;

- // Writes |private_data| as a cache file to |file_path| and returns success

- // via |succeeded|.

- static void WritePrivateDataToCacheFileTask(

- scoped_refptr<URLIndexPrivateData> private_data,

- const FilePath& file_path,

- scoped_refptr<RefCountedBool> succeeded);

+ // Gets if the cache database is enabled.

+ bool cache_enabled() const { return cache_enabled_ && cache_db_; }

- // Caches the index private data and writes the cache file to the profile

- // directory. Called by WritePrivateDataToCacheFileTask.

- bool SaveToFile(const FilePath& file_path);

- // Initializes all index data members in preparation for restoring the index

- // from the cache or a complete rebuild from the history database.

+ // Initializes all index private data members in preparation for restoring,

+ // rebuilding or resetting the index.

void Clear();

- // Returns true if there is no data in the index.

- bool Empty() const;

- // Creates a copy of ourself.

- scoped_refptr<URLIndexPrivateData> Duplicate() const;

- // Adds |word_id| to |history_id|'s entry in the history/word map,

- // creating a new entry if one does not already exist.

- void AddToHistoryIDWordMap(HistoryID history_id, WordID word_id);

- // Given a set of Char16s, finds words containing those characters.

- WordIDSet WordIDSetForTermChars(const Char16Set& term_chars);

// URL History indexing support functions.

// Indexes one URL history item as described by |row|. Returns true if the

- // row was actually indexed. |languages| gives a list of language encodings by

- // which the URLs and page titles are broken down into words and characters.

- // |scheme_whitelist| is used to filter non-qualifying schemes.

- bool IndexRow(const URLRow& row,

- const std::string& languages,

- const std::set<std::string>& scheme_whitelist);

+ // row was actually indexed.

+ bool IndexRow(const URLRow& row);

- // Adds the history item in |row| to the index if it does not already already

- // exist and it meets the minimum 'quick' criteria. If the row already exists

- // in the index then the index will be updated if the row still meets the

- // criteria, otherwise the row will be removed from the index. Returns true

- // if the index was actually updated. |languages| gives a list of language

- // encodings by which the URLs and page titles are broken down into words and

- // characters. |scheme_whitelist| is used to filter non-qualifying schemes.

- bool UpdateURL(const URLRow& row,

- const std::string& languages,

- const std::set<std::string>& scheme_whitelist);

- // Deletes index data for the history item with the given |url|.

- // The item may not have actually been indexed, which is the case if it did

- // not previously meet minimum 'quick' criteria. Returns true if the index

- // was actually updated.

- bool DeleteURL(const GURL& url);

// Parses and indexes the words in the URL and page title of |row| and

// calculate the word starts in each, saving the starts in |word_starts|.

// |languages| gives a list of language encodings by which the URLs and page

// titles are broken down into words and characters.

void AddRowWordsToIndex(const URLRow& row,

- RowWordStarts* word_starts,

- const std::string& languages);

+ RowWordStarts* word_starts);

- // Removes |row| and all associated words and characters from the index.

- void RemoveRowFromIndex(const URLRow& row);

- // Removes all words and characters associated with |row| from the index.

- void RemoveRowWordsFromIndex(const URLRow& row);

// Given a single word in |uni_word|, adds a reference for the containing

// history item identified by |history_id| to the index.

void AddWordToIndex(const string16& uni_word, HistoryID history_id);

@@ -247,6 +279,16 @@

// |history_id| as the initial element of the word's set.

void AddWordHistory(const string16& uni_word, HistoryID history_id);

+ // Removes |row| and all associated words and characters from the index.

+ void RemoveRowFromIndex(const URLRow& row);

+ // Removes all words and characters associated with |row| from the index.

+ void RemoveRowWordsFromIndex(const URLRow& row);

+ // Adds |word_id| to |history_id|'s entry in the history/word map,

+ // creating a new entry if one does not already exist.

+ void AddToHistoryIDWordMap(HistoryID history_id, WordID word_id);

// Clears |used_| for each item in the search term cache.

void ResetSearchTermCache();

@@ -258,42 +300,52 @@

// ids for the given term given in |term|.

HistoryIDSet HistoryIDsForTerm(const string16& term);

- // Encode a data structure into the protobuf |cache|.

- void SavePrivateData(imui::InMemoryURLIndexCacheItem* cache) const;

- void SaveWordList(imui::InMemoryURLIndexCacheItem* cache) const;

- void SaveWordMap(imui::InMemoryURLIndexCacheItem* cache) const;

- void SaveCharWordMap(imui::InMemoryURLIndexCacheItem* cache) const;

- void SaveWordIDHistoryMap(imui::InMemoryURLIndexCacheItem* cache) const;

- void SaveHistoryInfoMap(imui::InMemoryURLIndexCacheItem* cache) const;

- void SaveWordStartsMap(imui::InMemoryURLIndexCacheItem* cache) const;

+ // Given a set of Char16s, finds words containing those characters.

+ WordIDSet WordIDSetForTermChars(const Char16Set& term_chars);

- // Decode a data structure from the protobuf |cache|. Return false if there

- // is any kind of failure. |languages| will be used to break URLs and page

- // titles into words

- bool RestorePrivateData(const imui::InMemoryURLIndexCacheItem& cache,

- const std::string& languages);

- bool RestoreWordList(const imui::InMemoryURLIndexCacheItem& cache);

- bool RestoreWordMap(const imui::InMemoryURLIndexCacheItem& cache);

- bool RestoreCharWordMap(const imui::InMemoryURLIndexCacheItem& cache);

- bool RestoreWordIDHistoryMap(const imui::InMemoryURLIndexCacheItem& cache);

- bool RestoreHistoryInfoMap(const imui::InMemoryURLIndexCacheItem& cache);

- bool RestoreWordStartsMap(const imui::InMemoryURLIndexCacheItem& cache,

- const std::string& languages);

+ // Restores our contents from the cache database |cache_db|.

+ bool RestoreFromCache(InMemoryURLCacheDatabase* cache_db);

+ // Deletes any old style protobuf-based cache file.

+ void DeleteOldVersionCacheFile() const;

+ // Constructs a file path for the cache database within the same directory

+ // where the history database is kept and saves that path to |file_path|.

+ // Returns true if |file_path| can be successfully constructed.

+ bool GetCacheDBPath(FilePath* file_path);

+ // Sets the cache database for testing. Takes ownership of |test_db|.

+ void SetCacheDatabaseForTesting(InMemoryURLCacheDatabase* test_db);

// Determines if |gurl| has a whitelisted scheme and returns true if so.

static bool URLSchemeIsWhitelisted(const GURL& gurl,

const std::set<std::string>& whitelist);

+ // Directory where cache database (and older protobuf-based cache file)

+ // resides. Except when unit testing, this is the same directory in which

+ // the profile's history database is found.

+ FilePath history_dir_;

+ // Languages used during the word-breaking process during indexing.

+ std::string languages_;

// Cache of search terms.

SearchTermCacheMap search_term_cache_;

+ // The cache database.

+ scoped_refptr<InMemoryURLCacheDatabase> cache_db_;

+ // true if the cache is enabled.

+ bool cache_enabled_;

+ // true once the shutdown process has begun.

+ bool shutdown_;

+ // Guard that prevents simultaneous, cross-thread access to |shutdown_|.

+ mutable base::Lock lock_;

// Start of data members that are cached -------------------------------------

- // The version of the cache file most recently used to restore this instance

- // of the private data. If the private data was rebuilt from the history

- // database this will be 0.

- int restored_cache_version_;

// A list of all of indexed words. The index of a word in this list is the

// ID of the word in the word_map_. It reduces the memory overhead by

// replacing a potentially long and repeated string with a simple index.

@@ -336,10 +388,8 @@

// End of data members that are cached ---------------------------------------

- // For unit testing only. Specifies the version of the cache file to be saved.

- // Used only for testing upgrading of an older version of the cache upon

- // restore.

- int saved_cache_version_;

+ // Only URLs with a whitelisted scheme are indexed.

+ std::set<std::string> scheme_whitelist_;

// Used for unit testing only. Records the number of candidate history items

// at three stages in the index searching process.