Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(161)

Side by Side Diff: chrome/browser/history/url_index_private_data.h

Issue 9655003: Gather word-start Information to Aid in Scoring. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/
Patch Set: Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #ifndef CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_ 5 #ifndef CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_
6 #define CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_ 6 #define CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_
7 #pragma once 7 #pragma once
8 8
9 #include "base/file_path.h" 9 #include "base/file_path.h"
10 #include "base/gtest_prod_util.h" 10 #include "base/gtest_prod_util.h"
11 #include "chrome/browser/history/in_memory_url_index_types.h" 11 #include "chrome/browser/history/in_memory_url_index_types.h"
12 #include "chrome/browser/history/in_memory_url_index_cache.pb.h" 12 #include "chrome/browser/history/in_memory_url_index_cache.pb.h"
13 13
14 class HistoryQuickProviderTest; 14 class HistoryQuickProviderTest;
15 15
16 namespace in_memory_url_index { 16 namespace in_memory_url_index {
17 class InMemoryURLIndexCacheItem; 17 class InMemoryURLIndexCacheItem;
18 } 18 }
19 19
20 namespace history { 20 namespace history {
21 21
22 namespace imui = in_memory_url_index; 22 namespace imui = in_memory_url_index;
23 23
24 class HistoryDatabase; 24 class HistoryDatabase;
25 25
26 // Current version of the cache file.
27 static const int kCurrentCacheFileVersion = 1;
28
26 // A structure describing the InMemoryURLIndex's internal data and providing for 29 // A structure describing the InMemoryURLIndex's internal data and providing for
27 // restoring, rebuilding and updating that internal data. 30 // restoring, rebuilding and updating that internal data.
28 class URLIndexPrivateData { 31 class URLIndexPrivateData {
29 public: 32 public:
30 URLIndexPrivateData(); 33 URLIndexPrivateData();
31 ~URLIndexPrivateData(); 34 ~URLIndexPrivateData();
32 35
33 private: 36 private:
34 friend class AddHistoryMatch; 37 friend class AddHistoryMatch;
35 friend class ::HistoryQuickProviderTest; 38 friend class ::HistoryQuickProviderTest;
(...skipping 134 matching lines...) Expand 10 before | Expand all | Expand 10 after
170 // criteria, otherwise the row will be removed from the index. Returns true 173 // criteria, otherwise the row will be removed from the index. Returns true
171 // if the index was actually updated. 174 // if the index was actually updated.
172 bool UpdateURL(const URLRow& row); 175 bool UpdateURL(const URLRow& row);
173 176
174 // Deletes indexing data for the history item with the URL given in |url|. 177 // Deletes indexing data for the history item with the URL given in |url|.
175 // The item may not have actually been indexed, which is the case if it did 178 // The item may not have actually been indexed, which is the case if it did
176 // not previously meet minimum 'quick' criteria. Returns true if the index 179 // not previously meet minimum 'quick' criteria. Returns true if the index
177 // was actually updated. 180 // was actually updated.
178 bool DeleteURL(const GURL& url); 181 bool DeleteURL(const GURL& url);
179 182
180 // Parses and indexes the words in the URL and page title of |row|. 183 // Parses and indexes the words in the URL and page title of |row| and
181 void AddRowWordsToIndex(const URLRow& row); 184 // calculate the word starts in each, saving the starts in |word_starts|.
185 void AddRowWordsToIndex(const URLRow& row, RowWordStarts* word_starts);
182 186
183 // Removes |row| and all associated words and characters from the index. 187 // Removes |row| and all associated words and characters from the index.
184 void RemoveRowFromIndex(const URLRow& row); 188 void RemoveRowFromIndex(const URLRow& row);
185 189
186 // Removes all words and characters associated with |row| from the index. 190 // Removes all words and characters associated with |row| from the index.
187 void RemoveRowWordsFromIndex(const URLRow& row); 191 void RemoveRowWordsFromIndex(const URLRow& row);
188 192
189 // Given a single word in |uni_word|, adds a reference for the containing 193 // Given a single word in |uni_word|, adds a reference for the containing
190 // history item identified by |history_id| to the index. 194 // history item identified by |history_id| to the index.
191 void AddWordToIndex(const string16& uni_word, HistoryID history_id); 195 void AddWordToIndex(const string16& uni_word, HistoryID history_id);
(...skipping 23 matching lines...) Expand all
215 // in the user input, 2) completeness of each term's match, 3) ordering 219 // in the user input, 2) completeness of each term's match, 3) ordering
216 // of the occurrence of each term (i.e. they appear in order), 4) last 220 // of the occurrence of each term (i.e. they appear in order), 4) last
217 // visit time, and 5) number of visits. 221 // visit time, and 5) number of visits.
218 // This raw score allows the results to be ordered and can be used 222 // This raw score allows the results to be ordered and can be used
219 // to influence the final score calculated by the client of this 223 // to influence the final score calculated by the client of this
220 // index. Returns a ScoredHistoryMatch structure with the raw score and 224 // index. Returns a ScoredHistoryMatch structure with the raw score and
221 // substring matching metrics. 225 // substring matching metrics.
222 static ScoredHistoryMatch ScoredMatchForURL( 226 static ScoredHistoryMatch ScoredMatchForURL(
223 const URLRow& row, 227 const URLRow& row,
224 const string16& lower_string, 228 const string16& lower_string,
225 const String16Vector& terms_vector); 229 const String16Vector& terms_vector,
230 const RowWordStarts& word_starts);
226 231
227 // Calculates a component score based on position, ordering and total 232 // Calculates a component score based on position, ordering and total
228 // substring match size using metrics recorded in |matches|. |max_length| 233 // substring match size using metrics recorded in |matches|. |max_length|
229 // is the length of the string against which the terms are being searched. 234 // is the length of the string against which the terms are being searched.
230 static int ScoreComponentForMatches(const TermMatches& matches, 235 static int ScoreComponentForMatches(const TermMatches& matches,
231 size_t max_length); 236 size_t max_length);
232 237
233 // Determines if |gurl| has a whitelisted scheme and returns true if so. 238 // Determines if |gurl| has a whitelisted scheme and returns true if so.
234 bool URLSchemeIsWhitelisted(const GURL& gurl) const; 239 bool URLSchemeIsWhitelisted(const GURL& gurl) const;
235 240
241 // Sets the version of the cache file that will be saved when calling
242 // SavePrivateData(). For unit testing only.
243 void set_saved_cache_version(int version) { saved_cache_version_ = version; }
244
236 // Encode a data structure into the protobuf |cache|. 245 // Encode a data structure into the protobuf |cache|.
237 void SavePrivateData(imui::InMemoryURLIndexCacheItem* cache) const; 246 void SavePrivateData(imui::InMemoryURLIndexCacheItem* cache) const;
238 void SaveWordList(imui::InMemoryURLIndexCacheItem* cache) const; 247 void SaveWordList(imui::InMemoryURLIndexCacheItem* cache) const;
239 void SaveWordMap(imui::InMemoryURLIndexCacheItem* cache) const; 248 void SaveWordMap(imui::InMemoryURLIndexCacheItem* cache) const;
240 void SaveCharWordMap(imui::InMemoryURLIndexCacheItem* cache) const; 249 void SaveCharWordMap(imui::InMemoryURLIndexCacheItem* cache) const;
241 void SaveWordIDHistoryMap(imui::InMemoryURLIndexCacheItem* cache) const; 250 void SaveWordIDHistoryMap(imui::InMemoryURLIndexCacheItem* cache) const;
242 void SaveHistoryInfoMap(imui::InMemoryURLIndexCacheItem* cache) const; 251 void SaveHistoryInfoMap(imui::InMemoryURLIndexCacheItem* cache) const;
252 void SaveWordStartsMap(imui::InMemoryURLIndexCacheItem* cache) const;
243 253
244 // Decode a data structure from the protobuf |cache|. Return false if there 254 // Decode a data structure from the protobuf |cache|. Return false if there
245 // is any kind of failure. 255 // is any kind of failure.
246 bool RestorePrivateData(const imui::InMemoryURLIndexCacheItem& cache); 256 bool RestorePrivateData(const imui::InMemoryURLIndexCacheItem& cache);
247 bool RestoreWordList(const imui::InMemoryURLIndexCacheItem& cache); 257 bool RestoreWordList(const imui::InMemoryURLIndexCacheItem& cache);
248 bool RestoreWordMap(const imui::InMemoryURLIndexCacheItem& cache); 258 bool RestoreWordMap(const imui::InMemoryURLIndexCacheItem& cache);
249 bool RestoreCharWordMap(const imui::InMemoryURLIndexCacheItem& cache); 259 bool RestoreCharWordMap(const imui::InMemoryURLIndexCacheItem& cache);
250 bool RestoreWordIDHistoryMap(const imui::InMemoryURLIndexCacheItem& cache); 260 bool RestoreWordIDHistoryMap(const imui::InMemoryURLIndexCacheItem& cache);
251 bool RestoreHistoryInfoMap(const imui::InMemoryURLIndexCacheItem& cache); 261 bool RestoreHistoryInfoMap(const imui::InMemoryURLIndexCacheItem& cache);
262 bool RestoreWordStartsMap(const imui::InMemoryURLIndexCacheItem& cache);
252 263
253 // Cache of search terms. 264 // Cache of search terms.
254 SearchTermCacheMap search_term_cache_; 265 SearchTermCacheMap search_term_cache_;
255 266
256 // Languages used during the word-breaking process during indexing. 267 // Languages used during the word-breaking process during indexing.
257 std::string languages_; 268 std::string languages_;
258 269
259 // Only URLs with a whitelisted scheme are indexed. 270 // Only URLs with a whitelisted scheme are indexed.
260 std::set<std::string> scheme_whitelist_; 271 std::set<std::string> scheme_whitelist_;
261 272
262 // Start of data members that are cached ------------------------------------- 273 // Start of data members that are cached -------------------------------------
263 274
275 // The version of the cache file most recently used to restore this instance
276 // of the private data. If the private data was rebuilt from the history
277 // database this will be 0.
278 int restored_cache_version_;
279
264 // A list of all of indexed words. The index of a word in this list is the 280 // A list of all of indexed words. The index of a word in this list is the
265 // ID of the word in the word_map_. It reduces the memory overhead by 281 // ID of the word in the word_map_. It reduces the memory overhead by
266 // replacing a potentially long and repeated string with a simple index. 282 // replacing a potentially long and repeated string with a simple index.
267 String16Vector word_list_; 283 String16Vector word_list_;
268 284
269 // A list of available words slots in |word_list_|. An available word slot 285 // A list of available words slots in |word_list_|. An available word slot
270 // is the index of a unused word in word_list_ vector, also referred to as 286 // is the index of a unused word in word_list_ vector, also referred to as
271 // a WordID. As URL visits are added or modified new words may be added to 287 // a WordID. As URL visits are added or modified new words may be added to
272 // the index, in which case any available words are used, if any, and then 288 // the index, in which case any available words are used, if any, and then
273 // words are added to the end of the word_list_. When URL visits are 289 // words are added to the end of the word_list_. When URL visits are
(...skipping 16 matching lines...) Expand all
290 306
291 // A one-to-many mapping from a HistoryID to all WordIDs of words that occur 307 // A one-to-many mapping from a HistoryID to all WordIDs of words that occur
292 // in the URL and/or page title of the history item referenced by that 308 // in the URL and/or page title of the history item referenced by that
293 // HistoryID. 309 // HistoryID.
294 HistoryIDWordMap history_id_word_map_; 310 HistoryIDWordMap history_id_word_map_;
295 311
296 // A one-to-one mapping from HistoryID to the history item data governing 312 // A one-to-one mapping from HistoryID to the history item data governing
297 // index inclusion and relevance scoring. 313 // index inclusion and relevance scoring.
298 HistoryInfoMap history_info_map_; 314 HistoryInfoMap history_info_map_;
299 315
316 // A one-to-one mapping from HistoryID to the word starts detected in each
317 // item's URL and page title.
318 WordStartsMap word_starts_map_;
319
300 // End of data members that are cached --------------------------------------- 320 // End of data members that are cached ---------------------------------------
301 321
322 // For unit testing only. Specifies the version of the cache file to be saved.
323 // Used only for testing upgrading of an older version of the cache upon
324 // restore.
325 int saved_cache_version_;
326
302 // Used for unit testing only. Records the number of candidate history items 327 // Used for unit testing only. Records the number of candidate history items
303 // at three stages in the index searching process. 328 // at three stages in the index searching process.
304 size_t pre_filter_item_count_; // After word index is queried. 329 size_t pre_filter_item_count_; // After word index is queried.
305 size_t post_filter_item_count_; // After trimming large result set. 330 size_t post_filter_item_count_; // After trimming large result set.
306 size_t post_scoring_item_count_; // After performing final filter/scoring. 331 size_t post_scoring_item_count_; // After performing final filter/scoring.
307 }; 332 };
308 333
309 } // namespace history 334 } // namespace history
310 335
311 #endif // CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_ 336 #endif // CHROME_BROWSER_HISTORY_URL_INDEX_PRIVATE_DATA_H_
OLDNEW
« no previous file with comments | « chrome/browser/history/in_memory_url_index_unittest.cc ('k') | chrome/browser/history/url_index_private_data.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698