OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "chrome/browser/history/text_database_manager.h" | |
6 | |
7 #include <algorithm> | |
8 #include <functional> | |
9 | |
10 #include "base/bind.h" | |
11 #include "base/compiler_specific.h" | |
12 #include "base/files/file_enumerator.h" | |
13 #include "base/logging.h" | |
14 #include "base/message_loop/message_loop.h" | |
15 #include "base/metrics/histogram.h" | |
16 #include "base/strings/string_util.h" | |
17 #include "base/strings/utf_string_conversions.h" | |
18 #include "chrome/browser/history/history_publisher.h" | |
19 #include "chrome/browser/history/visit_database.h" | |
20 | |
21 using base::Time; | |
22 using base::TimeDelta; | |
23 using base::TimeTicks; | |
24 | |
25 namespace history { | |
26 | |
27 namespace { | |
28 | |
29 // The number of database files we will be attached to at once. | |
30 const int kCacheDBSize = 5; | |
31 | |
32 std::string ConvertStringForIndexer(const string16& input) { | |
33 // TODO(evanm): other transformations here? | |
34 return UTF16ToUTF8(CollapseWhitespace(input, false)); | |
35 } | |
36 | |
37 // Data older than this will be committed to the full text index even if we | |
38 // haven't gotten a title and/or body. | |
39 const int kExpirationSeconds = 20; | |
40 | |
41 } // namespace | |
42 | |
43 // TextDatabaseManager::ChangeSet ---------------------------------------------- | |
44 | |
45 TextDatabaseManager::ChangeSet::ChangeSet() {} | |
46 | |
47 TextDatabaseManager::ChangeSet::~ChangeSet() {} | |
48 | |
49 // TextDatabaseManager::PageInfo ----------------------------------------------- | |
50 | |
51 TextDatabaseManager::PageInfo::PageInfo(URLID url_id, | |
52 VisitID visit_id, | |
53 Time visit_time) | |
54 : url_id_(url_id), | |
55 visit_id_(visit_id), | |
56 visit_time_(visit_time) { | |
57 added_time_ = TimeTicks::Now(); | |
58 } | |
59 | |
60 TextDatabaseManager::PageInfo::~PageInfo() {} | |
61 | |
62 void TextDatabaseManager::PageInfo::set_title(const string16& ttl) { | |
63 if (ttl.empty()) // Make the title nonempty when we set it for EverybodySet. | |
64 title_ = ASCIIToUTF16(" "); | |
65 else | |
66 title_ = ttl; | |
67 } | |
68 | |
69 void TextDatabaseManager::PageInfo::set_body(const string16& bdy) { | |
70 if (bdy.empty()) // Make the body nonempty when we set it for EverybodySet. | |
71 body_ = ASCIIToUTF16(" "); | |
72 else | |
73 body_ = bdy; | |
74 } | |
75 | |
76 bool TextDatabaseManager::PageInfo::Expired(TimeTicks now) const { | |
77 return now - added_time_ > base::TimeDelta::FromSeconds(kExpirationSeconds); | |
78 } | |
79 | |
80 // TextDatabaseManager --------------------------------------------------------- | |
81 | |
82 TextDatabaseManager::TextDatabaseManager(const base::FilePath& dir, | |
83 URLDatabase* url_database, | |
84 VisitDatabase* visit_database) | |
85 : dir_(dir), | |
86 url_database_(url_database), | |
87 visit_database_(visit_database), | |
88 recent_changes_(RecentChangeList::NO_AUTO_EVICT), | |
89 transaction_nesting_(0), | |
90 db_cache_(DBCache::NO_AUTO_EVICT), | |
91 present_databases_loaded_(false), | |
92 weak_factory_(this), | |
93 history_publisher_(NULL) { | |
94 } | |
95 | |
96 TextDatabaseManager::~TextDatabaseManager() { | |
97 if (transaction_nesting_) | |
98 CommitTransaction(); | |
99 } | |
100 | |
101 // static | |
102 TextDatabase::DBIdent TextDatabaseManager::TimeToID(Time time) { | |
103 Time::Exploded exploded; | |
104 time.UTCExplode(&exploded); | |
105 | |
106 // We combine the month and year into a 6-digit number (200801 for | |
107 // January, 2008). The month is 1-based. | |
108 return exploded.year * 100 + exploded.month; | |
109 } | |
110 | |
111 // static | |
112 Time TextDatabaseManager::IDToTime(TextDatabase::DBIdent id) { | |
113 Time::Exploded exploded; | |
114 memset(&exploded, 0, sizeof(Time::Exploded)); | |
115 exploded.year = id / 100; | |
116 exploded.month = id % 100; | |
117 return Time::FromUTCExploded(exploded); | |
118 } | |
119 | |
120 bool TextDatabaseManager::Init(const HistoryPublisher* history_publisher) { | |
121 history_publisher_ = history_publisher; | |
122 | |
123 // Start checking recent changes and committing them. | |
124 ScheduleFlushOldChanges(); | |
125 return true; | |
126 } | |
127 | |
128 void TextDatabaseManager::BeginTransaction() { | |
129 transaction_nesting_++; | |
130 } | |
131 | |
132 void TextDatabaseManager::CommitTransaction() { | |
133 DCHECK(transaction_nesting_); | |
134 transaction_nesting_--; | |
135 if (transaction_nesting_) | |
136 return; // Still more nesting of transactions before committing. | |
137 | |
138 // Commit all databases with open transactions on them. | |
139 for (DBIdentSet::const_iterator i = open_transactions_.begin(); | |
140 i != open_transactions_.end(); ++i) { | |
141 DBCache::iterator iter = db_cache_.Get(*i); | |
142 if (iter == db_cache_.end()) { | |
143 NOTREACHED() << "All open transactions should be cached."; | |
144 continue; | |
145 } | |
146 iter->second->CommitTransaction(); | |
147 } | |
148 open_transactions_.clear(); | |
149 | |
150 // Now that the transaction is over, we can expire old connections. | |
151 db_cache_.ShrinkToSize(kCacheDBSize); | |
152 } | |
153 | |
154 void TextDatabaseManager::InitDBList() { | |
155 if (present_databases_loaded_) | |
156 return; | |
157 | |
158 present_databases_loaded_ = true; | |
159 | |
160 // Find files on disk matching our pattern so we can quickly test for them. | |
161 base::FilePath::StringType filepattern(TextDatabase::file_base()); | |
162 filepattern.append(FILE_PATH_LITERAL("*")); | |
163 base::FileEnumerator enumerator( | |
164 dir_, false, base::FileEnumerator::FILES, filepattern); | |
165 base::FilePath cur_file; | |
166 while (!(cur_file = enumerator.Next()).empty()) { | |
167 // Convert to the number representing this file. | |
168 TextDatabase::DBIdent id = TextDatabase::FileNameToID(cur_file); | |
169 if (id) // Will be 0 on error. | |
170 present_databases_.insert(id); | |
171 } | |
172 } | |
173 | |
174 void TextDatabaseManager::AddPageURL(const GURL& url, | |
175 URLID url_id, | |
176 VisitID visit_id, | |
177 Time time) { | |
178 // Delete any existing page info. | |
179 RecentChangeList::iterator found = recent_changes_.Peek(url); | |
180 if (found != recent_changes_.end()) | |
181 recent_changes_.Erase(found); | |
182 | |
183 // Just save this info for later. We will save it when it expires or when all | |
184 // the data is complete. | |
185 recent_changes_.Put(url, PageInfo(url_id, visit_id, time)); | |
186 } | |
187 | |
188 void TextDatabaseManager::AddPageTitle(const GURL& url, | |
189 const string16& title) { | |
190 RecentChangeList::iterator found = recent_changes_.Peek(url); | |
191 if (found == recent_changes_.end()) { | |
192 // This page is not in our cache of recent pages. This is very much an edge | |
193 // case as normally a title will come in <20 seconds after the page commits, | |
194 // and WebContents will avoid spamming us with >1 title per page. However, | |
195 // it could come up if your connection is unhappy, and we don't want to | |
196 // miss anything. | |
197 // | |
198 // To solve this problem, we'll just associate the most recent visit with | |
199 // the new title and index that using the regular code path. | |
200 URLRow url_row; | |
201 if (!url_database_->GetRowForURL(url, &url_row)) | |
202 return; // URL is unknown, give up. | |
203 VisitRow visit; | |
204 if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit)) | |
205 return; // No recent visit, give up. | |
206 | |
207 if (visit.is_indexed) { | |
208 // If this page was already indexed, we could have a body that came in | |
209 // first and we don't want to overwrite it. We could go query for the | |
210 // current body, or have a special setter for only the title, but this is | |
211 // not worth it for this edge case. | |
212 // | |
213 // It will be almost impossible for the title to take longer than | |
214 // kExpirationSeconds yet we got a body in less than that time, since | |
215 // the title should always come in first. | |
216 return; | |
217 } | |
218 | |
219 AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time, | |
220 title, string16()); | |
221 return; // We don't know about this page, give up. | |
222 } | |
223 | |
224 PageInfo& info = found->second; | |
225 if (info.has_body()) { | |
226 // This info is complete, write to the database. | |
227 AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(), | |
228 title, info.body()); | |
229 recent_changes_.Erase(found); | |
230 return; | |
231 } | |
232 | |
233 info.set_title(title); | |
234 } | |
235 | |
236 void TextDatabaseManager::AddPageContents(const GURL& url, | |
237 const string16& body) { | |
238 RecentChangeList::iterator found = recent_changes_.Peek(url); | |
239 if (found == recent_changes_.end()) { | |
240 // This page is not in our cache of recent pages. This means that the page | |
241 // took more than kExpirationSeconds to load. Often, this will be the result | |
242 // of a very slow iframe or other resource on the page that makes us think | |
243 // it's still loading. | |
244 // | |
245 // As a fallback, set the most recent visit's contents using the input, and | |
246 // use the last set title in the URL table as the title to index. | |
247 URLRow url_row; | |
248 if (!url_database_->GetRowForURL(url, &url_row)) | |
249 return; // URL is unknown, give up. | |
250 VisitRow visit; | |
251 if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit)) | |
252 return; // No recent visit, give up. | |
253 | |
254 // Use the title from the URL row as the title for the indexing. | |
255 AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time, | |
256 url_row.title(), body); | |
257 return; | |
258 } | |
259 | |
260 PageInfo& info = found->second; | |
261 if (info.has_title()) { | |
262 // This info is complete, write to the database. | |
263 AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(), | |
264 info.title(), body); | |
265 recent_changes_.Erase(found); | |
266 return; | |
267 } | |
268 | |
269 info.set_body(body); | |
270 } | |
271 | |
272 bool TextDatabaseManager::AddPageData(const GURL& url, | |
273 URLID url_id, | |
274 VisitID visit_id, | |
275 Time visit_time, | |
276 const string16& title, | |
277 const string16& body) { | |
278 TextDatabase* db = GetDBForTime(visit_time, true); | |
279 if (!db) | |
280 return false; | |
281 | |
282 TimeTicks beginning_time = TimeTicks::Now(); | |
283 | |
284 // First delete any recently-indexed data for this page. This will delete | |
285 // anything in the main database, but we don't bother looking through the | |
286 // archived database. | |
287 VisitVector visits; | |
288 visit_database_->GetIndexedVisitsForURL(url_id, &visits); | |
289 for (size_t i = 0; i < visits.size(); i++) { | |
290 visits[i].is_indexed = false; | |
291 visit_database_->UpdateVisitRow(visits[i]); | |
292 DeletePageData(visits[i].visit_time, url, NULL); | |
293 } | |
294 | |
295 if (visit_id) { | |
296 // We're supposed to update the visit database, so load the visit. | |
297 VisitRow row; | |
298 if (!visit_database_->GetRowForVisit(visit_id, &row)) { | |
299 // This situation can occur if Chrome's history is in the process of | |
300 // being updated, and then the browsing history is deleted before all | |
301 // updates have been completely performed. In this case, a stale update | |
302 // to the database is attempted, leading to the warning below. | |
303 DLOG(WARNING) << "Could not find requested visit #" << visit_id; | |
304 return false; | |
305 } | |
306 | |
307 DCHECK(visit_time == row.visit_time); | |
308 | |
309 // Update the visit database to reference our addition. | |
310 row.is_indexed = true; | |
311 if (!visit_database_->UpdateVisitRow(row)) | |
312 return false; | |
313 } | |
314 | |
315 // Now index the data. | |
316 std::string url_str = URLDatabase::GURLToDatabaseURL(url); | |
317 bool success = db->AddPageData(visit_time, url_str, | |
318 ConvertStringForIndexer(title), | |
319 ConvertStringForIndexer(body)); | |
320 | |
321 UMA_HISTOGRAM_TIMES("History.AddFTSData", | |
322 TimeTicks::Now() - beginning_time); | |
323 | |
324 if (history_publisher_) | |
325 history_publisher_->PublishPageContent(visit_time, url, title, body); | |
326 | |
327 return success; | |
328 } | |
329 | |
330 void TextDatabaseManager::DeletePageData(Time time, const GURL& url, | |
331 ChangeSet* change_set) { | |
332 TextDatabase::DBIdent db_ident = TimeToID(time); | |
333 | |
334 // We want to open the database for writing, but only if it exists. To | |
335 // achieve this, we check whether it exists by saying we're not going to | |
336 // write to it (avoiding the autocreation code normally called when writing) | |
337 // and then access it for writing only if it succeeds. | |
338 TextDatabase* db = GetDB(db_ident, false); | |
339 if (!db) | |
340 return; | |
341 db = GetDB(db_ident, true); | |
342 | |
343 if (change_set) | |
344 change_set->Add(db_ident); | |
345 | |
346 db->DeletePageData(time, URLDatabase::GURLToDatabaseURL(url)); | |
347 } | |
348 | |
349 void TextDatabaseManager::DeleteFromUncommitted( | |
350 const std::set<GURL>& restrict_urls, Time begin, Time end) { | |
351 // First find the beginning of the range to delete. Recall that the list | |
352 // has the most recent item at the beginning. There won't normally be very | |
353 // many items, so a brute-force search is fine. | |
354 RecentChangeList::iterator cur = recent_changes_.begin(); | |
355 if (!end.is_null()) { | |
356 // Walk from the beginning of the list backwards in time to find the newest | |
357 // entry that should be deleted. | |
358 while (cur != recent_changes_.end() && cur->second.visit_time() >= end) | |
359 ++cur; | |
360 } | |
361 | |
362 // Now delete all visits up to the oldest one we were supposed to delete. | |
363 // Note that if begin is_null, it will be less than or equal to any other | |
364 // time. | |
365 if (restrict_urls.empty()) { | |
366 while (cur != recent_changes_.end() && cur->second.visit_time() >= begin) | |
367 cur = recent_changes_.Erase(cur); | |
368 } else { | |
369 while (cur != recent_changes_.end() && cur->second.visit_time() >= begin) { | |
370 if (restrict_urls.find(cur->first) != restrict_urls.end()) | |
371 cur = recent_changes_.Erase(cur); | |
372 else | |
373 ++cur; | |
374 } | |
375 } | |
376 } | |
377 | |
378 void TextDatabaseManager::DeleteFromUncommittedForTimes( | |
379 const std::vector<base::Time>& times) { | |
380 // |times| must be in reverse chronological order, i.e. each member | |
381 // must be earlier than or the same as the one before it. | |
382 DCHECK( | |
383 std::adjacent_find( | |
384 times.begin(), times.end(), std::less<base::Time>()) == | |
385 times.end()); | |
386 | |
387 // Both |recent_changes_| and |times| are in reverse chronological order. | |
388 RecentChangeList::iterator it = recent_changes_.begin(); | |
389 std::vector<base::Time>::const_iterator time_it = times.begin(); | |
390 while (it != recent_changes_.end() && time_it != times.end()) { | |
391 base::Time visit_time = it->second.visit_time(); | |
392 if (visit_time == *time_it) { | |
393 it = recent_changes_.Erase(it); | |
394 } else if (visit_time < *time_it) { | |
395 ++time_it; | |
396 } else /* if (visit_time > *time_it) */ { | |
397 ++it; | |
398 } | |
399 } | |
400 } | |
401 | |
402 void TextDatabaseManager::DeleteAll() { | |
403 DCHECK_EQ(0, transaction_nesting_) << "Calling deleteAll in a transaction."; | |
404 | |
405 InitDBList(); | |
406 | |
407 // Delete uncommitted entries. | |
408 recent_changes_.Clear(); | |
409 | |
410 // Close all open databases. | |
411 db_cache_.Clear(); | |
412 | |
413 // Now go through and delete all the files. | |
414 for (DBIdentSet::iterator i = present_databases_.begin(); | |
415 i != present_databases_.end(); ++i) { | |
416 base::FilePath file_name = dir_.Append(TextDatabase::IDToFileName(*i)); | |
417 sql::Connection::Delete(file_name); | |
418 } | |
419 } | |
420 | |
421 void TextDatabaseManager::OptimizeChangedDatabases( | |
422 const ChangeSet& change_set) { | |
423 for (ChangeSet::DBSet::const_iterator i = | |
424 change_set.changed_databases_.begin(); | |
425 i != change_set.changed_databases_.end(); ++i) { | |
426 // We want to open the database for writing, but only if it exists. To | |
427 // achieve this, we check whether it exists by saying we're not going to | |
428 // write to it (avoiding the autocreation code normally called when writing) | |
429 // and then access it for writing only if it succeeds. | |
430 TextDatabase* db = GetDB(*i, false); | |
431 if (!db) | |
432 continue; | |
433 db = GetDB(*i, true); | |
434 if (!db) | |
435 continue; // The file may have changed or something. | |
436 db->Optimize(); | |
437 } | |
438 } | |
439 | |
440 void TextDatabaseManager::GetTextMatches( | |
441 const string16& query, | |
442 const QueryOptions& options, | |
443 std::vector<TextDatabase::Match>* results, | |
444 Time* first_time_searched) { | |
445 results->clear(); | |
446 | |
447 *first_time_searched = options.begin_time; | |
448 | |
449 InitDBList(); | |
450 if (present_databases_.empty()) | |
451 return; // Nothing to search. | |
452 | |
453 // Get the query into the proper format for the individual DBs. | |
454 string16 fts_query16; | |
455 query_parser_.ParseQuery(query, &fts_query16); | |
456 std::string fts_query = UTF16ToUTF8(fts_query16); | |
457 | |
458 // Need a copy of the options so we can modify the max count for each call | |
459 // to the individual databases. | |
460 QueryOptions cur_options(options); | |
461 | |
462 // Compute the minimum and maximum values for the identifiers that could | |
463 // encompass the input time range. | |
464 TextDatabase::DBIdent min_ident = options.begin_time.is_null() ? | |
465 *present_databases_.begin() : | |
466 TimeToID(options.begin_time); | |
467 TextDatabase::DBIdent max_ident = options.end_time.is_null() ? | |
468 *present_databases_.rbegin() : | |
469 TimeToID(options.end_time); | |
470 | |
471 // Iterate over the databases from the most recent backwards. | |
472 TextDatabase::URLSet found_urls; | |
473 for (DBIdentSet::reverse_iterator i = present_databases_.rbegin(); | |
474 i != present_databases_.rend(); | |
475 ++i) { | |
476 // TODO(brettw) allow canceling the query in the middle. | |
477 // if (canceled_or_something) | |
478 // break; | |
479 | |
480 // This code is stupid, we just loop until we find the correct starting | |
481 // time range rather than search in an intelligent way. Users will have a | |
482 // few dozen files at most, so this should not be an issue. | |
483 if (*i > max_ident) | |
484 continue; // Haven't gotten to the time range yet. | |
485 if (*i < min_ident) | |
486 break; // Covered all the time range. | |
487 | |
488 TextDatabase* cur_db = GetDB(*i, false); | |
489 if (!cur_db) | |
490 continue; | |
491 | |
492 // Adjust the max count according to how many results we've already got. | |
493 if (options.max_count) { | |
494 cur_options.max_count = options.max_count - | |
495 static_cast<int>(results->size()); | |
496 } | |
497 | |
498 bool has_more_results = cur_db->GetTextMatches( | |
499 fts_query, cur_options, results, &found_urls); | |
500 | |
501 DCHECK(static_cast<int>(results->size()) <= options.EffectiveMaxCount()); | |
502 | |
503 if (has_more_results || | |
504 static_cast<int>(results->size()) == options.EffectiveMaxCount()) { | |
505 // Since the search proceeds backwards in time, the last result we have | |
506 // gives the first time searched. | |
507 *first_time_searched = results->back().time; | |
508 break; | |
509 } | |
510 } | |
511 } | |
512 | |
513 size_t TextDatabaseManager::GetUncommittedEntryCountForTest() const { | |
514 return recent_changes_.size(); | |
515 } | |
516 | |
517 TextDatabase* TextDatabaseManager::GetDB(TextDatabase::DBIdent id, | |
518 bool for_writing) { | |
519 DBCache::iterator found_db = db_cache_.Get(id); | |
520 if (found_db != db_cache_.end()) { | |
521 if (transaction_nesting_ && for_writing && | |
522 open_transactions_.find(id) == open_transactions_.end()) { | |
523 // If we currently have an open transaction, that database is not yet | |
524 // part of the transaction, and the database will be written to, it needs | |
525 // to be part of our transaction. | |
526 found_db->second->BeginTransaction(); | |
527 open_transactions_.insert(id); | |
528 } | |
529 return found_db->second; | |
530 } | |
531 | |
532 // Need to make the database. | |
533 TextDatabase* new_db = new TextDatabase(dir_, id, for_writing); | |
534 if (!new_db->Init()) { | |
535 delete new_db; | |
536 return NULL; | |
537 } | |
538 db_cache_.Put(id, new_db); | |
539 present_databases_.insert(id); | |
540 | |
541 if (transaction_nesting_ && for_writing) { | |
542 // If we currently have an open transaction and the new database will be | |
543 // written to, it needs to be part of our transaction. | |
544 new_db->BeginTransaction(); | |
545 open_transactions_.insert(id); | |
546 } | |
547 | |
548 // When no transaction is open, allow this new one to kick out an old one. | |
549 if (!transaction_nesting_) | |
550 db_cache_.ShrinkToSize(kCacheDBSize); | |
551 | |
552 return new_db; | |
553 } | |
554 | |
555 TextDatabase* TextDatabaseManager::GetDBForTime(Time time, | |
556 bool create_if_necessary) { | |
557 return GetDB(TimeToID(time), create_if_necessary); | |
558 } | |
559 | |
560 void TextDatabaseManager::ScheduleFlushOldChanges() { | |
561 weak_factory_.InvalidateWeakPtrs(); | |
562 base::MessageLoop::current()->PostDelayedTask( | |
563 FROM_HERE, | |
564 base::Bind(&TextDatabaseManager::FlushOldChanges, | |
565 weak_factory_.GetWeakPtr()), | |
566 base::TimeDelta::FromSeconds(kExpirationSeconds)); | |
567 } | |
568 | |
569 void TextDatabaseManager::FlushOldChanges() { | |
570 FlushOldChangesForTime(TimeTicks::Now()); | |
571 } | |
572 | |
573 void TextDatabaseManager::FlushOldChangesForTime(TimeTicks now) { | |
574 // The end of the list is the oldest, so we just start from there committing | |
575 // things until we get something too new. | |
576 RecentChangeList::reverse_iterator i = recent_changes_.rbegin(); | |
577 while (i != recent_changes_.rend() && i->second.Expired(now)) { | |
578 AddPageData(i->first, i->second.url_id(), i->second.visit_id(), | |
579 i->second.visit_time(), i->second.title(), i->second.body()); | |
580 i = recent_changes_.Erase(i); | |
581 } | |
582 | |
583 ScheduleFlushOldChanges(); | |
584 } | |
585 | |
586 } // namespace history | |
OLD | NEW |