OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/omnibox/browser/scored_history_match.h" | 5 #include "components/omnibox/browser/scored_history_match.h" |
6 | 6 |
7 #include <math.h> | 7 #include <math.h> |
8 | 8 |
9 #include <algorithm> | 9 #include <algorithm> |
10 #include <vector> | 10 #include <vector> |
(...skipping 455 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
466 url_matches = FilterTermMatchesByWordStarts( | 466 url_matches = FilterTermMatchesByWordStarts( |
467 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_, | 467 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_, |
468 end_of_hostname_pos, std::string::npos); | 468 end_of_hostname_pos, std::string::npos); |
469 if (colon_pos != std::string::npos) { | 469 if (colon_pos != std::string::npos) { |
470 // Also filter matches not at a word boundary and in the scheme. | 470 // Also filter matches not at a word boundary and in the scheme. |
471 url_matches = FilterTermMatchesByWordStarts( | 471 url_matches = FilterTermMatchesByWordStarts( |
472 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_, | 472 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_, |
473 0, colon_pos); | 473 0, colon_pos); |
474 } | 474 } |
475 for (const auto& url_match : url_matches) { | 475 for (const auto& url_match : url_matches) { |
476 const size_t term_offset = terms_to_word_starts_offsets[url_match.term_num]; | 476 // Calculate the offset in the URL string where the meaningful (word) part |
| 477 // of the term starts. This takes into account times when a term starts |
| 478 // with punctuation such as "/foo". |
| 479 const size_t term_word_offset = |
| 480 url_match.offset + terms_to_word_starts_offsets[url_match.term_num]; |
477 // Advance next_word_starts until it's >= the position of the term we're | 481 // Advance next_word_starts until it's >= the position of the term we're |
478 // considering (adjusted for where the word begins within the term). | 482 // considering (adjusted for where the word begins within the term). |
479 while ((next_word_starts != end_word_starts) && | 483 while ((next_word_starts != end_word_starts) && |
480 (*next_word_starts < (url_match.offset + term_offset))) { | 484 (*next_word_starts < term_word_offset)) { |
481 ++next_word_starts; | 485 ++next_word_starts; |
482 } | 486 } |
483 const bool at_word_boundary = | 487 const bool at_word_boundary = (next_word_starts != end_word_starts) && |
484 (next_word_starts != end_word_starts) && | 488 (*next_word_starts == term_word_offset); |
485 (*next_word_starts == url_match.offset + term_offset); | |
486 if ((question_mark_pos != std::string::npos) && | 489 if ((question_mark_pos != std::string::npos) && |
487 (url_match.offset > question_mark_pos)) { | 490 (term_word_offset >= question_mark_pos)) { |
488 // The match is in a CGI ?... fragment. | 491 // The match is in a CGI ?... fragment. |
489 DCHECK(at_word_boundary); | 492 DCHECK(at_word_boundary); |
490 term_scores[url_match.term_num] += 5; | 493 term_scores[url_match.term_num] += 5; |
491 } else if ((end_of_hostname_pos != std::string::npos) && | 494 } else if ((end_of_hostname_pos != std::string::npos) && |
492 (url_match.offset > end_of_hostname_pos)) { | 495 (term_word_offset >= end_of_hostname_pos)) { |
493 // The match is in the path. | 496 // The match is in the path. |
494 DCHECK(at_word_boundary); | 497 DCHECK(at_word_boundary); |
495 term_scores[url_match.term_num] += 8; | 498 term_scores[url_match.term_num] += 8; |
496 } else if ((colon_pos == std::string::npos) || | 499 } else if ((colon_pos == std::string::npos) || |
497 (url_match.offset > colon_pos)) { | 500 (term_word_offset >= colon_pos)) { |
498 // The match is in the hostname. | 501 // The match is in the hostname. |
499 if ((last_part_of_hostname_pos == std::string::npos) || | 502 if ((last_part_of_hostname_pos == std::string::npos) || |
500 (url_match.offset < last_part_of_hostname_pos)) { | 503 (term_word_offset < last_part_of_hostname_pos)) { |
501 // Either there are no dots in the hostname or this match isn't | 504 // Either there are no dots in the hostname or this match isn't |
502 // the last dotted component. | 505 // the last dotted component. |
503 term_scores[url_match.term_num] += at_word_boundary ? 10 : 2; | 506 term_scores[url_match.term_num] += at_word_boundary ? 10 : 2; |
504 } else { | 507 } else { |
505 // The match is in the last part of a dotted hostname (usually this | 508 // The match is in the last part of a dotted hostname (usually this |
506 // is the top-level domain .com, .net, etc.). | 509 // is the top-level domain .com, .net, etc.). |
507 if (allow_tld_matches_) | 510 if (allow_tld_matches_) |
508 term_scores[url_match.term_num] += at_word_boundary ? 10 : 0; | 511 term_scores[url_match.term_num] += at_word_boundary ? 10 : 0; |
509 } | 512 } |
510 } else { | 513 } else { |
511 // The match is in the protocol (a.k.a. scheme). | 514 // The match is in the protocol (a.k.a. scheme). |
512 // Matches not at a word boundary should have been filtered already. | 515 // Matches not at a word boundary should have been filtered already. |
513 DCHECK(at_word_boundary); | 516 DCHECK(at_word_boundary); |
514 match_in_scheme = true; | 517 match_in_scheme = true; |
515 if (allow_scheme_matches_) | 518 if (allow_scheme_matches_) |
516 term_scores[url_match.term_num] += 10; | 519 term_scores[url_match.term_num] += 10; |
517 } | 520 } |
518 } | 521 } |
519 // Now do the analogous loop over all matches in the title. | 522 // Now do the analogous loop over all matches in the title. |
520 next_word_starts = word_starts.title_word_starts_.begin(); | 523 next_word_starts = word_starts.title_word_starts_.begin(); |
521 end_word_starts = word_starts.title_word_starts_.end(); | 524 end_word_starts = word_starts.title_word_starts_.end(); |
522 size_t word_num = 0; | 525 size_t word_num = 0; |
523 title_matches = FilterTermMatchesByWordStarts( | 526 title_matches = FilterTermMatchesByWordStarts( |
524 title_matches, terms_to_word_starts_offsets, | 527 title_matches, terms_to_word_starts_offsets, |
525 word_starts.title_word_starts_, 0, std::string::npos); | 528 word_starts.title_word_starts_, 0, std::string::npos); |
526 for (const auto& title_match : title_matches) { | 529 for (const auto& title_match : title_matches) { |
527 const size_t term_offset = | 530 // Calculate the offset in the title string where the meaningful (word) part |
528 terms_to_word_starts_offsets[title_match.term_num]; | 531 // of the term starts. This takes into account times when a term starts |
| 532 // with punctuation such as "/foo". |
| 533 const size_t term_word_offset = |
| 534 title_match.offset + terms_to_word_starts_offsets[title_match.term_num]; |
529 // Advance next_word_starts until it's >= the position of the term we're | 535 // Advance next_word_starts until it's >= the position of the term we're |
530 // considering (adjusted for where the word begins within the term). | 536 // considering (adjusted for where the word begins within the term). |
531 while ((next_word_starts != end_word_starts) && | 537 while ((next_word_starts != end_word_starts) && |
532 (*next_word_starts < (title_match.offset + term_offset))) { | 538 (*next_word_starts < term_word_offset)) { |
533 ++next_word_starts; | 539 ++next_word_starts; |
534 ++word_num; | 540 ++word_num; |
535 } | 541 } |
536 if (word_num >= num_title_words_to_allow_) | 542 if (word_num >= num_title_words_to_allow_) |
537 break; // only count the first ten words | 543 break; // only count the first ten words |
538 DCHECK(next_word_starts != end_word_starts); | 544 DCHECK(next_word_starts != end_word_starts); |
539 DCHECK_EQ(*next_word_starts, title_match.offset + term_offset) | 545 DCHECK_EQ(*next_word_starts, term_word_offset) << "not at word boundary"; |
540 << "not at word boundary"; | |
541 term_scores[title_match.term_num] += 8; | 546 term_scores[title_match.term_num] += 8; |
542 } | 547 } |
543 // TODO(mpearson): Restore logic for penalizing out-of-order matches. | 548 // TODO(mpearson): Restore logic for penalizing out-of-order matches. |
544 // (Perhaps discount them by 0.8?) | 549 // (Perhaps discount them by 0.8?) |
545 // TODO(mpearson): Consider: if the earliest match occurs late in the string, | 550 // TODO(mpearson): Consider: if the earliest match occurs late in the string, |
546 // should we discount it? | 551 // should we discount it? |
547 // TODO(mpearson): Consider: do we want to score based on how much of the | 552 // TODO(mpearson): Consider: do we want to score based on how much of the |
548 // input string the input covers? (I'm leaning toward no.) | 553 // input string the input covers? (I'm leaning toward no.) |
549 | 554 |
550 // Compute the topicality_score as the sum of transformed term_scores. | 555 // Compute the topicality_score as the sum of transformed term_scores. |
(...skipping 164 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
715 base::StringToDouble(it->first, &bucket.first); | 720 base::StringToDouble(it->first, &bucket.first); |
716 DCHECK(is_valid_intermediate_score); | 721 DCHECK(is_valid_intermediate_score); |
717 bool is_valid_hqp_score = base::StringToInt(it->second, &bucket.second); | 722 bool is_valid_hqp_score = base::StringToInt(it->second, &bucket.second); |
718 DCHECK(is_valid_hqp_score); | 723 DCHECK(is_valid_hqp_score); |
719 hqp_buckets->push_back(bucket); | 724 hqp_buckets->push_back(bucket); |
720 } | 725 } |
721 return true; | 726 return true; |
722 } | 727 } |
723 return false; | 728 return false; |
724 } | 729 } |
OLD | NEW |