Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(393)

Side by Side Diff: components/omnibox/browser/scored_history_match.cc

Issue 2421373003: Omnibox: Improve HQP Scoring for Terms that Start with Punctuation (Closed)
Patch Set: peter's comments, plus git cl format the rest of the changelist Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | components/omnibox/browser/scored_history_match_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/omnibox/browser/scored_history_match.h" 5 #include "components/omnibox/browser/scored_history_match.h"
6 6
7 #include <math.h> 7 #include <math.h>
8 8
9 #include <algorithm> 9 #include <algorithm>
10 #include <vector> 10 #include <vector>
(...skipping 455 matching lines...) Expand 10 before | Expand all | Expand 10 after
466 url_matches = FilterTermMatchesByWordStarts( 466 url_matches = FilterTermMatchesByWordStarts(
467 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_, 467 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_,
468 end_of_hostname_pos, std::string::npos); 468 end_of_hostname_pos, std::string::npos);
469 if (colon_pos != std::string::npos) { 469 if (colon_pos != std::string::npos) {
470 // Also filter matches not at a word boundary and in the scheme. 470 // Also filter matches not at a word boundary and in the scheme.
471 url_matches = FilterTermMatchesByWordStarts( 471 url_matches = FilterTermMatchesByWordStarts(
472 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_, 472 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_,
473 0, colon_pos); 473 0, colon_pos);
474 } 474 }
475 for (const auto& url_match : url_matches) { 475 for (const auto& url_match : url_matches) {
476 const size_t term_offset = terms_to_word_starts_offsets[url_match.term_num]; 476 // Calculate the offset in the URL string where the meaningful (word) part
477 // of the term starts. This takes into account times when a term starts
478 // with punctuation such as "/foo".
479 const size_t term_word_offset =
480 url_match.offset + terms_to_word_starts_offsets[url_match.term_num];
477 // Advance next_word_starts until it's >= the position of the term we're 481 // Advance next_word_starts until it's >= the position of the term we're
478 // considering (adjusted for where the word begins within the term). 482 // considering (adjusted for where the word begins within the term).
479 while ((next_word_starts != end_word_starts) && 483 while ((next_word_starts != end_word_starts) &&
480 (*next_word_starts < (url_match.offset + term_offset))) { 484 (*next_word_starts < term_word_offset)) {
481 ++next_word_starts; 485 ++next_word_starts;
482 } 486 }
483 const bool at_word_boundary = 487 const bool at_word_boundary = (next_word_starts != end_word_starts) &&
484 (next_word_starts != end_word_starts) && 488 (*next_word_starts == term_word_offset);
485 (*next_word_starts == url_match.offset + term_offset);
486 if ((question_mark_pos != std::string::npos) && 489 if ((question_mark_pos != std::string::npos) &&
487 (url_match.offset > question_mark_pos)) { 490 (term_word_offset >= question_mark_pos)) {
488 // The match is in a CGI ?... fragment. 491 // The match is in a CGI ?... fragment.
489 DCHECK(at_word_boundary); 492 DCHECK(at_word_boundary);
490 term_scores[url_match.term_num] += 5; 493 term_scores[url_match.term_num] += 5;
491 } else if ((end_of_hostname_pos != std::string::npos) && 494 } else if ((end_of_hostname_pos != std::string::npos) &&
492 (url_match.offset > end_of_hostname_pos)) { 495 (term_word_offset >= end_of_hostname_pos)) {
493 // The match is in the path. 496 // The match is in the path.
494 DCHECK(at_word_boundary); 497 DCHECK(at_word_boundary);
495 term_scores[url_match.term_num] += 8; 498 term_scores[url_match.term_num] += 8;
496 } else if ((colon_pos == std::string::npos) || 499 } else if ((colon_pos == std::string::npos) ||
497 (url_match.offset > colon_pos)) { 500 (term_word_offset >= colon_pos)) {
498 // The match is in the hostname. 501 // The match is in the hostname.
499 if ((last_part_of_hostname_pos == std::string::npos) || 502 if ((last_part_of_hostname_pos == std::string::npos) ||
500 (url_match.offset < last_part_of_hostname_pos)) { 503 (term_word_offset < last_part_of_hostname_pos)) {
501 // Either there are no dots in the hostname or this match isn't 504 // Either there are no dots in the hostname or this match isn't
502 // the last dotted component. 505 // the last dotted component.
503 term_scores[url_match.term_num] += at_word_boundary ? 10 : 2; 506 term_scores[url_match.term_num] += at_word_boundary ? 10 : 2;
504 } else { 507 } else {
505 // The match is in the last part of a dotted hostname (usually this 508 // The match is in the last part of a dotted hostname (usually this
506 // is the top-level domain .com, .net, etc.). 509 // is the top-level domain .com, .net, etc.).
507 if (allow_tld_matches_) 510 if (allow_tld_matches_)
508 term_scores[url_match.term_num] += at_word_boundary ? 10 : 0; 511 term_scores[url_match.term_num] += at_word_boundary ? 10 : 0;
509 } 512 }
510 } else { 513 } else {
511 // The match is in the protocol (a.k.a. scheme). 514 // The match is in the protocol (a.k.a. scheme).
512 // Matches not at a word boundary should have been filtered already. 515 // Matches not at a word boundary should have been filtered already.
513 DCHECK(at_word_boundary); 516 DCHECK(at_word_boundary);
514 match_in_scheme = true; 517 match_in_scheme = true;
515 if (allow_scheme_matches_) 518 if (allow_scheme_matches_)
516 term_scores[url_match.term_num] += 10; 519 term_scores[url_match.term_num] += 10;
517 } 520 }
518 } 521 }
519 // Now do the analogous loop over all matches in the title. 522 // Now do the analogous loop over all matches in the title.
520 next_word_starts = word_starts.title_word_starts_.begin(); 523 next_word_starts = word_starts.title_word_starts_.begin();
521 end_word_starts = word_starts.title_word_starts_.end(); 524 end_word_starts = word_starts.title_word_starts_.end();
522 size_t word_num = 0; 525 size_t word_num = 0;
523 title_matches = FilterTermMatchesByWordStarts( 526 title_matches = FilterTermMatchesByWordStarts(
524 title_matches, terms_to_word_starts_offsets, 527 title_matches, terms_to_word_starts_offsets,
525 word_starts.title_word_starts_, 0, std::string::npos); 528 word_starts.title_word_starts_, 0, std::string::npos);
526 for (const auto& title_match : title_matches) { 529 for (const auto& title_match : title_matches) {
527 const size_t term_offset = 530 // Calculate the offset in the title string where the meaningful (word) part
528 terms_to_word_starts_offsets[title_match.term_num]; 531 // of the term starts. This takes into account times when a term starts
532 // with punctuation such as "/foo".
533 const size_t term_word_offset =
534 title_match.offset + terms_to_word_starts_offsets[title_match.term_num];
529 // Advance next_word_starts until it's >= the position of the term we're 535 // Advance next_word_starts until it's >= the position of the term we're
530 // considering (adjusted for where the word begins within the term). 536 // considering (adjusted for where the word begins within the term).
531 while ((next_word_starts != end_word_starts) && 537 while ((next_word_starts != end_word_starts) &&
532 (*next_word_starts < (title_match.offset + term_offset))) { 538 (*next_word_starts < term_word_offset)) {
533 ++next_word_starts; 539 ++next_word_starts;
534 ++word_num; 540 ++word_num;
535 } 541 }
536 if (word_num >= num_title_words_to_allow_) 542 if (word_num >= num_title_words_to_allow_)
537 break; // only count the first ten words 543 break; // only count the first ten words
538 DCHECK(next_word_starts != end_word_starts); 544 DCHECK(next_word_starts != end_word_starts);
539 DCHECK_EQ(*next_word_starts, title_match.offset + term_offset) 545 DCHECK_EQ(*next_word_starts, term_word_offset) << "not at word boundary";
540 << "not at word boundary";
541 term_scores[title_match.term_num] += 8; 546 term_scores[title_match.term_num] += 8;
542 } 547 }
543 // TODO(mpearson): Restore logic for penalizing out-of-order matches. 548 // TODO(mpearson): Restore logic for penalizing out-of-order matches.
544 // (Perhaps discount them by 0.8?) 549 // (Perhaps discount them by 0.8?)
545 // TODO(mpearson): Consider: if the earliest match occurs late in the string, 550 // TODO(mpearson): Consider: if the earliest match occurs late in the string,
546 // should we discount it? 551 // should we discount it?
547 // TODO(mpearson): Consider: do we want to score based on how much of the 552 // TODO(mpearson): Consider: do we want to score based on how much of the
548 // input string the input covers? (I'm leaning toward no.) 553 // input string the input covers? (I'm leaning toward no.)
549 554
550 // Compute the topicality_score as the sum of transformed term_scores. 555 // Compute the topicality_score as the sum of transformed term_scores.
(...skipping 164 matching lines...) Expand 10 before | Expand all | Expand 10 after
715 base::StringToDouble(it->first, &bucket.first); 720 base::StringToDouble(it->first, &bucket.first);
716 DCHECK(is_valid_intermediate_score); 721 DCHECK(is_valid_intermediate_score);
717 bool is_valid_hqp_score = base::StringToInt(it->second, &bucket.second); 722 bool is_valid_hqp_score = base::StringToInt(it->second, &bucket.second);
718 DCHECK(is_valid_hqp_score); 723 DCHECK(is_valid_hqp_score);
719 hqp_buckets->push_back(bucket); 724 hqp_buckets->push_back(bucket);
720 } 725 }
721 return true; 726 return true;
722 } 727 }
723 return false; 728 return false;
724 } 729 }
OLDNEW
« no previous file with comments | « no previous file | components/omnibox/browser/scored_history_match_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698