components/omnibox/browser/scored_history_match.cc - Issue 2421373003: Omnibox: Improve HQP Scoring for Terms that Start with Punctuation

Side by Side Diff: components/omnibox/browser/scored_history_match.cc

Issue 2421373003: Omnibox: Improve HQP Scoring for Terms that Start with Punctuation (Closed)

Patch Set: peter's comments, plus git cl format the rest of the changelist Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/omnibox/browser/scored_history_match.h"	5 #include "components/omnibox/browser/scored_history_match.h"

6	6

7 #include <math.h>	7 #include <math.h>

8	8

9 #include <algorithm>	9 #include <algorithm>

10 #include <vector>	10 #include <vector>

(...skipping 455 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
466 url_matches = FilterTermMatchesByWordStarts(	466 url_matches = FilterTermMatchesByWordStarts(

467 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_,	467 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_,

468 end_of_hostname_pos, std::string::npos);	468 end_of_hostname_pos, std::string::npos);

469 if (colon_pos != std::string::npos) {	469 if (colon_pos != std::string::npos) {

470 // Also filter matches not at a word boundary and in the scheme.	470 // Also filter matches not at a word boundary and in the scheme.

471 url_matches = FilterTermMatchesByWordStarts(	471 url_matches = FilterTermMatchesByWordStarts(

472 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_,	472 url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_,

473 0, colon_pos);	473 0, colon_pos);

474 }	474 }

475 for (const auto& url_match : url_matches) {	475 for (const auto& url_match : url_matches) {

476 const size_t term_offset = terms_to_word_starts_offsets[url_match.term_num];	476 // Calculate the offset in the URL string where the meaningful (word) part

	477 // of the term starts. This takes into account times when a term starts

	478 // with punctuation such as "/foo".

	479 const size_t term_word_offset =

	480 url_match.offset + terms_to_word_starts_offsets[url_match.term_num];

477 // Advance next_word_starts until it's >= the position of the term we're	481 // Advance next_word_starts until it's >= the position of the term we're

478 // considering (adjusted for where the word begins within the term).	482 // considering (adjusted for where the word begins within the term).

479 while ((next_word_starts != end_word_starts) &&	483 while ((next_word_starts != end_word_starts) &&

480 (*next_word_starts < (url_match.offset + term_offset))) {	484 (*next_word_starts < term_word_offset)) {

481 ++next_word_starts;	485 ++next_word_starts;

482 }	486 }

483 const bool at_word_boundary =	487 const bool at_word_boundary = (next_word_starts != end_word_starts) &&

484 (next_word_starts != end_word_starts) &&	488 (*next_word_starts == term_word_offset);

485 (*next_word_starts == url_match.offset + term_offset);

486 if ((question_mark_pos != std::string::npos) &&	489 if ((question_mark_pos != std::string::npos) &&

487 (url_match.offset > question_mark_pos)) {	490 (term_word_offset >= question_mark_pos)) {

488 // The match is in a CGI ?... fragment.	491 // The match is in a CGI ?... fragment.

489 DCHECK(at_word_boundary);	492 DCHECK(at_word_boundary);

490 term_scores[url_match.term_num] += 5;	493 term_scores[url_match.term_num] += 5;

491 } else if ((end_of_hostname_pos != std::string::npos) &&	494 } else if ((end_of_hostname_pos != std::string::npos) &&

492 (url_match.offset > end_of_hostname_pos)) {	495 (term_word_offset >= end_of_hostname_pos)) {

493 // The match is in the path.	496 // The match is in the path.

494 DCHECK(at_word_boundary);	497 DCHECK(at_word_boundary);

495 term_scores[url_match.term_num] += 8;	498 term_scores[url_match.term_num] += 8;

496 } else if ((colon_pos == std::string::npos) \|\|	499 } else if ((colon_pos == std::string::npos) \|\|

497 (url_match.offset > colon_pos)) {	500 (term_word_offset >= colon_pos)) {

498 // The match is in the hostname.	501 // The match is in the hostname.

499 if ((last_part_of_hostname_pos == std::string::npos) \|\|	502 if ((last_part_of_hostname_pos == std::string::npos) \|\|

500 (url_match.offset < last_part_of_hostname_pos)) {	503 (term_word_offset < last_part_of_hostname_pos)) {

501 // Either there are no dots in the hostname or this match isn't	504 // Either there are no dots in the hostname or this match isn't

502 // the last dotted component.	505 // the last dotted component.

503 term_scores[url_match.term_num] += at_word_boundary ? 10 : 2;	506 term_scores[url_match.term_num] += at_word_boundary ? 10 : 2;

504 } else {	507 } else {

505 // The match is in the last part of a dotted hostname (usually this	508 // The match is in the last part of a dotted hostname (usually this

506 // is the top-level domain .com, .net, etc.).	509 // is the top-level domain .com, .net, etc.).

507 if (allow_tld_matches_)	510 if (allow_tld_matches_)

508 term_scores[url_match.term_num] += at_word_boundary ? 10 : 0;	511 term_scores[url_match.term_num] += at_word_boundary ? 10 : 0;

509 }	512 }

510 } else {	513 } else {

511 // The match is in the protocol (a.k.a. scheme).	514 // The match is in the protocol (a.k.a. scheme).

512 // Matches not at a word boundary should have been filtered already.	515 // Matches not at a word boundary should have been filtered already.

513 DCHECK(at_word_boundary);	516 DCHECK(at_word_boundary);

514 match_in_scheme = true;	517 match_in_scheme = true;

515 if (allow_scheme_matches_)	518 if (allow_scheme_matches_)

516 term_scores[url_match.term_num] += 10;	519 term_scores[url_match.term_num] += 10;

517 }	520 }

518 }	521 }

519 // Now do the analogous loop over all matches in the title.	522 // Now do the analogous loop over all matches in the title.

520 next_word_starts = word_starts.title_word_starts_.begin();	523 next_word_starts = word_starts.title_word_starts_.begin();

521 end_word_starts = word_starts.title_word_starts_.end();	524 end_word_starts = word_starts.title_word_starts_.end();

522 size_t word_num = 0;	525 size_t word_num = 0;

523 title_matches = FilterTermMatchesByWordStarts(	526 title_matches = FilterTermMatchesByWordStarts(

524 title_matches, terms_to_word_starts_offsets,	527 title_matches, terms_to_word_starts_offsets,

525 word_starts.title_word_starts_, 0, std::string::npos);	528 word_starts.title_word_starts_, 0, std::string::npos);

526 for (const auto& title_match : title_matches) {	529 for (const auto& title_match : title_matches) {

527 const size_t term_offset =	530 // Calculate the offset in the title string where the meaningful (word) part

528 terms_to_word_starts_offsets[title_match.term_num];	531 // of the term starts. This takes into account times when a term starts

	532 // with punctuation such as "/foo".

	533 const size_t term_word_offset =

	534 title_match.offset + terms_to_word_starts_offsets[title_match.term_num];

529 // Advance next_word_starts until it's >= the position of the term we're	535 // Advance next_word_starts until it's >= the position of the term we're

530 // considering (adjusted for where the word begins within the term).	536 // considering (adjusted for where the word begins within the term).

531 while ((next_word_starts != end_word_starts) &&	537 while ((next_word_starts != end_word_starts) &&

532 (*next_word_starts < (title_match.offset + term_offset))) {	538 (*next_word_starts < term_word_offset)) {

533 ++next_word_starts;	539 ++next_word_starts;

534 ++word_num;	540 ++word_num;

535 }	541 }

536 if (word_num >= num_title_words_to_allow_)	542 if (word_num >= num_title_words_to_allow_)

537 break; // only count the first ten words	543 break; // only count the first ten words

538 DCHECK(next_word_starts != end_word_starts);	544 DCHECK(next_word_starts != end_word_starts);

539 DCHECK_EQ(*next_word_starts, title_match.offset + term_offset)	545 DCHECK_EQ(*next_word_starts, term_word_offset) << "not at word boundary";

540 << "not at word boundary";

541 term_scores[title_match.term_num] += 8;	546 term_scores[title_match.term_num] += 8;

542 }	547 }

543 // TODO(mpearson): Restore logic for penalizing out-of-order matches.	548 // TODO(mpearson): Restore logic for penalizing out-of-order matches.

544 // (Perhaps discount them by 0.8?)	549 // (Perhaps discount them by 0.8?)

545 // TODO(mpearson): Consider: if the earliest match occurs late in the string,	550 // TODO(mpearson): Consider: if the earliest match occurs late in the string,

546 // should we discount it?	551 // should we discount it?

547 // TODO(mpearson): Consider: do we want to score based on how much of the	552 // TODO(mpearson): Consider: do we want to score based on how much of the

548 // input string the input covers? (I'm leaning toward no.)	553 // input string the input covers? (I'm leaning toward no.)

549	554

550 // Compute the topicality_score as the sum of transformed term_scores.	555 // Compute the topicality_score as the sum of transformed term_scores.

(...skipping 164 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
715 base::StringToDouble(it->first, &bucket.first);	720 base::StringToDouble(it->first, &bucket.first);

716 DCHECK(is_valid_intermediate_score);	721 DCHECK(is_valid_intermediate_score);

717 bool is_valid_hqp_score = base::StringToInt(it->second, &bucket.second);	722 bool is_valid_hqp_score = base::StringToInt(it->second, &bucket.second);

718 DCHECK(is_valid_hqp_score);	723 DCHECK(is_valid_hqp_score);

719 hqp_buckets->push_back(bucket);	724 hqp_buckets->push_back(bucket);

720 }	725 }

721 return true;	726 return true;

722 }	727 }

723 return false;	728 return false;

724 }	729 }

OLD	NEW

« no previous file with comments | « no previous file | components/omnibox/browser/scored_history_match_unittest.cc » ('j') | no next file with comments »