Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(3293)

Unified Diff: chrome/renderer/translate/translate_helper.cc

Issue 18911002: Move language detection to chrome/common/. (Closed) Base URL: http://git.chromium.org/chromium/src.git@master
Patch Set: Rebase Created 7 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « chrome/renderer/translate/translate_helper.h ('k') | chrome/renderer/translate/translate_helper_metrics.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: chrome/renderer/translate/translate_helper.cc
diff --git a/chrome/renderer/translate/translate_helper.cc b/chrome/renderer/translate/translate_helper.cc
index 07417573712850bf9ac6a89373a64de32f918078..8d3c6624707048d155323697e813b565b1b55af8 100644
--- a/chrome/renderer/translate/translate_helper.cc
+++ b/chrome/renderer/translate/translate_helper.cc
@@ -9,13 +9,12 @@
#include "base/logging.h"
#include "base/message_loop.h"
#include "base/strings/string16.h"
-#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "chrome/common/chrome_constants.h"
#include "chrome/common/render_messages.h"
-#include "chrome/common/translate/translate_util.h"
-#include "chrome/renderer/translate/translate_helper_metrics.h"
+#include "chrome/common/translate/language_detection_util.h"
+#include "chrome/common/translate/translate_common_metrics.h"
#include "content/public/renderer/render_view.h"
#include "third_party/WebKit/public/web/WebDocument.h"
#include "third_party/WebKit/public/web/WebElement.h"
@@ -26,10 +25,6 @@
#include "third_party/WebKit/public/web/WebView.h"
#include "v8/include/v8.h"
-#if defined(ENABLE_LANGUAGE_DETECTION)
-#include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"
-#endif
-
using WebKit::WebDocument;
using WebKit::WebElement;
using WebKit::WebFrame;
@@ -56,38 +51,6 @@ const int kTranslateStatusCheckDelayMs = 400;
// Language name passed to the Translate element for it to detect the language.
const char kAutoDetectionLanguage[] = "auto";
-// Similar language code list. Some languages are very similar and difficult
-// for CLD to distinguish.
-struct SimilarLanguageCode {
- const char* const code;
- int group;
-};
-
-const SimilarLanguageCode kSimilarLanguageCodes[] = {
- {"bs", 1},
- {"hr", 1},
- {"hi", 2},
- {"ne", 2},
-};
-
-// Checks |kSimilarLanguageCodes| and returns group code.
-int GetSimilarLanguageGroupCode(const std::string& language) {
- for (size_t i = 0; i < arraysize(kSimilarLanguageCodes); ++i) {
- if (language.find(kSimilarLanguageCodes[i].code) != 0)
- continue;
- return kSimilarLanguageCodes[i].group;
- }
- return 0;
-}
-
-// Well-known languages which often have wrong server configuration of
-// Content-Language: en.
-// TODO(toyoshim): Remove these static tables and caller functions to
-// chrome/common/translate, and implement them as std::set<>.
-const char* kWellKnownCodesOnWrongConfiguration[] = {
- "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th"
-};
-
} // namespace
////////////////////////////////////////////////////////////////////////////////
@@ -128,7 +91,7 @@ void TranslateHelper::PageCaptured(int page_id, const string16& contents) {
html_lang = html_element.getAttribute("lang").utf8();
std::string cld_language;
bool is_cld_reliable;
- std::string language = DeterminePageLanguage(
+ std::string language = LanguageDetectionUtil::DeterminePageLanguage(
content_language, html_lang, contents, &cld_language, &is_cld_reliable);
if (language.empty())
@@ -163,42 +126,6 @@ void TranslateHelper::CancelPendingTranslation() {
target_lang_.clear();
}
-#if defined(ENABLE_LANGUAGE_DETECTION)
-// static
-std::string TranslateHelper::DetermineTextLanguage(const string16& text,
- bool* is_cld_reliable) {
- std::string language = chrome::kUnknownLanguageCode;
- int num_languages = 0;
- int text_bytes = 0;
- bool is_reliable = false;
- Language cld_language =
- DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,
- &num_languages, NULL, &text_bytes);
- if (is_cld_reliable != NULL)
- *is_cld_reliable = is_reliable;
-
- // We don't trust the result if the CLD reports that the detection is not
- // reliable, or if the actual text used to detect the language was less than
- // 100 bytes (short texts can often lead to wrong results).
- // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that
- // the determined language code is correct with 50% confidence. Chrome should
- // handle the real confidence value to judge.
- if (is_reliable && text_bytes >= 100 && cld_language != NUM_LANGUAGES &&
- cld_language != UNKNOWN_LANGUAGE && cld_language != TG_UNKNOWN_LANGUAGE) {
- // We should not use LanguageCode_ISO_639_1 because it does not cover all
- // the languages CLD can detect. As a result, it'll return the invalid
- // language code for tradtional Chinese among others.
- // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and
- // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN
- // for Simplified Chinese.
- language = LanguageCodeWithDialects(cld_language);
- }
- VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text
- << "\n*************************************\n";
- return language;
-}
-#endif // defined(ENABLE_LANGUAGE_DETECTION)
-
////////////////////////////////////////////////////////////////////////////////
// TranslateHelper, protected:
//
@@ -303,218 +230,6 @@ double TranslateHelper::ExecuteScriptAndGetDoubleResult(
////////////////////////////////////////////////////////////////////////////////
// TranslateHelper, private:
//
-// static
-void TranslateHelper::CorrectLanguageCodeTypo(std::string* code) {
- DCHECK(code);
-
- size_t coma_index = code->find(',');
- if (coma_index != std::string::npos) {
- // There are more than 1 language specified, just keep the first one.
- *code = code->substr(0, coma_index);
- }
- TrimWhitespaceASCII(*code, TRIM_ALL, code);
-
- // An underscore instead of a dash is a frequent mistake.
- size_t underscore_index = code->find('_');
- if (underscore_index != std::string::npos)
- (*code)[underscore_index] = '-';
-
- // Change everything up to a dash to lower-case and everything after to upper.
- size_t dash_index = code->find('-');
- if (dash_index != std::string::npos) {
- *code = StringToLowerASCII(code->substr(0, dash_index)) +
- StringToUpperASCII(code->substr(dash_index));
- } else {
- *code = StringToLowerASCII(*code);
- }
-}
-
-// static
-bool TranslateHelper::IsValidLanguageCode(const std::string& code) {
- // Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/.
- // TODO(hajimehoshi): How about es-419, which is used as an Accept language?
- std::vector<std::string> chunks;
- base::SplitString(code, '-', &chunks);
-
- if (chunks.size() < 1 || 2 < chunks.size())
- return false;
-
- const std::string& main_code = chunks[0];
-
- if (main_code.size() < 1 || 3 < main_code.size())
- return false;
-
- for (std::string::const_iterator it = main_code.begin();
- it != main_code.end(); ++it) {
- if (!IsAsciiAlpha(*it))
- return false;
- }
-
- if (chunks.size() == 1)
- return true;
-
- const std::string& sub_code = chunks[1];
-
- if (sub_code.size() != 2)
- return false;
-
- for (std::string::const_iterator it = sub_code.begin();
- it != sub_code.end(); ++it) {
- if (!IsAsciiAlpha(*it))
- return false;
- }
-
- return true;
-}
-
-// static
-void TranslateHelper::ApplyLanguageCodeCorrection(std::string* code) {
- // Correct well-known format errors.
- CorrectLanguageCodeTypo(code);
-
- if (!IsValidLanguageCode(*code)) {
- *code = std::string();
- return;
- }
-
- TranslateUtil::ToTranslateLanguageSynonym(code);
-}
-
-// static
-bool TranslateHelper::IsSameOrSimilarLanguages(
- const std::string& page_language, const std::string& cld_language) {
- // Language code part of |page_language| is matched to one of |cld_language|.
- // Country code is ignored here.
- if (page_language.size() >= 2 &&
- cld_language.find(page_language.c_str(), 0, 2) == 0) {
- // Languages are matched strictly. Reports false to metrics, but returns
- // true.
- TranslateHelperMetrics::ReportSimilarLanguageMatch(false);
- return true;
- }
-
- // Check if |page_language| and |cld_language| are in the similar language
- // list and belong to the same language group.
- int page_code = GetSimilarLanguageGroupCode(page_language);
- bool match = page_code != 0 &&
- page_code == GetSimilarLanguageGroupCode(cld_language);
-
- TranslateHelperMetrics::ReportSimilarLanguageMatch(match);
- return match;
-}
-
-// static
-bool TranslateHelper::MaybeServerWrongConfiguration(
- const std::string& page_language, const std::string& cld_language) {
- // If |page_language| is not "en-*", respect it and just return false here.
- if (!StartsWithASCII(page_language, "en", false))
- return false;
-
- // A server provides a language meta information representing "en-*". But it
- // might be just a default value due to missing user configuration.
- // Let's trust |cld_language| if the determined language is not difficult to
- // distinguish from English, and the language is one of well-known languages
- // which often provide "en-*" meta information mistakenly.
- for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {
- if (cld_language == kWellKnownCodesOnWrongConfiguration[i])
- return true;
- }
- return false;
-}
-
-// static
-bool TranslateHelper::CanCLDComplementSubCode(
- const std::string& page_language, const std::string& cld_language) {
- // Translate server cannot treat general Chinese. If Content-Language and
- // CLD agree that the language is Chinese and Content-Language doesn't know
- // which dialect is used, CLD language has priority.
- // TODO(hajimehoshi): How about the other dialects like zh-MO?
- return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false);
-}
-
-// static
-std::string TranslateHelper::DeterminePageLanguage(const std::string& code,
- const std::string& html_lang,
- const string16& contents,
- std::string* cld_language_p,
- bool* is_cld_reliable_p) {
-#if defined(ENABLE_LANGUAGE_DETECTION)
- base::TimeTicks begin_time = base::TimeTicks::Now();
- bool is_cld_reliable;
- std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable);
- TranslateHelperMetrics::ReportLanguageDetectionTime(begin_time,
- base::TimeTicks::Now());
-
- if (cld_language_p != NULL)
- *cld_language_p = cld_language;
- if (is_cld_reliable_p != NULL)
- *is_cld_reliable_p = is_cld_reliable;
- TranslateUtil::ToTranslateLanguageSynonym(&cld_language);
-#endif // defined(ENABLE_LANGUAGE_DETECTION)
-
- // Check if html lang attribute is valid.
- std::string modified_html_lang;
- if (!html_lang.empty()) {
- modified_html_lang = html_lang;
- ApplyLanguageCodeCorrection(&modified_html_lang);
- TranslateHelperMetrics::ReportHtmlLang(html_lang, modified_html_lang);
- VLOG(9) << "html lang based language code: " << modified_html_lang;
- }
-
- // Check if Content-Language is valid.
- std::string modified_code;
- if (!code.empty()) {
- modified_code = code;
- ApplyLanguageCodeCorrection(&modified_code);
- TranslateHelperMetrics::ReportContentLanguage(code, modified_code);
- }
-
- // Adopt |modified_html_lang| if it is valid. Otherwise, adopt
- // |modified_code|.
- std::string language = modified_html_lang.empty() ? modified_code :
- modified_html_lang;
-
-#if defined(ENABLE_LANGUAGE_DETECTION)
- // If |language| is empty, just use CLD result even though it might be
- // chrome::kUnknownLanguageCode.
- if (language.empty()) {
- TranslateHelperMetrics::ReportLanguageVerification(
- TranslateHelperMetrics::LANGUAGE_VERIFICATION_CLD_ONLY);
- return cld_language;
- }
-
- if (cld_language == chrome::kUnknownLanguageCode) {
- TranslateHelperMetrics::ReportLanguageVerification(
- TranslateHelperMetrics::LANGUAGE_VERIFICATION_UNKNOWN);
- return language;
- } else if (IsSameOrSimilarLanguages(language, cld_language)) {
- TranslateHelperMetrics::ReportLanguageVerification(
- TranslateHelperMetrics::LANGUAGE_VERIFICATION_CLD_AGREE);
- return language;
- } else if (MaybeServerWrongConfiguration(language, cld_language)) {
- TranslateHelperMetrics::ReportLanguageVerification(
- TranslateHelperMetrics::LANGUAGE_VERIFICATION_TRUST_CLD);
- return cld_language;
- } else if (CanCLDComplementSubCode(language, cld_language)) {
- TranslateHelperMetrics::ReportLanguageVerification(
- TranslateHelperMetrics::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE);
- return cld_language;
- } else {
- TranslateHelperMetrics::ReportLanguageVerification(
- TranslateHelperMetrics::LANGUAGE_VERIFICATION_CLD_DISAGREE);
- // Content-Language value might be wrong because CLD says that this page
- // is written in another language with confidence.
- // In this case, Chrome doesn't rely on any of the language codes, and
- // gives up suggesting a translation.
- return std::string(chrome::kUnknownLanguageCode);
- }
-#else // defined(ENABLE_LANGUAGE_DETECTION)
- TranslateHelperMetrics::ReportLanguageVerification(
- TranslateHelperMetrics::LANGUAGE_VERIFICATION_CLD_DISABLED);
-#endif // defined(ENABLE_LANGUAGE_DETECTION)
-
- return language;
-}
// static
bool TranslateHelper::IsTranslationAllowed(WebDocument* document) {
@@ -589,11 +304,11 @@ void TranslateHelper::OnTranslatePage(int page_id,
source_lang : kAutoDetectionLanguage;
target_lang_ = target_lang;
- TranslateHelperMetrics::ReportUserActionDuration(language_determined_time_,
+ TranslateCommonMetrics::ReportUserActionDuration(language_determined_time_,
base::TimeTicks::Now());
GURL url(main_frame->document().url());
- TranslateHelperMetrics::ReportPageScheme(url.scheme());
+ TranslateCommonMetrics::ReportPageScheme(url.scheme());
if (!IsTranslateLibAvailable()) {
// Evaluate the script to add the translation related method to the global
@@ -656,7 +371,7 @@ void TranslateHelper::CheckTranslateStatus() {
translation_pending_ = false;
// Check JavaScript performance counters for UMA reports.
- TranslateHelperMetrics::ReportTimeToTranslate(
+ TranslateCommonMetrics::ReportTimeToTranslate(
ExecuteScriptAndGetDoubleResult("cr.googleTranslate.translationTime"));
// Notify the browser we are done.
@@ -697,9 +412,9 @@ void TranslateHelper::TranslatePageImpl(int count) {
// The library is loaded, and ready for translation now.
// Check JavaScript performance counters for UMA reports.
- TranslateHelperMetrics::ReportTimeToBeReady(
+ TranslateCommonMetrics::ReportTimeToBeReady(
ExecuteScriptAndGetDoubleResult("cr.googleTranslate.readyTime"));
- TranslateHelperMetrics::ReportTimeToLoad(
+ TranslateCommonMetrics::ReportTimeToLoad(
ExecuteScriptAndGetDoubleResult("cr.googleTranslate.loadTime"));
if (!StartTranslation()) {
« no previous file with comments | « chrome/renderer/translate/translate_helper.h ('k') | chrome/renderer/translate/translate_helper_metrics.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698