| Index: chrome/renderer/translate/translate_helper.cc
|
| diff --git a/chrome/renderer/translate/translate_helper.cc b/chrome/renderer/translate/translate_helper.cc
|
| index 07417573712850bf9ac6a89373a64de32f918078..8d3c6624707048d155323697e813b565b1b55af8 100644
|
| --- a/chrome/renderer/translate/translate_helper.cc
|
| +++ b/chrome/renderer/translate/translate_helper.cc
|
| @@ -9,13 +9,12 @@
|
| #include "base/logging.h"
|
| #include "base/message_loop.h"
|
| #include "base/strings/string16.h"
|
| -#include "base/strings/string_split.h"
|
| #include "base/strings/string_util.h"
|
| #include "base/strings/utf_string_conversions.h"
|
| #include "chrome/common/chrome_constants.h"
|
| #include "chrome/common/render_messages.h"
|
| -#include "chrome/common/translate/translate_util.h"
|
| -#include "chrome/renderer/translate/translate_helper_metrics.h"
|
| +#include "chrome/common/translate/language_detection_util.h"
|
| +#include "chrome/common/translate/translate_common_metrics.h"
|
| #include "content/public/renderer/render_view.h"
|
| #include "third_party/WebKit/public/web/WebDocument.h"
|
| #include "third_party/WebKit/public/web/WebElement.h"
|
| @@ -26,10 +25,6 @@
|
| #include "third_party/WebKit/public/web/WebView.h"
|
| #include "v8/include/v8.h"
|
|
|
| -#if defined(ENABLE_LANGUAGE_DETECTION)
|
| -#include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"
|
| -#endif
|
| -
|
| using WebKit::WebDocument;
|
| using WebKit::WebElement;
|
| using WebKit::WebFrame;
|
| @@ -56,38 +51,6 @@ const int kTranslateStatusCheckDelayMs = 400;
|
| // Language name passed to the Translate element for it to detect the language.
|
| const char kAutoDetectionLanguage[] = "auto";
|
|
|
| -// Similar language code list. Some languages are very similar and difficult
|
| -// for CLD to distinguish.
|
| -struct SimilarLanguageCode {
|
| - const char* const code;
|
| - int group;
|
| -};
|
| -
|
| -const SimilarLanguageCode kSimilarLanguageCodes[] = {
|
| - {"bs", 1},
|
| - {"hr", 1},
|
| - {"hi", 2},
|
| - {"ne", 2},
|
| -};
|
| -
|
| -// Checks |kSimilarLanguageCodes| and returns group code.
|
| -int GetSimilarLanguageGroupCode(const std::string& language) {
|
| - for (size_t i = 0; i < arraysize(kSimilarLanguageCodes); ++i) {
|
| - if (language.find(kSimilarLanguageCodes[i].code) != 0)
|
| - continue;
|
| - return kSimilarLanguageCodes[i].group;
|
| - }
|
| - return 0;
|
| -}
|
| -
|
| -// Well-known languages which often have wrong server configuration of
|
| -// Content-Language: en.
|
| -// TODO(toyoshim): Remove these static tables and caller functions to
|
| -// chrome/common/translate, and implement them as std::set<>.
|
| -const char* kWellKnownCodesOnWrongConfiguration[] = {
|
| - "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th"
|
| -};
|
| -
|
| } // namespace
|
|
|
| ////////////////////////////////////////////////////////////////////////////////
|
| @@ -128,7 +91,7 @@ void TranslateHelper::PageCaptured(int page_id, const string16& contents) {
|
| html_lang = html_element.getAttribute("lang").utf8();
|
| std::string cld_language;
|
| bool is_cld_reliable;
|
| - std::string language = DeterminePageLanguage(
|
| + std::string language = LanguageDetectionUtil::DeterminePageLanguage(
|
| content_language, html_lang, contents, &cld_language, &is_cld_reliable);
|
|
|
| if (language.empty())
|
| @@ -163,42 +126,6 @@ void TranslateHelper::CancelPendingTranslation() {
|
| target_lang_.clear();
|
| }
|
|
|
| -#if defined(ENABLE_LANGUAGE_DETECTION)
|
| -// static
|
| -std::string TranslateHelper::DetermineTextLanguage(const string16& text,
|
| - bool* is_cld_reliable) {
|
| - std::string language = chrome::kUnknownLanguageCode;
|
| - int num_languages = 0;
|
| - int text_bytes = 0;
|
| - bool is_reliable = false;
|
| - Language cld_language =
|
| - DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,
|
| - &num_languages, NULL, &text_bytes);
|
| - if (is_cld_reliable != NULL)
|
| - *is_cld_reliable = is_reliable;
|
| -
|
| - // We don't trust the result if the CLD reports that the detection is not
|
| - // reliable, or if the actual text used to detect the language was less than
|
| - // 100 bytes (short texts can often lead to wrong results).
|
| - // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that
|
| - // the determined language code is correct with 50% confidence. Chrome should
|
| - // handle the real confidence value to judge.
|
| - if (is_reliable && text_bytes >= 100 && cld_language != NUM_LANGUAGES &&
|
| - cld_language != UNKNOWN_LANGUAGE && cld_language != TG_UNKNOWN_LANGUAGE) {
|
| - // We should not use LanguageCode_ISO_639_1 because it does not cover all
|
| - // the languages CLD can detect. As a result, it'll return the invalid
|
| - // language code for tradtional Chinese among others.
|
| - // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and
|
| - // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN
|
| - // for Simplified Chinese.
|
| - language = LanguageCodeWithDialects(cld_language);
|
| - }
|
| - VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text
|
| - << "\n*************************************\n";
|
| - return language;
|
| -}
|
| -#endif // defined(ENABLE_LANGUAGE_DETECTION)
|
| -
|
| ////////////////////////////////////////////////////////////////////////////////
|
| // TranslateHelper, protected:
|
| //
|
| @@ -303,218 +230,6 @@ double TranslateHelper::ExecuteScriptAndGetDoubleResult(
|
| ////////////////////////////////////////////////////////////////////////////////
|
| // TranslateHelper, private:
|
| //
|
| -// static
|
| -void TranslateHelper::CorrectLanguageCodeTypo(std::string* code) {
|
| - DCHECK(code);
|
| -
|
| - size_t coma_index = code->find(',');
|
| - if (coma_index != std::string::npos) {
|
| - // There are more than 1 language specified, just keep the first one.
|
| - *code = code->substr(0, coma_index);
|
| - }
|
| - TrimWhitespaceASCII(*code, TRIM_ALL, code);
|
| -
|
| - // An underscore instead of a dash is a frequent mistake.
|
| - size_t underscore_index = code->find('_');
|
| - if (underscore_index != std::string::npos)
|
| - (*code)[underscore_index] = '-';
|
| -
|
| - // Change everything up to a dash to lower-case and everything after to upper.
|
| - size_t dash_index = code->find('-');
|
| - if (dash_index != std::string::npos) {
|
| - *code = StringToLowerASCII(code->substr(0, dash_index)) +
|
| - StringToUpperASCII(code->substr(dash_index));
|
| - } else {
|
| - *code = StringToLowerASCII(*code);
|
| - }
|
| -}
|
| -
|
| -// static
|
| -bool TranslateHelper::IsValidLanguageCode(const std::string& code) {
|
| - // Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/.
|
| - // TODO(hajimehoshi): How about es-419, which is used as an Accept language?
|
| - std::vector<std::string> chunks;
|
| - base::SplitString(code, '-', &chunks);
|
| -
|
| - if (chunks.size() < 1 || 2 < chunks.size())
|
| - return false;
|
| -
|
| - const std::string& main_code = chunks[0];
|
| -
|
| - if (main_code.size() < 1 || 3 < main_code.size())
|
| - return false;
|
| -
|
| - for (std::string::const_iterator it = main_code.begin();
|
| - it != main_code.end(); ++it) {
|
| - if (!IsAsciiAlpha(*it))
|
| - return false;
|
| - }
|
| -
|
| - if (chunks.size() == 1)
|
| - return true;
|
| -
|
| - const std::string& sub_code = chunks[1];
|
| -
|
| - if (sub_code.size() != 2)
|
| - return false;
|
| -
|
| - for (std::string::const_iterator it = sub_code.begin();
|
| - it != sub_code.end(); ++it) {
|
| - if (!IsAsciiAlpha(*it))
|
| - return false;
|
| - }
|
| -
|
| - return true;
|
| -}
|
| -
|
| -// static
|
| -void TranslateHelper::ApplyLanguageCodeCorrection(std::string* code) {
|
| - // Correct well-known format errors.
|
| - CorrectLanguageCodeTypo(code);
|
| -
|
| - if (!IsValidLanguageCode(*code)) {
|
| - *code = std::string();
|
| - return;
|
| - }
|
| -
|
| - TranslateUtil::ToTranslateLanguageSynonym(code);
|
| -}
|
| -
|
| -// static
|
| -bool TranslateHelper::IsSameOrSimilarLanguages(
|
| - const std::string& page_language, const std::string& cld_language) {
|
| - // Language code part of |page_language| is matched to one of |cld_language|.
|
| - // Country code is ignored here.
|
| - if (page_language.size() >= 2 &&
|
| - cld_language.find(page_language.c_str(), 0, 2) == 0) {
|
| - // Languages are matched strictly. Reports false to metrics, but returns
|
| - // true.
|
| - TranslateHelperMetrics::ReportSimilarLanguageMatch(false);
|
| - return true;
|
| - }
|
| -
|
| - // Check if |page_language| and |cld_language| are in the similar language
|
| - // list and belong to the same language group.
|
| - int page_code = GetSimilarLanguageGroupCode(page_language);
|
| - bool match = page_code != 0 &&
|
| - page_code == GetSimilarLanguageGroupCode(cld_language);
|
| -
|
| - TranslateHelperMetrics::ReportSimilarLanguageMatch(match);
|
| - return match;
|
| -}
|
| -
|
| -// static
|
| -bool TranslateHelper::MaybeServerWrongConfiguration(
|
| - const std::string& page_language, const std::string& cld_language) {
|
| - // If |page_language| is not "en-*", respect it and just return false here.
|
| - if (!StartsWithASCII(page_language, "en", false))
|
| - return false;
|
| -
|
| - // A server provides a language meta information representing "en-*". But it
|
| - // might be just a default value due to missing user configuration.
|
| - // Let's trust |cld_language| if the determined language is not difficult to
|
| - // distinguish from English, and the language is one of well-known languages
|
| - // which often provide "en-*" meta information mistakenly.
|
| - for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {
|
| - if (cld_language == kWellKnownCodesOnWrongConfiguration[i])
|
| - return true;
|
| - }
|
| - return false;
|
| -}
|
| -
|
| -// static
|
| -bool TranslateHelper::CanCLDComplementSubCode(
|
| - const std::string& page_language, const std::string& cld_language) {
|
| - // Translate server cannot treat general Chinese. If Content-Language and
|
| - // CLD agree that the language is Chinese and Content-Language doesn't know
|
| - // which dialect is used, CLD language has priority.
|
| - // TODO(hajimehoshi): How about the other dialects like zh-MO?
|
| - return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false);
|
| -}
|
| -
|
| -// static
|
| -std::string TranslateHelper::DeterminePageLanguage(const std::string& code,
|
| - const std::string& html_lang,
|
| - const string16& contents,
|
| - std::string* cld_language_p,
|
| - bool* is_cld_reliable_p) {
|
| -#if defined(ENABLE_LANGUAGE_DETECTION)
|
| - base::TimeTicks begin_time = base::TimeTicks::Now();
|
| - bool is_cld_reliable;
|
| - std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable);
|
| - TranslateHelperMetrics::ReportLanguageDetectionTime(begin_time,
|
| - base::TimeTicks::Now());
|
| -
|
| - if (cld_language_p != NULL)
|
| - *cld_language_p = cld_language;
|
| - if (is_cld_reliable_p != NULL)
|
| - *is_cld_reliable_p = is_cld_reliable;
|
| - TranslateUtil::ToTranslateLanguageSynonym(&cld_language);
|
| -#endif // defined(ENABLE_LANGUAGE_DETECTION)
|
| -
|
| - // Check if html lang attribute is valid.
|
| - std::string modified_html_lang;
|
| - if (!html_lang.empty()) {
|
| - modified_html_lang = html_lang;
|
| - ApplyLanguageCodeCorrection(&modified_html_lang);
|
| - TranslateHelperMetrics::ReportHtmlLang(html_lang, modified_html_lang);
|
| - VLOG(9) << "html lang based language code: " << modified_html_lang;
|
| - }
|
| -
|
| - // Check if Content-Language is valid.
|
| - std::string modified_code;
|
| - if (!code.empty()) {
|
| - modified_code = code;
|
| - ApplyLanguageCodeCorrection(&modified_code);
|
| - TranslateHelperMetrics::ReportContentLanguage(code, modified_code);
|
| - }
|
| -
|
| - // Adopt |modified_html_lang| if it is valid. Otherwise, adopt
|
| - // |modified_code|.
|
| - std::string language = modified_html_lang.empty() ? modified_code :
|
| - modified_html_lang;
|
| -
|
| -#if defined(ENABLE_LANGUAGE_DETECTION)
|
| - // If |language| is empty, just use CLD result even though it might be
|
| - // chrome::kUnknownLanguageCode.
|
| - if (language.empty()) {
|
| - TranslateHelperMetrics::ReportLanguageVerification(
|
| - TranslateHelperMetrics::LANGUAGE_VERIFICATION_CLD_ONLY);
|
| - return cld_language;
|
| - }
|
| -
|
| - if (cld_language == chrome::kUnknownLanguageCode) {
|
| - TranslateHelperMetrics::ReportLanguageVerification(
|
| - TranslateHelperMetrics::LANGUAGE_VERIFICATION_UNKNOWN);
|
| - return language;
|
| - } else if (IsSameOrSimilarLanguages(language, cld_language)) {
|
| - TranslateHelperMetrics::ReportLanguageVerification(
|
| - TranslateHelperMetrics::LANGUAGE_VERIFICATION_CLD_AGREE);
|
| - return language;
|
| - } else if (MaybeServerWrongConfiguration(language, cld_language)) {
|
| - TranslateHelperMetrics::ReportLanguageVerification(
|
| - TranslateHelperMetrics::LANGUAGE_VERIFICATION_TRUST_CLD);
|
| - return cld_language;
|
| - } else if (CanCLDComplementSubCode(language, cld_language)) {
|
| - TranslateHelperMetrics::ReportLanguageVerification(
|
| - TranslateHelperMetrics::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE);
|
| - return cld_language;
|
| - } else {
|
| - TranslateHelperMetrics::ReportLanguageVerification(
|
| - TranslateHelperMetrics::LANGUAGE_VERIFICATION_CLD_DISAGREE);
|
| - // Content-Language value might be wrong because CLD says that this page
|
| - // is written in another language with confidence.
|
| - // In this case, Chrome doesn't rely on any of the language codes, and
|
| - // gives up suggesting a translation.
|
| - return std::string(chrome::kUnknownLanguageCode);
|
| - }
|
| -#else // defined(ENABLE_LANGUAGE_DETECTION)
|
| - TranslateHelperMetrics::ReportLanguageVerification(
|
| - TranslateHelperMetrics::LANGUAGE_VERIFICATION_CLD_DISABLED);
|
| -#endif // defined(ENABLE_LANGUAGE_DETECTION)
|
| -
|
| - return language;
|
| -}
|
|
|
| // static
|
| bool TranslateHelper::IsTranslationAllowed(WebDocument* document) {
|
| @@ -589,11 +304,11 @@ void TranslateHelper::OnTranslatePage(int page_id,
|
| source_lang : kAutoDetectionLanguage;
|
| target_lang_ = target_lang;
|
|
|
| - TranslateHelperMetrics::ReportUserActionDuration(language_determined_time_,
|
| + TranslateCommonMetrics::ReportUserActionDuration(language_determined_time_,
|
| base::TimeTicks::Now());
|
|
|
| GURL url(main_frame->document().url());
|
| - TranslateHelperMetrics::ReportPageScheme(url.scheme());
|
| + TranslateCommonMetrics::ReportPageScheme(url.scheme());
|
|
|
| if (!IsTranslateLibAvailable()) {
|
| // Evaluate the script to add the translation related method to the global
|
| @@ -656,7 +371,7 @@ void TranslateHelper::CheckTranslateStatus() {
|
| translation_pending_ = false;
|
|
|
| // Check JavaScript performance counters for UMA reports.
|
| - TranslateHelperMetrics::ReportTimeToTranslate(
|
| + TranslateCommonMetrics::ReportTimeToTranslate(
|
| ExecuteScriptAndGetDoubleResult("cr.googleTranslate.translationTime"));
|
|
|
| // Notify the browser we are done.
|
| @@ -697,9 +412,9 @@ void TranslateHelper::TranslatePageImpl(int count) {
|
|
|
| // The library is loaded, and ready for translation now.
|
| // Check JavaScript performance counters for UMA reports.
|
| - TranslateHelperMetrics::ReportTimeToBeReady(
|
| + TranslateCommonMetrics::ReportTimeToBeReady(
|
| ExecuteScriptAndGetDoubleResult("cr.googleTranslate.readyTime"));
|
| - TranslateHelperMetrics::ReportTimeToLoad(
|
| + TranslateCommonMetrics::ReportTimeToLoad(
|
| ExecuteScriptAndGetDoubleResult("cr.googleTranslate.loadTime"));
|
|
|
| if (!StartTranslation()) {
|
|
|