| OLD | NEW |
| 1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/common/translate/language_detection_util.h" | 5 #include "chrome/common/translate/language_detection_util.h" |
| 6 | 6 |
| 7 #include "base/logging.h" | 7 #include "base/logging.h" |
| 8 #include "base/strings/string_split.h" | 8 #include "base/strings/string_split.h" |
| 9 #include "base/strings/string_util.h" | 9 #include "base/strings/string_util.h" |
| 10 #include "base/time/time.h" | 10 #include "base/time/time.h" |
| 11 #include "chrome/common/chrome_constants.h" | 11 #include "chrome/common/chrome_constants.h" |
| 12 #include "chrome/common/translate/translate_common_metrics.h" | 12 #include "chrome/common/translate/translate_common_metrics.h" |
| 13 #include "chrome/common/translate/translate_util.h" | 13 #include "chrome/common/translate/translate_util.h" |
| 14 | |
| 15 #if defined(ENABLE_LANGUAGE_DETECTION) | |
| 16 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" | 14 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" |
| 17 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" | 15 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" |
| 18 #endif | |
| 19 | 16 |
| 20 namespace { | 17 namespace { |
| 21 | 18 |
| 22 // Similar language code list. Some languages are very similar and difficult | 19 // Similar language code list. Some languages are very similar and difficult |
| 23 // for CLD to distinguish. | 20 // for CLD to distinguish. |
| 24 struct SimilarLanguageCode { | 21 struct SimilarLanguageCode { |
| 25 const char* const code; | 22 const char* const code; |
| 26 int group; | 23 int group; |
| 27 }; | 24 }; |
| 28 | 25 |
| (...skipping 28 matching lines...) Expand all Loading... |
| 57 LanguageDetectionUtil::CorrectLanguageCodeTypo(code); | 54 LanguageDetectionUtil::CorrectLanguageCodeTypo(code); |
| 58 | 55 |
| 59 if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) { | 56 if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) { |
| 60 *code = std::string(); | 57 *code = std::string(); |
| 61 return; | 58 return; |
| 62 } | 59 } |
| 63 | 60 |
| 64 TranslateUtil::ToTranslateLanguageSynonym(code); | 61 TranslateUtil::ToTranslateLanguageSynonym(code); |
| 65 } | 62 } |
| 66 | 63 |
| 67 #if defined(ENABLE_LANGUAGE_DETECTION) | |
| 68 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it | 64 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it |
| 69 // failed. | 65 // failed. |
| 70 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. | 66 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. |
| 71 std::string DetermineTextLanguage(const base::string16& text, | 67 std::string DetermineTextLanguage(const base::string16& text, |
| 72 bool* is_cld_reliable) { | 68 bool* is_cld_reliable) { |
| 73 std::string language = chrome::kUnknownLanguageCode; | 69 std::string language = chrome::kUnknownLanguageCode; |
| 74 int num_languages = 0; | 70 int num_languages = 0; |
| 75 int text_bytes = 0; | 71 int text_bytes = 0; |
| 76 bool is_reliable = false; | 72 bool is_reliable = false; |
| 77 Language cld_language = | 73 Language cld_language = |
| (...skipping 15 matching lines...) Expand all Loading... |
| 93 // language code for tradtional Chinese among others. | 89 // language code for tradtional Chinese among others. |
| 94 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and | 90 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and |
| 95 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN | 91 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN |
| 96 // for Simplified Chinese. | 92 // for Simplified Chinese. |
| 97 language = LanguageCodeWithDialects(cld_language); | 93 language = LanguageCodeWithDialects(cld_language); |
| 98 } | 94 } |
| 99 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text | 95 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text |
| 100 << "\n*************************************\n"; | 96 << "\n*************************************\n"; |
| 101 return language; | 97 return language; |
| 102 } | 98 } |
| 103 #endif // defined(ENABLE_LANGUAGE_DETECTION) | |
| 104 | 99 |
| 105 // Checks if CLD can complement a sub code when the page language doesn't know | 100 // Checks if CLD can complement a sub code when the page language doesn't know |
| 106 // the sub code. | 101 // the sub code. |
| 107 bool CanCLDComplementSubCode( | 102 bool CanCLDComplementSubCode( |
| 108 const std::string& page_language, const std::string& cld_language) { | 103 const std::string& page_language, const std::string& cld_language) { |
| 109 // Translate server cannot treat general Chinese. If Content-Language and | 104 // Translate server cannot treat general Chinese. If Content-Language and |
| 110 // CLD agree that the language is Chinese and Content-Language doesn't know | 105 // CLD agree that the language is Chinese and Content-Language doesn't know |
| 111 // which dialect is used, CLD language has priority. | 106 // which dialect is used, CLD language has priority. |
| 112 // TODO(hajimehoshi): How about the other dialects like zh-MO? | 107 // TODO(hajimehoshi): How about the other dialects like zh-MO? |
| 113 return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false); | 108 return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false); |
| 114 } | 109 } |
| 115 | 110 |
| 116 } // namespace | 111 } // namespace |
| 117 | 112 |
| 118 namespace LanguageDetectionUtil { | 113 namespace LanguageDetectionUtil { |
| 119 | 114 |
| 120 std::string DeterminePageLanguage(const std::string& code, | 115 std::string DeterminePageLanguage(const std::string& code, |
| 121 const std::string& html_lang, | 116 const std::string& html_lang, |
| 122 const base::string16& contents, | 117 const base::string16& contents, |
| 123 std::string* cld_language_p, | 118 std::string* cld_language_p, |
| 124 bool* is_cld_reliable_p) { | 119 bool* is_cld_reliable_p) { |
| 125 #if defined(ENABLE_LANGUAGE_DETECTION) | |
| 126 base::TimeTicks begin_time = base::TimeTicks::Now(); | 120 base::TimeTicks begin_time = base::TimeTicks::Now(); |
| 127 bool is_cld_reliable; | 121 bool is_cld_reliable; |
| 128 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable); | 122 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable); |
| 129 TranslateCommonMetrics::ReportLanguageDetectionTime(begin_time, | 123 TranslateCommonMetrics::ReportLanguageDetectionTime(begin_time, |
| 130 base::TimeTicks::Now()); | 124 base::TimeTicks::Now()); |
| 131 | 125 |
| 132 if (cld_language_p != NULL) | 126 if (cld_language_p != NULL) |
| 133 *cld_language_p = cld_language; | 127 *cld_language_p = cld_language; |
| 134 if (is_cld_reliable_p != NULL) | 128 if (is_cld_reliable_p != NULL) |
| 135 *is_cld_reliable_p = is_cld_reliable; | 129 *is_cld_reliable_p = is_cld_reliable; |
| 136 TranslateUtil::ToTranslateLanguageSynonym(&cld_language); | 130 TranslateUtil::ToTranslateLanguageSynonym(&cld_language); |
| 137 #endif // defined(ENABLE_LANGUAGE_DETECTION) | |
| 138 | 131 |
| 139 // Check if html lang attribute is valid. | 132 // Check if html lang attribute is valid. |
| 140 std::string modified_html_lang; | 133 std::string modified_html_lang; |
| 141 if (!html_lang.empty()) { | 134 if (!html_lang.empty()) { |
| 142 modified_html_lang = html_lang; | 135 modified_html_lang = html_lang; |
| 143 ApplyLanguageCodeCorrection(&modified_html_lang); | 136 ApplyLanguageCodeCorrection(&modified_html_lang); |
| 144 TranslateCommonMetrics::ReportHtmlLang(html_lang, modified_html_lang); | 137 TranslateCommonMetrics::ReportHtmlLang(html_lang, modified_html_lang); |
| 145 VLOG(9) << "html lang based language code: " << modified_html_lang; | 138 VLOG(9) << "html lang based language code: " << modified_html_lang; |
| 146 } | 139 } |
| 147 | 140 |
| 148 // Check if Content-Language is valid. | 141 // Check if Content-Language is valid. |
| 149 std::string modified_code; | 142 std::string modified_code; |
| 150 if (!code.empty()) { | 143 if (!code.empty()) { |
| 151 modified_code = code; | 144 modified_code = code; |
| 152 ApplyLanguageCodeCorrection(&modified_code); | 145 ApplyLanguageCodeCorrection(&modified_code); |
| 153 TranslateCommonMetrics::ReportContentLanguage(code, modified_code); | 146 TranslateCommonMetrics::ReportContentLanguage(code, modified_code); |
| 154 } | 147 } |
| 155 | 148 |
| 156 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt | 149 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt |
| 157 // |modified_code|. | 150 // |modified_code|. |
| 158 std::string language = modified_html_lang.empty() ? modified_code : | 151 std::string language = modified_html_lang.empty() ? modified_code : |
| 159 modified_html_lang; | 152 modified_html_lang; |
| 160 | 153 |
| 161 #if defined(ENABLE_LANGUAGE_DETECTION) | |
| 162 // If |language| is empty, just use CLD result even though it might be | 154 // If |language| is empty, just use CLD result even though it might be |
| 163 // chrome::kUnknownLanguageCode. | 155 // chrome::kUnknownLanguageCode. |
| 164 if (language.empty()) { | 156 if (language.empty()) { |
| 165 TranslateCommonMetrics::ReportLanguageVerification( | 157 TranslateCommonMetrics::ReportLanguageVerification( |
| 166 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_ONLY); | 158 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_ONLY); |
| 167 return cld_language; | 159 return cld_language; |
| 168 } | 160 } |
| 169 | 161 |
| 170 if (cld_language == chrome::kUnknownLanguageCode) { | 162 if (cld_language == chrome::kUnknownLanguageCode) { |
| 171 TranslateCommonMetrics::ReportLanguageVerification( | 163 TranslateCommonMetrics::ReportLanguageVerification( |
| (...skipping 13 matching lines...) Expand all Loading... |
| 185 return cld_language; | 177 return cld_language; |
| 186 } else { | 178 } else { |
| 187 TranslateCommonMetrics::ReportLanguageVerification( | 179 TranslateCommonMetrics::ReportLanguageVerification( |
| 188 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISAGREE); | 180 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISAGREE); |
| 189 // Content-Language value might be wrong because CLD says that this page | 181 // Content-Language value might be wrong because CLD says that this page |
| 190 // is written in another language with confidence. | 182 // is written in another language with confidence. |
| 191 // In this case, Chrome doesn't rely on any of the language codes, and | 183 // In this case, Chrome doesn't rely on any of the language codes, and |
| 192 // gives up suggesting a translation. | 184 // gives up suggesting a translation. |
| 193 return std::string(chrome::kUnknownLanguageCode); | 185 return std::string(chrome::kUnknownLanguageCode); |
| 194 } | 186 } |
| 195 #else // defined(ENABLE_LANGUAGE_DETECTION) | |
| 196 TranslateCommonMetrics::ReportLanguageVerification( | |
| 197 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISABLED); | |
| 198 #endif // defined(ENABLE_LANGUAGE_DETECTION) | |
| 199 | 187 |
| 200 return language; | 188 return language; |
| 201 } | 189 } |
| 202 | 190 |
| 203 void CorrectLanguageCodeTypo(std::string* code) { | 191 void CorrectLanguageCodeTypo(std::string* code) { |
| 204 DCHECK(code); | 192 DCHECK(code); |
| 205 | 193 |
| 206 size_t coma_index = code->find(','); | 194 size_t coma_index = code->find(','); |
| 207 if (coma_index != std::string::npos) { | 195 if (coma_index != std::string::npos) { |
| 208 // There are more than 1 language specified, just keep the first one. | 196 // There are more than 1 language specified, just keep the first one. |
| (...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 296 // distinguish from English, and the language is one of well-known languages | 284 // distinguish from English, and the language is one of well-known languages |
| 297 // which often provide "en-*" meta information mistakenly. | 285 // which often provide "en-*" meta information mistakenly. |
| 298 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { | 286 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { |
| 299 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) | 287 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) |
| 300 return true; | 288 return true; |
| 301 } | 289 } |
| 302 return false; | 290 return false; |
| 303 } | 291 } |
| 304 | 292 |
| 305 std::string GetCLDVersion() { | 293 std::string GetCLDVersion() { |
| 306 #if defined(ENABLE_LANGUAGE_DETECTION) | |
| 307 return CompactLangDet::DetectLanguageVersion(); | 294 return CompactLangDet::DetectLanguageVersion(); |
| 308 #else | |
| 309 return "" | |
| 310 #endif | |
| 311 } | 295 } |
| 312 | 296 |
| 313 } // namespace LanguageDetectionUtil | 297 } // namespace LanguageDetectionUtil |
| OLD | NEW |