OLD | NEW |
1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/common/translate/language_detection_util.h" | 5 #include "chrome/common/translate/language_detection_util.h" |
6 | 6 |
7 #include "base/logging.h" | 7 #include "base/logging.h" |
8 #include "base/strings/string_split.h" | 8 #include "base/strings/string_split.h" |
9 #include "base/strings/string_util.h" | 9 #include "base/strings/string_util.h" |
10 #include "base/time/time.h" | 10 #include "base/time/time.h" |
11 #include "chrome/common/chrome_constants.h" | 11 #include "chrome/common/chrome_constants.h" |
12 #include "chrome/common/translate/translate_common_metrics.h" | 12 #include "chrome/common/translate/translate_common_metrics.h" |
13 #include "chrome/common/translate/translate_util.h" | 13 #include "chrome/common/translate/translate_util.h" |
14 | |
15 #if defined(ENABLE_LANGUAGE_DETECTION) | |
16 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" | 14 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" |
17 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" | 15 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" |
18 #endif | |
19 | 16 |
20 namespace { | 17 namespace { |
21 | 18 |
22 // Similar language code list. Some languages are very similar and difficult | 19 // Similar language code list. Some languages are very similar and difficult |
23 // for CLD to distinguish. | 20 // for CLD to distinguish. |
24 struct SimilarLanguageCode { | 21 struct SimilarLanguageCode { |
25 const char* const code; | 22 const char* const code; |
26 int group; | 23 int group; |
27 }; | 24 }; |
28 | 25 |
(...skipping 28 matching lines...) Expand all Loading... |
57 LanguageDetectionUtil::CorrectLanguageCodeTypo(code); | 54 LanguageDetectionUtil::CorrectLanguageCodeTypo(code); |
58 | 55 |
59 if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) { | 56 if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) { |
60 *code = std::string(); | 57 *code = std::string(); |
61 return; | 58 return; |
62 } | 59 } |
63 | 60 |
64 TranslateUtil::ToTranslateLanguageSynonym(code); | 61 TranslateUtil::ToTranslateLanguageSynonym(code); |
65 } | 62 } |
66 | 63 |
67 #if defined(ENABLE_LANGUAGE_DETECTION) | |
68 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it | 64 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it |
69 // failed. | 65 // failed. |
70 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. | 66 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. |
71 std::string DetermineTextLanguage(const base::string16& text, | 67 std::string DetermineTextLanguage(const base::string16& text, |
72 bool* is_cld_reliable) { | 68 bool* is_cld_reliable) { |
73 std::string language = chrome::kUnknownLanguageCode; | 69 std::string language = chrome::kUnknownLanguageCode; |
74 int num_languages = 0; | 70 int num_languages = 0; |
75 int text_bytes = 0; | 71 int text_bytes = 0; |
76 bool is_reliable = false; | 72 bool is_reliable = false; |
77 Language cld_language = | 73 Language cld_language = |
(...skipping 15 matching lines...) Expand all Loading... |
93 // language code for tradtional Chinese among others. | 89 // language code for tradtional Chinese among others. |
94 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and | 90 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and |
95 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN | 91 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN |
96 // for Simplified Chinese. | 92 // for Simplified Chinese. |
97 language = LanguageCodeWithDialects(cld_language); | 93 language = LanguageCodeWithDialects(cld_language); |
98 } | 94 } |
99 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text | 95 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text |
100 << "\n*************************************\n"; | 96 << "\n*************************************\n"; |
101 return language; | 97 return language; |
102 } | 98 } |
103 #endif // defined(ENABLE_LANGUAGE_DETECTION) | |
104 | 99 |
105 // Checks if CLD can complement a sub code when the page language doesn't know | 100 // Checks if CLD can complement a sub code when the page language doesn't know |
106 // the sub code. | 101 // the sub code. |
107 bool CanCLDComplementSubCode( | 102 bool CanCLDComplementSubCode( |
108 const std::string& page_language, const std::string& cld_language) { | 103 const std::string& page_language, const std::string& cld_language) { |
109 // Translate server cannot treat general Chinese. If Content-Language and | 104 // Translate server cannot treat general Chinese. If Content-Language and |
110 // CLD agree that the language is Chinese and Content-Language doesn't know | 105 // CLD agree that the language is Chinese and Content-Language doesn't know |
111 // which dialect is used, CLD language has priority. | 106 // which dialect is used, CLD language has priority. |
112 // TODO(hajimehoshi): How about the other dialects like zh-MO? | 107 // TODO(hajimehoshi): How about the other dialects like zh-MO? |
113 return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false); | 108 return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false); |
114 } | 109 } |
115 | 110 |
116 } // namespace | 111 } // namespace |
117 | 112 |
118 namespace LanguageDetectionUtil { | 113 namespace LanguageDetectionUtil { |
119 | 114 |
120 std::string DeterminePageLanguage(const std::string& code, | 115 std::string DeterminePageLanguage(const std::string& code, |
121 const std::string& html_lang, | 116 const std::string& html_lang, |
122 const base::string16& contents, | 117 const base::string16& contents, |
123 std::string* cld_language_p, | 118 std::string* cld_language_p, |
124 bool* is_cld_reliable_p) { | 119 bool* is_cld_reliable_p) { |
125 #if defined(ENABLE_LANGUAGE_DETECTION) | |
126 base::TimeTicks begin_time = base::TimeTicks::Now(); | 120 base::TimeTicks begin_time = base::TimeTicks::Now(); |
127 bool is_cld_reliable; | 121 bool is_cld_reliable; |
128 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable); | 122 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable); |
129 TranslateCommonMetrics::ReportLanguageDetectionTime(begin_time, | 123 TranslateCommonMetrics::ReportLanguageDetectionTime(begin_time, |
130 base::TimeTicks::Now()); | 124 base::TimeTicks::Now()); |
131 | 125 |
132 if (cld_language_p != NULL) | 126 if (cld_language_p != NULL) |
133 *cld_language_p = cld_language; | 127 *cld_language_p = cld_language; |
134 if (is_cld_reliable_p != NULL) | 128 if (is_cld_reliable_p != NULL) |
135 *is_cld_reliable_p = is_cld_reliable; | 129 *is_cld_reliable_p = is_cld_reliable; |
136 TranslateUtil::ToTranslateLanguageSynonym(&cld_language); | 130 TranslateUtil::ToTranslateLanguageSynonym(&cld_language); |
137 #endif // defined(ENABLE_LANGUAGE_DETECTION) | |
138 | 131 |
139 // Check if html lang attribute is valid. | 132 // Check if html lang attribute is valid. |
140 std::string modified_html_lang; | 133 std::string modified_html_lang; |
141 if (!html_lang.empty()) { | 134 if (!html_lang.empty()) { |
142 modified_html_lang = html_lang; | 135 modified_html_lang = html_lang; |
143 ApplyLanguageCodeCorrection(&modified_html_lang); | 136 ApplyLanguageCodeCorrection(&modified_html_lang); |
144 TranslateCommonMetrics::ReportHtmlLang(html_lang, modified_html_lang); | 137 TranslateCommonMetrics::ReportHtmlLang(html_lang, modified_html_lang); |
145 VLOG(9) << "html lang based language code: " << modified_html_lang; | 138 VLOG(9) << "html lang based language code: " << modified_html_lang; |
146 } | 139 } |
147 | 140 |
148 // Check if Content-Language is valid. | 141 // Check if Content-Language is valid. |
149 std::string modified_code; | 142 std::string modified_code; |
150 if (!code.empty()) { | 143 if (!code.empty()) { |
151 modified_code = code; | 144 modified_code = code; |
152 ApplyLanguageCodeCorrection(&modified_code); | 145 ApplyLanguageCodeCorrection(&modified_code); |
153 TranslateCommonMetrics::ReportContentLanguage(code, modified_code); | 146 TranslateCommonMetrics::ReportContentLanguage(code, modified_code); |
154 } | 147 } |
155 | 148 |
156 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt | 149 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt |
157 // |modified_code|. | 150 // |modified_code|. |
158 std::string language = modified_html_lang.empty() ? modified_code : | 151 std::string language = modified_html_lang.empty() ? modified_code : |
159 modified_html_lang; | 152 modified_html_lang; |
160 | 153 |
161 #if defined(ENABLE_LANGUAGE_DETECTION) | |
162 // If |language| is empty, just use CLD result even though it might be | 154 // If |language| is empty, just use CLD result even though it might be |
163 // chrome::kUnknownLanguageCode. | 155 // chrome::kUnknownLanguageCode. |
164 if (language.empty()) { | 156 if (language.empty()) { |
165 TranslateCommonMetrics::ReportLanguageVerification( | 157 TranslateCommonMetrics::ReportLanguageVerification( |
166 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_ONLY); | 158 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_ONLY); |
167 return cld_language; | 159 return cld_language; |
168 } | 160 } |
169 | 161 |
170 if (cld_language == chrome::kUnknownLanguageCode) { | 162 if (cld_language == chrome::kUnknownLanguageCode) { |
171 TranslateCommonMetrics::ReportLanguageVerification( | 163 TranslateCommonMetrics::ReportLanguageVerification( |
(...skipping 13 matching lines...) Expand all Loading... |
185 return cld_language; | 177 return cld_language; |
186 } else { | 178 } else { |
187 TranslateCommonMetrics::ReportLanguageVerification( | 179 TranslateCommonMetrics::ReportLanguageVerification( |
188 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISAGREE); | 180 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISAGREE); |
189 // Content-Language value might be wrong because CLD says that this page | 181 // Content-Language value might be wrong because CLD says that this page |
190 // is written in another language with confidence. | 182 // is written in another language with confidence. |
191 // In this case, Chrome doesn't rely on any of the language codes, and | 183 // In this case, Chrome doesn't rely on any of the language codes, and |
192 // gives up suggesting a translation. | 184 // gives up suggesting a translation. |
193 return std::string(chrome::kUnknownLanguageCode); | 185 return std::string(chrome::kUnknownLanguageCode); |
194 } | 186 } |
195 #else // defined(ENABLE_LANGUAGE_DETECTION) | |
196 TranslateCommonMetrics::ReportLanguageVerification( | |
197 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISABLED); | |
198 #endif // defined(ENABLE_LANGUAGE_DETECTION) | |
199 | 187 |
200 return language; | 188 return language; |
201 } | 189 } |
202 | 190 |
203 void CorrectLanguageCodeTypo(std::string* code) { | 191 void CorrectLanguageCodeTypo(std::string* code) { |
204 DCHECK(code); | 192 DCHECK(code); |
205 | 193 |
206 size_t coma_index = code->find(','); | 194 size_t coma_index = code->find(','); |
207 if (coma_index != std::string::npos) { | 195 if (coma_index != std::string::npos) { |
208 // There are more than 1 language specified, just keep the first one. | 196 // There are more than 1 language specified, just keep the first one. |
(...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
296 // distinguish from English, and the language is one of well-known languages | 284 // distinguish from English, and the language is one of well-known languages |
297 // which often provide "en-*" meta information mistakenly. | 285 // which often provide "en-*" meta information mistakenly. |
298 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { | 286 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { |
299 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) | 287 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) |
300 return true; | 288 return true; |
301 } | 289 } |
302 return false; | 290 return false; |
303 } | 291 } |
304 | 292 |
305 std::string GetCLDVersion() { | 293 std::string GetCLDVersion() { |
306 #if defined(ENABLE_LANGUAGE_DETECTION) | |
307 return CompactLangDet::DetectLanguageVersion(); | 294 return CompactLangDet::DetectLanguageVersion(); |
308 #else | |
309 return "" | |
310 #endif | |
311 } | 295 } |
312 | 296 |
313 } // namespace LanguageDetectionUtil | 297 } // namespace LanguageDetectionUtil |
OLD | NEW |