Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(337)

Side by Side Diff: chrome/common/translate/language_detection_util.cc

Issue 18820002: Remove ENABLE_LANGUAGE_DETECTION (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Bug fix: mis-rebased Created 7 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « chrome/chrome_tests_unit.gypi ('k') | chrome/common/translate/translate_common_metrics.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2013 The Chromium Authors. All rights reserved. 1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/common/translate/language_detection_util.h" 5 #include "chrome/common/translate/language_detection_util.h"
6 6
7 #include "base/logging.h" 7 #include "base/logging.h"
8 #include "base/strings/string_split.h" 8 #include "base/strings/string_split.h"
9 #include "base/strings/string_util.h" 9 #include "base/strings/string_util.h"
10 #include "base/time/time.h" 10 #include "base/time/time.h"
11 #include "chrome/common/chrome_constants.h" 11 #include "chrome/common/chrome_constants.h"
12 #include "chrome/common/translate/translate_common_metrics.h" 12 #include "chrome/common/translate/translate_common_metrics.h"
13 #include "chrome/common/translate/translate_util.h" 13 #include "chrome/common/translate/translate_util.h"
14
15 #if defined(ENABLE_LANGUAGE_DETECTION)
16 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" 14 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h"
17 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" 15 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"
18 #endif
19 16
20 namespace { 17 namespace {
21 18
22 // Similar language code list. Some languages are very similar and difficult 19 // Similar language code list. Some languages are very similar and difficult
23 // for CLD to distinguish. 20 // for CLD to distinguish.
24 struct SimilarLanguageCode { 21 struct SimilarLanguageCode {
25 const char* const code; 22 const char* const code;
26 int group; 23 int group;
27 }; 24 };
28 25
(...skipping 28 matching lines...) Expand all
57 LanguageDetectionUtil::CorrectLanguageCodeTypo(code); 54 LanguageDetectionUtil::CorrectLanguageCodeTypo(code);
58 55
59 if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) { 56 if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) {
60 *code = std::string(); 57 *code = std::string();
61 return; 58 return;
62 } 59 }
63 60
64 TranslateUtil::ToTranslateLanguageSynonym(code); 61 TranslateUtil::ToTranslateLanguageSynonym(code);
65 } 62 }
66 63
67 #if defined(ENABLE_LANGUAGE_DETECTION)
68 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it 64 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it
69 // failed. 65 // failed.
70 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. 66 // |is_cld_reliable| will be set as true if CLD says the detection is reliable.
71 std::string DetermineTextLanguage(const base::string16& text, 67 std::string DetermineTextLanguage(const base::string16& text,
72 bool* is_cld_reliable) { 68 bool* is_cld_reliable) {
73 std::string language = chrome::kUnknownLanguageCode; 69 std::string language = chrome::kUnknownLanguageCode;
74 int num_languages = 0; 70 int num_languages = 0;
75 int text_bytes = 0; 71 int text_bytes = 0;
76 bool is_reliable = false; 72 bool is_reliable = false;
77 Language cld_language = 73 Language cld_language =
(...skipping 15 matching lines...) Expand all
93 // language code for tradtional Chinese among others. 89 // language code for tradtional Chinese among others.
94 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and 90 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and
95 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN 91 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN
96 // for Simplified Chinese. 92 // for Simplified Chinese.
97 language = LanguageCodeWithDialects(cld_language); 93 language = LanguageCodeWithDialects(cld_language);
98 } 94 }
99 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text 95 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text
100 << "\n*************************************\n"; 96 << "\n*************************************\n";
101 return language; 97 return language;
102 } 98 }
103 #endif // defined(ENABLE_LANGUAGE_DETECTION)
104 99
105 // Checks if CLD can complement a sub code when the page language doesn't know 100 // Checks if CLD can complement a sub code when the page language doesn't know
106 // the sub code. 101 // the sub code.
107 bool CanCLDComplementSubCode( 102 bool CanCLDComplementSubCode(
108 const std::string& page_language, const std::string& cld_language) { 103 const std::string& page_language, const std::string& cld_language) {
109 // Translate server cannot treat general Chinese. If Content-Language and 104 // Translate server cannot treat general Chinese. If Content-Language and
110 // CLD agree that the language is Chinese and Content-Language doesn't know 105 // CLD agree that the language is Chinese and Content-Language doesn't know
111 // which dialect is used, CLD language has priority. 106 // which dialect is used, CLD language has priority.
112 // TODO(hajimehoshi): How about the other dialects like zh-MO? 107 // TODO(hajimehoshi): How about the other dialects like zh-MO?
113 return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false); 108 return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false);
114 } 109 }
115 110
116 } // namespace 111 } // namespace
117 112
118 namespace LanguageDetectionUtil { 113 namespace LanguageDetectionUtil {
119 114
120 std::string DeterminePageLanguage(const std::string& code, 115 std::string DeterminePageLanguage(const std::string& code,
121 const std::string& html_lang, 116 const std::string& html_lang,
122 const base::string16& contents, 117 const base::string16& contents,
123 std::string* cld_language_p, 118 std::string* cld_language_p,
124 bool* is_cld_reliable_p) { 119 bool* is_cld_reliable_p) {
125 #if defined(ENABLE_LANGUAGE_DETECTION)
126 base::TimeTicks begin_time = base::TimeTicks::Now(); 120 base::TimeTicks begin_time = base::TimeTicks::Now();
127 bool is_cld_reliable; 121 bool is_cld_reliable;
128 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable); 122 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable);
129 TranslateCommonMetrics::ReportLanguageDetectionTime(begin_time, 123 TranslateCommonMetrics::ReportLanguageDetectionTime(begin_time,
130 base::TimeTicks::Now()); 124 base::TimeTicks::Now());
131 125
132 if (cld_language_p != NULL) 126 if (cld_language_p != NULL)
133 *cld_language_p = cld_language; 127 *cld_language_p = cld_language;
134 if (is_cld_reliable_p != NULL) 128 if (is_cld_reliable_p != NULL)
135 *is_cld_reliable_p = is_cld_reliable; 129 *is_cld_reliable_p = is_cld_reliable;
136 TranslateUtil::ToTranslateLanguageSynonym(&cld_language); 130 TranslateUtil::ToTranslateLanguageSynonym(&cld_language);
137 #endif // defined(ENABLE_LANGUAGE_DETECTION)
138 131
139 // Check if html lang attribute is valid. 132 // Check if html lang attribute is valid.
140 std::string modified_html_lang; 133 std::string modified_html_lang;
141 if (!html_lang.empty()) { 134 if (!html_lang.empty()) {
142 modified_html_lang = html_lang; 135 modified_html_lang = html_lang;
143 ApplyLanguageCodeCorrection(&modified_html_lang); 136 ApplyLanguageCodeCorrection(&modified_html_lang);
144 TranslateCommonMetrics::ReportHtmlLang(html_lang, modified_html_lang); 137 TranslateCommonMetrics::ReportHtmlLang(html_lang, modified_html_lang);
145 VLOG(9) << "html lang based language code: " << modified_html_lang; 138 VLOG(9) << "html lang based language code: " << modified_html_lang;
146 } 139 }
147 140
148 // Check if Content-Language is valid. 141 // Check if Content-Language is valid.
149 std::string modified_code; 142 std::string modified_code;
150 if (!code.empty()) { 143 if (!code.empty()) {
151 modified_code = code; 144 modified_code = code;
152 ApplyLanguageCodeCorrection(&modified_code); 145 ApplyLanguageCodeCorrection(&modified_code);
153 TranslateCommonMetrics::ReportContentLanguage(code, modified_code); 146 TranslateCommonMetrics::ReportContentLanguage(code, modified_code);
154 } 147 }
155 148
156 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt 149 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt
157 // |modified_code|. 150 // |modified_code|.
158 std::string language = modified_html_lang.empty() ? modified_code : 151 std::string language = modified_html_lang.empty() ? modified_code :
159 modified_html_lang; 152 modified_html_lang;
160 153
161 #if defined(ENABLE_LANGUAGE_DETECTION)
162 // If |language| is empty, just use CLD result even though it might be 154 // If |language| is empty, just use CLD result even though it might be
163 // chrome::kUnknownLanguageCode. 155 // chrome::kUnknownLanguageCode.
164 if (language.empty()) { 156 if (language.empty()) {
165 TranslateCommonMetrics::ReportLanguageVerification( 157 TranslateCommonMetrics::ReportLanguageVerification(
166 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_ONLY); 158 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_ONLY);
167 return cld_language; 159 return cld_language;
168 } 160 }
169 161
170 if (cld_language == chrome::kUnknownLanguageCode) { 162 if (cld_language == chrome::kUnknownLanguageCode) {
171 TranslateCommonMetrics::ReportLanguageVerification( 163 TranslateCommonMetrics::ReportLanguageVerification(
(...skipping 13 matching lines...) Expand all
185 return cld_language; 177 return cld_language;
186 } else { 178 } else {
187 TranslateCommonMetrics::ReportLanguageVerification( 179 TranslateCommonMetrics::ReportLanguageVerification(
188 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISAGREE); 180 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISAGREE);
189 // Content-Language value might be wrong because CLD says that this page 181 // Content-Language value might be wrong because CLD says that this page
190 // is written in another language with confidence. 182 // is written in another language with confidence.
191 // In this case, Chrome doesn't rely on any of the language codes, and 183 // In this case, Chrome doesn't rely on any of the language codes, and
192 // gives up suggesting a translation. 184 // gives up suggesting a translation.
193 return std::string(chrome::kUnknownLanguageCode); 185 return std::string(chrome::kUnknownLanguageCode);
194 } 186 }
195 #else // defined(ENABLE_LANGUAGE_DETECTION)
196 TranslateCommonMetrics::ReportLanguageVerification(
197 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISABLED);
198 #endif // defined(ENABLE_LANGUAGE_DETECTION)
199 187
200 return language; 188 return language;
201 } 189 }
202 190
203 void CorrectLanguageCodeTypo(std::string* code) { 191 void CorrectLanguageCodeTypo(std::string* code) {
204 DCHECK(code); 192 DCHECK(code);
205 193
206 size_t coma_index = code->find(','); 194 size_t coma_index = code->find(',');
207 if (coma_index != std::string::npos) { 195 if (coma_index != std::string::npos) {
208 // There are more than 1 language specified, just keep the first one. 196 // There are more than 1 language specified, just keep the first one.
(...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after
296 // distinguish from English, and the language is one of well-known languages 284 // distinguish from English, and the language is one of well-known languages
297 // which often provide "en-*" meta information mistakenly. 285 // which often provide "en-*" meta information mistakenly.
298 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { 286 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {
299 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) 287 if (cld_language == kWellKnownCodesOnWrongConfiguration[i])
300 return true; 288 return true;
301 } 289 }
302 return false; 290 return false;
303 } 291 }
304 292
305 std::string GetCLDVersion() { 293 std::string GetCLDVersion() {
306 #if defined(ENABLE_LANGUAGE_DETECTION)
307 return CompactLangDet::DetectLanguageVersion(); 294 return CompactLangDet::DetectLanguageVersion();
308 #else
309 return ""
310 #endif
311 } 295 }
312 296
313 } // namespace LanguageDetectionUtil 297 } // namespace LanguageDetectionUtil
OLDNEW
« no previous file with comments | « chrome/chrome_tests_unit.gypi ('k') | chrome/common/translate/translate_common_metrics.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698