OLD | NEW |
1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/common/translate/language_detection_util.h" | 5 #include "chrome/common/translate/language_detection_util.h" |
6 | 6 |
7 #include "base/logging.h" | 7 #include "base/logging.h" |
| 8 #include "base/metrics/field_trial.h" |
8 #include "base/strings/string_split.h" | 9 #include "base/strings/string_split.h" |
9 #include "base/strings/string_util.h" | 10 #include "base/strings/string_util.h" |
| 11 #include "base/strings/utf_string_conversions.h" |
10 #include "base/time/time.h" | 12 #include "base/time/time.h" |
11 #include "chrome/common/chrome_constants.h" | 13 #include "chrome/common/chrome_constants.h" |
12 #include "chrome/common/translate/translate_common_metrics.h" | 14 #include "chrome/common/translate/translate_common_metrics.h" |
13 #include "chrome/common/translate/translate_util.h" | 15 #include "chrome/common/translate/translate_util.h" |
| 16 |
| 17 #if !defined(CLD_VERSION) || CLD_VERSION==1 |
14 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" | 18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" |
15 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" | 19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" |
| 20 #endif |
| 21 |
| 22 #if !defined(CLD_VERSION) || CLD_VERSION==2 |
| 23 #include "third_party/cld_2/src/public/compact_lang_det.h" |
| 24 #endif |
16 | 25 |
17 namespace { | 26 namespace { |
18 | 27 |
19 // Similar language code list. Some languages are very similar and difficult | 28 // Similar language code list. Some languages are very similar and difficult |
20 // for CLD to distinguish. | 29 // for CLD to distinguish. |
21 struct SimilarLanguageCode { | 30 struct SimilarLanguageCode { |
22 const char* const code; | 31 const char* const code; |
23 int group; | 32 int group; |
24 }; | 33 }; |
25 | 34 |
(...skipping 28 matching lines...) Expand all Loading... |
54 LanguageDetectionUtil::CorrectLanguageCodeTypo(code); | 63 LanguageDetectionUtil::CorrectLanguageCodeTypo(code); |
55 | 64 |
56 if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) { | 65 if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) { |
57 *code = std::string(); | 66 *code = std::string(); |
58 return; | 67 return; |
59 } | 68 } |
60 | 69 |
61 TranslateUtil::ToTranslateLanguageSynonym(code); | 70 TranslateUtil::ToTranslateLanguageSynonym(code); |
62 } | 71 } |
63 | 72 |
| 73 int GetCLDMajorVersion() { |
| 74 #if !defined(CLD_VERSION) |
| 75 std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2"); |
| 76 if (group_name == "CLD2") |
| 77 return 2; |
| 78 else |
| 79 return 1; |
| 80 #else |
| 81 return CLD_VERSION; |
| 82 #endif |
| 83 } |
| 84 |
64 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it | 85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it |
65 // failed. | 86 // failed. |
66 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. | 87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. |
67 std::string DetermineTextLanguage(const base::string16& text, | 88 std::string DetermineTextLanguage(const base::string16& text, |
68 bool* is_cld_reliable) { | 89 bool* is_cld_reliable) { |
69 std::string language = chrome::kUnknownLanguageCode; | 90 std::string language = chrome::kUnknownLanguageCode; |
70 int num_languages = 0; | |
71 int text_bytes = 0; | 91 int text_bytes = 0; |
72 bool is_reliable = false; | 92 bool is_reliable = false; |
73 Language cld_language = | 93 |
74 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable, | 94 // Language or CLD2::Language |
75 &num_languages, NULL, &text_bytes); | 95 int cld_language = 0; |
| 96 bool is_valid_language = false; |
| 97 |
| 98 switch (GetCLDMajorVersion()) { |
| 99 #if !defined(CLD_VERSION) || CLD_VERSION==1 |
| 100 case 1: { |
| 101 int num_languages = 0; |
| 102 cld_language = |
| 103 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable, |
| 104 &num_languages, NULL, &text_bytes); |
| 105 is_valid_language = cld_language != NUM_LANGUAGES && |
| 106 cld_language != UNKNOWN_LANGUAGE && |
| 107 cld_language != TG_UNKNOWN_LANGUAGE; |
| 108 break; |
| 109 } |
| 110 #endif |
| 111 #if !defined(CLD_VERSION) || CLD_VERSION==2 |
| 112 case 2: { |
| 113 std::string utf8_text(UTF16ToUTF8(text)); |
| 114 CLD2::Language language3[3]; |
| 115 int percent3[3]; |
| 116 cld_language = |
| 117 CLD2::DetectLanguageSummary(utf8_text.c_str(), utf8_text.size(), true, |
| 118 language3, percent3, |
| 119 &text_bytes, &is_reliable); |
| 120 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && |
| 121 cld_language != CLD2::UNKNOWN_LANGUAGE && |
| 122 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; |
| 123 break; |
| 124 } |
| 125 #endif |
| 126 default: |
| 127 NOTREACHED(); |
| 128 } |
| 129 |
76 if (is_cld_reliable != NULL) | 130 if (is_cld_reliable != NULL) |
77 *is_cld_reliable = is_reliable; | 131 *is_cld_reliable = is_reliable; |
78 | 132 |
79 // We don't trust the result if the CLD reports that the detection is not | 133 // We don't trust the result if the CLD reports that the detection is not |
80 // reliable, or if the actual text used to detect the language was less than | 134 // reliable, or if the actual text used to detect the language was less than |
81 // 100 bytes (short texts can often lead to wrong results). | 135 // 100 bytes (short texts can often lead to wrong results). |
82 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that | 136 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that |
83 // the determined language code is correct with 50% confidence. Chrome should | 137 // the determined language code is correct with 50% confidence. Chrome should |
84 // handle the real confidence value to judge. | 138 // handle the real confidence value to judge. |
85 if (is_reliable && text_bytes >= 100 && cld_language != NUM_LANGUAGES && | 139 if (is_reliable && text_bytes >= 100 && is_valid_language) { |
86 cld_language != UNKNOWN_LANGUAGE && cld_language != TG_UNKNOWN_LANGUAGE) { | |
87 // We should not use LanguageCode_ISO_639_1 because it does not cover all | 140 // We should not use LanguageCode_ISO_639_1 because it does not cover all |
88 // the languages CLD can detect. As a result, it'll return the invalid | 141 // the languages CLD can detect. As a result, it'll return the invalid |
89 // language code for tradtional Chinese among others. | 142 // language code for tradtional Chinese among others. |
90 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and | 143 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and |
91 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN | 144 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN |
92 // for Simplified Chinese. | 145 // for Simplified Chinese. |
93 language = LanguageCodeWithDialects(cld_language); | 146 switch (GetCLDMajorVersion()) { |
| 147 #if !defined(CLD_VERSION) || CLD_VERSION==1 |
| 148 case 1: |
| 149 language = |
| 150 LanguageCodeWithDialects(static_cast<Language>(cld_language)); |
| 151 break; |
| 152 #endif |
| 153 #if !defined(CLD_VERSION) || CLD_VERSION==2 |
| 154 case 2: |
| 155 if (cld_language == CLD2::CHINESE) { |
| 156 language = "zh-CN"; |
| 157 } else { |
| 158 language = |
| 159 CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language)); |
| 160 } |
| 161 break; |
| 162 #endif |
| 163 default: |
| 164 NOTREACHED(); |
| 165 } |
94 } | 166 } |
95 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text | 167 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text |
96 << "\n*************************************\n"; | 168 << "\n*************************************\n"; |
97 return language; | 169 return language; |
98 } | 170 } |
99 | 171 |
100 // Checks if CLD can complement a sub code when the page language doesn't know | 172 // Checks if CLD can complement a sub code when the page language doesn't know |
101 // the sub code. | 173 // the sub code. |
102 bool CanCLDComplementSubCode( | 174 bool CanCLDComplementSubCode( |
103 const std::string& page_language, const std::string& cld_language) { | 175 const std::string& page_language, const std::string& cld_language) { |
(...skipping 180 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
284 // distinguish from English, and the language is one of well-known languages | 356 // distinguish from English, and the language is one of well-known languages |
285 // which often provide "en-*" meta information mistakenly. | 357 // which often provide "en-*" meta information mistakenly. |
286 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { | 358 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { |
287 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) | 359 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) |
288 return true; | 360 return true; |
289 } | 361 } |
290 return false; | 362 return false; |
291 } | 363 } |
292 | 364 |
293 std::string GetCLDVersion() { | 365 std::string GetCLDVersion() { |
294 return CompactLangDet::DetectLanguageVersion(); | 366 switch (GetCLDMajorVersion()) { |
| 367 #if !defined(CLD_VERSION) || CLD_VERSION==1 |
| 368 case 1: |
| 369 return CompactLangDet::DetectLanguageVersion(); |
| 370 #endif |
| 371 #if !defined(CLD_VERSION) || CLD_VERSION==2 |
| 372 case 2: |
| 373 return CLD2::DetectLanguageVersion(); |
| 374 #endif |
| 375 default: |
| 376 NOTREACHED(); |
| 377 } |
| 378 return ""; |
295 } | 379 } |
296 | 380 |
297 } // namespace LanguageDetectionUtil | 381 } // namespace LanguageDetectionUtil |
OLD | NEW |