Index: chrome/common/translate/language_detection_util.cc |
diff --git a/chrome/common/translate/language_detection_util.cc b/chrome/common/translate/language_detection_util.cc |
index 73e48237d3174ccd5cb2879cba71d4215b238e10..ccb6154e8a38dc206f91ac77dccfe09d4edb5aab 100644 |
--- a/chrome/common/translate/language_detection_util.cc |
+++ b/chrome/common/translate/language_detection_util.cc |
@@ -5,14 +5,23 @@ |
#include "chrome/common/translate/language_detection_util.h" |
#include "base/logging.h" |
+#include "base/metrics/field_trial.h" |
#include "base/strings/string_split.h" |
#include "base/strings/string_util.h" |
+#include "base/strings/utf_string_conversions.h" |
#include "base/time/time.h" |
#include "chrome/common/chrome_constants.h" |
#include "chrome/common/translate/translate_common_metrics.h" |
#include "chrome/common/translate/translate_util.h" |
+ |
+#if !defined(CLD_VERSION) || CLD_VERSION==1 |
#include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" |
#include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" |
+#endif |
+ |
+#if !defined(CLD_VERSION) || CLD_VERSION==2 |
+#include "third_party/cld_2/src/public/compact_lang_det.h" |
+#endif |
namespace { |
@@ -61,18 +70,63 @@ void ApplyLanguageCodeCorrection(std::string* code) { |
TranslateUtil::ToTranslateLanguageSynonym(code); |
} |
+int GetCLDMajorVersion() { |
+#if !defined(CLD_VERSION) |
+ std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2"); |
+ if (group_name == "CLD2") |
+ return 2; |
+ else |
+ return 1; |
+#else |
+ return CLD_VERSION; |
+#endif |
+} |
+ |
// Returns the ISO 639 language code of the specified |text|, or 'unknown' if it |
// failed. |
// |is_cld_reliable| will be set as true if CLD says the detection is reliable. |
std::string DetermineTextLanguage(const base::string16& text, |
bool* is_cld_reliable) { |
std::string language = chrome::kUnknownLanguageCode; |
- int num_languages = 0; |
int text_bytes = 0; |
bool is_reliable = false; |
- Language cld_language = |
- DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable, |
- &num_languages, NULL, &text_bytes); |
+ |
+ // Language or CLD2::Language |
+ int cld_language = 0; |
+ bool is_valid_language = false; |
+ |
+ switch (GetCLDMajorVersion()) { |
+#if !defined(CLD_VERSION) || CLD_VERSION==1 |
+ case 1: { |
+ int num_languages = 0; |
+ cld_language = |
+ DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable, |
+ &num_languages, NULL, &text_bytes); |
+ is_valid_language = cld_language != NUM_LANGUAGES && |
+ cld_language != UNKNOWN_LANGUAGE && |
+ cld_language != TG_UNKNOWN_LANGUAGE; |
+ break; |
+ } |
+#endif |
+#if !defined(CLD_VERSION) || CLD_VERSION==2 |
+ case 2: { |
+ std::string utf8_text(UTF16ToUTF8(text)); |
+ CLD2::Language language3[3]; |
+ int percent3[3]; |
+ cld_language = |
+ CLD2::DetectLanguageSummary(utf8_text.c_str(), utf8_text.size(), true, |
+ language3, percent3, |
+ &text_bytes, &is_reliable); |
+ is_valid_language = cld_language != CLD2::NUM_LANGUAGES && |
+ cld_language != CLD2::UNKNOWN_LANGUAGE && |
+ cld_language != CLD2::TG_UNKNOWN_LANGUAGE; |
+ break; |
+ } |
+#endif |
+ default: |
+ NOTREACHED(); |
+ } |
+ |
if (is_cld_reliable != NULL) |
*is_cld_reliable = is_reliable; |
@@ -82,15 +136,33 @@ std::string DetermineTextLanguage(const base::string16& text, |
// TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that |
// the determined language code is correct with 50% confidence. Chrome should |
// handle the real confidence value to judge. |
- if (is_reliable && text_bytes >= 100 && cld_language != NUM_LANGUAGES && |
- cld_language != UNKNOWN_LANGUAGE && cld_language != TG_UNKNOWN_LANGUAGE) { |
+ if (is_reliable && text_bytes >= 100 && is_valid_language) { |
// We should not use LanguageCode_ISO_639_1 because it does not cover all |
// the languages CLD can detect. As a result, it'll return the invalid |
// language code for tradtional Chinese among others. |
// |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and |
// 'other' tables to do the 'right' thing. In addition, it'll return zh-CN |
// for Simplified Chinese. |
- language = LanguageCodeWithDialects(cld_language); |
+ switch (GetCLDMajorVersion()) { |
+#if !defined(CLD_VERSION) || CLD_VERSION==1 |
+ case 1: |
+ language = |
+ LanguageCodeWithDialects(static_cast<Language>(cld_language)); |
+ break; |
+#endif |
+#if !defined(CLD_VERSION) || CLD_VERSION==2 |
+ case 2: |
+ if (cld_language == CLD2::CHINESE) { |
+ language = "zh-CN"; |
+ } else { |
+ language = |
+ CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language)); |
+ } |
+ break; |
+#endif |
+ default: |
+ NOTREACHED(); |
+ } |
} |
VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text |
<< "\n*************************************\n"; |
@@ -291,7 +363,19 @@ bool MaybeServerWrongConfiguration(const std::string& page_language, |
} |
std::string GetCLDVersion() { |
- return CompactLangDet::DetectLanguageVersion(); |
+ switch (GetCLDMajorVersion()) { |
+#if !defined(CLD_VERSION) || CLD_VERSION==1 |
+ case 1: |
+ return CompactLangDet::DetectLanguageVersion(); |
+#endif |
+#if !defined(CLD_VERSION) || CLD_VERSION==2 |
+ case 2: |
+ return CLD2::DetectLanguageVersion(); |
+#endif |
+ default: |
+ NOTREACHED(); |
+ } |
+ return ""; |
} |
} // namespace LanguageDetectionUtil |