Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(6)

Side by Side Diff: third_party/WebKit/Source/platform/text/LocaleToScriptMapping.cpp

Issue 2192703002: More LayoutLocale refactor with additional Chinese support (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Comment updated as per drott review Created 4 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2011 Google Inc. All rights reserved. 2 * Copyright (C) 2011 Google Inc. All rights reserved.
3 * 3 *
4 * Redistribution and use in source and binary forms, with or without 4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are 5 * modification, are permitted provided that the following conditions are
6 * met: 6 * met:
7 * 7 *
8 * * Redistributions of source code must retain the above copyright 8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer. 9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above 10 * * Redistributions in binary form must reproduce the above
(...skipping 18 matching lines...) Expand all
29 */ 29 */
30 30
31 #include "platform/text/LocaleToScriptMapping.h" 31 #include "platform/text/LocaleToScriptMapping.h"
32 32
33 #include "wtf/HashMap.h" 33 #include "wtf/HashMap.h"
34 #include "wtf/HashSet.h" 34 #include "wtf/HashSet.h"
35 #include "wtf/text/StringHash.h" 35 #include "wtf/text/StringHash.h"
36 36
37 namespace blink { 37 namespace blink {
38 38
39 struct SubtagScript {
40 const char* subtag;
41 UScriptCode script;
42 };
43
44 using SubtagScriptMap = HashMap<String, UScriptCode, CaseFoldingHash>;
45
46 static SubtagScriptMap createSubtagScriptMap(const SubtagScript list[], size_t s ize)
47 {
48 SubtagScriptMap map;
49 for (size_t i = 0; i < size; ++i)
50 map.set(list[i].subtag, list[i].script);
51 return map;
52 }
53
39 UScriptCode scriptNameToCode(const String& scriptName) 54 UScriptCode scriptNameToCode(const String& scriptName)
40 { 55 {
41 struct ScriptNameCode {
42 const char* name;
43 UScriptCode code;
44 };
45
46 // This generally maps an ISO 15924 script code to its UScriptCode, but cert ain families of script codes are 56 // This generally maps an ISO 15924 script code to its UScriptCode, but cert ain families of script codes are
47 // treated as a single script for assigning a per-script font in Settings. F or example, "hira" is mapped to 57 // treated as a single script for assigning a per-script font in Settings. F or example, "hira" is mapped to
48 // USCRIPT_KATAKANA_OR_HIRAGANA instead of USCRIPT_HIRAGANA, since we want a ll Japanese scripts to be rendered 58 // USCRIPT_KATAKANA_OR_HIRAGANA instead of USCRIPT_HIRAGANA, since we want a ll Japanese scripts to be rendered
49 // using the same font setting. 59 // using the same font setting.
50 static const ScriptNameCode scriptNameCodeList[] = { 60 static const SubtagScript scriptNameCodeList[] = {
51 { "zyyy", USCRIPT_COMMON }, 61 { "zyyy", USCRIPT_COMMON },
52 { "qaai", USCRIPT_INHERITED }, 62 { "qaai", USCRIPT_INHERITED },
53 { "arab", USCRIPT_ARABIC }, 63 { "arab", USCRIPT_ARABIC },
54 { "armn", USCRIPT_ARMENIAN }, 64 { "armn", USCRIPT_ARMENIAN },
55 { "beng", USCRIPT_BENGALI }, 65 { "beng", USCRIPT_BENGALI },
56 { "bopo", USCRIPT_BOPOMOFO }, 66 { "bopo", USCRIPT_BOPOMOFO },
57 { "cher", USCRIPT_CHEROKEE }, 67 { "cher", USCRIPT_CHEROKEE },
58 { "copt", USCRIPT_COPTIC }, 68 { "copt", USCRIPT_COPTIC },
59 { "cyrl", USCRIPT_CYRILLIC }, 69 { "cyrl", USCRIPT_CYRILLIC },
60 { "dsrt", USCRIPT_DESERET }, 70 { "dsrt", USCRIPT_DESERET },
(...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after
148 { "syrn", USCRIPT_EASTERN_SYRIAC }, 158 { "syrn", USCRIPT_EASTERN_SYRIAC },
149 { "teng", USCRIPT_TENGWAR }, 159 { "teng", USCRIPT_TENGWAR },
150 { "vaii", USCRIPT_VAI }, 160 { "vaii", USCRIPT_VAI },
151 { "visp", USCRIPT_VISIBLE_SPEECH }, 161 { "visp", USCRIPT_VISIBLE_SPEECH },
152 { "xsux", USCRIPT_CUNEIFORM }, 162 { "xsux", USCRIPT_CUNEIFORM },
153 { "jpan", USCRIPT_KATAKANA_OR_HIRAGANA }, 163 { "jpan", USCRIPT_KATAKANA_OR_HIRAGANA },
154 { "kore", USCRIPT_HANGUL }, 164 { "kore", USCRIPT_HANGUL },
155 { "zxxx", USCRIPT_UNWRITTEN_LANGUAGES }, 165 { "zxxx", USCRIPT_UNWRITTEN_LANGUAGES },
156 { "zzzz", USCRIPT_UNKNOWN } 166 { "zzzz", USCRIPT_UNKNOWN }
157 }; 167 };
168 DEFINE_STATIC_LOCAL(SubtagScriptMap, scriptNameCodeMap,
169 (createSubtagScriptMap(scriptNameCodeList, WTF_ARRAY_LENGTH(scriptNameCo deList))));
158 170
159 typedef HashMap<String, UScriptCode> ScriptNameCodeMap; 171 const auto& it = scriptNameCodeMap.find(scriptName);
160 DEFINE_STATIC_LOCAL(ScriptNameCodeMap, scriptNameCodeMap, ());
161 if (scriptNameCodeMap.isEmpty()) {
162 for (size_t i = 0; i < sizeof(scriptNameCodeList) / sizeof(scriptNameCod eList[0]); ++i)
163 scriptNameCodeMap.set(scriptNameCodeList[i].name, scriptNameCodeList [i].code);
164 }
165
166 HashMap<String, UScriptCode>::iterator it = scriptNameCodeMap.find(scriptNam e.lower());
167 if (it != scriptNameCodeMap.end()) 172 if (it != scriptNameCodeMap.end())
168 return it->value; 173 return it->value;
169 return USCRIPT_INVALID_CODE; 174 return USCRIPT_INVALID_CODE;
170 } 175 }
171 176
172 UScriptCode localeToScriptCodeForFontSelection(const String& locale) 177 UScriptCode localeToScriptCodeForFontSelection(const String& locale)
173 { 178 {
174 struct LocaleScript { 179 static const SubtagScript localeScriptList[] = {
175 const char* locale;
176 UScriptCode script;
177 };
178
179 static const LocaleScript localeScriptList[] = {
180 { "aa", USCRIPT_LATIN }, 180 { "aa", USCRIPT_LATIN },
181 { "ab", USCRIPT_CYRILLIC }, 181 { "ab", USCRIPT_CYRILLIC },
182 { "ady", USCRIPT_CYRILLIC }, 182 { "ady", USCRIPT_CYRILLIC },
183 { "aeb", USCRIPT_ARABIC }, 183 { "aeb", USCRIPT_ARABIC },
184 { "af", USCRIPT_LATIN }, 184 { "af", USCRIPT_LATIN },
185 { "ak", USCRIPT_LATIN }, 185 { "ak", USCRIPT_LATIN },
186 { "am", USCRIPT_ETHIOPIC }, 186 { "am", USCRIPT_ETHIOPIC },
187 { "ar", USCRIPT_ARABIC }, 187 { "ar", USCRIPT_ARABIC },
188 { "arq", USCRIPT_ARABIC }, 188 { "arq", USCRIPT_ARABIC },
189 { "ary", USCRIPT_ARABIC }, 189 { "ary", USCRIPT_ARABIC },
(...skipping 218 matching lines...) Expand 10 before | Expand all | Expand 10 after
408 { "vi", USCRIPT_LATIN }, 408 { "vi", USCRIPT_LATIN },
409 { "wal", USCRIPT_ETHIOPIC }, 409 { "wal", USCRIPT_ETHIOPIC },
410 { "war", USCRIPT_LATIN }, 410 { "war", USCRIPT_LATIN },
411 { "wo", USCRIPT_LATIN }, 411 { "wo", USCRIPT_LATIN },
412 { "xh", USCRIPT_LATIN }, 412 { "xh", USCRIPT_LATIN },
413 { "yap", USCRIPT_LATIN }, 413 { "yap", USCRIPT_LATIN },
414 { "yo", USCRIPT_LATIN }, 414 { "yo", USCRIPT_LATIN },
415 { "za", USCRIPT_LATIN }, 415 { "za", USCRIPT_LATIN },
416 { "zdj", USCRIPT_ARABIC }, 416 { "zdj", USCRIPT_ARABIC },
417 { "zh", USCRIPT_SIMPLIFIED_HAN }, 417 { "zh", USCRIPT_SIMPLIFIED_HAN },
418 { "zh_hk", USCRIPT_TRADITIONAL_HAN }, 418 { "zu", USCRIPT_LATIN },
419 { "zh_tw", USCRIPT_TRADITIONAL_HAN }, 419 // Encompassed languages within the Chinese macrolanguage.
420 { "zu", USCRIPT_LATIN } 420 // http://www-01.sil.org/iso639-3/documentation.asp?id=zho
421 // http://lists.w3.org/Archives/Public/public-i18n-cjk/2016JulSep/0022.h tml
422 { "cdo", USCRIPT_SIMPLIFIED_HAN },
423 { "cjy", USCRIPT_SIMPLIFIED_HAN },
424 { "cmn", USCRIPT_SIMPLIFIED_HAN },
425 { "cpx", USCRIPT_SIMPLIFIED_HAN },
426 { "czh", USCRIPT_SIMPLIFIED_HAN },
427 { "czo", USCRIPT_SIMPLIFIED_HAN },
428 { "gan", USCRIPT_SIMPLIFIED_HAN },
429 { "hsn", USCRIPT_SIMPLIFIED_HAN },
430 { "mnp", USCRIPT_SIMPLIFIED_HAN },
431 { "wuu", USCRIPT_SIMPLIFIED_HAN },
432 { "hak", USCRIPT_TRADITIONAL_HAN },
433 { "lzh", USCRIPT_TRADITIONAL_HAN },
434 { "nan", USCRIPT_TRADITIONAL_HAN },
435 { "yue", USCRIPT_TRADITIONAL_HAN },
436 { "zh-cdo", USCRIPT_SIMPLIFIED_HAN },
437 { "zh-cjy", USCRIPT_SIMPLIFIED_HAN },
438 { "zh-cmn", USCRIPT_SIMPLIFIED_HAN },
439 { "zh-cpx", USCRIPT_SIMPLIFIED_HAN },
440 { "zh-czh", USCRIPT_SIMPLIFIED_HAN },
441 { "zh-czo", USCRIPT_SIMPLIFIED_HAN },
442 { "zh-gan", USCRIPT_SIMPLIFIED_HAN },
443 { "zh-hsn", USCRIPT_SIMPLIFIED_HAN },
444 { "zh-mnp", USCRIPT_SIMPLIFIED_HAN },
445 { "zh-wuu", USCRIPT_SIMPLIFIED_HAN },
446 { "zh-hak", USCRIPT_TRADITIONAL_HAN },
447 { "zh-lzh", USCRIPT_TRADITIONAL_HAN },
448 { "zh-nan", USCRIPT_TRADITIONAL_HAN },
449 { "zh-yue", USCRIPT_TRADITIONAL_HAN },
450 // Chinese with regions. Logically, regions should be handled
451 // separately, but this works for the current purposes.
452 { "zh-hk", USCRIPT_TRADITIONAL_HAN },
453 { "zh-mo", USCRIPT_TRADITIONAL_HAN },
454 { "zh-tw", USCRIPT_TRADITIONAL_HAN },
421 }; 455 };
456 DEFINE_STATIC_LOCAL(SubtagScriptMap, localeScriptMap,
457 (createSubtagScriptMap(localeScriptList, WTF_ARRAY_LENGTH(localeScriptLi st))));
422 458
423 typedef HashMap<String, UScriptCode> LocaleScriptMap; 459 // BCP 47 uses '-' as the delimiter but ICU uses '_'.
424 DEFINE_STATIC_LOCAL(LocaleScriptMap, localeScriptMap, ()); 460 // https://tools.ietf.org/html/bcp47
425 if (localeScriptMap.isEmpty()) { 461 String canonicalLocale = locale;
426 for (size_t i = 0; i < sizeof(localeScriptList) / sizeof(localeScriptLis t[0]); ++i) 462 canonicalLocale.replace('_', '-');
427 localeScriptMap.set(localeScriptList[i].locale, localeScriptList[i]. script);
428 }
429
430 String canonicalLocale = locale.lower().replace('-', '_');
431 while (!canonicalLocale.isEmpty()) { 463 while (!canonicalLocale.isEmpty()) {
432 HashMap<String, UScriptCode>::iterator it = localeScriptMap.find(canonic alLocale); 464 const auto& it = localeScriptMap.find(canonicalLocale);
433 if (it != localeScriptMap.end()) 465 if (it != localeScriptMap.end())
434 return it->value; 466 return it->value;
435 size_t pos = canonicalLocale.reverseFind('_'); 467 size_t pos = canonicalLocale.reverseFind('-');
436 if (pos == kNotFound) 468 if (pos == kNotFound)
437 break; 469 break;
438 UScriptCode code = scriptNameToCode(canonicalLocale.substring(pos + 1)); 470 // script = 4ALPHA
439 if (code != USCRIPT_INVALID_CODE && code != USCRIPT_UNKNOWN) 471 if (canonicalLocale.length() - (pos + 1) == 4) {
440 return code; 472 UScriptCode code = scriptNameToCode(canonicalLocale.substring(pos + 1));
473 if (code != USCRIPT_INVALID_CODE && code != USCRIPT_UNKNOWN)
474 return code;
475 }
441 canonicalLocale = canonicalLocale.substring(0, pos); 476 canonicalLocale = canonicalLocale.substring(0, pos);
442 } 477 }
443 return USCRIPT_COMMON; 478 return USCRIPT_COMMON;
444 } 479 }
445 480
446 static bool isUnambiguousHanScript(UScriptCode script) 481 static UScriptCode scriptCodeForHanFromRegion(const String& region)
447 { 482 {
448 // localeToScriptCodeForFontSelection() does not return these values. 483 static const SubtagScript regionScriptList[] = {
449 ASSERT(script != USCRIPT_HIRAGANA && script != USCRIPT_KATAKANA);
450 return script == USCRIPT_KATAKANA_OR_HIRAGANA
451 || script == USCRIPT_SIMPLIFIED_HAN
452 || script == USCRIPT_TRADITIONAL_HAN
453 || script == USCRIPT_HANGUL;
454 }
455
456 static UScriptCode scriptCodeForHanFromSubtag(const String& subtag)
457 {
458 struct SubtagScript {
459 const char* subtag;
460 UScriptCode script;
461 };
462
463 static const SubtagScript subtagScriptList[] = {
464 { "cn", USCRIPT_SIMPLIFIED_HAN },
465 { "hans", USCRIPT_SIMPLIFIED_HAN },
466 { "hant", USCRIPT_TRADITIONAL_HAN },
467 { "hk", USCRIPT_TRADITIONAL_HAN }, 484 { "hk", USCRIPT_TRADITIONAL_HAN },
468 { "jp", USCRIPT_KATAKANA_OR_HIRAGANA }, 485 { "jp", USCRIPT_KATAKANA_OR_HIRAGANA },
469 { "kr", USCRIPT_HANGUL }, 486 { "kr", USCRIPT_HANGUL },
487 { "mo", USCRIPT_TRADITIONAL_HAN },
470 { "tw", USCRIPT_TRADITIONAL_HAN }, 488 { "tw", USCRIPT_TRADITIONAL_HAN },
471 }; 489 };
490 DEFINE_STATIC_LOCAL(SubtagScriptMap, regionScriptMap,
491 (createSubtagScriptMap(regionScriptList, WTF_ARRAY_LENGTH(regionScriptLi st))));
472 492
473 typedef HashMap<String, UScriptCode> SubtagScriptMap; 493 const auto& it = regionScriptMap.find(region);
474 DEFINE_STATIC_LOCAL(SubtagScriptMap, subtagScriptMap, ()); 494 return it != regionScriptMap.end() ? it->value : USCRIPT_COMMON;
475 if (subtagScriptMap.isEmpty()) {
476 for (size_t i = 0; i < WTF_ARRAY_LENGTH(subtagScriptList); ++i)
477 subtagScriptMap.set(subtagScriptList[i].subtag, subtagScriptList[i]. script);
478 }
479
480 const auto& it = subtagScriptMap.find(subtag.lower());
481 return it != subtagScriptMap.end() ? it->value : USCRIPT_COMMON;
482 } 495 }
483 496
484 static UScriptCode scriptCodeForHanFromSubtags(const String& locale, char delimi ter) 497 UScriptCode scriptCodeForHanFromSubtags(const String& locale, char delimiter)
485 { 498 {
486 // Some sites emit lang="en-JP" when English is set as the preferred 499 // Some sites emit lang="en-JP" when English is set as the preferred
487 // language. Use script/region subtags of the content locale to pick the 500 // language. Use script/region subtags of the content locale to pick the
488 // fallback font for unified Han ideographs. 501 // fallback font for unified Han ideographs.
489 for (size_t end = locale.find(delimiter); end != kNotFound; ) { 502 for (size_t end = locale.find(delimiter); end != kNotFound; ) {
490 size_t begin = end + 1; 503 size_t begin = end + 1;
491 end = locale.find(delimiter, begin); 504 end = locale.find(delimiter, begin);
492 UScriptCode script = scriptCodeForHanFromSubtag( 505 size_t len = (end == kNotFound ? locale.length() : end) - begin;
493 locale.substring(begin, 506 UScriptCode script;
494 end == kNotFound ? UINT_MAX : end - begin)); 507 switch (len) {
495 if (script != USCRIPT_COMMON) 508 case 2: // region = 2ALPHA / 3DIGIT
496 return script; 509 script = scriptCodeForHanFromRegion(locale.substring(begin, len));
510 if (script != USCRIPT_COMMON)
511 return script;
512 break;
513 case 4: // script = 4ALPHA
514 script = scriptNameToCode(locale.substring(begin, len));
515 if (script != USCRIPT_INVALID_CODE)
516 return script;
517 }
497 } 518 }
498 519
499 return USCRIPT_COMMON; 520 return USCRIPT_COMMON;
500 } 521 }
501 522
502 UScriptCode scriptCodeForHanFromLocale(UScriptCode script, const String& locale, char delimiter)
503 {
504 if (isUnambiguousHanScript(script))
505 return script;
506
507 // Identify the script for Han if the UScriptCode is ambiguous.
508 // Check subtags only, because the UScriptCode covers the language part.
509 return scriptCodeForHanFromSubtags(locale, delimiter);
510 }
511
512 UScriptCode scriptCodeForHanFromLocale(const String& locale, char delimiter)
513 {
514 UScriptCode script = localeToScriptCodeForFontSelection(locale);
515 return scriptCodeForHanFromLocale(script, locale, delimiter);
516 }
517
518 } // namespace blink 523 } // namespace blink
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698