OLD | NEW |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/url_formatter/url_formatter.h" | 5 #include "components/url_formatter/url_formatter.h" |
6 | 6 |
7 #include <algorithm> | 7 #include <algorithm> |
8 #include <utility> | 8 #include <utility> |
9 | 9 |
10 #include "base/lazy_instance.h" | 10 #include "base/lazy_instance.h" |
(...skipping 410 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
421 allowed_set.addAll(*inclusion_set); | 421 allowed_set.addAll(*inclusion_set); |
422 | 422 |
423 // Five aspirational scripts are taken from UTR 31 Table 6 at | 423 // Five aspirational scripts are taken from UTR 31 Table 6 at |
424 // http://www.unicode.org/reports/tr31/#Aspirational_Use_Scripts . | 424 // http://www.unicode.org/reports/tr31/#Aspirational_Use_Scripts . |
425 // Not all the characters of aspirational scripts are suitable for | 425 // Not all the characters of aspirational scripts are suitable for |
426 // identifiers. Therefore, only characters belonging to | 426 // identifiers. Therefore, only characters belonging to |
427 // [:Identifier_Type=Aspirational:] (listed in 'Status/Type=Aspirational' | 427 // [:Identifier_Type=Aspirational:] (listed in 'Status/Type=Aspirational' |
428 // section at | 428 // section at |
429 // http://www.unicode.org/Public/security/latest/xidmodifications.txt) are | 429 // http://www.unicode.org/Public/security/latest/xidmodifications.txt) are |
430 // are added to the allowed set. The list has to be updated when a new | 430 // are added to the allowed set. The list has to be updated when a new |
431 // version of Unicode is released. The current version is 8.0.0 and ICU 58 | 431 // version of Unicode is released. The current version is 9.0.0 and ICU 60 |
432 // will have Unicode 9.0 data. | 432 // will have Unicode 10.0 data. |
433 #if U_ICU_VERSION_MAJOR_NUM < 58 | 433 #if U_ICU_VERSION_MAJOR_NUM < 60 |
434 const icu::UnicodeSet aspirational_scripts( | 434 const icu::UnicodeSet aspirational_scripts( |
435 icu::UnicodeString( | 435 icu::UnicodeString( |
436 // Unified Canadian Syllabics | 436 // Unified Canadian Syllabics |
437 "[\\u1401-\\u166C\\u166F-\\u167F" | 437 "[\\u1401-\\u166C\\u166F-\\u167F" |
438 // Mongolian | 438 // Mongolian |
439 "\\u1810-\\u1819\\u1820-\\u1877\\u1880-\\u18AA" | 439 "\\u1810-\\u1819\\u1820-\\u1877\\u1880-\\u18AA" |
440 // Unified Canadian Syllabics | 440 // Unified Canadian Syllabics |
441 "\\u18B0-\\u18F5" | 441 "\\u18B0-\\u18F5" |
442 // Tifinagh | 442 // Tifinagh |
443 "\\u2D30-\\u2D67\\u2D7F" | 443 "\\u2D30-\\u2D67\\u2D7F" |
444 // Yi | 444 // Yi |
445 "\\uA000-\\uA48C" | 445 "\\uA000-\\uA48C" |
446 // Miao | 446 // Miao |
447 "\\U00016F00-\\U00016F44\\U00016F50-\\U00016F7F" | 447 "\\U00016F00-\\U00016F44\\U00016F50-\\U00016F7E" |
448 "\\U00016F8F-\\U00016F9F]", | 448 "\\U00016F8F-\\U00016F9F]", |
449 -1, US_INV), | 449 -1, US_INV), |
450 *status); | 450 *status); |
451 allowed_set.addAll(aspirational_scripts); | 451 allowed_set.addAll(aspirational_scripts); |
452 #else | 452 #else |
453 #error "Update aspirational_scripts per Unicode 9.0" | 453 #error "Update aspirational_scripts per Unicode 10.0" |
454 #endif | 454 #endif |
455 | 455 |
456 // U+0338 is included in the recommended set, while U+05F4 and U+2027 are in | 456 // U+0338 is included in the recommended set, while U+05F4 and U+2027 are in |
457 // the inclusion set. However, they are blacklisted as a part of Mozilla's | 457 // the inclusion set. However, they are blacklisted as a part of Mozilla's |
458 // IDN blacklist (http://kb.mozillazine.org/Network.IDN.blacklist_chars). | 458 // IDN blacklist (http://kb.mozillazine.org/Network.IDN.blacklist_chars). |
459 // U+0338 and U+2027 are dropped; the former can look like a slash when | 459 // U+0338 and U+2027 are dropped; the former can look like a slash when |
460 // rendered with a broken font, and the latter can be confused with U+30FB | 460 // rendered with a broken font, and the latter can be confused with U+30FB |
461 // (Katakana Middle Dot). U+05F4 (Hebrew Punctuation Gershayim) is kept, | 461 // (Katakana Middle Dot). U+05F4 (Hebrew Punctuation Gershayim) is kept, |
462 // even though it can look like a double quotation mark. Using it in Hebrew | 462 // even though it can look like a double quotation mark. Using it in Hebrew |
463 // should be safe. When used with a non-Hebrew script, it'd be filtered by | 463 // should be safe. When used with a non-Hebrew script, it'd be filtered by |
(...skipping 326 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
790 return base::StartsWith(text, www, base::CompareCase::SENSITIVE) | 790 return base::StartsWith(text, www, base::CompareCase::SENSITIVE) |
791 ? text.substr(www.length()) : text; | 791 ? text.substr(www.length()) : text; |
792 } | 792 } |
793 | 793 |
794 base::string16 StripWWWFromHost(const GURL& url) { | 794 base::string16 StripWWWFromHost(const GURL& url) { |
795 DCHECK(url.is_valid()); | 795 DCHECK(url.is_valid()); |
796 return StripWWW(base::ASCIIToUTF16(url.host_piece())); | 796 return StripWWW(base::ASCIIToUTF16(url.host_piece())); |
797 } | 797 } |
798 | 798 |
799 } // namespace url_formatter | 799 } // namespace url_formatter |
OLD | NEW |