Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(281)

Side by Side Diff: lib/i18n/bidi_utils.dart

Issue 10827227: Start moving non-platform libraries from lib/ to pkg/ . (Closed) Base URL: http://dart.googlecode.com/svn/branches/bleeding_edge/dart/
Patch Set: Start moving non-platform libraries from lib/ to pkg/ Created 8 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « lib/i18n/bidi_formatter.dart ('k') | lib/i18n/date_format.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 /**
6 * Bidi stands for Bi-directional text.
7 * According to http://en.wikipedia.org/wiki/Bi-directional_text:
8 * Bi-directional text is text containing text in both text directionalities,
9 * both right-to-left (RTL) and left-to-right (LTR). It generally involves text
10 * containing different types of alphabets, but may also refer to boustrophedon,
11 * which is changing text directionality in each row.
12 *
13 * This file provides some utility classes for determining directionality of
14 * text, switching CSS layout from LTR to RTL, and other normalizing utilities
15 * needed when switching between RTL and LTR formatting.
16 *
17 * It defines the TextDirection class which is used to represent directionality
18 * of text,
19 * In most cases, it is preferable to use bidi_formatter.dart, which provides
20 * bidi functionality in the given directional context, instead of using
21 * bidi_utils.dart directly.
22 */
23 class TextDirection {
24 static final LTR = const TextDirection._('LTR', 'ltr');
25 static final RTL = const TextDirection._('RTL', 'rtl');
26 // If the directionality of the text cannot be determined and we are not using
27 // the context direction (or if the context direction is unknown), then the
28 // text falls back on the more common ltr direction.
29 static final UNKNOWN = const TextDirection._('UNKNOWN', 'ltr');
30
31 /**
32 * Textual representation of the directionality constant. One of
33 * 'LTR', 'RTL', or 'UNKNOWN'.
34 */
35 final String value;
36
37 /** Textual representation of the directionality when used in span tag. */
38 final String spanText;
39
40 const TextDirection._(this.value, this.spanText);
41
42 /**
43 * Returns true if [otherDirection] is known to be different from this
44 * direction.
45 */
46 bool isDirectionChange(TextDirection otherDirection) {
47 return otherDirection != TextDirection.UNKNOWN && this != otherDirection;
48 }
49 }
50
51 /** Unicode "Left-To-Right Embedding" (LRE) character. */
52 final LRE = '\u202A';
53
54 /** Unicode "Right-To-Left Embedding" (RLE) character. */
55 final RLE = '\u202B';
56
57 /** Unicode "Pop Directional Formatting" (PDF) character. */
58 final PDF = '\u202C';
59
60 /** Unicode "Left-To-Right Mark" (LRM) character. */
61 final LRM = '\u200E';
62
63 /** Unicode "Right-To-Left Mark" (RLM) character. */
64 final RLM = '\u200F';
65
66 /** Constant to define the threshold of RTL directionality. */
67 num _RTL_DETECTION_THRESHOLD = 0.40;
68
69 /**
70 * Practical patterns to identify strong LTR and RTL characters, respectively.
71 * These patterns are not completely correct according to the Unicode
72 * standard. They are simplified for performance and small code size.
73 */
74 final String _LTR_CHARS =
75 @'A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02B8\u0300-\u0590'
76 @'\u0800-\u1FFF\u2C00-\uFB1C\uFDFE-\uFE6F\uFEFD-\uFFFF';
77 final String _RTL_CHARS = @'\u0591-\u07FF\uFB1D-\uFDFD\uFE70-\uFEFC';
78
79 /**
80 * Returns the input [text] with spaces instead of HTML tags or HTML escapes,
81 * which is helpful for text directionality estimation.
82 * Note: This function should not be used in other contexts.
83 * It does not deal well with many things: comments, script,
84 * elements, style elements, dir attribute,`>` in quoted attribute values,
85 * etc. But it does handle well enough the most common use cases.
86 * Since the worst that can happen as a result of these shortcomings is that
87 * the wrong directionality will be estimated, we have not invested in
88 * improving this.
89 */
90 String stripHtmlIfNeeded(String text) {
91 // The regular expression is simplified for an HTML tag (opening or
92 // closing) or an HTML escape. We might want to skip over such expressions
93 // when estimating the text directionality.
94 return text.replaceAll(const RegExp(@'<[^>]*>|&[^;]+;'), ' ');
95 }
96
97 /**
98 * Determines if the first character in [text] with strong directionality is
99 * LTR. If [isHtml] is true, the text is HTML or HTML-escaped.
100 */
101 bool startsWithLtr(String text, [isHtml=false]) {
102 return const RegExp('^[^$_RTL_CHARS]*[$_LTR_CHARS]').hasMatch(
103 isHtml? stripHtmlIfNeeded(text) : text);
104 }
105
106 /**
107 * Determines if the first character in [text] with strong directionality is
108 * RTL. If [isHtml] is true, the text is HTML or HTML-escaped.
109 */
110 bool startsWithRtl(String text, [isHtml=false]) {
111 return const RegExp('^[^$_LTR_CHARS]*[$_RTL_CHARS]').hasMatch(
112 isHtml? stripHtmlIfNeeded(text) : text);
113 }
114
115 /**
116 * Determines if the exit directionality (ie, the last strongly-directional
117 * character in [text] is LTR. If [isHtml] is true, the text is HTML or
118 * HTML-escaped.
119 */
120 bool endsWithLtr(String text, [isHtml=false]) {
121 return const RegExp('[$_LTR_CHARS][^$_RTL_CHARS]*\$').hasMatch(
122 isHtml? stripHtmlIfNeeded(text) : text);
123 }
124
125 /**
126 * Determines if the exit directionality (ie, the last strongly-directional
127 * character in [text] is RTL. If [isHtml] is true, the text is HTML or
128 * HTML-escaped.
129 */
130 bool endsWithRtl(String text, [isHtml=false]) {
131 return const RegExp('[$_RTL_CHARS][^$_LTR_CHARS]*\$').hasMatch(
132 isHtml? stripHtmlIfNeeded(text) : text);
133 }
134
135 /**
136 * Determines if the given [text] has any LTR characters in it.
137 * If [isHtml] is true, the text is HTML or HTML-escaped.
138 */
139 bool hasAnyLtr(String text, [isHtml=false]) {
140 return const RegExp(@'[' '$_LTR_CHARS' @']').hasMatch(
141 isHtml? stripHtmlIfNeeded(text) : text);
142 }
143
144 /**
145 * Determines if the given [text] has any RTL characters in it.
146 * If [isHtml] is true, the text is HTML or HTML-escaped.
147 */
148 bool hasAnyRtl(String text, [isHtml=false]) {
149 return const RegExp(@'[' '$_RTL_CHARS' @']').hasMatch(
150 isHtml? stripHtmlIfNeeded(text) : text);
151 }
152
153 /**
154 * Check if a BCP 47 / III [languageString] indicates an RTL language.
155 *
156 * i.e. either:
157 * - a language code explicitly specifying one of the right-to-left scripts,
158 * e.g. "az-Arab", or
159 * - a language code specifying one of the languages normally written in a
160 * right-to-left script, e.g. "fa" (Farsi), except ones explicitly
161 * specifying Latin or Cyrillic script (which are the usual LTR
162 * alternatives).
163 *
164 * The list of right-to-left scripts appears in the 100-199 range in
165 * http://www.unicode.org/iso15924/iso15924-num.html, of which Arabic and
166 * Hebrew are by far the most widely used. We also recognize Thaana, N'Ko, and
167 * Tifinagh, which also have significant modern usage. The rest (Syriac,
168 * Samaritan, Mandaic, etc.) seem to have extremely limited or no modern usage
169 * and are not recognized.
170 * The languages usually written in a right-to-left script are taken as those
171 * with Suppress-Script: Hebr|Arab|Thaa|Nkoo|Tfng in
172 * http://www.iana.org/assignments/language-subtag-registry,
173 * as well as Sindhi (sd) and Uyghur (ug).
174 * The presence of other subtags of the language code, e.g. regions like EG
175 * (Egypt), is ignored.
176 */
177 bool isRtlLanguage(String languageString) {
178 return const RegExp(@'^(ar|dv|he|iw|fa|nqo|ps|sd|ug|ur|yi|.*[-_]'
179 @'(Arab|Hebr|Thaa|Nkoo|Tfng))(?!.*[-_](Latn|Cyrl)($|-|_))'
180 @'($|-|_)', ignoreCase : true).hasMatch(languageString);
181 }
182
183 /**
184 * Enforce the [html] snippet in RTL directionality regardless of overall
185 * context. If the html piece was enclosed by a tag, the direction will be
186 * applied to existing tag, otherwise a span tag will be added as wrapper.
187 * For this reason, if html snippet start with with tag, this tag must enclose
188 * the whole piece. If the tag already has a direction specified, this new one
189 * will override existing one in behavior (should work on Chrome, FF, and IE
190 * since this was ported directly from the Closure version).
191 */
192 String enforceRtlInHtml(String html) {
193 return _enforceInHtmlHelper(html, 'rtl');
194 }
195
196 /**
197 * Enforce RTL on both end of the given [text] using unicode BiDi formatting
198 * characters RLE and PDF.
199 */
200 String enforceRtlInText(String text) {
201 return '$RLE$text$PDF';
202 }
203
204 /**
205 * Enforce the [html] snippet in LTR directionality regardless of overall
206 * context. If the html piece was enclosed by a tag, the direction will be
207 * applied to existing tag, otherwise a span tag will be added as wrapper.
208 * For this reason, if html snippet start with with tag, this tag must enclose
209 * the whole piece. If the tag already has a direction specified, this new one
210 * will override existing one in behavior (tested on FF and IE).
211 */
212 String enforceLtrInHtml(String html) {
213 return _enforceInHtmlHelper(html, 'ltr');
214 }
215
216 /**
217 * Enforce LTR on both end of the given [text] using unicode BiDi formatting
218 * characters LRE and PDF.
219 */
220 String enforceLtrInText(String text) {
221 return '$LRE$text$PDF';
222 }
223
224 /**
225 * Enforce the [html] snippet in the desired [direction] regardless of overall
226 * context. If the html piece was enclosed by a tag, the direction will be
227 * applied to existing tag, otherwise a span tag will be added as wrapper.
228 * For this reason, if html snippet start with with tag, this tag must enclose
229 * the whole piece. If the tag already has a direction specified, this new one
230 * will override existing one in behavior (tested on FF and IE).
231 */
232 String _enforceInHtmlHelper(String html, String direction) {
233 if (html.startsWith('<')) {
234 StringBuffer buffer = new StringBuffer();
235 var startIndex = 0;
236 Match match = const RegExp('<\\w+').firstMatch(html);
237 if (match != null) {
238 buffer.add(html.substring(
239 startIndex, match.end())).add(' dir=$direction');
240 startIndex = match.end();
241 }
242 return buffer.add(html.substring(startIndex)).toString();
243 }
244 // '\n' is important for FF so that it won't incorrectly merge span groups.
245 return '\n<span dir=$direction>$html</span>';
246 }
247
248 /**
249 * Apply bracket guard to [str] using html span tag. This is to address the
250 * problem of messy bracket display that frequently happens in RTL layout.
251 * If [isRtlContext] is true, then we explicitly want to wrap in a span of RTL
252 * directionality, regardless of the estimated directionality.
253 */
254 String guardBracketInHtml(String str, [bool isRtlContext]) {
255 var useRtl = isRtlContext == null ? hasAnyRtl(str) : isRtlContext;
256 RegExp matchingBrackets =
257 const RegExp(@'(\(.*?\)+)|(\[.*?\]+)|(\{.*?\}+)|(&lt;.*?(&gt;)+)');
258 return _guardBracketHelper(str, matchingBrackets,
259 '<span dir=${useRtl? "rtl" : "ltr"}>', '</span>');
260 }
261
262 /**
263 * Apply bracket guard to [str] using LRM and RLM. This is to address the
264 * problem of messy bracket display that frequently happens in RTL layout.
265 * This version works for both plain text and html, but in some cases is not
266 * as good as guardBracketInHtml.
267 * If [isRtlContext] is true, then we explicitly want to wrap in a span of RTL
268 * directionality, regardless of the estimated directionality.
269 */
270 String guardBracketInText(String str, [bool isRtlContext]) {
271 var useRtl = isRtlContext == null ? hasAnyRtl(str) : isRtlContext;
272 var mark = useRtl ? RLM : LRM;
273 return _guardBracketHelper(str,
274 const RegExp(@'(\(.*?\)+)|(\[.*?\]+)|(\{.*?\}+)|(<.*?>+)'), mark, mark);
275 }
276
277 /**
278 * (Mostly) reimplements the $& functionality of "replace" in JavaScript.
279 * Given a [str] and the [regexp] to match with, optionally supply a string to
280 * be inserted [before] the match and/or [after]. For example,
281 * `_guardBracketHelper('firetruck', const RegExp('truck'), 'hydrant', '!')`
282 * would return 'firehydrant!'.
283 */
284 // TODO(efortuna): Get rid of this once this is implemented in Dart.
285 // See Issue 2979.
286 String _guardBracketHelper(String str, RegExp regexp, [String before,
287 String after]) {
288 StringBuffer buffer = new StringBuffer();
289 var startIndex = 0;
290 Iterable matches = regexp.allMatches(str);
291 for (Match match in matches) {
292 buffer.add(str.substring(startIndex, match.start())).add(before);
293 buffer.add(str.substring(match.start(), match.end())).add(after);
294 startIndex = match.end();
295 }
296 return buffer.add(str.substring(startIndex)).toString();
297 }
298
299 /**
300 * Estimates the directionality of [text] using the best known
301 * general-purpose method (using relative word counts). A
302 * TextDirection.UNKNOWN return value indicates completely neutral input.
303 * [isHtml] is true if [text] HTML or HTML-escaped.
304 *
305 * If the number of RTL words is above a certain percentage of the total
306 * number of strongly directional words, returns RTL.
307 * Otherwise, if any words are strongly or weakly LTR, returns LTR.
308 * Otherwise, returns UNKNOWN, which is used to mean `neutral`.
309 * Numbers and URLs are counted as weakly LTR.
310 */
311 TextDirection estimateDirectionOfText(String text, [bool isHtml=false]) {
312 text = isHtml? stripHtmlIfNeeded(text) : text;
313 var rtlCount = 0;
314 var total = 0;
315 var hasWeaklyLtr = false;
316 // Split a string into 'words' for directionality estimation based on
317 // relative word counts.
318 for (String token in text.split(const RegExp(@'\s+'))) {
319 if (startsWithRtl(token)) {
320 rtlCount++;
321 total++;
322 } else if (const RegExp(@'^http://').hasMatch(token)) {
323 // Checked if token looks like something that must always be LTR even in
324 // RTL text, such as a URL.
325 hasWeaklyLtr = true;
326 } else if (hasAnyLtr(token)) {
327 total++;
328 } else if (const RegExp(@'\d').hasMatch(token)) {
329 // Checked if token contains any numerals.
330 hasWeaklyLtr = true;
331 }
332 }
333
334 if (total == 0) {
335 return hasWeaklyLtr ? TextDirection.LTR : TextDirection.UNKNOWN;
336 } else if (rtlCount > _RTL_DETECTION_THRESHOLD * total) {
337 return TextDirection.RTL;
338 } else {
339 return TextDirection.LTR;
340 }
341 }
342
343 /**
344 * Find the first index in [str] of the first closing parenthesis that does
345 * not match an opening parenthesis.
346 */
347 int _unmatchedParenIndex(String str) {
348 int sum = 0;
349 int index = 0;
350 while (sum >= 0 || index > str.length) {
351 int char = str.charCodeAt(index);
352 if (char == '('.charCodeAt(0)) sum++;
353 else if (char == ')'.charCodeAt(0)) sum--;
354 index++;
355 }
356 return index;
357 }
358
359 /**
360 * Replace the double and single quote directly after a Hebrew character in
361 * [str] with GERESH and GERSHAYIM. This is most likely the user's intention.
362 */
363 String normalizeHebrewQuote(String str) {
364 StringBuffer buf = new StringBuffer();
365 if (str.length > 0) {
366 buf.add(str.substring(0, 1));
367 }
368 // Start at 1 because we're looking for the patterns [\u0591-\u05f2])" or
369 // [\u0591-\u05f2]'.
370 for (int i = 1; i < str.length; i++) {
371 if (str.substring(i, i+1) == '"'
372 && const RegExp('[\u0591-\u05f2]').hasMatch(str.substring(i-1, i))) {
373 buf.add('\u05f4');
374 } else if (str.substring(i, i+1) == "'"
375 && const RegExp('[\u0591-\u05f2]').hasMatch(str.substring(i-1, i))) {
376 buf.add('\u05f3');
377 } else {
378 buf.add(str.substring(i, i+1));
379 }
380 }
381 return buf.toString();
382 }
383
384 /**
385 * Check the estimated directionality of [str], return true if the piece of
386 * text should be laid out in RTL direction. If [isHtml] is true, the string
387 * is HTML or HTML-escaped.
388 */
389 bool detectRtlDirectionality(String str, [bool isHtml]) {
390 return estimateDirectionOfText(str, isHtml) == TextDirection.RTL;
391 }
OLDNEW
« no previous file with comments | « lib/i18n/bidi_formatter.dart ('k') | lib/i18n/date_format.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698