OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | |
2 // for details. All rights reserved. Use of this source code is governed by a | |
3 // BSD-style license that can be found in the LICENSE file. | |
4 | |
5 /** | |
6 * Bidi stands for Bi-directional text. | |
7 * According to http://en.wikipedia.org/wiki/Bi-directional_text: | |
8 * Bi-directional text is text containing text in both text directionalities, | |
9 * both right-to-left (RTL) and left-to-right (LTR). It generally involves text | |
10 * containing different types of alphabets, but may also refer to boustrophedon, | |
11 * which is changing text directionality in each row. | |
12 * | |
13 * This file provides some utility classes for determining directionality of | |
14 * text, switching CSS layout from LTR to RTL, and other normalizing utilities | |
15 * needed when switching between RTL and LTR formatting. | |
16 * | |
17 * It defines the TextDirection class which is used to represent directionality | |
18 * of text, | |
19 * In most cases, it is preferable to use bidi_formatter.dart, which provides | |
20 * bidi functionality in the given directional context, instead of using | |
21 * bidi_utils.dart directly. | |
22 */ | |
23 class TextDirection { | |
24 static final LTR = const TextDirection._('LTR', 'ltr'); | |
25 static final RTL = const TextDirection._('RTL', 'rtl'); | |
26 // If the directionality of the text cannot be determined and we are not using | |
27 // the context direction (or if the context direction is unknown), then the | |
28 // text falls back on the more common ltr direction. | |
29 static final UNKNOWN = const TextDirection._('UNKNOWN', 'ltr'); | |
30 | |
31 /** | |
32 * Textual representation of the directionality constant. One of | |
33 * 'LTR', 'RTL', or 'UNKNOWN'. | |
34 */ | |
35 final String value; | |
36 | |
37 /** Textual representation of the directionality when used in span tag. */ | |
38 final String spanText; | |
39 | |
40 const TextDirection._(this.value, this.spanText); | |
41 | |
42 /** | |
43 * Returns true if [otherDirection] is known to be different from this | |
44 * direction. | |
45 */ | |
46 bool isDirectionChange(TextDirection otherDirection) { | |
47 return otherDirection != TextDirection.UNKNOWN && this != otherDirection; | |
48 } | |
49 } | |
50 | |
51 /** Unicode "Left-To-Right Embedding" (LRE) character. */ | |
52 final LRE = '\u202A'; | |
53 | |
54 /** Unicode "Right-To-Left Embedding" (RLE) character. */ | |
55 final RLE = '\u202B'; | |
56 | |
57 /** Unicode "Pop Directional Formatting" (PDF) character. */ | |
58 final PDF = '\u202C'; | |
59 | |
60 /** Unicode "Left-To-Right Mark" (LRM) character. */ | |
61 final LRM = '\u200E'; | |
62 | |
63 /** Unicode "Right-To-Left Mark" (RLM) character. */ | |
64 final RLM = '\u200F'; | |
65 | |
66 /** Constant to define the threshold of RTL directionality. */ | |
67 num _RTL_DETECTION_THRESHOLD = 0.40; | |
68 | |
69 /** | |
70 * Practical patterns to identify strong LTR and RTL characters, respectively. | |
71 * These patterns are not completely correct according to the Unicode | |
72 * standard. They are simplified for performance and small code size. | |
73 */ | |
74 final String _LTR_CHARS = | |
75 @'A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02B8\u0300-\u0590' | |
76 @'\u0800-\u1FFF\u2C00-\uFB1C\uFDFE-\uFE6F\uFEFD-\uFFFF'; | |
77 final String _RTL_CHARS = @'\u0591-\u07FF\uFB1D-\uFDFD\uFE70-\uFEFC'; | |
78 | |
79 /** | |
80 * Returns the input [text] with spaces instead of HTML tags or HTML escapes, | |
81 * which is helpful for text directionality estimation. | |
82 * Note: This function should not be used in other contexts. | |
83 * It does not deal well with many things: comments, script, | |
84 * elements, style elements, dir attribute,`>` in quoted attribute values, | |
85 * etc. But it does handle well enough the most common use cases. | |
86 * Since the worst that can happen as a result of these shortcomings is that | |
87 * the wrong directionality will be estimated, we have not invested in | |
88 * improving this. | |
89 */ | |
90 String stripHtmlIfNeeded(String text) { | |
91 // The regular expression is simplified for an HTML tag (opening or | |
92 // closing) or an HTML escape. We might want to skip over such expressions | |
93 // when estimating the text directionality. | |
94 return text.replaceAll(const RegExp(@'<[^>]*>|&[^;]+;'), ' '); | |
95 } | |
96 | |
97 /** | |
98 * Determines if the first character in [text] with strong directionality is | |
99 * LTR. If [isHtml] is true, the text is HTML or HTML-escaped. | |
100 */ | |
101 bool startsWithLtr(String text, [isHtml=false]) { | |
102 return const RegExp('^[^$_RTL_CHARS]*[$_LTR_CHARS]').hasMatch( | |
103 isHtml? stripHtmlIfNeeded(text) : text); | |
104 } | |
105 | |
106 /** | |
107 * Determines if the first character in [text] with strong directionality is | |
108 * RTL. If [isHtml] is true, the text is HTML or HTML-escaped. | |
109 */ | |
110 bool startsWithRtl(String text, [isHtml=false]) { | |
111 return const RegExp('^[^$_LTR_CHARS]*[$_RTL_CHARS]').hasMatch( | |
112 isHtml? stripHtmlIfNeeded(text) : text); | |
113 } | |
114 | |
115 /** | |
116 * Determines if the exit directionality (ie, the last strongly-directional | |
117 * character in [text] is LTR. If [isHtml] is true, the text is HTML or | |
118 * HTML-escaped. | |
119 */ | |
120 bool endsWithLtr(String text, [isHtml=false]) { | |
121 return const RegExp('[$_LTR_CHARS][^$_RTL_CHARS]*\$').hasMatch( | |
122 isHtml? stripHtmlIfNeeded(text) : text); | |
123 } | |
124 | |
125 /** | |
126 * Determines if the exit directionality (ie, the last strongly-directional | |
127 * character in [text] is RTL. If [isHtml] is true, the text is HTML or | |
128 * HTML-escaped. | |
129 */ | |
130 bool endsWithRtl(String text, [isHtml=false]) { | |
131 return const RegExp('[$_RTL_CHARS][^$_LTR_CHARS]*\$').hasMatch( | |
132 isHtml? stripHtmlIfNeeded(text) : text); | |
133 } | |
134 | |
135 /** | |
136 * Determines if the given [text] has any LTR characters in it. | |
137 * If [isHtml] is true, the text is HTML or HTML-escaped. | |
138 */ | |
139 bool hasAnyLtr(String text, [isHtml=false]) { | |
140 return const RegExp(@'[' '$_LTR_CHARS' @']').hasMatch( | |
141 isHtml? stripHtmlIfNeeded(text) : text); | |
142 } | |
143 | |
144 /** | |
145 * Determines if the given [text] has any RTL characters in it. | |
146 * If [isHtml] is true, the text is HTML or HTML-escaped. | |
147 */ | |
148 bool hasAnyRtl(String text, [isHtml=false]) { | |
149 return const RegExp(@'[' '$_RTL_CHARS' @']').hasMatch( | |
150 isHtml? stripHtmlIfNeeded(text) : text); | |
151 } | |
152 | |
153 /** | |
154 * Check if a BCP 47 / III [languageString] indicates an RTL language. | |
155 * | |
156 * i.e. either: | |
157 * - a language code explicitly specifying one of the right-to-left scripts, | |
158 * e.g. "az-Arab", or | |
159 * - a language code specifying one of the languages normally written in a | |
160 * right-to-left script, e.g. "fa" (Farsi), except ones explicitly | |
161 * specifying Latin or Cyrillic script (which are the usual LTR | |
162 * alternatives). | |
163 * | |
164 * The list of right-to-left scripts appears in the 100-199 range in | |
165 * http://www.unicode.org/iso15924/iso15924-num.html, of which Arabic and | |
166 * Hebrew are by far the most widely used. We also recognize Thaana, N'Ko, and | |
167 * Tifinagh, which also have significant modern usage. The rest (Syriac, | |
168 * Samaritan, Mandaic, etc.) seem to have extremely limited or no modern usage | |
169 * and are not recognized. | |
170 * The languages usually written in a right-to-left script are taken as those | |
171 * with Suppress-Script: Hebr|Arab|Thaa|Nkoo|Tfng in | |
172 * http://www.iana.org/assignments/language-subtag-registry, | |
173 * as well as Sindhi (sd) and Uyghur (ug). | |
174 * The presence of other subtags of the language code, e.g. regions like EG | |
175 * (Egypt), is ignored. | |
176 */ | |
177 bool isRtlLanguage(String languageString) { | |
178 return const RegExp(@'^(ar|dv|he|iw|fa|nqo|ps|sd|ug|ur|yi|.*[-_]' | |
179 @'(Arab|Hebr|Thaa|Nkoo|Tfng))(?!.*[-_](Latn|Cyrl)($|-|_))' | |
180 @'($|-|_)', ignoreCase : true).hasMatch(languageString); | |
181 } | |
182 | |
183 /** | |
184 * Enforce the [html] snippet in RTL directionality regardless of overall | |
185 * context. If the html piece was enclosed by a tag, the direction will be | |
186 * applied to existing tag, otherwise a span tag will be added as wrapper. | |
187 * For this reason, if html snippet start with with tag, this tag must enclose | |
188 * the whole piece. If the tag already has a direction specified, this new one | |
189 * will override existing one in behavior (should work on Chrome, FF, and IE | |
190 * since this was ported directly from the Closure version). | |
191 */ | |
192 String enforceRtlInHtml(String html) { | |
193 return _enforceInHtmlHelper(html, 'rtl'); | |
194 } | |
195 | |
196 /** | |
197 * Enforce RTL on both end of the given [text] using unicode BiDi formatting | |
198 * characters RLE and PDF. | |
199 */ | |
200 String enforceRtlInText(String text) { | |
201 return '$RLE$text$PDF'; | |
202 } | |
203 | |
204 /** | |
205 * Enforce the [html] snippet in LTR directionality regardless of overall | |
206 * context. If the html piece was enclosed by a tag, the direction will be | |
207 * applied to existing tag, otherwise a span tag will be added as wrapper. | |
208 * For this reason, if html snippet start with with tag, this tag must enclose | |
209 * the whole piece. If the tag already has a direction specified, this new one | |
210 * will override existing one in behavior (tested on FF and IE). | |
211 */ | |
212 String enforceLtrInHtml(String html) { | |
213 return _enforceInHtmlHelper(html, 'ltr'); | |
214 } | |
215 | |
216 /** | |
217 * Enforce LTR on both end of the given [text] using unicode BiDi formatting | |
218 * characters LRE and PDF. | |
219 */ | |
220 String enforceLtrInText(String text) { | |
221 return '$LRE$text$PDF'; | |
222 } | |
223 | |
224 /** | |
225 * Enforce the [html] snippet in the desired [direction] regardless of overall | |
226 * context. If the html piece was enclosed by a tag, the direction will be | |
227 * applied to existing tag, otherwise a span tag will be added as wrapper. | |
228 * For this reason, if html snippet start with with tag, this tag must enclose | |
229 * the whole piece. If the tag already has a direction specified, this new one | |
230 * will override existing one in behavior (tested on FF and IE). | |
231 */ | |
232 String _enforceInHtmlHelper(String html, String direction) { | |
233 if (html.startsWith('<')) { | |
234 StringBuffer buffer = new StringBuffer(); | |
235 var startIndex = 0; | |
236 Match match = const RegExp('<\\w+').firstMatch(html); | |
237 if (match != null) { | |
238 buffer.add(html.substring( | |
239 startIndex, match.end())).add(' dir=$direction'); | |
240 startIndex = match.end(); | |
241 } | |
242 return buffer.add(html.substring(startIndex)).toString(); | |
243 } | |
244 // '\n' is important for FF so that it won't incorrectly merge span groups. | |
245 return '\n<span dir=$direction>$html</span>'; | |
246 } | |
247 | |
248 /** | |
249 * Apply bracket guard to [str] using html span tag. This is to address the | |
250 * problem of messy bracket display that frequently happens in RTL layout. | |
251 * If [isRtlContext] is true, then we explicitly want to wrap in a span of RTL | |
252 * directionality, regardless of the estimated directionality. | |
253 */ | |
254 String guardBracketInHtml(String str, [bool isRtlContext]) { | |
255 var useRtl = isRtlContext == null ? hasAnyRtl(str) : isRtlContext; | |
256 RegExp matchingBrackets = | |
257 const RegExp(@'(\(.*?\)+)|(\[.*?\]+)|(\{.*?\}+)|(<.*?(>)+)'); | |
258 return _guardBracketHelper(str, matchingBrackets, | |
259 '<span dir=${useRtl? "rtl" : "ltr"}>', '</span>'); | |
260 } | |
261 | |
262 /** | |
263 * Apply bracket guard to [str] using LRM and RLM. This is to address the | |
264 * problem of messy bracket display that frequently happens in RTL layout. | |
265 * This version works for both plain text and html, but in some cases is not | |
266 * as good as guardBracketInHtml. | |
267 * If [isRtlContext] is true, then we explicitly want to wrap in a span of RTL | |
268 * directionality, regardless of the estimated directionality. | |
269 */ | |
270 String guardBracketInText(String str, [bool isRtlContext]) { | |
271 var useRtl = isRtlContext == null ? hasAnyRtl(str) : isRtlContext; | |
272 var mark = useRtl ? RLM : LRM; | |
273 return _guardBracketHelper(str, | |
274 const RegExp(@'(\(.*?\)+)|(\[.*?\]+)|(\{.*?\}+)|(<.*?>+)'), mark, mark); | |
275 } | |
276 | |
277 /** | |
278 * (Mostly) reimplements the $& functionality of "replace" in JavaScript. | |
279 * Given a [str] and the [regexp] to match with, optionally supply a string to | |
280 * be inserted [before] the match and/or [after]. For example, | |
281 * `_guardBracketHelper('firetruck', const RegExp('truck'), 'hydrant', '!')` | |
282 * would return 'firehydrant!'. | |
283 */ | |
284 // TODO(efortuna): Get rid of this once this is implemented in Dart. | |
285 // See Issue 2979. | |
286 String _guardBracketHelper(String str, RegExp regexp, [String before, | |
287 String after]) { | |
288 StringBuffer buffer = new StringBuffer(); | |
289 var startIndex = 0; | |
290 Iterable matches = regexp.allMatches(str); | |
291 for (Match match in matches) { | |
292 buffer.add(str.substring(startIndex, match.start())).add(before); | |
293 buffer.add(str.substring(match.start(), match.end())).add(after); | |
294 startIndex = match.end(); | |
295 } | |
296 return buffer.add(str.substring(startIndex)).toString(); | |
297 } | |
298 | |
299 /** | |
300 * Estimates the directionality of [text] using the best known | |
301 * general-purpose method (using relative word counts). A | |
302 * TextDirection.UNKNOWN return value indicates completely neutral input. | |
303 * [isHtml] is true if [text] HTML or HTML-escaped. | |
304 * | |
305 * If the number of RTL words is above a certain percentage of the total | |
306 * number of strongly directional words, returns RTL. | |
307 * Otherwise, if any words are strongly or weakly LTR, returns LTR. | |
308 * Otherwise, returns UNKNOWN, which is used to mean `neutral`. | |
309 * Numbers and URLs are counted as weakly LTR. | |
310 */ | |
311 TextDirection estimateDirectionOfText(String text, [bool isHtml=false]) { | |
312 text = isHtml? stripHtmlIfNeeded(text) : text; | |
313 var rtlCount = 0; | |
314 var total = 0; | |
315 var hasWeaklyLtr = false; | |
316 // Split a string into 'words' for directionality estimation based on | |
317 // relative word counts. | |
318 for (String token in text.split(const RegExp(@'\s+'))) { | |
319 if (startsWithRtl(token)) { | |
320 rtlCount++; | |
321 total++; | |
322 } else if (const RegExp(@'^http://').hasMatch(token)) { | |
323 // Checked if token looks like something that must always be LTR even in | |
324 // RTL text, such as a URL. | |
325 hasWeaklyLtr = true; | |
326 } else if (hasAnyLtr(token)) { | |
327 total++; | |
328 } else if (const RegExp(@'\d').hasMatch(token)) { | |
329 // Checked if token contains any numerals. | |
330 hasWeaklyLtr = true; | |
331 } | |
332 } | |
333 | |
334 if (total == 0) { | |
335 return hasWeaklyLtr ? TextDirection.LTR : TextDirection.UNKNOWN; | |
336 } else if (rtlCount > _RTL_DETECTION_THRESHOLD * total) { | |
337 return TextDirection.RTL; | |
338 } else { | |
339 return TextDirection.LTR; | |
340 } | |
341 } | |
342 | |
343 /** | |
344 * Find the first index in [str] of the first closing parenthesis that does | |
345 * not match an opening parenthesis. | |
346 */ | |
347 int _unmatchedParenIndex(String str) { | |
348 int sum = 0; | |
349 int index = 0; | |
350 while (sum >= 0 || index > str.length) { | |
351 int char = str.charCodeAt(index); | |
352 if (char == '('.charCodeAt(0)) sum++; | |
353 else if (char == ')'.charCodeAt(0)) sum--; | |
354 index++; | |
355 } | |
356 return index; | |
357 } | |
358 | |
359 /** | |
360 * Replace the double and single quote directly after a Hebrew character in | |
361 * [str] with GERESH and GERSHAYIM. This is most likely the user's intention. | |
362 */ | |
363 String normalizeHebrewQuote(String str) { | |
364 StringBuffer buf = new StringBuffer(); | |
365 if (str.length > 0) { | |
366 buf.add(str.substring(0, 1)); | |
367 } | |
368 // Start at 1 because we're looking for the patterns [\u0591-\u05f2])" or | |
369 // [\u0591-\u05f2]'. | |
370 for (int i = 1; i < str.length; i++) { | |
371 if (str.substring(i, i+1) == '"' | |
372 && const RegExp('[\u0591-\u05f2]').hasMatch(str.substring(i-1, i))) { | |
373 buf.add('\u05f4'); | |
374 } else if (str.substring(i, i+1) == "'" | |
375 && const RegExp('[\u0591-\u05f2]').hasMatch(str.substring(i-1, i))) { | |
376 buf.add('\u05f3'); | |
377 } else { | |
378 buf.add(str.substring(i, i+1)); | |
379 } | |
380 } | |
381 return buf.toString(); | |
382 } | |
383 | |
384 /** | |
385 * Check the estimated directionality of [str], return true if the piece of | |
386 * text should be laid out in RTL direction. If [isHtml] is true, the string | |
387 * is HTML or HTML-escaped. | |
388 */ | |
389 bool detectRtlDirectionality(String str, [bool isHtml]) { | |
390 return estimateDirectionOfText(str, isHtml) == TextDirection.RTL; | |
391 } | |
OLD | NEW |