lib/i18n/bidi_utils.dart - Issue 10827227: Start moving non-platform libraries from lib/ to pkg/ .

Side by Side Diff: lib/i18n/bidi_utils.dart

Issue 10827227: Start moving non-platform libraries from lib/ to pkg/ . (Closed) Base URL: http://dart.googlecode.com/svn/branches/bleeding_edge/dart/

Patch Set: Start moving non-platform libraries from lib/ to pkg/ Created 8 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.

4

5 /**

6 * Bidi stands for Bi-directional text.

7 * According to http://en.wikipedia.org/wiki/Bi-directional_text:

8 * Bi-directional text is text containing text in both text directionalities,

9 * both right-to-left (RTL) and left-to-right (LTR). It generally involves text

10 * containing different types of alphabets, but may also refer to boustrophedon,

11 * which is changing text directionality in each row.

12 *

13 * This file provides some utility classes for determining directionality of

14 * text, switching CSS layout from LTR to RTL, and other normalizing utilities

15 * needed when switching between RTL and LTR formatting.

16 *

17 * It defines the TextDirection class which is used to represent directionality

18 * of text,

19 * In most cases, it is preferable to use bidi_formatter.dart, which provides

20 * bidi functionality in the given directional context, instead of using

21 * bidi_utils.dart directly.

22 */

23 class TextDirection {

24 static final LTR = const TextDirection._('LTR', 'ltr');

25 static final RTL = const TextDirection._('RTL', 'rtl');

26 // If the directionality of the text cannot be determined and we are not using

27 // the context direction (or if the context direction is unknown), then the

28 // text falls back on the more common ltr direction.

29 static final UNKNOWN = const TextDirection._('UNKNOWN', 'ltr');

30

31 /**

32 * Textual representation of the directionality constant. One of

33 * 'LTR', 'RTL', or 'UNKNOWN'.

34 */

35 final String value;

36

37 /** Textual representation of the directionality when used in span tag. */

38 final String spanText;

39

40 const TextDirection._(this.value, this.spanText);

41

42 /**

43 * Returns true if [otherDirection] is known to be different from this

44 * direction.

45 */

46 bool isDirectionChange(TextDirection otherDirection) {

47 return otherDirection != TextDirection.UNKNOWN && this != otherDirection;

48 }

49 }

50

51 /** Unicode "Left-To-Right Embedding" (LRE) character. */

52 final LRE = '\u202A';

53

54 /** Unicode "Right-To-Left Embedding" (RLE) character. */

55 final RLE = '\u202B';

56

57 /** Unicode "Pop Directional Formatting" (PDF) character. */

58 final PDF = '\u202C';

59

60 /** Unicode "Left-To-Right Mark" (LRM) character. */

61 final LRM = '\u200E';

62

63 /** Unicode "Right-To-Left Mark" (RLM) character. */

64 final RLM = '\u200F';

65

66 /** Constant to define the threshold of RTL directionality. */

67 num _RTL_DETECTION_THRESHOLD = 0.40;

68

69 /**

70 * Practical patterns to identify strong LTR and RTL characters, respectively.

71 * These patterns are not completely correct according to the Unicode

72 * standard. They are simplified for performance and small code size.

73 */

74 final String _LTR_CHARS =

75 @'A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02B8\u0300-\u0590'

76 @'\u0800-\u1FFF\u2C00-\uFB1C\uFDFE-\uFE6F\uFEFD-\uFFFF';

77 final String _RTL_CHARS = @'\u0591-\u07FF\uFB1D-\uFDFD\uFE70-\uFEFC';

78

79 /**

80 * Returns the input [text] with spaces instead of HTML tags or HTML escapes,

81 * which is helpful for text directionality estimation.

82 * Note: This function should not be used in other contexts.

83 * It does not deal well with many things: comments, script,

84 * elements, style elements, dir attribute,`>` in quoted attribute values,

85 * etc. But it does handle well enough the most common use cases.

86 * Since the worst that can happen as a result of these shortcomings is that

87 * the wrong directionality will be estimated, we have not invested in

88 * improving this.

89 */

90 String stripHtmlIfNeeded(String text) {

91 // The regular expression is simplified for an HTML tag (opening or

92 // closing) or an HTML escape. We might want to skip over such expressions

93 // when estimating the text directionality.

94 return text.replaceAll(const RegExp(@'<[^>]*>\|&[^;]+;'), ' ');

95 }

96

97 /**

98 * Determines if the first character in [text] with strong directionality is

99 * LTR. If [isHtml] is true, the text is HTML or HTML-escaped.

100 */

101 bool startsWithLtr(String text, [isHtml=false]) {

102 return const RegExp('^[^$_RTL_CHARS]*[$_LTR_CHARS]').hasMatch(

103 isHtml? stripHtmlIfNeeded(text) : text);

104 }

105

106 /**

107 * Determines if the first character in [text] with strong directionality is

108 * RTL. If [isHtml] is true, the text is HTML or HTML-escaped.

109 */

110 bool startsWithRtl(String text, [isHtml=false]) {

111 return const RegExp('^[^$_LTR_CHARS]*[$_RTL_CHARS]').hasMatch(

112 isHtml? stripHtmlIfNeeded(text) : text);

113 }

114

115 /**

116 * Determines if the exit directionality (ie, the last strongly-directional

117 * character in [text] is LTR. If [isHtml] is true, the text is HTML or

118 * HTML-escaped.

119 */

120 bool endsWithLtr(String text, [isHtml=false]) {

121 return const RegExp('[$_LTR_CHARS][^$_RTL_CHARS]*\$').hasMatch(

122 isHtml? stripHtmlIfNeeded(text) : text);

123 }

124

125 /**

126 * Determines if the exit directionality (ie, the last strongly-directional

127 * character in [text] is RTL. If [isHtml] is true, the text is HTML or

128 * HTML-escaped.

129 */

130 bool endsWithRtl(String text, [isHtml=false]) {

131 return const RegExp('[$_RTL_CHARS][^$_LTR_CHARS]*\$').hasMatch(

132 isHtml? stripHtmlIfNeeded(text) : text);

133 }

134

135 /**

136 * Determines if the given [text] has any LTR characters in it.

137 * If [isHtml] is true, the text is HTML or HTML-escaped.

138 */

139 bool hasAnyLtr(String text, [isHtml=false]) {

140 return const RegExp(@'[' '$_LTR_CHARS' @']').hasMatch(

141 isHtml? stripHtmlIfNeeded(text) : text);

142 }

143

144 /**

145 * Determines if the given [text] has any RTL characters in it.

146 * If [isHtml] is true, the text is HTML or HTML-escaped.

147 */

148 bool hasAnyRtl(String text, [isHtml=false]) {

149 return const RegExp(@'[' '$_RTL_CHARS' @']').hasMatch(

150 isHtml? stripHtmlIfNeeded(text) : text);

151 }

152

153 /**

154 * Check if a BCP 47 / III [languageString] indicates an RTL language.

155 *

156 * i.e. either:

157 * - a language code explicitly specifying one of the right-to-left scripts,

158 * e.g. "az-Arab", or

159 * - a language code specifying one of the languages normally written in a

160 * right-to-left script, e.g. "fa" (Farsi), except ones explicitly

161 * specifying Latin or Cyrillic script (which are the usual LTR

162 * alternatives).

163 *

164 * The list of right-to-left scripts appears in the 100-199 range in

165 * http://www.unicode.org/iso15924/iso15924-num.html, of which Arabic and

166 * Hebrew are by far the most widely used. We also recognize Thaana, N'Ko, and

167 * Tifinagh, which also have significant modern usage. The rest (Syriac,

168 * Samaritan, Mandaic, etc.) seem to have extremely limited or no modern usage

169 * and are not recognized.

170 * The languages usually written in a right-to-left script are taken as those

171 * with Suppress-Script: Hebr\|Arab\|Thaa\|Nkoo\|Tfng in

172 * http://www.iana.org/assignments/language-subtag-registry,

173 * as well as Sindhi (sd) and Uyghur (ug).

174 * The presence of other subtags of the language code, e.g. regions like EG

175 * (Egypt), is ignored.

176 */

177 bool isRtlLanguage(String languageString) {

178 return const RegExp(@'^(ar\|dv\|he\|iw\|fa\|nqo\|ps\|sd\|ug\|ur\|yi\|.*[-_]'

179 @'(Arab\|Hebr\|Thaa\|Nkoo\|Tfng))(?!.*[-_](Latn\|Cyrl)($\|-\|_))'

180 @'($\|-\|_)', ignoreCase : true).hasMatch(languageString);

181 }

182

183 /**

184 * Enforce the [html] snippet in RTL directionality regardless of overall

185 * context. If the html piece was enclosed by a tag, the direction will be

186 * applied to existing tag, otherwise a span tag will be added as wrapper.

187 * For this reason, if html snippet start with with tag, this tag must enclose

188 * the whole piece. If the tag already has a direction specified, this new one

189 * will override existing one in behavior (should work on Chrome, FF, and IE

190 * since this was ported directly from the Closure version).

191 */

192 String enforceRtlInHtml(String html) {

193 return _enforceInHtmlHelper(html, 'rtl');

194 }

195

196 /**

197 * Enforce RTL on both end of the given [text] using unicode BiDi formatting

198 * characters RLE and PDF.

199 */

200 String enforceRtlInText(String text) {

201 return '$RLE$text$PDF';

202 }

203

204 /**

205 * Enforce the [html] snippet in LTR directionality regardless of overall

206 * context. If the html piece was enclosed by a tag, the direction will be

207 * applied to existing tag, otherwise a span tag will be added as wrapper.

208 * For this reason, if html snippet start with with tag, this tag must enclose

209 * the whole piece. If the tag already has a direction specified, this new one

210 * will override existing one in behavior (tested on FF and IE).

211 */

212 String enforceLtrInHtml(String html) {

213 return _enforceInHtmlHelper(html, 'ltr');

214 }

215

216 /**

217 * Enforce LTR on both end of the given [text] using unicode BiDi formatting

218 * characters LRE and PDF.

219 */

220 String enforceLtrInText(String text) {

221 return '$LRE$text$PDF';

222 }

223

224 /**

225 * Enforce the [html] snippet in the desired [direction] regardless of overall

226 * context. If the html piece was enclosed by a tag, the direction will be

227 * applied to existing tag, otherwise a span tag will be added as wrapper.

228 * For this reason, if html snippet start with with tag, this tag must enclose

229 * the whole piece. If the tag already has a direction specified, this new one

230 * will override existing one in behavior (tested on FF and IE).

231 */

232 String _enforceInHtmlHelper(String html, String direction) {

233 if (html.startsWith('<')) {

234 StringBuffer buffer = new StringBuffer();

235 var startIndex = 0;

236 Match match = const RegExp('<\\w+').firstMatch(html);

237 if (match != null) {

238 buffer.add(html.substring(

239 startIndex, match.end())).add(' dir=$direction');

240 startIndex = match.end();

241 }

242 return buffer.add(html.substring(startIndex)).toString();

243 }

244 // '\n' is important for FF so that it won't incorrectly merge span groups.

245 return '\n<span dir=$direction>$html</span>';

246 }

247

248 /**

249 * Apply bracket guard to [str] using html span tag. This is to address the

250 * problem of messy bracket display that frequently happens in RTL layout.

251 * If [isRtlContext] is true, then we explicitly want to wrap in a span of RTL

252 * directionality, regardless of the estimated directionality.

253 */

254 String guardBracketInHtml(String str, [bool isRtlContext]) {

255 var useRtl = isRtlContext == null ? hasAnyRtl(str) : isRtlContext;

256 RegExp matchingBrackets =

257 const RegExp(@'($.?$+)\|(\[.?\]+)\|(\{.?\}+)\|(<.?(>)+)');

258 return _guardBracketHelper(str, matchingBrackets,

259 '<span dir=${useRtl? "rtl" : "ltr"}>', '</span>');

260 }

261

262 /**

263 * Apply bracket guard to [str] using LRM and RLM. This is to address the

264 * problem of messy bracket display that frequently happens in RTL layout.

265 * This version works for both plain text and html, but in some cases is not

266 * as good as guardBracketInHtml.

267 * If [isRtlContext] is true, then we explicitly want to wrap in a span of RTL

268 * directionality, regardless of the estimated directionality.

269 */

270 String guardBracketInText(String str, [bool isRtlContext]) {

271 var useRtl = isRtlContext == null ? hasAnyRtl(str) : isRtlContext;

272 var mark = useRtl ? RLM : LRM;

273 return _guardBracketHelper(str,

274 const RegExp(@'($.?$+)\|(\[.?\]+)\|(\{.?\}+)\|(<.?>+)'), mark, mark);

275 }

276

277 /**

278 * (Mostly) reimplements the $& functionality of "replace" in JavaScript.

279 * Given a [str] and the [regexp] to match with, optionally supply a string to

280 * be inserted [before] the match and/or [after]. For example,

281 * `_guardBracketHelper('firetruck', const RegExp('truck'), 'hydrant', '!')`

282 * would return 'firehydrant!'.

283 */

284 // TODO(efortuna): Get rid of this once this is implemented in Dart.

285 // See Issue 2979.

286 String _guardBracketHelper(String str, RegExp regexp, [String before,

287 String after]) {

288 StringBuffer buffer = new StringBuffer();

289 var startIndex = 0;

290 Iterable matches = regexp.allMatches(str);

291 for (Match match in matches) {

292 buffer.add(str.substring(startIndex, match.start())).add(before);

293 buffer.add(str.substring(match.start(), match.end())).add(after);

294 startIndex = match.end();

295 }

296 return buffer.add(str.substring(startIndex)).toString();

297 }

298

299 /**

300 * Estimates the directionality of [text] using the best known

301 * general-purpose method (using relative word counts). A

302 * TextDirection.UNKNOWN return value indicates completely neutral input.

303 * [isHtml] is true if [text] HTML or HTML-escaped.

304 *

305 * If the number of RTL words is above a certain percentage of the total

306 * number of strongly directional words, returns RTL.

307 * Otherwise, if any words are strongly or weakly LTR, returns LTR.

308 * Otherwise, returns UNKNOWN, which is used to mean `neutral`.

309 * Numbers and URLs are counted as weakly LTR.

310 */

311 TextDirection estimateDirectionOfText(String text, [bool isHtml=false]) {

312 text = isHtml? stripHtmlIfNeeded(text) : text;

313 var rtlCount = 0;

314 var total = 0;

315 var hasWeaklyLtr = false;

316 // Split a string into 'words' for directionality estimation based on

317 // relative word counts.

318 for (String token in text.split(const RegExp(@'\s+'))) {

319 if (startsWithRtl(token)) {

320 rtlCount++;

321 total++;

322 } else if (const RegExp(@'^http://').hasMatch(token)) {

323 // Checked if token looks like something that must always be LTR even in

324 // RTL text, such as a URL.

325 hasWeaklyLtr = true;

326 } else if (hasAnyLtr(token)) {

327 total++;

328 } else if (const RegExp(@'\d').hasMatch(token)) {

329 // Checked if token contains any numerals.

330 hasWeaklyLtr = true;

331 }

332 }

333

334 if (total == 0) {

335 return hasWeaklyLtr ? TextDirection.LTR : TextDirection.UNKNOWN;

336 } else if (rtlCount > _RTL_DETECTION_THRESHOLD * total) {

337 return TextDirection.RTL;

338 } else {

339 return TextDirection.LTR;

340 }

341 }

342

343 /**

344 * Find the first index in [str] of the first closing parenthesis that does

345 * not match an opening parenthesis.

346 */

347 int _unmatchedParenIndex(String str) {

348 int sum = 0;

349 int index = 0;

350 while (sum >= 0 \|\| index > str.length) {

351 int char = str.charCodeAt(index);

352 if (char == '('.charCodeAt(0)) sum++;

353 else if (char == ')'.charCodeAt(0)) sum--;

354 index++;

355 }

356 return index;

357 }

358

359 /**

360 * Replace the double and single quote directly after a Hebrew character in

361 * [str] with GERESH and GERSHAYIM. This is most likely the user's intention.

362 */

363 String normalizeHebrewQuote(String str) {

364 StringBuffer buf = new StringBuffer();

365 if (str.length > 0) {

366 buf.add(str.substring(0, 1));

367 }

368 // Start at 1 because we're looking for the patterns [\u0591-\u05f2])" or

369 // [\u0591-\u05f2]'.

370 for (int i = 1; i < str.length; i++) {

371 if (str.substring(i, i+1) == '"'

372 && const RegExp('[\u0591-\u05f2]').hasMatch(str.substring(i-1, i))) {

373 buf.add('\u05f4');

374 } else if (str.substring(i, i+1) == "'"

375 && const RegExp('[\u0591-\u05f2]').hasMatch(str.substring(i-1, i))) {

376 buf.add('\u05f3');

377 } else {

378 buf.add(str.substring(i, i+1));

379 }

380 }

381 return buf.toString();

382 }

383

384 /**

385 * Check the estimated directionality of [str], return true if the piece of

386 * text should be laid out in RTL direction. If [isHtml] is true, the string

387 * is HTML or HTML-escaped.

388 */

389 bool detectRtlDirectionality(String str, [bool isHtml]) {

390 return estimateDirectionOfText(str, isHtml) == TextDirection.RTL;

391 }

OLD	NEW

« no previous file with comments | « lib/i18n/bidi_formatter.dart ('k') | lib/i18n/date_format.dart » ('j') | no next file with comments »