| Index: icu51/source/data/brkitr/word_POSIX.txt
|
| ===================================================================
|
| --- icu51/source/data/brkitr/word_POSIX.txt (revision 0)
|
| +++ icu51/source/data/brkitr/word_POSIX.txt (revision 0)
|
| @@ -0,0 +1,244 @@
|
| +#
|
| +# Copyright (C) 2002-2013, International Business Machines Corporation
|
| +# and others. All Rights Reserved.
|
| +#
|
| +# file: word_POSIX.txt
|
| +#
|
| +# ICU Word Break Rules, POSIX locale.
|
| +# See Unicode Standard Annex #29.
|
| +# These rules are based on UAX #29 Revision 20 for Unicode Version 6.2
|
| +#
|
| +# Note: Updates to word.txt will usually need to be merged into
|
| +# word_POSIX.txt also.
|
| +
|
| +##############################################################################
|
| +#
|
| +# Character class definitions from TR 29
|
| +#
|
| +##############################################################################
|
| +
|
| +!!chain;
|
| +
|
| +
|
| +#
|
| +# Character Class Definitions.
|
| +#
|
| +
|
| +$CR = [\p{Word_Break = CR}];
|
| +$LF = [\p{Word_Break = LF}];
|
| +$Newline = [\p{Word_Break = Newline}];
|
| +$Extend = [\p{Word_Break = Extend}];
|
| +$Format = [\p{Word_Break = Format}];
|
| +$Hiragana = [:Hiragana:];
|
| +$Katakana = [\p{Word_Break = Katakana}];
|
| +$Han = [:Han:];
|
| +$ALetter = [\p{Word_Break = ALetter}];
|
| +$MidNumLet = [\p{Word_Break = MidNumLet} - [.]];
|
| +$MidLetter = [\p{Word_Break = MidLetter} - [\:]];
|
| +$MidNum = [\p{Word_Break = MidNum} [.]];
|
| +$Numeric = [\p{Word_Break = Numeric}];
|
| +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
| +$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
|
| +
|
| +
|
| +# Dictionary character set, for triggering language-based break engines. Currently
|
| +# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
| +# 5.0 or later as the definition of Complex_Context was corrected to include all
|
| +# characters requiring dictionary break.
|
| +
|
| +$Control = [\p{Grapheme_Cluster_Break = Control}];
|
| +$HangulSyllable = [\uac00-\ud7a3];
|
| +$ComplexContext = [:LineBreak = Complex_Context:];
|
| +$KanaKanji = [$Han $Hiragana $Katakana];
|
| +$dictionaryCJK = [$KanaKanji $HangulSyllable];
|
| +$dictionary = [$ComplexContext $dictionaryCJK];
|
| +
|
| +# leave CJK scripts out of ALetterPlus
|
| +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
|
| +
|
| +
|
| +#
|
| +# Rules 4 Ignore Format and Extend characters,
|
| +# except when they appear at the beginning of a region of text.
|
| +#
|
| +# TODO: check if handling of katakana in dictionary makes rules incorrect/void
|
| +$KatakanaEx = $Katakana ($Extend | $Format)*;
|
| +$ALetterEx = $ALetterPlus ($Extend | $Format)*;
|
| +$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
| +$MidLetterEx = $MidLetter ($Extend | $Format)*;
|
| +$MidNumEx = $MidNum ($Extend | $Format)*;
|
| +$NumericEx = $Numeric ($Extend | $Format)*;
|
| +$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
| +$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*;
|
| +
|
| +$Ideographic = [\p{Ideographic}];
|
| +$HiraganaEx = $Hiragana ($Extend | $Format)*;
|
| +$IdeographicEx = $Ideographic ($Extend | $Format)*;
|
| +
|
| +## -------------------------------------------------
|
| +
|
| +!!forward;
|
| +
|
| +
|
| +# Rule 3 - CR x LF
|
| +#
|
| +$CR $LF;
|
| +
|
| +# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
|
| +# of a region of Text. The rule here comes into play when the start of text
|
| +# begins with a group of Format chars, or with a "word" consisting of a single
|
| +# char that is not in any of the listed word break categories followed by
|
| +# format char(s), or is not a CJK dictionary character.
|
| +[^$CR $LF $Newline]? ($Extend | $Format)+;
|
| +
|
| +$NumericEx {100};
|
| +$ALetterEx {200};
|
| +$HangulSyllable {200};
|
| +$KatakanaEx {400}; # note: these status values override those from rule 5
|
| +$HiraganaEx {400}; # by virtue of being numerically larger.
|
| +$IdeographicEx {400}; #
|
| +
|
| +#
|
| +# rule 5
|
| +# Do not break between most letters.
|
| +#
|
| +$ALetterEx $ALetterEx {200};
|
| +
|
| +# rule 6 and 7
|
| +$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
|
| +
|
| +# rule 8
|
| +
|
| +$NumericEx $NumericEx {100};
|
| +
|
| +# rule 9
|
| +
|
| +$ALetterEx $NumericEx {200};
|
| +
|
| +# rule 10
|
| +
|
| +$NumericEx $ALetterEx {200};
|
| +
|
| +# rule 11 and 12
|
| +
|
| +$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
|
| +
|
| +# rule 13
|
| +# to be consistent with $KanaKanji $KanaKanhi, changed
|
| +# from 300 to 400.
|
| +# See also TestRuleStatus in intltest/rbbiapts.cpp
|
| +$KatakanaEx $KatakanaEx {400};
|
| +
|
| +# rule 13a/b
|
| +
|
| +$ALetterEx $ExtendNumLetEx {200}; # (13a)
|
| +$NumericEx $ExtendNumLetEx {100}; # (13a)
|
| +$KatakanaEx $ExtendNumLetEx {400}; # (13a)
|
| +$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
|
| +
|
| +$ExtendNumLetEx $ALetterEx {200}; # (13b)
|
| +$ExtendNumLetEx $NumericEx {100}; # (13b)
|
| +$ExtendNumLetEx $KatakanaEx {400}; # (13b)
|
| +
|
| +# rule 13c
|
| +
|
| +$Regional_IndicatorEx $Regional_IndicatorEx;
|
| +
|
| +# special handling for CJK characters: chain for later dictionary segmentation
|
| +$HangulSyllable $HangulSyllable {200};
|
| +$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
|
| +
|
| +
|
| +## -------------------------------------------------
|
| +
|
| +!!reverse;
|
| +
|
| +$BackALetterEx = ($Format | $Extend)* $ALetterPlus;
|
| +$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;
|
| +$BackNumericEx = ($Format | $Extend)* $Numeric;
|
| +$BackMidNumEx = ($Format | $Extend)* $MidNum;
|
| +$BackMidLetterEx = ($Format | $Extend)* $MidLetter;
|
| +$BackKatakanaEx = ($Format | $Extend)* $Katakana;
|
| +$BackHiraganaEx = ($Format | $Extend)* $Hiragana;
|
| +$BackExtendNumLetEx = ($Format | $Extend)* $ExtendNumLet;
|
| +$BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator;
|
| +
|
| +# rule 3
|
| +$LF $CR;
|
| +
|
| +# rule 4
|
| +($Format | $Extend)* [^$CR $LF $Newline]?;
|
| +
|
| +# rule 5
|
| +
|
| +$BackALetterEx $BackALetterEx;
|
| +
|
| +# rule 6 and 7
|
| +
|
| +$BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;
|
| +
|
| +
|
| +# rule 8
|
| +
|
| +$BackNumericEx $BackNumericEx;
|
| +
|
| +# rule 9
|
| +
|
| +$BackNumericEx $BackALetterEx;
|
| +
|
| +# rule 10
|
| +
|
| +$BackALetterEx $BackNumericEx;
|
| +
|
| +# rule 11 and 12
|
| +
|
| +$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx;
|
| +
|
| +# rule 13
|
| +
|
| +$BackKatakanaEx $BackKatakanaEx;
|
| +
|
| +# rules 13 a/b
|
| +#
|
| +$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
|
| +($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
|
| +
|
| +# rule 13c
|
| +
|
| +$BackRegional_IndicatorEx $BackRegional_IndicatorEx;
|
| +
|
| +# special handling for CJK characters: chain for later dictionary segmentation
|
| +$HangulSyllable $HangulSyllable;
|
| +$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
|
| +
|
| +## -------------------------------------------------
|
| +
|
| +!!safe_reverse;
|
| +
|
| +# rule 3
|
| +($Extend | $Format)+ .?;
|
| +
|
| +# rule 6
|
| +($MidLetter | $MidNumLet) $BackALetterEx;
|
| +
|
| +# rule 11
|
| +($MidNum | $MidNumLet) $BackNumericEx;
|
| +
|
| +# For dictionary-based break
|
| +$dictionary $dictionary;
|
| +
|
| +## -------------------------------------------------
|
| +
|
| +!!safe_forward;
|
| +
|
| +# rule 4
|
| +($Extend | $Format)+ .?;
|
| +
|
| +# rule 6
|
| +($MidLetterEx | $MidNumLetEx) $ALetterEx;
|
| +
|
| +# rule 11
|
| +($MidNumEx | $MidNumLetEx) $NumericEx;
|
| +
|
| +# For dictionary-based break
|
| +$dictionary $dictionary;
|
|
|
| Property changes on: icu51/source/data/brkitr/word_POSIX.txt
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
| Added: svn:mime-type
|
| + text/plain
|
|
|
|
|