| # |
| # Copyright (C) 2002-2003, International Business Machines Corporation and others. |
| # All Rights Reserved. |
| # |
| # file: count_word.txt |
| # |
| # ICU Word Break Rules |
| # See Unicode Standard Annex #29. |
| # These rules are based on Version 4.0.0, dated 2003-04-17 |
| # |
| |
| |
| |
| #################################################################################### |
| # |
| # Character class definitions from TR 29 |
| # |
| #################################################################################### |
| $Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] |
| [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] |
| [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] |
| [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; |
| |
| $ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:] [:name = HYPHEN-MINUS:] |
| - $Katakana |
| - [:Script = Thai:] |
| - [:Script = Lao:] |
| - [:Script = Hiragana:]]; |
| |
| $MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:] |
| [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = COLON:]]; |
| |
| $MidNumLet = [:name = FULL STOP:]; |
| |
| $MidNum = [[:LineBreak = Infix_Numeric:] - $MidNumLet]; |
| $Numeric = [:LineBreak = Numeric:]; |
| |
| |
| $TheZWSP = \u200b; |
| |
| # |
| # Character Class Definitions. |
| # The names are those from TR29. |
| # |
| $CR = \u000d; |
| $LF = \u000a; |
| $Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; |
| $Extend = [[:Grapheme_Extend = TRUE:]]; |
| |
| |
| |
| |
| #################################################################################### |
| # |
| # Word Break Rules. Definitions and Rules specific to word break begin Here. |
| # |
| #################################################################################### |
| |
| $Format = [[:Cf:] - $TheZWSP]; |
| |
| |
| |
| # Rule 3: Treat a grapheme cluster as if it were a single character. |
| # Hangul Syllables are easier to deal with here than they are in Grapheme Clusters |
| # because we don't need to find the boundaries between adjacent syllables - |
| # they won't be word boundaries. |
| # |
| |
| |
| # |
| # "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. |
| # |
| $ALetterEx = $ALetter $Extend*; |
| $NumericEx = $Numeric $Extend*; |
| $MidNumEx = $MidNum $Extend*; |
| $MidNumLetEx = $MidNumLet $Extend*; |
| $MidLetterEx = $MidLetter $Extend*; |
| $KatakanaEx = $Katakana $Extend*; |
| $FormatEx = $Format $Extend*; |
| |
| $word_pad=[[:P:][:S:][:Z:][:C:]]; |
| |
| # |
| # Numbers. Rules 8, 11, 12 form the TR. |
| # |
| $NumberSequence = $NumericEx ($FormatEx* ($MidNumEx | $MidNumLetEx)? $FormatEx* $NumericEx)*; |
| $NumberSequence $word_pad* {100}; |
| |
| # |
| # Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 |
| # - must include at least one letter. |
| # - may include both letters and numbers. |
| # - may include MideLetter, MidNumber punctuation. |
| # |
| $LetterSequence = $ALetterEx ($FormatEx* ($MidLetterEx | $MidNumLetEx)? $FormatEx* $ALetterEx)*; # rules #6, #7 |
| ($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $word_pad* {200}; |
| |
| # |
| # Do not break between Katakana. Rule #13. |
| # |
| $KatakanaEx ($FormatEx* $KatakanaEx)* {300}; |
| [:Hiragana:] $Extend* {300}; |
| |
| # |
| # Ideographic Characters. Stand by themselves as words. |
| # Separated from the "Everything Else" rule, below, only so that they |
| # can be tagged with a return value. TODO: is this what we want? |
| # |
| # [:IDEOGRAPHIC:] $Extend* $word_pad* {400}; |
| |
| # |
| # Everything Else, with no tag. |
| # Non-Control chars combine with $Extend (combining) chars. |
| # Controls are do not. |
| # |
| [^$Control [:Ideographic:]] $Extend* $word_pad*; |
| $CR $LF; |
| |
| # |
| # Reverse Rules. Back up over any of the chars that can group together. |
| # (Reverse rules do not need to be exact; they can back up too far, |
| # but must back up at least enough, and must stop on a boundary.) |
| # |
| |
| # NonStarters are the set of all characters that can appear at the 2nd - nth position of |
| # a word. (They may also be the first.) The reverse rule skips over these, until it |
| # reaches something that can only be the start (and probably only) char in a "word". |
| # A space or punctuation meets the test. |
| # |
| $NonStarters = [$Numeric $ALetter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format]; |
| |
| #!.*; |
| ! ($NonStarters* | \n \r) .; |
| |