| # |
| # Copyright (C) 2002-2003, International Business Machines Corporation and others. |
| # All Rights Reserved. |
| # |
| # file: count_word.txt |
| # |
| # ICU Word Break Rules |
| # See Unicode Standard Annex #29. |
| # These rules are based on Version 4.0.0, dated 2003-04-17 |
| # |
| |
| |
| |
| #################################################################################### |
| # |
| # Character class definitions from TR 29 |
| # |
| #################################################################################### |
| $Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] |
| [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] |
| [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] |
| [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; |
| |
| |
| $dash = \u002d; |
| |
| $ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:] |
| [:P:] [:S:] [:LineBreak = Numeric:] |
| - $dash |
| - $Katakana |
| - [:Script = Thai:] |
| - [:Script = Lao:] |
| - [:Script = Hiragana:]]; |
| |
| $TheZWSP = \u200b; |
| |
| # |
| # Character Class Definitions. |
| # The names are those from TR29. |
| # |
| $CR = \u000d; |
| $LF = \u000a; |
| $Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; |
| $Extend = [[:Grapheme_Extend = TRUE:]]; |
| |
| |
| |
| |
| #################################################################################### |
| # |
| # Word Break Rules. Definitions and Rules specific to word break begin Here. |
| # |
| #################################################################################### |
| |
| $Format = [[:Cf:] - $TheZWSP]; |
| |
| |
| |
| # Rule 3: Treat a grapheme cluster as if it were a single character. |
| # Hangul Syllables are easier to deal with here than they are in Grapheme Clusters |
| # because we don't need to find the boundaries between adjacent syllables - |
| # they won't be word boundaries. |
| # |
| |
| |
| # |
| # "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. |
| # |
| $ALetterEx = $ALetter $Extend*; |
| $KatakanaEx = $Katakana $Extend*; |
| $FormatEx = $Format $Extend*; |
| |
| # |
| # Numbers. Rules 8, 11, 12 form the TR. |
| # |
| |
| # |
| # Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 |
| # - must include at least one letter. |
| # - may include both letters and numbers. |
| # - may include MideLetter, MidNumber punctuation. |
| # |
| $LetterSequence = $ALetterEx ($FormatEx* $ALetterEx)*; # rules #6, #7 |
| $LetterSequence {200}; |
| |
| $ALetterEx* $dash+ {200}; |
| $ALetterEx* ($dash $LetterSequence)+ $dash* {200}; |
| |
| # |
| # Do not break between Katakana. Rule #13. |
| # |
| $KatakanaEx ($FormatEx* $KatakanaEx)* {300}; |
| [:Hiragana:] $Extend* {300}; |
| |
| # |
| # Ideographic Characters. Stand by themselves as words. |
| # Separated from the "Everything Else" rule, below, only so that they |
| # can be tagged with a return value. TODO: is this what we want? |
| # |
| # [:IDEOGRAPHIC:] $Extend* {400}; |
| |
| # |
| # Everything Else, with no tag. |
| # Non-Control chars combine with $Extend (combining) chars. |
| # Controls are do not. |
| # |
| [^$Control [:Ideographic:]] $Extend*; |
| $CR $LF; |
| |
| # |
| # Reverse Rules. Back up over any of the chars that can group together. |
| # (Reverse rules do not need to be exact; they can back up too far, |
| # but must back up at least enough, and must stop on a boundary.) |
| # |
| |
| # NonStarters are the set of all characters that can appear at the 2nd - nth position of |
| # a word. (They may also be the first.) The reverse rule skips over these, until it |
| # reaches something that can only be the start (and probably only) char in a "word". |
| # A space or punctuation meets the test. |
| # |
| $NonStarters = [$ALetter $Katakana $Extend $Format]; |
| |
| #!.*; |
| ! ($NonStarters* | \n \r) .; |
| |