blob: 5e1ed67596c070d17d4e7a45e90dcd39bbeee863 [file] [log] [blame]
#
# Copyright (C) 2002-2009, International Business Machines Corporation and others.
# All Rights Reserved.
#
# file: char.txt
#
# ICU Character Break Rules, also known as Grapheme Cluster Boundaries
# See Unicode Standard Annex #29.
# These rules are based on TR29 Revision 13, for Unicode Version 5.1
#
#
# Character Class Definitions.
#
$CR = [\p{Grapheme_Cluster_Break = CR}];
$LF = [\p{Grapheme_Cluster_Break = LF}];
$Control = [\p{Grapheme_Cluster_Break = Control}];
$Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
$Extend = [\p{Grapheme_Cluster_Break = Extend}];
$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
$BengaliLetter = [\u0985-\u09B9 \u09CE \u09DC-\u09E1 \u09F0-\u09F1];
$BengaliSignVirama = \u09CD;
$GujaratiLetter = [\u0A85-\u0A8C \u0A8F-\u0A90 \u0A93-\u0AB9 \u0AE0-\u0AE1];
$GujaratiSignVirama = \u0ACD;
$DevanagariLetter = [\u0904-\u0939 \u0958-\u0961 \u0972-\u097F];
$DevanagariSignVirama = \u094D;
$KannadaLetter = [\u0C85-\u0CB9 \u0CDE-\u0CE1];
$KannadaSignVirama = \u0CCD;
$MalayalamLetter = [\u0D05-\u0D39 \u0D60-\u0D61 \u0D7A-\u0D7F];
$MalayalamSignVirama = \u0D4D;
$OriyaLetter = [\u0B05-\u0B39 \u0B5C-\u0B61 \u0B71];
$OriyaSignVirama = \u0B4D;
$GurmukhiLetter = [\u0A05-\u0A39 \u0A59-\u0A5E];
$GurmukhiSignVirama = \u0A4D;
$TamilLetter = [\u0B85-\u0BB9];
$TamilSignVirama = \u0BCD;
$TeluguLetter = [\u0C05-\u0C39 \u0C58-\u0C61];
$TeluguSignVirama = \u0C4D;
#
# Korean Syllable Definitions
#
$L = [\p{Grapheme_Cluster_Break = L}];
$V = [\p{Grapheme_Cluster_Break = V}];
$T = [\p{Grapheme_Cluster_Break = T}];
$LV = [\p{Grapheme_Cluster_Break = LV}];
$LVT = [\p{Grapheme_Cluster_Break = LVT}];
## -------------------------------------------------
!!chain;
!!forward;
$CR $LF;
$BengaliLetter ($BengaliSignVirama $BengaliLetter?)+;
$GujaratiLetter ($GujaratiSignVirama $GujaratiLetter?)+;
$DevanagariLetter ($DevanagariSignVirama $DevanagariLetter?)+;
$KannadaLetter ($KannadaSignVirama $KannadaLetter?)+;
$MalayalamLetter ($MalayalamSignVirama $MalayalamLetter?)+;
$OriyaLetter ($OriyaSignVirama $OriyaLetter?)+;
$GurmukhiLetter ($GurmukhiSignVirama $GurmukhiLetter?)+;
$TamilLetter ($TamilSignVirama $TamilLetter?)+;
$TeluguLetter ($TeluguSignVirama $TeluguLetter?)+;
$L ($L | $V | $LV | $LVT);
($LV | $V) ($V | $T);
($LVT | $T) $T;
[^$Control $CR $LF] $Extend;
[^$Control $CR $LF] $SpacingMark;
$Prepend [^$Control $CR $LF];
## -------------------------------------------------
!!reverse;
$LF $CR;
($BengaliLetter? $BengaliSignVirama)+ $BengaliLetter;
($GujaratiLetter? $GujaratiSignVirama)+ $GujaratiLetter;
($DevanagariLetter? $DevanagariSignVirama)+ $DevanagariLetter;
($KannadaLetter? $KannadaSignVirama)+ $KannadaLetter;
($MalayalamLetter? $MalayalamSignVirama)+ $MalayalamLetter;
($OriyaLetter? $OriyaSignVirama)+ $OriyaLetter;
($GurmukhiLetter? $GurmukhiSignVirama)+ $GurmukhiLetter;
($TamilLetter? $TamilSignVirama)+ $TamilLetter;
($TeluguLetter? $TeluguSignVirama)+ $TeluguLetter;
($L | $V | $LV | $LVT) $L;
($V | $T) ($LV | $V);
$T ($LVT | $T);
$Extend [^$Control $CR $LF];
$SpacingMark [^$Control $CR $LF];
[^$Control $CR $LF] $Prepend;
## -------------------------------------------------
!!safe_reverse;
## -------------------------------------------------
!!safe_forward;