| package textseg |
| |
| import ( |
| "errors" |
| "unicode/utf8" |
| ) |
| |
| // Generated from grapheme_clusters.rl. DO NOT EDIT |
| %%{ |
| # (except you are actually in grapheme_clusters.rl here, so edit away!) |
| |
| machine graphclust; |
| write data; |
| }%% |
| |
| var Error = errors.New("invalid UTF8 text") |
| |
| // ScanGraphemeClusters is a split function for bufio.Scanner that splits |
| // on grapheme cluster boundaries. |
| func ScanGraphemeClusters(data []byte, atEOF bool) (int, []byte, error) { |
| if len(data) == 0 { |
| return 0, nil, nil |
| } |
| |
| // Ragel state |
| cs := 0 // Current State |
| p := 0 // "Pointer" into data |
| pe := len(data) // End-of-data "pointer" |
| ts := 0 |
| te := 0 |
| act := 0 |
| eof := pe |
| |
| // Make Go compiler happy |
| _ = ts |
| _ = te |
| _ = act |
| _ = eof |
| |
| startPos := 0 |
| endPos := 0 |
| |
| %%{ |
| include GraphemeCluster "grapheme_clusters_table.rl"; |
| |
| action start { |
| startPos = p |
| } |
| |
| action end { |
| endPos = p |
| } |
| |
| action emit { |
| return endPos+1, data[startPos:endPos+1], nil |
| } |
| |
| ZWJGlue = ZWJ (Glue_After_Zwj | E_Base_GAZ Extend* E_Modifier?)?; |
| AnyExtender = Extend | ZWJGlue | SpacingMark; |
| Extension = AnyExtender*; |
| ReplacementChar = (0xEF 0xBF 0xBD); |
| |
| CRLFSeq = CR LF; |
| ControlSeq = Control | ReplacementChar; |
| HangulSeq = ( |
| L+ (((LV? V+ | LVT) T*)?|LV?) | |
| LV V* T* | |
| V+ T* | |
| LVT T* | |
| T+ |
| ) Extension; |
| EmojiSeq = (E_Base | E_Base_GAZ) Extend* E_Modifier? Extension; |
| ZWJSeq = ZWJGlue Extension; |
| EmojiFlagSeq = Regional_Indicator Regional_Indicator? Extension; |
| |
| UTF8Cont = 0x80 .. 0xBF; |
| AnyUTF8 = ( |
| 0x00..0x7F | |
| 0xC0..0xDF . UTF8Cont | |
| 0xE0..0xEF . UTF8Cont . UTF8Cont | |
| 0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont |
| ); |
| |
| # OtherSeq is any character that isn't at the start of one of the extended sequences above, followed by extension |
| OtherSeq = (AnyUTF8 - (CR|LF|Control|ReplacementChar|L|LV|V|LVT|T|E_Base|E_Base_GAZ|ZWJ|Regional_Indicator|Prepend)) Extension; |
| |
| # PrependSeq is prepend followed by any of the other patterns above, except control characters which explicitly break |
| PrependSeq = Prepend+ (HangulSeq|EmojiSeq|ZWJSeq|EmojiFlagSeq|OtherSeq)?; |
| |
| CRLFTok = CRLFSeq >start @end; |
| ControlTok = ControlSeq >start @end; |
| HangulTok = HangulSeq >start @end; |
| EmojiTok = EmojiSeq >start @end; |
| ZWJTok = ZWJSeq >start @end; |
| EmojiFlagTok = EmojiFlagSeq >start @end; |
| OtherTok = OtherSeq >start @end; |
| PrependTok = PrependSeq >start @end; |
| |
| main := |* |
| CRLFTok => emit; |
| ControlTok => emit; |
| HangulTok => emit; |
| EmojiTok => emit; |
| ZWJTok => emit; |
| EmojiFlagTok => emit; |
| PrependTok => emit; |
| OtherTok => emit; |
| |
| # any single valid UTF-8 character would also be valid per spec, |
| # but we'll handle that separately after the loop so we can deal |
| # with requesting more bytes if we're not at EOF. |
| *|; |
| |
| write init; |
| write exec; |
| }%% |
| |
| // If we fall out here then we were unable to complete a sequence. |
| // If we weren't able to complete a sequence then either we've |
| // reached the end of a partial buffer (so there's more data to come) |
| // or we have an isolated symbol that would normally be part of a |
| // grapheme cluster but has appeared in isolation here. |
| |
| if !atEOF { |
| // Request more |
| return 0, nil, nil |
| } |
| |
| // Just take the first UTF-8 sequence and return that. |
| _, seqLen := utf8.DecodeRune(data) |
| return seqLen, data[:seqLen], nil |
| } |