| // Copyright (c) 2014 Couchbase, Inc. |
| // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file |
| // except in compliance with the License. You may obtain a copy of the License at |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // Unless required by applicable law or agreed to in writing, software distributed under the |
| // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, |
| // either express or implied. See the License for the specific language governing permissions |
| // and limitations under the License. |
| |
| // +build ignore |
| |
| package main |
| |
| import ( |
| "bufio" |
| "bytes" |
| "flag" |
| "fmt" |
| "io" |
| "log" |
| "net/http" |
| "os" |
| "os/exec" |
| "strconv" |
| "strings" |
| "unicode" |
| ) |
| |
| var url = flag.String("url", |
| "http://www.unicode.org/Public/"+unicode.Version+"/ucd/auxiliary/", |
| "URL of Unicode database directory") |
| var verbose = flag.Bool("verbose", |
| false, |
| "write data to stdout as it is parsed") |
| var localFiles = flag.Bool("local", |
| false, |
| "data files have been copied to the current directory; for debugging only") |
| |
| var outputFile = flag.String("output", |
| "", |
| "output file for generated tables; default stdout") |
| |
| var output *bufio.Writer |
| |
| func main() { |
| flag.Parse() |
| setupOutput() |
| |
| graphemeTests := make([]test, 0) |
| graphemeTests = loadUnicodeData("GraphemeBreakTest.txt", graphemeTests) |
| wordTests := make([]test, 0) |
| wordTests = loadUnicodeData("WordBreakTest.txt", wordTests) |
| sentenceTests := make([]test, 0) |
| sentenceTests = loadUnicodeData("SentenceBreakTest.txt", sentenceTests) |
| |
| fmt.Fprintf(output, fileHeader, *url) |
| generateTestTables("Grapheme", graphemeTests) |
| generateTestTables("Word", wordTests) |
| generateTestTables("Sentence", sentenceTests) |
| |
| flushOutput() |
| } |
| |
| // WordBreakProperty.txt has the form: |
| // 05F0..05F2 ; Hebrew_Letter # Lo [3] HEBREW LIGATURE YIDDISH DOUBLE VAV..HEBREW LIGATURE YIDDISH DOUBLE YOD |
| // FB1D ; Hebrew_Letter # Lo HEBREW LETTER YOD WITH HIRIQ |
| func openReader(file string) (input io.ReadCloser) { |
| if *localFiles { |
| f, err := os.Open(file) |
| if err != nil { |
| log.Fatal(err) |
| } |
| input = f |
| } else { |
| path := *url + file |
| resp, err := http.Get(path) |
| if err != nil { |
| log.Fatal(err) |
| } |
| if resp.StatusCode != 200 { |
| log.Fatal("bad GET status for "+file, resp.Status) |
| } |
| input = resp.Body |
| } |
| return |
| } |
| |
| func loadUnicodeData(filename string, tests []test) []test { |
| f := openReader(filename) |
| defer f.Close() |
| bufioReader := bufio.NewReader(f) |
| line, err := bufioReader.ReadString('\n') |
| for err == nil { |
| tests = parseLine(line, tests) |
| line, err = bufioReader.ReadString('\n') |
| } |
| // if the err was EOF still need to process last value |
| if err == io.EOF { |
| tests = parseLine(line, tests) |
| } |
| return tests |
| } |
| |
| const comment = "#" |
| const brk = "÷" |
| const nbrk = "×" |
| |
| type test [][]byte |
| |
| func parseLine(line string, tests []test) []test { |
| if strings.HasPrefix(line, comment) { |
| return tests |
| } |
| line = strings.TrimSpace(line) |
| if len(line) == 0 { |
| return tests |
| } |
| commentStart := strings.Index(line, comment) |
| if commentStart > 0 { |
| line = line[0:commentStart] |
| } |
| pieces := strings.Split(line, brk) |
| t := make(test, 0) |
| for _, piece := range pieces { |
| piece = strings.TrimSpace(piece) |
| if len(piece) > 0 { |
| codePoints := strings.Split(piece, nbrk) |
| word := "" |
| for _, codePoint := range codePoints { |
| codePoint = strings.TrimSpace(codePoint) |
| r, err := strconv.ParseInt(codePoint, 16, 64) |
| if err != nil { |
| log.Printf("err: %v for '%s'", err, string(r)) |
| return tests |
| } |
| |
| word += string(r) |
| } |
| t = append(t, []byte(word)) |
| } |
| } |
| tests = append(tests, t) |
| return tests |
| } |
| |
| func generateTestTables(prefix string, tests []test) { |
| fmt.Fprintf(output, testHeader, prefix) |
| for _, t := range tests { |
| fmt.Fprintf(output, "\t\t{\n") |
| fmt.Fprintf(output, "\t\t\tinput: %#v,\n", bytes.Join(t, []byte{})) |
| fmt.Fprintf(output, "\t\t\toutput: %s,\n", generateTest(t)) |
| fmt.Fprintf(output, "\t\t},\n") |
| } |
| fmt.Fprintf(output, "}\n") |
| } |
| |
| func generateTest(t test) string { |
| rv := "[][]byte{" |
| for _, te := range t { |
| rv += fmt.Sprintf("%#v,", te) |
| } |
| rv += "}" |
| return rv |
| } |
| |
| const fileHeader = `// Generated by running |
| // maketesttables --url=%s |
| // DO NOT EDIT |
| |
| package textseg |
| ` |
| |
| const testHeader = `var unicode%sTests = []struct { |
| input []byte |
| output [][]byte |
| }{ |
| ` |
| |
| func setupOutput() { |
| output = bufio.NewWriter(startGofmt()) |
| } |
| |
| // startGofmt connects output to a gofmt process if -output is set. |
| func startGofmt() io.Writer { |
| if *outputFile == "" { |
| return os.Stdout |
| } |
| stdout, err := os.Create(*outputFile) |
| if err != nil { |
| log.Fatal(err) |
| } |
| // Pipe output to gofmt. |
| gofmt := exec.Command("gofmt") |
| fd, err := gofmt.StdinPipe() |
| if err != nil { |
| log.Fatal(err) |
| } |
| gofmt.Stdout = stdout |
| gofmt.Stderr = os.Stderr |
| err = gofmt.Start() |
| if err != nil { |
| log.Fatal(err) |
| } |
| return fd |
| } |
| |
| func flushOutput() { |
| err := output.Flush() |
| if err != nil { |
| log.Fatal(err) |
| } |
| } |