| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.ja.util; |
| |
| import java.io.IOException; |
| import java.nio.file.Path; |
| import java.nio.file.Paths; |
| import java.util.Locale; |
| |
| /** |
| * Tool to build dictionaries. Usage: |
| * |
| * <pre> |
| * java -cp [lucene classpath] org.apache.lucene.analysis.ja.util.DictionaryBuilder \ |
| * ${inputDir} ${outputDir} ${encoding} ${normalizeEntry} |
| * </pre> |
| * |
| * <p>The input directory is expected to include unk.def, matrix.def, plus any number of .csv files, |
| * roughly following the conventions of IPADIC. JapaneseTokenizer uses dictionaries built with this |
| * tool. Note that the input files required by this build generally must be generated from a corpus |
| * of real text using tools that are not part of Lucene. |
| * |
| * <p>The normalizeEntry option is a Boolean value.<br> |
| * If true, check a surface form (first column in csv) is <a |
| * href="https://unicode.org/reports/tr15/#Norm_Forms">NFC Normalized</a>. If it isn't, NFC |
| * normalized contents will be added to the TokenInfoDictionary in addition to the original form. |
| * <br> |
| * This option is false for pre-built dictionary in the Lucene. |
| * |
| * @lucene.experimental |
| */ |
| public class DictionaryBuilder { |
| |
| /** Format of the dictionary. */ |
| public enum DictionaryFormat { |
| /** IPADIC format */ |
| IPADIC, |
| /** UNIDIC format */ |
| UNIDIC |
| } |
| |
| private DictionaryBuilder() {} |
| |
| public static void build( |
| DictionaryFormat format, |
| Path inputDir, |
| Path outputDir, |
| String encoding, |
| boolean normalizeEntry) |
| throws IOException { |
| new TokenInfoDictionaryBuilder(format, encoding, normalizeEntry) |
| .build(inputDir) |
| .write(outputDir); |
| |
| new UnknownDictionaryBuilder(encoding).build(inputDir).write(outputDir); |
| |
| ConnectionCostsBuilder.build(inputDir.resolve("matrix.def")).write(outputDir); |
| } |
| |
| public static void main(String[] args) throws IOException { |
| DictionaryFormat format = DictionaryFormat.valueOf(args[0].toUpperCase(Locale.ROOT)); |
| String inputDirName = args[1]; |
| String outputDirName = args[2]; |
| String inputEncoding = args[3]; |
| boolean normalizeEntries = Boolean.parseBoolean(args[4]); |
| DictionaryBuilder.build( |
| format, Paths.get(inputDirName), Paths.get(outputDirName), inputEncoding, normalizeEntries); |
| } |
| } |