| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.ko.dict; |
| |
| import java.io.BufferedReader; |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.util.ArrayList; |
| import java.util.Comparator; |
| import java.util.List; |
| import org.apache.lucene.analysis.ko.POS; |
| import org.apache.lucene.util.IntsRefBuilder; |
| import org.apache.lucene.util.fst.FST; |
| import org.apache.lucene.util.fst.FSTCompiler; |
| import org.apache.lucene.util.fst.PositiveIntOutputs; |
| |
| /** |
| * Class for building a User Dictionary. This class allows for adding custom nouns (세종) or compounds |
| * (세종시 세종 시). |
| */ |
| public final class UserDictionary implements Dictionary { |
| // text -> wordID |
| private final TokenInfoFST fst; |
| |
| private static final int WORD_COST = -100000; |
| |
| // NNG left |
| private static final short LEFT_ID = 1781; |
| |
| // NNG right |
| private static final short RIGHT_ID = 3533; |
| // NNG right with hangul and a coda on the last char |
| private static final short RIGHT_ID_T = 3535; |
| // NNG right with hangul and no coda on the last char |
| private static final short RIGHT_ID_F = 3534; |
| |
| // length, length... indexed by compound ID or null for simple noun |
| private final int[][] segmentations; |
| private final short[] rightIds; |
| |
| public static UserDictionary open(Reader reader) throws IOException { |
| |
| BufferedReader br = new BufferedReader(reader); |
| String line; |
| List<String> entries = new ArrayList<>(); |
| |
| // text + optional segmentations |
| while ((line = br.readLine()) != null) { |
| // Remove comments |
| line = line.replaceAll("#.*$", ""); |
| |
| // Skip empty lines or comment lines |
| if (line.trim().length() == 0) { |
| continue; |
| } |
| entries.add(line); |
| } |
| |
| if (entries.isEmpty()) { |
| return null; |
| } else { |
| return new UserDictionary(entries); |
| } |
| } |
| |
| private UserDictionary(List<String> entries) throws IOException { |
| final CharacterDefinition charDef = CharacterDefinition.getInstance(); |
| entries.sort(Comparator.comparing(e -> e.split("\\s+")[0])); |
| |
| PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); |
| FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput); |
| IntsRefBuilder scratch = new IntsRefBuilder(); |
| |
| String lastToken = null; |
| List<int[]> segmentations = new ArrayList<>(entries.size()); |
| List<Short> rightIds = new ArrayList<>(entries.size()); |
| long ord = 0; |
| for (String entry : entries) { |
| String[] splits = entry.split("\\s+"); |
| String token = splits[0]; |
| if (token.equals(lastToken)) { |
| continue; |
| } |
| char lastChar = entry.charAt(entry.length() - 1); |
| if (charDef.isHangul(lastChar)) { |
| if (charDef.hasCoda(lastChar)) { |
| rightIds.add(RIGHT_ID_T); |
| } else { |
| rightIds.add(RIGHT_ID_F); |
| } |
| } else { |
| rightIds.add(RIGHT_ID); |
| } |
| |
| if (splits.length == 1) { |
| segmentations.add(null); |
| } else { |
| int[] length = new int[splits.length - 1]; |
| int offset = 0; |
| for (int i = 1; i < splits.length; i++) { |
| length[i - 1] = splits[i].length(); |
| offset += splits[i].length(); |
| } |
| if (offset > token.length()) { |
| throw new IllegalArgumentException( |
| "Illegal user dictionary entry " |
| + entry |
| + " - the segmentation is bigger than the surface form (" |
| + token |
| + ")"); |
| } |
| segmentations.add(length); |
| } |
| |
| // add mapping to FST |
| scratch.grow(token.length()); |
| scratch.setLength(token.length()); |
| for (int i = 0; i < token.length(); i++) { |
| scratch.setIntAt(i, token.charAt(i)); |
| } |
| fstCompiler.add(scratch.get(), ord); |
| lastToken = token; |
| ord++; |
| } |
| this.fst = new TokenInfoFST(fstCompiler.compile()); |
| this.segmentations = segmentations.toArray(new int[segmentations.size()][]); |
| this.rightIds = new short[rightIds.size()]; |
| for (int i = 0; i < rightIds.size(); i++) { |
| this.rightIds[i] = rightIds.get(i); |
| } |
| } |
| |
| public TokenInfoFST getFST() { |
| return fst; |
| } |
| |
| @Override |
| public int getLeftId(int wordId) { |
| return LEFT_ID; |
| } |
| |
| @Override |
| public int getRightId(int wordId) { |
| return rightIds[wordId]; |
| } |
| |
| @Override |
| public int getWordCost(int wordId) { |
| return WORD_COST; |
| } |
| |
| @Override |
| public POS.Type getPOSType(int wordId) { |
| if (segmentations[wordId] == null) { |
| return POS.Type.MORPHEME; |
| } else { |
| return POS.Type.COMPOUND; |
| } |
| } |
| |
| @Override |
| public POS.Tag getLeftPOS(int wordId) { |
| return POS.Tag.NNG; |
| } |
| |
| @Override |
| public POS.Tag getRightPOS(int wordId) { |
| return POS.Tag.NNG; |
| } |
| |
| @Override |
| public String getReading(int wordId) { |
| return null; |
| } |
| |
| @Override |
| public Morpheme[] getMorphemes(int wordId, char[] surfaceForm, int off, int len) { |
| int[] segs = segmentations[wordId]; |
| if (segs == null) { |
| return null; |
| } |
| int offset = 0; |
| Morpheme[] morphemes = new Morpheme[segs.length]; |
| for (int i = 0; i < segs.length; i++) { |
| morphemes[i] = new Morpheme(POS.Tag.NNG, new String(surfaceForm, off + offset, segs[i])); |
| offset += segs[i]; |
| } |
| return morphemes; |
| } |
| |
| /** |
| * Lookup words in text |
| * |
| * @param chars text |
| * @param off offset into text |
| * @param len length of text |
| * @return array of wordId |
| */ |
| public List<Integer> lookup(char[] chars, int off, int len) throws IOException { |
| List<Integer> result = new ArrayList<>(); |
| final FST.BytesReader fstReader = fst.getBytesReader(); |
| |
| FST.Arc<Long> arc = new FST.Arc<>(); |
| int end = off + len; |
| for (int startOffset = off; startOffset < end; startOffset++) { |
| arc = fst.getFirstArc(arc); |
| int output = 0; |
| int remaining = end - startOffset; |
| for (int i = 0; i < remaining; i++) { |
| int ch = chars[startOffset + i]; |
| if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) { |
| break; // continue to next position |
| } |
| output += arc.output().intValue(); |
| if (arc.isFinal()) { |
| final int finalOutput = output + arc.nextFinalOutput().intValue(); |
| result.add(finalOutput); |
| } |
| } |
| } |
| return result; |
| } |
| } |