blob: f8fcefdd295b86c7c99a69a1e55bbe731d10fc9a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ko.dict;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.ko.POS;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.PositiveIntOutputs;
/**
* Class for building a User Dictionary. This class allows for adding custom nouns (세종) or compounds
* (세종시 세종 시).
*/
public final class UserDictionary implements Dictionary {
// text -> wordID
private final TokenInfoFST fst;
private static final int WORD_COST = -100000;
// NNG left
private static final short LEFT_ID = 1781;
// NNG right
private static final short RIGHT_ID = 3533;
// NNG right with hangul and a coda on the last char
private static final short RIGHT_ID_T = 3535;
// NNG right with hangul and no coda on the last char
private static final short RIGHT_ID_F = 3534;
// length, length... indexed by compound ID or null for simple noun
private final int[][] segmentations;
private final short[] rightIds;
public static UserDictionary open(Reader reader) throws IOException {
BufferedReader br = new BufferedReader(reader);
String line;
List<String> entries = new ArrayList<>();
// text + optional segmentations
while ((line = br.readLine()) != null) {
// Remove comments
line = line.replaceAll("#.*$", "");
// Skip empty lines or comment lines
if (line.trim().length() == 0) {
continue;
}
entries.add(line);
}
if (entries.isEmpty()) {
return null;
} else {
return new UserDictionary(entries);
}
}
private UserDictionary(List<String> entries) throws IOException {
final CharacterDefinition charDef = CharacterDefinition.getInstance();
entries.sort(Comparator.comparing(e -> e.split("\\s+")[0]));
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
IntsRefBuilder scratch = new IntsRefBuilder();
String lastToken = null;
List<int[]> segmentations = new ArrayList<>(entries.size());
List<Short> rightIds = new ArrayList<>(entries.size());
long ord = 0;
for (String entry : entries) {
String[] splits = entry.split("\\s+");
String token = splits[0];
if (token.equals(lastToken)) {
continue;
}
char lastChar = entry.charAt(entry.length() - 1);
if (charDef.isHangul(lastChar)) {
if (charDef.hasCoda(lastChar)) {
rightIds.add(RIGHT_ID_T);
} else {
rightIds.add(RIGHT_ID_F);
}
} else {
rightIds.add(RIGHT_ID);
}
if (splits.length == 1) {
segmentations.add(null);
} else {
int[] length = new int[splits.length - 1];
int offset = 0;
for (int i = 1; i < splits.length; i++) {
length[i - 1] = splits[i].length();
offset += splits[i].length();
}
if (offset > token.length()) {
throw new IllegalArgumentException(
"Illegal user dictionary entry "
+ entry
+ " - the segmentation is bigger than the surface form ("
+ token
+ ")");
}
segmentations.add(length);
}
// add mapping to FST
scratch.grow(token.length());
scratch.setLength(token.length());
for (int i = 0; i < token.length(); i++) {
scratch.setIntAt(i, token.charAt(i));
}
fstCompiler.add(scratch.get(), ord);
lastToken = token;
ord++;
}
this.fst = new TokenInfoFST(fstCompiler.compile());
this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
this.rightIds = new short[rightIds.size()];
for (int i = 0; i < rightIds.size(); i++) {
this.rightIds[i] = rightIds.get(i);
}
}
public TokenInfoFST getFST() {
return fst;
}
@Override
public int getLeftId(int wordId) {
return LEFT_ID;
}
@Override
public int getRightId(int wordId) {
return rightIds[wordId];
}
@Override
public int getWordCost(int wordId) {
return WORD_COST;
}
@Override
public POS.Type getPOSType(int wordId) {
if (segmentations[wordId] == null) {
return POS.Type.MORPHEME;
} else {
return POS.Type.COMPOUND;
}
}
@Override
public POS.Tag getLeftPOS(int wordId) {
return POS.Tag.NNG;
}
@Override
public POS.Tag getRightPOS(int wordId) {
return POS.Tag.NNG;
}
@Override
public String getReading(int wordId) {
return null;
}
@Override
public Morpheme[] getMorphemes(int wordId, char[] surfaceForm, int off, int len) {
int[] segs = segmentations[wordId];
if (segs == null) {
return null;
}
int offset = 0;
Morpheme[] morphemes = new Morpheme[segs.length];
for (int i = 0; i < segs.length; i++) {
morphemes[i] = new Morpheme(POS.Tag.NNG, new String(surfaceForm, off + offset, segs[i]));
offset += segs[i];
}
return morphemes;
}
/**
* Lookup words in text
*
* @param chars text
* @param off offset into text
* @param len length of text
* @return array of wordId
*/
public List<Integer> lookup(char[] chars, int off, int len) throws IOException {
List<Integer> result = new ArrayList<>();
final FST.BytesReader fstReader = fst.getBytesReader();
FST.Arc<Long> arc = new FST.Arc<>();
int end = off + len;
for (int startOffset = off; startOffset < end; startOffset++) {
arc = fst.getFirstArc(arc);
int output = 0;
int remaining = end - startOffset;
for (int i = 0; i < remaining; i++) {
int ch = chars[startOffset + i];
if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
break; // continue to next position
}
output += arc.output().intValue();
if (arc.isFinal()) {
final int finalOutput = output + arc.nextFinalOutput().intValue();
result.add(finalOutput);
}
}
}
return result;
}
}