blob: 10042c9e7e2b7c56c1e27b036766d71aca300304 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ko.dict;
import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.ResourceScheme;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.analysis.ko.POS;
import org.apache.lucene.analysis.ko.util.DictionaryBuilder;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.IntsRefFSTEnum;
/** Tests of TokenInfoDictionary build tools; run using ant test-tools */
public class TestTokenInfoDictionary extends LuceneTestCase {
public void testPut() throws Exception {
TokenInfoDictionary dict =
newDictionary(
"명사,1,1,2,NNG,*,*,*,*,*,*,*",
// "large" id
"일반,5000,5000,3,NNG,*,*,*,*,*,*,*");
IntsRef wordIdRef = new IntsRefBuilder().get();
dict.lookupWordIds(0, wordIdRef);
int wordId = wordIdRef.ints[wordIdRef.offset];
assertEquals(1, dict.getLeftId(wordId));
assertEquals(1, dict.getRightId(wordId));
assertEquals(2, dict.getWordCost(wordId));
dict.lookupWordIds(1, wordIdRef);
wordId = wordIdRef.ints[wordIdRef.offset];
assertEquals(5000, dict.getLeftId(wordId));
assertEquals(5000, dict.getRightId(wordId));
assertEquals(3, dict.getWordCost(wordId));
}
private TokenInfoDictionary newDictionary(String... entries) throws Exception {
Path dir = createTempDir();
try (OutputStream out = Files.newOutputStream(dir.resolve("test.csv"));
PrintWriter printer =
new PrintWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8))) {
for (String entry : entries) {
printer.println(entry);
}
}
Files.createFile(dir.resolve("unk.def"));
Files.createFile(dir.resolve("char.def"));
try (OutputStream out = Files.newOutputStream(dir.resolve("matrix.def"));
PrintWriter printer =
new PrintWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8))) {
printer.println("1 1");
}
DictionaryBuilder.build(dir, dir, "utf-8", true);
String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/');
// We must also load the other files (in BinaryDictionary) from the correct path
return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString());
}
public void testPutException() {
// too few columns
expectThrows(IllegalArgumentException.class, () -> newDictionary("HANGUL,1,1,1,NNG,*,*,*,*,*"));
// id too large
expectThrows(
IllegalArgumentException.class,
() -> newDictionary("HANGUL,8192,8192,1,NNG,*,*,*,*,*,*,*"));
}
/** enumerates the entire FST/lookup data and just does basic sanity checks */
@Slow
public void testEnumerateAll() throws Exception {
// just for debugging
int numTerms = 0;
int numWords = 0;
int lastWordId = -1;
int lastSourceId = -1;
CharacterDefinition charDef = CharacterDefinition.getInstance();
TokenInfoDictionary tid = TokenInfoDictionary.getInstance();
ConnectionCosts matrix = ConnectionCosts.getInstance();
FST<Long> fst = tid.getFST().getInternalFST();
IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst);
IntsRefFSTEnum.InputOutput<Long> mapping;
IntsRef scratch = new IntsRef();
while ((mapping = fstEnum.next()) != null) {
numTerms++;
IntsRef input = mapping.input;
char[] chars = new char[input.length];
for (int i = 0; i < chars.length; i++) {
chars[i] = (char) input.ints[input.offset + i];
}
String surfaceForm = new String(chars);
assertFalse(surfaceForm.isEmpty());
assertEquals(surfaceForm.trim(), surfaceForm);
assertTrue(UnicodeUtil.validUTF16String(surfaceForm));
Long output = mapping.output;
int sourceId = output.intValue();
// we walk in order, terms, sourceIds, and wordIds should always be increasing
assertTrue(sourceId > lastSourceId);
lastSourceId = sourceId;
tid.lookupWordIds(sourceId, scratch);
for (int i = 0; i < scratch.length; i++) {
numWords++;
int wordId = scratch.ints[scratch.offset + i];
assertTrue(wordId > lastWordId);
lastWordId = wordId;
int leftId = tid.getLeftId(wordId);
int rightId = tid.getRightId(wordId);
matrix.get(rightId, leftId);
tid.getWordCost(wordId);
POS.Type type = tid.getPOSType(wordId);
POS.Tag leftPOS = tid.getLeftPOS(wordId);
POS.Tag rightPOS = tid.getRightPOS(wordId);
if (type == POS.Type.MORPHEME) {
assertSame(leftPOS, rightPOS);
String reading = tid.getReading(wordId);
boolean isHanja = charDef.isHanja(surfaceForm.charAt(0));
if (isHanja) {
assertNotNull(reading);
for (int j = 0; j < reading.length(); j++) {
assertTrue(charDef.isHangul(reading.charAt(j)));
}
}
if (reading != null) {
assertTrue(UnicodeUtil.validUTF16String(reading));
}
} else {
if (type == POS.Type.COMPOUND) {
assertSame(leftPOS, rightPOS);
assertTrue(leftPOS == POS.Tag.NNG || rightPOS == POS.Tag.NNP);
}
Dictionary.Morpheme[] decompound = tid.getMorphemes(wordId, chars, 0, chars.length);
if (decompound != null) {
int offset = 0;
for (Dictionary.Morpheme morph : decompound) {
assertTrue(UnicodeUtil.validUTF16String(morph.surfaceForm));
assertFalse(morph.surfaceForm.isEmpty());
assertEquals(morph.surfaceForm.trim(), morph.surfaceForm);
if (type != POS.Type.INFLECT) {
assertEquals(
morph.surfaceForm,
surfaceForm.substring(offset, offset + morph.surfaceForm.length()));
offset += morph.surfaceForm.length();
}
}
assertTrue(offset <= surfaceForm.length());
}
}
}
}
if (VERBOSE) {
System.out.println("checked " + numTerms + " terms, " + numWords + " words.");
}
}
}