blob: 1b64c3a9d3199391a85e80246a6907dcd9b3901e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import java.io.ByteArrayInputStream;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.Arrays;
import java.util.Collections;
import java.util.TreeMap;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
public class TestDictionary extends LuceneTestCase {
public void testSimpleDictionary() throws Exception {
Dictionary dictionary = loadDictionary("simple.aff", "simple.dic");
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}).length);
assertEquals(1, dictionary.lookupPrefix(new char[] {'s'}).length);
IntsRef ordList = dictionary.lookupWord(new char[] {'o', 'l', 'r'}, 0, 3);
assertNotNull(ordList);
assertEquals(1, ordList.length);
assertEquals('B', assertSingleFlag(dictionary, ordList));
int offset = random().nextInt(10);
ordList = dictionary.lookupWord((" ".repeat(offset) + "lucen").toCharArray(), offset, 5);
assertNotNull(ordList);
assertEquals(1, ordList.length);
assertEquals('A', assertSingleFlag(dictionary, ordList));
}
private static char assertSingleFlag(Dictionary dictionary, IntsRef ordList) {
int entryId = ordList.ints[0];
char[] flags = dictionary.flagLookup.getFlags(entryId);
assertEquals(1, flags.length);
return flags[0];
}
public void testCompressedDictionary() throws Exception {
Dictionary dictionary = loadDictionary("compressed.aff", "compressed.dic");
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}).length);
assertEquals(1, dictionary.lookupPrefix(new char[] {'s'}).length);
IntsRef ordList = dictionary.lookupWord(new char[] {'o', 'l', 'r'}, 0, 3);
assertSingleFlag(dictionary, ordList);
}
public void testCompressedBeforeSetDictionary() throws Exception {
Dictionary dictionary = loadDictionary("compressed-before-set.aff", "compressed.dic");
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}).length);
assertEquals(1, dictionary.lookupPrefix(new char[] {'s'}).length);
IntsRef ordList = dictionary.lookupWord(new char[] {'o', 'l', 'r'}, 0, 3);
assertSingleFlag(dictionary, ordList);
}
public void testCompressedEmptyAliasDictionary() throws Exception {
Dictionary dictionary = loadDictionary("compressed-empty-alias.aff", "compressed.dic");
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}).length);
assertEquals(1, dictionary.lookupPrefix(new char[] {'s'}).length);
IntsRef ordList = dictionary.lookupWord(new char[] {'o', 'l', 'r'}, 0, 3);
assertSingleFlag(dictionary, ordList);
}
// malformed rule causes ParseException
public void testInvalidData() {
ParseException expected =
expectThrows(ParseException.class, () -> loadDictionary("broken.aff", "simple.dic"));
assertTrue(expected.getMessage().startsWith("Invalid syntax"));
assertEquals(24, expected.getErrorOffset());
}
public void testUsingFlagsBeforeFlagDirective() throws IOException, ParseException {
byte[] aff = "KEEPCASE 42\nFLAG num".getBytes(StandardCharsets.UTF_8);
byte[] dic = "1\nfoo/42".getBytes(StandardCharsets.UTF_8);
Dictionary dictionary =
new Dictionary(
new ByteBuffersDirectory(),
"",
new ByteArrayInputStream(aff),
new ByteArrayInputStream(dic));
assertEquals(42, dictionary.keepcase);
}
public void testForgivableErrors() throws Exception {
Dictionary dictionary = loadDictionary("forgivable-errors.aff", "forgivable-errors.dic");
assertEquals(1, dictionary.repTable.size());
assertEquals(2, dictionary.compoundMax);
loadDictionary("forgivable-errors-long.aff", "single-word.dic");
loadDictionary("forgivable-errors-num.aff", "single-word.dic");
}
private Dictionary loadDictionary(String aff, String dic) throws IOException, ParseException {
try (InputStream affixStream = getClass().getResourceAsStream(aff);
InputStream dicStream = getClass().getResourceAsStream(dic);
Directory tempDir = getDirectory()) {
return new Dictionary(tempDir, "dictionary", affixStream, dicStream);
}
}
private static class CloseCheckInputStream extends FilterInputStream {
private boolean closed = false;
public CloseCheckInputStream(InputStream delegate) {
super(delegate);
}
@Override
public void close() throws IOException {
this.closed = true;
super.close();
}
public boolean isClosed() {
return this.closed;
}
}
public void testResourceCleanup() throws Exception {
CloseCheckInputStream affixStream =
new CloseCheckInputStream(getClass().getResourceAsStream("compressed.aff"));
CloseCheckInputStream dictStream =
new CloseCheckInputStream(getClass().getResourceAsStream("compressed.dic"));
Directory tempDir = getDirectory();
new Dictionary(tempDir, "dictionary", affixStream, dictStream);
assertFalse(affixStream.isClosed());
assertFalse(dictStream.isClosed());
affixStream.close();
dictStream.close();
tempDir.close();
assertTrue(affixStream.isClosed());
assertTrue(dictStream.isClosed());
}
public void testReplacements() {
TreeMap<String, String> map = new TreeMap<>();
map.put("a", "b");
map.put("ab", "c");
map.put("c", "de");
map.put("def", "gh");
ConvTable table = new ConvTable(map);
StringBuilder sb = new StringBuilder("atestanother");
table.applyMappings(sb);
assertEquals("btestbnother", sb.toString());
sb = new StringBuilder("abtestanother");
table.applyMappings(sb);
assertEquals("ctestbnother", sb.toString());
sb = new StringBuilder("atestabnother");
table.applyMappings(sb);
assertEquals("btestcnother", sb.toString());
sb = new StringBuilder("abtestabnother");
table.applyMappings(sb);
assertEquals("ctestcnother", sb.toString());
sb = new StringBuilder("abtestabcnother");
table.applyMappings(sb);
assertEquals("ctestcdenother", sb.toString());
sb = new StringBuilder("defdefdefc");
table.applyMappings(sb);
assertEquals("ghghghde", sb.toString());
}
public void testSetWithCrazyWhitespaceAndBOMs() throws Exception {
assertEquals("UTF-8", getDictionaryEncoding("SET\tUTF-8\n"));
assertEquals("UTF-8", getDictionaryEncoding("SET\t UTF-8\n"));
assertEquals("UTF-8", getDictionaryEncoding("\uFEFFSET\tUTF-8\n"));
assertEquals("UTF-8", getDictionaryEncoding("\uFEFFSET\tUTF-8\r\n"));
assertEquals(Dictionary.DEFAULT_CHARSET.name(), getDictionaryEncoding(""));
}
private static String getDictionaryEncoding(String affFile) throws IOException, ParseException {
Dictionary dictionary =
new Dictionary(
new ByteBuffersDirectory(),
"",
new ByteArrayInputStream(affFile.getBytes(StandardCharsets.UTF_8)),
new ByteArrayInputStream("1\nmock".getBytes(StandardCharsets.UTF_8)));
return dictionary.decoder.charset().name();
}
public void testFlagWithCrazyWhitespace() {
assertNotNull(Dictionary.getFlagParsingStrategy("FLAG\tUTF-8", StandardCharsets.UTF_8));
assertNotNull(Dictionary.getFlagParsingStrategy("FLAG UTF-8", StandardCharsets.UTF_8));
}
@Test
public void testUtf8Flag() {
Dictionary.FlagParsingStrategy strategy =
Dictionary.getFlagParsingStrategy("FLAG\tUTF-8", Dictionary.DEFAULT_CHARSET);
String src = "привет";
String asAscii = new String(src.getBytes(StandardCharsets.UTF_8), Dictionary.DEFAULT_CHARSET);
assertNotEquals(src, asAscii);
assertEquals(src, new String(strategy.parseFlags(asAscii)));
}
@Test
public void testCustomMorphologicalData() throws IOException, ParseException {
Dictionary dic = loadDictionary("morphdata.aff", "morphdata.dic");
assertNull(dic.lookupEntries("nonexistent"));
DictEntries simpleNoun = dic.lookupEntries("simplenoun");
assertEquals(1, simpleNoun.size());
assertEquals(Collections.emptyList(), simpleNoun.getMorphologicalValues(0, "aa:"));
assertEquals(Collections.singletonList("42"), simpleNoun.getMorphologicalValues(0, "fr:"));
DictEntries lay = dic.lookupEntries("lay");
String actual =
IntStream.range(0, 3)
.mapToObj(lay::getMorphologicalData)
.sorted()
.collect(Collectors.joining("; "));
assertEquals("is:past_2 po:verb st:lie; is:present po:verb; po:noun", actual);
DictEntries sing = dic.lookupEntries("sing");
assertEquals(1, sing.size());
assertEquals(Arrays.asList("sang", "sung"), sing.getMorphologicalValues(0, "al:"));
assertEquals(
"al:abaléar po:verbo ts:transitiva",
dic.lookupEntries("unsupported1").getMorphologicalData(0));
assertEquals("", dic.lookupEntries("unsupported2").getMorphologicalData(0));
}
private Directory getDirectory() {
return newDirectory();
}
}