blob: a9c11b8a8b93c723b80c25a5271031d1bfcdb6a4 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.morfologik;
import java.io.IOException;
import java.util.TreeSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* TODO: The tests below rely on the order of returned lemmas, which is probably not good.
*/
public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
private Analyzer getTestAnalyzer() {
return new MorfologikAnalyzer();
}
/** Test stemming of single tokens with Morfologik library. */
public final void testSingleTokens() throws IOException {
Analyzer a = getTestAnalyzer();
assertAnalyzesTo(a, "a", new String[] { "a" });
assertAnalyzesTo(a, "liście", new String[] { "liście", "liść", "list", "lista" });
assertAnalyzesTo(a, "danych", new String[] { "dany", "dana", "dane", "dać" });
assertAnalyzesTo(a, "ęóąśłżźćń", new String[] { "ęóąśłżźćń" });
a.close();
}
/** Test stemming of multiple tokens and proper term metrics. */
public final void testMultipleTokens() throws IOException {
Analyzer a = getTestAnalyzer();
assertAnalyzesTo(
a,
"liście danych",
new String[] { "liście", "liść", "list", "lista", "dany", "dana", "dane", "dać" },
new int[] { 0, 0, 0, 0, 7, 7, 7, 7 },
new int[] { 6, 6, 6, 6, 13, 13, 13, 13 },
new int[] { 1, 0, 0, 0, 1, 0, 0, 0 });
assertAnalyzesTo(
a,
"T. Gl\u00FCcksberg",
new String[] { "tom", "tona", "Gl\u00FCcksberg" },
new int[] { 0, 0, 3 },
new int[] { 1, 1, 13 },
new int[] { 1, 0, 1 });
a.close();
}
@SuppressWarnings("unused")
private void dumpTokens(String input) throws IOException {
try (Analyzer a = getTestAnalyzer();
TokenStream ts = a.tokenStream("dummy", input)) {
ts.reset();
MorphosyntacticTagsAttribute attribute = ts.getAttribute(MorphosyntacticTagsAttribute.class);
CharTermAttribute charTerm = ts.getAttribute(CharTermAttribute.class);
while (ts.incrementToken()) {
System.out.println(charTerm.toString() + " => " + attribute.getTags());
}
ts.end();
}
}
/** Test reuse of MorfologikFilter with leftover stems. */
public final void testLeftoverStems() throws IOException {
Analyzer a = getTestAnalyzer();
try (TokenStream ts_1 = a.tokenStream("dummy", "liście")) {
CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
ts_1.reset();
ts_1.incrementToken();
assertEquals("first stream", "liście", termAtt_1.toString());
ts_1.end();
}
try (TokenStream ts_2 = a.tokenStream("dummy", "danych")) {
CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
ts_2.reset();
ts_2.incrementToken();
assertEquals("second stream", "dany", termAtt_2.toString());
ts_2.end();
}
a.close();
}
/** Test stemming of mixed-case tokens. */
public final void testCase() throws IOException {
Analyzer a = getTestAnalyzer();
assertAnalyzesTo(a, "AGD", new String[] { "AGD", "artykuły gospodarstwa domowego" });
assertAnalyzesTo(a, "agd", new String[] { "artykuły gospodarstwa domowego" });
assertAnalyzesTo(a, "Poznania", new String[] { "Poznań" });
assertAnalyzesTo(a, "poznania", new String[] { "poznanie", "poznać" });
assertAnalyzesTo(a, "Aarona", new String[] { "Aaron" });
assertAnalyzesTo(a, "aarona", new String[] { "aarona" });
assertAnalyzesTo(a, "Liście", new String[] { "liście", "liść", "list", "lista" });
a.close();
}
private void assertPOSToken(TokenStream ts, String term, String... tags) throws IOException {
ts.incrementToken();
assertEquals(term, ts.getAttribute(CharTermAttribute.class).toString());
TreeSet<String> actual = new TreeSet<>();
TreeSet<String> expected = new TreeSet<>();
for (StringBuilder b : ts.getAttribute(MorphosyntacticTagsAttribute.class).getTags()) {
actual.add(b.toString());
}
for (String s : tags) {
expected.add(s);
}
if (!expected.equals(actual)) {
System.out.println("Expected:\n" + expected);
System.out.println("Actual:\n" + actual);
assertEquals(expected, actual);
}
}
/** Test morphosyntactic annotations. */
public final void testPOSAttribute() throws IOException {
try (Analyzer a = getTestAnalyzer();
TokenStream ts = a.tokenStream("dummy", "liście")) {
ts.reset();
assertPOSToken(ts, "liście",
"subst:sg:acc:n2",
"subst:sg:nom:n2",
"subst:sg:voc:n2");
assertPOSToken(ts, "liść",
"subst:pl:acc:m3",
"subst:pl:nom:m3",
"subst:pl:voc:m3");
assertPOSToken(ts, "list",
"subst:sg:loc:m3",
"subst:sg:voc:m3");
assertPOSToken(ts, "lista",
"subst:sg:dat:f",
"subst:sg:loc:f");
ts.end();
}
}
/** */
public final void testKeywordAttrTokens() throws IOException {
Analyzer a = new MorfologikAnalyzer() {
@Override
protected TokenStreamComponents createComponents(String field) {
final CharArraySet keywords = new CharArraySet(1, false);
keywords.add("liście");
final Tokenizer src = new StandardTokenizer();
TokenStream result = new SetKeywordMarkerFilter(src, keywords);
result = new MorfologikFilter(result);
return new TokenStreamComponents(src, result);
}
};
assertAnalyzesTo(
a,
"liście danych",
new String[] { "liście", "dany", "dana", "dane", "dać" },
new int[] { 0, 7, 7, 7, 7 },
new int[] { 6, 13, 13, 13, 13 },
new int[] { 1, 1, 0, 0, 0 });
a.close();
}
/** blast some random strings through the analyzer */
public void testRandom() throws Exception {
Analyzer a = getTestAnalyzer();
checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
a.close();
}
}