| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.morfologik; |
| |
| |
| import java.io.IOException; |
| import java.util.TreeSet; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.CharArraySet; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; |
| import org.apache.lucene.analysis.standard.StandardTokenizer; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| |
| /** |
| * TODO: The tests below rely on the order of returned lemmas, which is probably not good. |
| */ |
| public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase { |
| |
| private Analyzer getTestAnalyzer() { |
| return new MorfologikAnalyzer(); |
| } |
| |
| /** Test stemming of single tokens with Morfologik library. */ |
| public final void testSingleTokens() throws IOException { |
| Analyzer a = getTestAnalyzer(); |
| assertAnalyzesTo(a, "a", new String[] { "a" }); |
| assertAnalyzesTo(a, "liście", new String[] { "liście", "liść", "list", "lista" }); |
| assertAnalyzesTo(a, "danych", new String[] { "dany", "dana", "dane", "dać" }); |
| assertAnalyzesTo(a, "ęóąśłżźćń", new String[] { "ęóąśłżźćń" }); |
| a.close(); |
| } |
| |
| /** Test stemming of multiple tokens and proper term metrics. */ |
| public final void testMultipleTokens() throws IOException { |
| Analyzer a = getTestAnalyzer(); |
| assertAnalyzesTo( |
| a, |
| "liście danych", |
| new String[] { "liście", "liść", "list", "lista", "dany", "dana", "dane", "dać" }, |
| new int[] { 0, 0, 0, 0, 7, 7, 7, 7 }, |
| new int[] { 6, 6, 6, 6, 13, 13, 13, 13 }, |
| new int[] { 1, 0, 0, 0, 1, 0, 0, 0 }); |
| |
| assertAnalyzesTo( |
| a, |
| "T. Gl\u00FCcksberg", |
| new String[] { "tom", "tona", "Gl\u00FCcksberg" }, |
| new int[] { 0, 0, 3 }, |
| new int[] { 1, 1, 13 }, |
| new int[] { 1, 0, 1 }); |
| a.close(); |
| } |
| |
| @SuppressWarnings("unused") |
| private void dumpTokens(String input) throws IOException { |
| try (Analyzer a = getTestAnalyzer(); |
| TokenStream ts = a.tokenStream("dummy", input)) { |
| ts.reset(); |
| |
| MorphosyntacticTagsAttribute attribute = ts.getAttribute(MorphosyntacticTagsAttribute.class); |
| CharTermAttribute charTerm = ts.getAttribute(CharTermAttribute.class); |
| while (ts.incrementToken()) { |
| System.out.println(charTerm.toString() + " => " + attribute.getTags()); |
| } |
| ts.end(); |
| } |
| } |
| |
| /** Test reuse of MorfologikFilter with leftover stems. */ |
| public final void testLeftoverStems() throws IOException { |
| Analyzer a = getTestAnalyzer(); |
| try (TokenStream ts_1 = a.tokenStream("dummy", "liście")) { |
| CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class); |
| ts_1.reset(); |
| ts_1.incrementToken(); |
| assertEquals("first stream", "liście", termAtt_1.toString()); |
| ts_1.end(); |
| } |
| |
| try (TokenStream ts_2 = a.tokenStream("dummy", "danych")) { |
| CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class); |
| ts_2.reset(); |
| ts_2.incrementToken(); |
| assertEquals("second stream", "dany", termAtt_2.toString()); |
| ts_2.end(); |
| } |
| a.close(); |
| } |
| |
| /** Test stemming of mixed-case tokens. */ |
| public final void testCase() throws IOException { |
| Analyzer a = getTestAnalyzer(); |
| |
| assertAnalyzesTo(a, "AGD", new String[] { "AGD", "artykuły gospodarstwa domowego" }); |
| assertAnalyzesTo(a, "agd", new String[] { "artykuły gospodarstwa domowego" }); |
| |
| assertAnalyzesTo(a, "Poznania", new String[] { "Poznań" }); |
| assertAnalyzesTo(a, "poznania", new String[] { "poznanie", "poznać" }); |
| |
| assertAnalyzesTo(a, "Aarona", new String[] { "Aaron" }); |
| assertAnalyzesTo(a, "aarona", new String[] { "aarona" }); |
| |
| assertAnalyzesTo(a, "Liście", new String[] { "liście", "liść", "list", "lista" }); |
| a.close(); |
| } |
| |
| private void assertPOSToken(TokenStream ts, String term, String... tags) throws IOException { |
| ts.incrementToken(); |
| assertEquals(term, ts.getAttribute(CharTermAttribute.class).toString()); |
| |
| TreeSet<String> actual = new TreeSet<>(); |
| TreeSet<String> expected = new TreeSet<>(); |
| for (StringBuilder b : ts.getAttribute(MorphosyntacticTagsAttribute.class).getTags()) { |
| actual.add(b.toString()); |
| } |
| for (String s : tags) { |
| expected.add(s); |
| } |
| |
| if (!expected.equals(actual)) { |
| System.out.println("Expected:\n" + expected); |
| System.out.println("Actual:\n" + actual); |
| assertEquals(expected, actual); |
| } |
| } |
| |
| /** Test morphosyntactic annotations. */ |
| public final void testPOSAttribute() throws IOException { |
| try (Analyzer a = getTestAnalyzer(); |
| TokenStream ts = a.tokenStream("dummy", "liście")) { |
| ts.reset(); |
| assertPOSToken(ts, "liście", |
| "subst:sg:acc:n2", |
| "subst:sg:nom:n2", |
| "subst:sg:voc:n2"); |
| |
| assertPOSToken(ts, "liść", |
| "subst:pl:acc:m3", |
| "subst:pl:nom:m3", |
| "subst:pl:voc:m3"); |
| |
| assertPOSToken(ts, "list", |
| "subst:sg:loc:m3", |
| "subst:sg:voc:m3"); |
| |
| assertPOSToken(ts, "lista", |
| "subst:sg:dat:f", |
| "subst:sg:loc:f"); |
| ts.end(); |
| } |
| } |
| |
| /** */ |
| public final void testKeywordAttrTokens() throws IOException { |
| Analyzer a = new MorfologikAnalyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String field) { |
| final CharArraySet keywords = new CharArraySet(1, false); |
| keywords.add("liście"); |
| |
| final Tokenizer src = new StandardTokenizer(); |
| TokenStream result = new SetKeywordMarkerFilter(src, keywords); |
| result = new MorfologikFilter(result); |
| |
| return new TokenStreamComponents(src, result); |
| } |
| }; |
| |
| assertAnalyzesTo( |
| a, |
| "liście danych", |
| new String[] { "liście", "dany", "dana", "dane", "dać" }, |
| new int[] { 0, 7, 7, 7, 7 }, |
| new int[] { 6, 13, 13, 13, 13 }, |
| new int[] { 1, 1, 0, 0, 0 }); |
| a.close(); |
| } |
| |
| /** blast some random strings through the analyzer */ |
| public void testRandom() throws Exception { |
| Analyzer a = getTestAnalyzer(); |
| checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER); |
| a.close(); |
| } |
| } |