| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.morfologik; |
| |
| import java.io.IOException; |
| import java.util.TreeSet; |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.CharArraySet; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; |
| import org.apache.lucene.analysis.standard.StandardTokenizer; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| |
| /** TODO: The tests below rely on the order of returned lemmas, which is probably not good. */ |
| public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase { |
| |
| private Analyzer getTestAnalyzer() { |
| return new MorfologikAnalyzer(); |
| } |
| |
| /** Test stemming of single tokens with Morfologik library. */ |
| public final void testSingleTokens() throws IOException { |
| Analyzer a = getTestAnalyzer(); |
| assertAnalyzesTo(a, "a", new String[] {"a"}); |
| assertAnalyzesTo(a, "liście", new String[] {"liście", "liść", "list", "lista"}); |
| assertAnalyzesTo(a, "danych", new String[] {"dany", "dana", "dane", "dać"}); |
| assertAnalyzesTo(a, "ęóąśłżźćń", new String[] {"ęóąśłżźćń"}); |
| a.close(); |
| } |
| |
| /** Test stemming of multiple tokens and proper term metrics. */ |
| public final void testMultipleTokens() throws IOException { |
| Analyzer a = getTestAnalyzer(); |
| assertAnalyzesTo( |
| a, |
| "liście danych", |
| new String[] {"liście", "liść", "list", "lista", "dany", "dana", "dane", "dać"}, |
| new int[] {0, 0, 0, 0, 7, 7, 7, 7}, |
| new int[] {6, 6, 6, 6, 13, 13, 13, 13}, |
| new int[] {1, 0, 0, 0, 1, 0, 0, 0}); |
| |
| assertAnalyzesTo( |
| a, |
| "T. Gl\u00FCcksberg", |
| new String[] {"tom", "tona", "Gl\u00FCcksberg"}, |
| new int[] {0, 0, 3}, |
| new int[] {1, 1, 13}, |
| new int[] {1, 0, 1}); |
| a.close(); |
| } |
| |
| @SuppressWarnings("unused") |
| private void dumpTokens(String input) throws IOException { |
| try (Analyzer a = getTestAnalyzer(); |
| TokenStream ts = a.tokenStream("dummy", input)) { |
| ts.reset(); |
| |
| MorphosyntacticTagsAttribute attribute = ts.getAttribute(MorphosyntacticTagsAttribute.class); |
| CharTermAttribute charTerm = ts.getAttribute(CharTermAttribute.class); |
| while (ts.incrementToken()) { |
| System.out.println(charTerm.toString() + " => " + attribute.getTags()); |
| } |
| ts.end(); |
| } |
| } |
| |
| /** Test reuse of MorfologikFilter with leftover stems. */ |
| public final void testLeftoverStems() throws IOException { |
| Analyzer a = getTestAnalyzer(); |
| try (TokenStream ts_1 = a.tokenStream("dummy", "liście")) { |
| CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class); |
| ts_1.reset(); |
| ts_1.incrementToken(); |
| assertEquals("first stream", "liście", termAtt_1.toString()); |
| ts_1.end(); |
| } |
| |
| try (TokenStream ts_2 = a.tokenStream("dummy", "danych")) { |
| CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class); |
| ts_2.reset(); |
| ts_2.incrementToken(); |
| assertEquals("second stream", "dany", termAtt_2.toString()); |
| ts_2.end(); |
| } |
| a.close(); |
| } |
| |
| /** Test stemming of mixed-case tokens. */ |
| public final void testCase() throws IOException { |
| Analyzer a = getTestAnalyzer(); |
| |
| assertAnalyzesTo(a, "AGD", new String[] {"AGD", "artykuły gospodarstwa domowego"}); |
| assertAnalyzesTo(a, "agd", new String[] {"artykuły gospodarstwa domowego"}); |
| |
| assertAnalyzesTo(a, "Poznania", new String[] {"Poznań"}); |
| assertAnalyzesTo(a, "poznania", new String[] {"poznanie", "poznać"}); |
| |
| assertAnalyzesTo(a, "Aarona", new String[] {"Aaron"}); |
| assertAnalyzesTo(a, "aarona", new String[] {"aarona"}); |
| |
| assertAnalyzesTo(a, "Liście", new String[] {"liście", "liść", "list", "lista"}); |
| a.close(); |
| } |
| |
| private void assertPOSToken(TokenStream ts, String term, String... tags) throws IOException { |
| ts.incrementToken(); |
| assertEquals(term, ts.getAttribute(CharTermAttribute.class).toString()); |
| |
| TreeSet<String> actual = new TreeSet<>(); |
| TreeSet<String> expected = new TreeSet<>(); |
| for (StringBuilder b : ts.getAttribute(MorphosyntacticTagsAttribute.class).getTags()) { |
| actual.add(b.toString()); |
| } |
| for (String s : tags) { |
| expected.add(s); |
| } |
| |
| if (!expected.equals(actual)) { |
| System.out.println("Expected:\n" + expected); |
| System.out.println("Actual:\n" + actual); |
| assertEquals(expected, actual); |
| } |
| } |
| |
| /** Test morphosyntactic annotations. */ |
| public final void testPOSAttribute() throws IOException { |
| try (Analyzer a = getTestAnalyzer(); |
| TokenStream ts = a.tokenStream("dummy", "liście")) { |
| ts.reset(); |
| assertPOSToken(ts, "liście", "subst:sg:acc:n2", "subst:sg:nom:n2", "subst:sg:voc:n2"); |
| assertPOSToken(ts, "liść", "subst:pl:acc:m3", "subst:pl:nom:m3", "subst:pl:voc:m3"); |
| assertPOSToken(ts, "list", "subst:sg:loc:m3", "subst:sg:voc:m3"); |
| assertPOSToken(ts, "lista", "subst:sg:dat:f", "subst:sg:loc:f"); |
| ts.end(); |
| } |
| } |
| |
| /** */ |
| public final void testKeywordAttrTokens() throws IOException { |
| Analyzer a = |
| new MorfologikAnalyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String field) { |
| final CharArraySet keywords = new CharArraySet(1, false); |
| keywords.add("liście"); |
| |
| final Tokenizer src = new StandardTokenizer(); |
| TokenStream result = new SetKeywordMarkerFilter(src, keywords); |
| result = new MorfologikFilter(result); |
| |
| return new TokenStreamComponents(src, result); |
| } |
| }; |
| |
| assertAnalyzesTo( |
| a, |
| "liście danych", |
| new String[] {"liście", "dany", "dana", "dane", "dać"}, |
| new int[] {0, 7, 7, 7, 7}, |
| new int[] {6, 13, 13, 13, 13}, |
| new int[] {1, 1, 0, 0, 0}); |
| a.close(); |
| } |
| |
| /** blast some random strings through the analyzer */ |
| public void testRandom() throws Exception { |
| Analyzer a = getTestAnalyzer(); |
| checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER); |
| a.close(); |
| } |
| } |