blob: ad8b8765bf948f57a66583c911cc84bd9dd79f52 [file] [log] [blame]
// Lucene version compatibility level 8.2.0
using J2N.Text;
using Lucene.Net.Analysis.Miscellaneous;
using Lucene.Net.Analysis.Morfologik.TokenAttributes;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Support;
using NUnit.Framework;
using System;
using System.IO;
using System.Text;
using JCG = J2N.Collections.Generic;
using SCG = System.Collections.Generic;
namespace Lucene.Net.Analysis.Morfologik
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// TODO: The tests below rely on the order of returned lemmas, which is probably not good.
/// </summary>
public class TestMorfologikAnalyzer : BaseTokenStreamTestCase
{
private Analyzer getTestAnalyzer()
{
return new MorfologikAnalyzer(TEST_VERSION_CURRENT);
}
/** Test stemming of single tokens with Morfologik library. */
[Test]
public void TestSingleTokens()
{
Analyzer a = getTestAnalyzer();
AssertAnalyzesTo(a, "a", new String[] { "a" });
AssertAnalyzesTo(a, "liście", new String[] { "liście", "liść", "list", "lista" });
AssertAnalyzesTo(a, "danych", new String[] { "dany", "dana", "dane", "dać" });
AssertAnalyzesTo(a, "ęóąśłżźćń", new String[] { "ęóąśłżźćń" });
a.Dispose();
}
/** Test stemming of multiple tokens and proper term metrics. */
[Test]
public void TestMultipleTokens()
{
Analyzer a = getTestAnalyzer();
AssertAnalyzesTo(
a,
"liście danych",
new String[] { "liście", "liść", "list", "lista", "dany", "dana", "dane", "dać" },
new int[] { 0, 0, 0, 0, 7, 7, 7, 7 },
new int[] { 6, 6, 6, 6, 13, 13, 13, 13 },
new int[] { 1, 0, 0, 0, 1, 0, 0, 0 });
AssertAnalyzesTo(
a,
"T. Gl\u00FCcksberg",
new String[] { "tom", "tona", "Gl\u00FCcksberg" },
new int[] { 0, 0, 3 },
new int[] { 1, 1, 13 },
new int[] { 1, 0, 1 });
a.Dispose();
}
private void dumpTokens(String input)
{
using (Analyzer a = getTestAnalyzer())
using (TokenStream ts = a.GetTokenStream("dummy", input))
{
ts.Reset();
IMorphosyntacticTagsAttribute attribute = ts.GetAttribute<IMorphosyntacticTagsAttribute>();
ICharTermAttribute charTerm = ts.GetAttribute<ICharTermAttribute>();
while (ts.IncrementToken())
{
Console.WriteLine(charTerm.ToString() + " => " + string.Format(StringFormatter.InvariantCulture, "{0}", attribute.Tags));
}
ts.End();
}
}
/** Test reuse of MorfologikFilter with leftover stems. */
[Test]
public void TestLeftoverStems()
{
Analyzer a = getTestAnalyzer();
using (TokenStream ts_1 = a.GetTokenStream("dummy", "liście"))
{
ICharTermAttribute termAtt_1 = ts_1.GetAttribute<ICharTermAttribute>();
ts_1.Reset();
ts_1.IncrementToken();
assertEquals("first stream", "liście", termAtt_1.ToString());
ts_1.End();
}
using (TokenStream ts_2 = a.GetTokenStream("dummy", "danych"))
{
ICharTermAttribute termAtt_2 = ts_2.GetAttribute<ICharTermAttribute>();
ts_2.Reset();
ts_2.IncrementToken();
assertEquals("second stream", "dany", termAtt_2.toString());
ts_2.End();
}
a.Dispose();
}
/** Test stemming of mixed-case tokens. */
[Test]
public void TestCase()
{
Analyzer a = getTestAnalyzer();
AssertAnalyzesTo(a, "AGD", new String[] { "AGD", "artykuły gospodarstwa domowego" });
AssertAnalyzesTo(a, "agd", new String[] { "artykuły gospodarstwa domowego" });
AssertAnalyzesTo(a, "Poznania", new String[] { "Poznań" });
AssertAnalyzesTo(a, "poznania", new String[] { "poznanie", "poznać" });
AssertAnalyzesTo(a, "Aarona", new String[] { "Aaron" });
AssertAnalyzesTo(a, "aarona", new String[] { "aarona" });
AssertAnalyzesTo(a, "Liście", new String[] { "liście", "liść", "list", "lista" });
a.Dispose();
}
private void assertPOSToken(TokenStream ts, String term, params String[] tags)
{
ts.IncrementToken();
assertEquals(term, ts.GetAttribute<ICharTermAttribute>().ToString());
SCG.ISet<String> actual = new JCG.SortedSet<String>(StringComparer.Ordinal);
SCG.ISet<String> expected = new JCG.SortedSet<String>(StringComparer.Ordinal);
foreach (StringBuilder b in ts.GetAttribute<IMorphosyntacticTagsAttribute>().Tags)
{
actual.Add(b.ToString());
}
foreach (String s in tags)
{
expected.Add(s);
}
// LUCENENET: Commented out unnecessary extra check
//if (!expected.Equals(actual))
//{
// Console.WriteLine("Expected:\n" + expected);
// Console.WriteLine("Actual:\n" + actual);
assertEquals(expected, actual, aggressive: false);
//}
}
/** Test morphosyntactic annotations. */
[Test]
public void TestPOSAttribute()
{
using (Analyzer a = getTestAnalyzer())
using (TokenStream ts = a.GetTokenStream("dummy", "liście"))
{
ts.Reset();
assertPOSToken(ts, "liście",
"subst:sg:acc:n2",
"subst:sg:nom:n2",
"subst:sg:voc:n2");
assertPOSToken(ts, "liść",
"subst:pl:acc:m3",
"subst:pl:nom:m3",
"subst:pl:voc:m3");
assertPOSToken(ts, "list",
"subst:sg:loc:m3",
"subst:sg:voc:m3");
assertPOSToken(ts, "lista",
"subst:sg:dat:f",
"subst:sg:loc:f");
ts.End();
}
}
private class MockMorfologikAnalyzer : MorfologikAnalyzer
{
public MockMorfologikAnalyzer()
: base(TEST_VERSION_CURRENT)
{ }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
{
CharArraySet keywords = new CharArraySet(TEST_VERSION_CURRENT, 1, false);
keywords.add("liście");
Tokenizer src = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
TokenStream result = new SetKeywordMarkerFilter(src, keywords);
result = new MorfologikFilter(result);
return new TokenStreamComponents(src, result);
}
}
/** */
[Test]
public void TestKeywordAttrTokens()
{
Analyzer a = new MockMorfologikAnalyzer();
AssertAnalyzesTo(
a,
"liście danych",
new String[] { "liście", "dany", "dana", "dane", "dać" },
new int[] { 0, 7, 7, 7, 7 },
new int[] { 6, 13, 13, 13, 13 },
new int[] { 1, 1, 0, 0, 0 });
a.Dispose();
}
/** blast some random strings through the analyzer */
[Test]
public void TestRandom()
{
Analyzer a = getTestAnalyzer();
CheckRandomData(Random, a, 1000 * RandomMultiplier);
a.Dispose();
}
}
}