src/Lucene.Net.Tests.Analysis.Morfologik/Morfologik/TestMorfologikAnalyzer.cs - lucenenet - Git at Google

 // Lucene version compatibility level 8.2.0
 using J2N.Text;
 using Lucene.Net.Analysis.Miscellaneous;
 using Lucene.Net.Analysis.Morfologik.TokenAttributes;
 using Lucene.Net.Analysis.Standard;
 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Analysis.Util;
 using Lucene.Net.Support;
 using NUnit.Framework;
 using System;
 using System.IO;
 using System.Text;
 using JCG = J2N.Collections.Generic;
 using SCG = System.Collections.Generic;

 namespace Lucene.Net.Analysis.Morfologik
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// TODO: The tests below rely on the order of returned lemmas, which is probably not good.
     /// </summary>
     public class TestMorfologikAnalyzer : BaseTokenStreamTestCase
     {
         private Analyzer getTestAnalyzer()
         {
             return new MorfologikAnalyzer(TEST_VERSION_CURRENT);
         }

         /** Test stemming of single tokens with Morfologik library. */
         [Test]
         public void TestSingleTokens()
         {
             Analyzer a = getTestAnalyzer();
             AssertAnalyzesTo(a, "a", new String[] { "a" });
             AssertAnalyzesTo(a, "liście", new String[] { "liście", "liść", "list", "lista" });
             AssertAnalyzesTo(a, "danych", new String[] { "dany", "dana", "dane", "dać" });
             AssertAnalyzesTo(a, "ęóąśłżźćń", new String[] { "ęóąśłżźćń" });
             a.Dispose();
         }

         /** Test stemming of multiple tokens and proper term metrics. */
         [Test]
         public void TestMultipleTokens()
         {
             Analyzer a = getTestAnalyzer();
             AssertAnalyzesTo(
                 a,
                 "liście danych",
                 new String[] { "liście", "liść", "list", "lista", "dany", "dana", "dane", "dać" },
                 new int[] { 0, 0, 0, 0, 7, 7, 7, 7 },
                 new int[] { 6, 6, 6, 6, 13, 13, 13, 13 },
                 new int[] { 1, 0, 0, 0, 1, 0, 0, 0 });

             AssertAnalyzesTo(
                 a,
                 "T. Gl\u00FCcksberg",
                 new String[] { "tom", "tona", "Gl\u00FCcksberg" },
                 new int[] { 0, 0, 3 },
                 new int[] { 1, 1, 13 },
                 new int[] { 1, 0, 1 });
             a.Dispose();
         }

         private void dumpTokens(String input)
         {
             using (Analyzer a = getTestAnalyzer())
             using (TokenStream ts = a.GetTokenStream("dummy", input))
             {
                 ts.Reset();

                 IMorphosyntacticTagsAttribute attribute = ts.GetAttribute<IMorphosyntacticTagsAttribute>();
                 ICharTermAttribute charTerm = ts.GetAttribute<ICharTermAttribute>();
                 while (ts.IncrementToken())
                 {
                     Console.WriteLine(charTerm.ToString() + " => " + string.Format(StringFormatter.InvariantCulture, "{0}", attribute.Tags));
                 }
                 ts.End();
             }
         }

         /** Test reuse of MorfologikFilter with leftover stems. */
         [Test]
         public void TestLeftoverStems()
         {
             Analyzer a = getTestAnalyzer();
             using (TokenStream ts_1 = a.GetTokenStream("dummy", "liście"))
             {
                 ICharTermAttribute termAtt_1 = ts_1.GetAttribute<ICharTermAttribute>();
                 ts_1.Reset();
                 ts_1.IncrementToken();
                 assertEquals("first stream", "liście", termAtt_1.ToString());
                 ts_1.End();
             }

             using (TokenStream ts_2 = a.GetTokenStream("dummy", "danych"))
             {
                 ICharTermAttribute termAtt_2 = ts_2.GetAttribute<ICharTermAttribute>();
                 ts_2.Reset();
                 ts_2.IncrementToken();
                 assertEquals("second stream", "dany", termAtt_2.toString());
                 ts_2.End();
             }
             a.Dispose();
         }

         /** Test stemming of mixed-case tokens. */
         [Test]
         public void TestCase()
         {
             Analyzer a = getTestAnalyzer();

             AssertAnalyzesTo(a, "AGD", new String[] { "AGD", "artykuły gospodarstwa domowego" });
             AssertAnalyzesTo(a, "agd", new String[] { "artykuły gospodarstwa domowego" });

             AssertAnalyzesTo(a, "Poznania", new String[] { "Poznań" });
             AssertAnalyzesTo(a, "poznania", new String[] { "poznanie", "poznać" });

             AssertAnalyzesTo(a, "Aarona", new String[] { "Aaron" });
             AssertAnalyzesTo(a, "aarona", new String[] { "aarona" });

             AssertAnalyzesTo(a, "Liście", new String[] { "liście", "liść", "list", "lista" });
             a.Dispose();
         }

         private void assertPOSToken(TokenStream ts, String term, params String[] tags)
         {
             ts.IncrementToken();
             assertEquals(term, ts.GetAttribute<ICharTermAttribute>().ToString());

             SCG.ISet<String> actual = new JCG.SortedSet<String>(StringComparer.Ordinal);
             SCG.ISet<String> expected = new JCG.SortedSet<String>(StringComparer.Ordinal);
             foreach (StringBuilder b in ts.GetAttribute<IMorphosyntacticTagsAttribute>().Tags)
             {
                 actual.Add(b.ToString());
             }
             foreach (String s in tags)
             {
                 expected.Add(s);
             }

             // LUCENENET: Commented out unnecessary extra check
             //if (!expected.Equals(actual))
             //{
             //    Console.WriteLine("Expected:\n" + expected);
             //    Console.WriteLine("Actual:\n" + actual);
                 assertEquals(expected, actual, aggressive: false);
             //}
         }

         /** Test morphosyntactic annotations. */
         [Test]
         public void TestPOSAttribute()
         {
             using (Analyzer a = getTestAnalyzer())
             using (TokenStream ts = a.GetTokenStream("dummy", "liście"))
             {
                 ts.Reset();
                 assertPOSToken(ts, "liście",
                   "subst:sg:acc:n2",
                   "subst:sg:nom:n2",
                   "subst:sg:voc:n2");

                 assertPOSToken(ts, "liść",
                   "subst:pl:acc:m3",
                   "subst:pl:nom:m3",
                   "subst:pl:voc:m3");

                 assertPOSToken(ts, "list",
                   "subst:sg:loc:m3",
                   "subst:sg:voc:m3");

                 assertPOSToken(ts, "lista",
                   "subst:sg:dat:f",
                   "subst:sg:loc:f");
                 ts.End();
             }
         }

         private class MockMorfologikAnalyzer : MorfologikAnalyzer
         {
             public MockMorfologikAnalyzer()
                 : base(TEST_VERSION_CURRENT)
             { }

             protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
             {
                 CharArraySet keywords = new CharArraySet(TEST_VERSION_CURRENT, 1, false);
                 keywords.add("liście");

                 Tokenizer src = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
                 TokenStream result = new SetKeywordMarkerFilter(src, keywords);
                 result = new MorfologikFilter(result);

                 return new TokenStreamComponents(src, result);
             }
         }

         /** */
         [Test]
         public void TestKeywordAttrTokens()
         {
             Analyzer a = new MockMorfologikAnalyzer();

             AssertAnalyzesTo(
               a,
                   "liście danych",
                   new String[] { "liście", "dany", "dana", "dane", "dać" },
                   new int[] { 0, 7, 7, 7, 7 },
                   new int[] { 6, 13, 13, 13, 13 },
                   new int[] { 1, 1, 0, 0, 0 });
             a.Dispose();
         }

         /** blast some random strings through the analyzer */
         [Test]
         public void TestRandom()
         {
             Analyzer a = getTestAnalyzer();
             CheckRandomData(Random, a, 1000 * RandomMultiplier);
             a.Dispose();
         }
     }
 }
	// Lucene version compatibility level 8.2.0
	using J2N.Text;
	using Lucene.Net.Analysis.Miscellaneous;
	using Lucene.Net.Analysis.Morfologik.TokenAttributes;
	using Lucene.Net.Analysis.Standard;
	using Lucene.Net.Analysis.TokenAttributes;
	using Lucene.Net.Analysis.Util;
	using Lucene.Net.Support;
	using NUnit.Framework;
	using System;
	using System.IO;
	using System.Text;
	using JCG = J2N.Collections.Generic;
	using SCG = System.Collections.Generic;

	namespace Lucene.Net.Analysis.Morfologik
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// TODO: The tests below rely on the order of returned lemmas, which is probably not good.
	/// </summary>
	public class TestMorfologikAnalyzer : BaseTokenStreamTestCase
	{
	private Analyzer getTestAnalyzer()
	{
	return new MorfologikAnalyzer(TEST_VERSION_CURRENT);
	}

	/** Test stemming of single tokens with Morfologik library. */
	[Test]
	public void TestSingleTokens()
	{
	Analyzer a = getTestAnalyzer();
	AssertAnalyzesTo(a, "a", new String[] { "a" });
	AssertAnalyzesTo(a, "liście", new String[] { "liście", "liść", "list", "lista" });
	AssertAnalyzesTo(a, "danych", new String[] { "dany", "dana", "dane", "dać" });
	AssertAnalyzesTo(a, "ęóąśłżźćń", new String[] { "ęóąśłżźćń" });
	a.Dispose();
	}

	/** Test stemming of multiple tokens and proper term metrics. */
	[Test]
	public void TestMultipleTokens()
	{
	Analyzer a = getTestAnalyzer();
	AssertAnalyzesTo(
	a,
	"liście danych",
	new String[] { "liście", "liść", "list", "lista", "dany", "dana", "dane", "dać" },
	new int[] { 0, 0, 0, 0, 7, 7, 7, 7 },
	new int[] { 6, 6, 6, 6, 13, 13, 13, 13 },
	new int[] { 1, 0, 0, 0, 1, 0, 0, 0 });

	AssertAnalyzesTo(
	a,
	"T. Gl\u00FCcksberg",
	new String[] { "tom", "tona", "Gl\u00FCcksberg" },
	new int[] { 0, 0, 3 },
	new int[] { 1, 1, 13 },
	new int[] { 1, 0, 1 });
	a.Dispose();
	}

	private void dumpTokens(String input)
	{
	using (Analyzer a = getTestAnalyzer())
	using (TokenStream ts = a.GetTokenStream("dummy", input))
	{
	ts.Reset();

	IMorphosyntacticTagsAttribute attribute = ts.GetAttribute<IMorphosyntacticTagsAttribute>();
	ICharTermAttribute charTerm = ts.GetAttribute<ICharTermAttribute>();
	while (ts.IncrementToken())
	{
	Console.WriteLine(charTerm.ToString() + " => " + string.Format(StringFormatter.InvariantCulture, "{0}", attribute.Tags));
	}
	ts.End();
	}
	}

	/** Test reuse of MorfologikFilter with leftover stems. */
	[Test]
	public void TestLeftoverStems()
	{
	Analyzer a = getTestAnalyzer();
	using (TokenStream ts_1 = a.GetTokenStream("dummy", "liście"))
	{
	ICharTermAttribute termAtt_1 = ts_1.GetAttribute<ICharTermAttribute>();
	ts_1.Reset();
	ts_1.IncrementToken();
	assertEquals("first stream", "liście", termAtt_1.ToString());
	ts_1.End();
	}

	using (TokenStream ts_2 = a.GetTokenStream("dummy", "danych"))
	{
	ICharTermAttribute termAtt_2 = ts_2.GetAttribute<ICharTermAttribute>();
	ts_2.Reset();
	ts_2.IncrementToken();
	assertEquals("second stream", "dany", termAtt_2.toString());
	ts_2.End();
	}
	a.Dispose();
	}

	/** Test stemming of mixed-case tokens. */
	[Test]
	public void TestCase()
	{
	Analyzer a = getTestAnalyzer();

	AssertAnalyzesTo(a, "AGD", new String[] { "AGD", "artykuły gospodarstwa domowego" });
	AssertAnalyzesTo(a, "agd", new String[] { "artykuły gospodarstwa domowego" });

	AssertAnalyzesTo(a, "Poznania", new String[] { "Poznań" });
	AssertAnalyzesTo(a, "poznania", new String[] { "poznanie", "poznać" });

	AssertAnalyzesTo(a, "Aarona", new String[] { "Aaron" });
	AssertAnalyzesTo(a, "aarona", new String[] { "aarona" });

	AssertAnalyzesTo(a, "Liście", new String[] { "liście", "liść", "list", "lista" });
	a.Dispose();
	}

	private void assertPOSToken(TokenStream ts, String term, params String[] tags)
	{
	ts.IncrementToken();
	assertEquals(term, ts.GetAttribute<ICharTermAttribute>().ToString());

	SCG.ISet<String> actual = new JCG.SortedSet<String>(StringComparer.Ordinal);
	SCG.ISet<String> expected = new JCG.SortedSet<String>(StringComparer.Ordinal);
	foreach (StringBuilder b in ts.GetAttribute<IMorphosyntacticTagsAttribute>().Tags)
	{
	actual.Add(b.ToString());
	}
	foreach (String s in tags)
	{
	expected.Add(s);
	}

	// LUCENENET: Commented out unnecessary extra check
	//if (!expected.Equals(actual))
	//{
	// Console.WriteLine("Expected:\n" + expected);
	// Console.WriteLine("Actual:\n" + actual);
	assertEquals(expected, actual, aggressive: false);
	//}
	}

	/** Test morphosyntactic annotations. */
	[Test]
	public void TestPOSAttribute()
	{
	using (Analyzer a = getTestAnalyzer())
	using (TokenStream ts = a.GetTokenStream("dummy", "liście"))
	{
	ts.Reset();
	assertPOSToken(ts, "liście",
	"subst:sg:acc:n2",
	"subst:sg:nom:n2",
	"subst:sg:voc:n2");

	assertPOSToken(ts, "liść",
	"subst:pl:acc:m3",
	"subst:pl:nom:m3",
	"subst:pl:voc:m3");

	assertPOSToken(ts, "list",
	"subst:sg:loc:m3",
	"subst:sg:voc:m3");

	assertPOSToken(ts, "lista",
	"subst:sg:dat:f",
	"subst:sg:loc:f");
	ts.End();
	}
	}

	private class MockMorfologikAnalyzer : MorfologikAnalyzer
	{
	public MockMorfologikAnalyzer()
	: base(TEST_VERSION_CURRENT)
	{ }

	protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
	{
	CharArraySet keywords = new CharArraySet(TEST_VERSION_CURRENT, 1, false);
	keywords.add("liście");

	Tokenizer src = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
	TokenStream result = new SetKeywordMarkerFilter(src, keywords);
	result = new MorfologikFilter(result);

	return new TokenStreamComponents(src, result);
	}
	}

	/** */
	[Test]
	public void TestKeywordAttrTokens()
	{
	Analyzer a = new MockMorfologikAnalyzer();

	AssertAnalyzesTo(
	a,
	"liście danych",
	new String[] { "liście", "dany", "dana", "dane", "dać" },
	new int[] { 0, 7, 7, 7, 7 },
	new int[] { 6, 13, 13, 13, 13 },
	new int[] { 1, 1, 0, 0, 0 });
	a.Dispose();
	}

	/** blast some random strings through the analyzer */
	[Test]
	public void TestRandom()
	{
	Analyzer a = getTestAnalyzer();
	CheckRandomData(Random, a, 1000 * RandomMultiplier);
	a.Dispose();
	}
	}
	}