src/Lucene.Net.Tests.Analysis.Common/Analysis/Bg/TestBulgarianStemmer.cs - lucenenet - Git at Google

 using Lucene.Net.Analysis.Core;
 using Lucene.Net.Analysis.Miscellaneous;
 using Lucene.Net.Analysis.Util;
 using NUnit.Framework;
 using System.IO;

 namespace Lucene.Net.Analysis.Bg
 {
     /*
 	 * Licensed to the Apache Software Foundation (ASF) under one or more
 	 * contributor license agreements.  See the NOTICE file distributed with
 	 * this work for additional information regarding copyright ownership.
 	 * The ASF licenses this file to You under the Apache License, Version 2.0
 	 * (the "License"); you may not use this file except in compliance with
 	 * the License.  You may obtain a copy of the License at
 	 *
 	 *     http://www.apache.org/licenses/LICENSE-2.0
 	 *
 	 * Unless required by applicable law or agreed to in writing, software
 	 * distributed under the License is distributed on an "AS IS" BASIS,
 	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 	 * See the License for the specific language governing permissions and
 	 * limitations under the License.
 	 */

     /// <summary>
     /// Test the Bulgarian Stemmer
     /// </summary>
     public class TestBulgarianStemmer : BaseTokenStreamTestCase
     {
         /// <summary>
         /// Test showing how masculine noun forms conflate. An example noun for each
         /// common (and some rare) plural pattern is listed.
         /// </summary>
         [Test]
         public virtual void TestMasculineNouns()
         {
             BulgarianAnalyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);

             // -и pattern
             AssertAnalyzesTo(a, "град", new string[] { "град" });
             AssertAnalyzesTo(a, "града", new string[] { "град" });
             AssertAnalyzesTo(a, "градът", new string[] { "град" });
             AssertAnalyzesTo(a, "градове", new string[] { "град" });
             AssertAnalyzesTo(a, "градовете", new string[] { "град" });

             // -ове pattern
             AssertAnalyzesTo(a, "народ", new string[] { "народ" });
             AssertAnalyzesTo(a, "народа", new string[] { "народ" });
             AssertAnalyzesTo(a, "народът", new string[] { "народ" });
             AssertAnalyzesTo(a, "народи", new string[] { "народ" });
             AssertAnalyzesTo(a, "народите", new string[] { "народ" });
             AssertAnalyzesTo(a, "народе", new string[] { "народ" });

             // -ища pattern
             AssertAnalyzesTo(a, "път", new string[] { "път" });
             AssertAnalyzesTo(a, "пътя", new string[] { "път" });
             AssertAnalyzesTo(a, "пътят", new string[] { "път" });
             AssertAnalyzesTo(a, "пътища", new string[] { "път" });
             AssertAnalyzesTo(a, "пътищата", new string[] { "път" });

             // -чета pattern
             AssertAnalyzesTo(a, "градец", new string[] { "градец" });
             AssertAnalyzesTo(a, "градеца", new string[] { "градец" });
             AssertAnalyzesTo(a, "градецът", new string[] { "градец" });
             /* note the below forms conflate with each other, but not the rest */
             AssertAnalyzesTo(a, "градовце", new string[] { "градовц" });
             AssertAnalyzesTo(a, "градовцете", new string[] { "градовц" });

             // -овци pattern
             AssertAnalyzesTo(a, "дядо", new string[] { "дяд" });
             AssertAnalyzesTo(a, "дядото", new string[] { "дяд" });
             AssertAnalyzesTo(a, "дядовци", new string[] { "дяд" });
             AssertAnalyzesTo(a, "дядовците", new string[] { "дяд" });

             // -е pattern
             AssertAnalyzesTo(a, "мъж", new string[] { "мъж" });
             AssertAnalyzesTo(a, "мъжа", new string[] { "мъж" });
             AssertAnalyzesTo(a, "мъже", new string[] { "мъж" });
             AssertAnalyzesTo(a, "мъжете", new string[] { "мъж" });
             AssertAnalyzesTo(a, "мъжо", new string[] { "мъж" });
             /* word is too short, will not remove -ът */
             AssertAnalyzesTo(a, "мъжът", new string[] { "мъжът" });

             // -а pattern
             AssertAnalyzesTo(a, "крак", new string[] { "крак" });
             AssertAnalyzesTo(a, "крака", new string[] { "крак" });
             AssertAnalyzesTo(a, "кракът", new string[] { "крак" });
             AssertAnalyzesTo(a, "краката", new string[] { "крак" });

             // брат
             AssertAnalyzesTo(a, "брат", new string[] { "брат" });
             AssertAnalyzesTo(a, "брата", new string[] { "брат" });
             AssertAnalyzesTo(a, "братът", new string[] { "брат" });
             AssertAnalyzesTo(a, "братя", new string[] { "брат" });
             AssertAnalyzesTo(a, "братята", new string[] { "брат" });
             AssertAnalyzesTo(a, "брате", new string[] { "брат" });
         }

         /// <summary>
         /// Test showing how feminine noun forms conflate
         /// </summary>
         [Test]
         public virtual void TestFeminineNouns()
         {
             BulgarianAnalyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);

             AssertAnalyzesTo(a, "вест", new string[] { "вест" });
             AssertAnalyzesTo(a, "вестта", new string[] { "вест" });
             AssertAnalyzesTo(a, "вести", new string[] { "вест" });
             AssertAnalyzesTo(a, "вестите", new string[] { "вест" });
         }

         /// <summary>
         /// Test showing how neuter noun forms conflate an example noun for each common
         /// plural pattern is listed
         /// </summary>
         [Test]
         public virtual void TestNeuterNouns()
         {
             BulgarianAnalyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);

             // -а pattern
             AssertAnalyzesTo(a, "дърво", new string[] { "дърв" });
             AssertAnalyzesTo(a, "дървото", new string[] { "дърв" });
             AssertAnalyzesTo(a, "дърва", new string[] { "дърв" });
             AssertAnalyzesTo(a, "дървета", new string[] { "дърв" });
             AssertAnalyzesTo(a, "дървата", new string[] { "дърв" });
             AssertAnalyzesTo(a, "дърветата", new string[] { "дърв" });

             // -та pattern
             AssertAnalyzesTo(a, "море", new string[] { "мор" });
             AssertAnalyzesTo(a, "морето", new string[] { "мор" });
             AssertAnalyzesTo(a, "морета", new string[] { "мор" });
             AssertAnalyzesTo(a, "моретата", new string[] { "мор" });

             // -я pattern
             AssertAnalyzesTo(a, "изключение", new string[] { "изключени" });
             AssertAnalyzesTo(a, "изключението", new string[] { "изключени" });
             AssertAnalyzesTo(a, "изключенията", new string[] { "изключени" });
             /* note the below form in this example does not conflate with the rest */
             AssertAnalyzesTo(a, "изключения", new string[] { "изключн" });
         }

         /// <summary>
         /// Test showing how adjectival forms conflate
         /// </summary>
         [Test]
         public virtual void TestAdjectives()
         {
             BulgarianAnalyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);
             AssertAnalyzesTo(a, "красив", new string[] { "красив" });
             AssertAnalyzesTo(a, "красивия", new string[] { "красив" });
             AssertAnalyzesTo(a, "красивият", new string[] { "красив" });
             AssertAnalyzesTo(a, "красива", new string[] { "красив" });
             AssertAnalyzesTo(a, "красивата", new string[] { "красив" });
             AssertAnalyzesTo(a, "красиво", new string[] { "красив" });
             AssertAnalyzesTo(a, "красивото", new string[] { "красив" });
             AssertAnalyzesTo(a, "красиви", new string[] { "красив" });
             AssertAnalyzesTo(a, "красивите", new string[] { "красив" });
         }

         /// <summary>
         /// Test some exceptional rules, implemented as rewrites.
         /// </summary>
         [Test]
         public virtual void TestExceptions()
         {
             BulgarianAnalyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);

             // ци -> к
             AssertAnalyzesTo(a, "собственик", new string[] { "собственик" });
             AssertAnalyzesTo(a, "собственика", new string[] { "собственик" });
             AssertAnalyzesTo(a, "собственикът", new string[] { "собственик" });
             AssertAnalyzesTo(a, "собственици", new string[] { "собственик" });
             AssertAnalyzesTo(a, "собствениците", new string[] { "собственик" });

             // зи -> г
             AssertAnalyzesTo(a, "подлог", new string[] { "подлог" });
             AssertAnalyzesTo(a, "подлога", new string[] { "подлог" });
             AssertAnalyzesTo(a, "подлогът", new string[] { "подлог" });
             AssertAnalyzesTo(a, "подлози", new string[] { "подлог" });
             AssertAnalyzesTo(a, "подлозите", new string[] { "подлог" });

             // си -> х
             AssertAnalyzesTo(a, "кожух", new string[] { "кожух" });
             AssertAnalyzesTo(a, "кожуха", new string[] { "кожух" });
             AssertAnalyzesTo(a, "кожухът", new string[] { "кожух" });
             AssertAnalyzesTo(a, "кожуси", new string[] { "кожух" });
             AssertAnalyzesTo(a, "кожусите", new string[] { "кожух" });

             // ъ deletion
             AssertAnalyzesTo(a, "център", new string[] { "центр" });
             AssertAnalyzesTo(a, "центъра", new string[] { "центр" });
             AssertAnalyzesTo(a, "центърът", new string[] { "центр" });
             AssertAnalyzesTo(a, "центрове", new string[] { "центр" });
             AssertAnalyzesTo(a, "центровете", new string[] { "центр" });

             // е*и -> я*
             AssertAnalyzesTo(a, "промяна", new string[] { "промян" });
             AssertAnalyzesTo(a, "промяната", new string[] { "промян" });
             AssertAnalyzesTo(a, "промени", new string[] { "промян" });
             AssertAnalyzesTo(a, "промените", new string[] { "промян" });

             // ен -> н
             AssertAnalyzesTo(a, "песен", new string[] { "песн" });
             AssertAnalyzesTo(a, "песента", new string[] { "песн" });
             AssertAnalyzesTo(a, "песни", new string[] { "песн" });
             AssertAnalyzesTo(a, "песните", new string[] { "песн" });

             // -еве -> й
             // note: this is the only word i think this rule works for.
             // most -еве pluralized nouns are monosyllabic,
             // and the stemmer requires length > 6...
             AssertAnalyzesTo(a, "строй", new string[] { "строй" });
             AssertAnalyzesTo(a, "строеве", new string[] { "строй" });
             AssertAnalyzesTo(a, "строевете", new string[] { "строй" });
             /* note the below forms conflate with each other, but not the rest */
             AssertAnalyzesTo(a, "строя", new string[] { "стр" });
             AssertAnalyzesTo(a, "строят", new string[] { "стр" });
         }

         [Test]
         public virtual void TestWithKeywordAttribute()
         {
             CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
             set.add("строеве");
             MockTokenizer tokenStream = new MockTokenizer(new StringReader("строевете строеве"), MockTokenizer.WHITESPACE, false);

             BulgarianStemFilter filter = new BulgarianStemFilter(new SetKeywordMarkerFilter(tokenStream, set));
             AssertTokenStreamContents(filter, new string[] { "строй", "строеве" });
         }

         [Test]
         public virtual void TestEmptyTerm()
         {
             Analyzer a = new AnalyzerAnonymousInnerClassHelper(this);
             CheckOneTerm(a, "", "");
         }

         private class AnalyzerAnonymousInnerClassHelper : Analyzer
         {
             private readonly TestBulgarianStemmer outerInstance;

             public AnalyzerAnonymousInnerClassHelper(TestBulgarianStemmer outerInstance)
             {
                 this.outerInstance = outerInstance;
             }

             protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
             {
                 Tokenizer tokenizer = new KeywordTokenizer(reader);
                 return new TokenStreamComponents(tokenizer, new BulgarianStemFilter(tokenizer));
             }
         }
     }
 }
	using Lucene.Net.Analysis.Core;
	using Lucene.Net.Analysis.Miscellaneous;
	using Lucene.Net.Analysis.Util;
	using NUnit.Framework;
	using System.IO;

	namespace Lucene.Net.Analysis.Bg
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Test the Bulgarian Stemmer
	/// </summary>
	public class TestBulgarianStemmer : BaseTokenStreamTestCase
	{
	/// <summary>
	/// Test showing how masculine noun forms conflate. An example noun for each
	/// common (and some rare) plural pattern is listed.
	/// </summary>
	[Test]
	public virtual void TestMasculineNouns()
	{
	BulgarianAnalyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);

	// -и pattern
	AssertAnalyzesTo(a, "град", new string[] { "град" });
	AssertAnalyzesTo(a, "града", new string[] { "град" });
	AssertAnalyzesTo(a, "градът", new string[] { "град" });
	AssertAnalyzesTo(a, "градове", new string[] { "град" });
	AssertAnalyzesTo(a, "градовете", new string[] { "град" });

	// -ове pattern
	AssertAnalyzesTo(a, "народ", new string[] { "народ" });
	AssertAnalyzesTo(a, "народа", new string[] { "народ" });
	AssertAnalyzesTo(a, "народът", new string[] { "народ" });
	AssertAnalyzesTo(a, "народи", new string[] { "народ" });
	AssertAnalyzesTo(a, "народите", new string[] { "народ" });
	AssertAnalyzesTo(a, "народе", new string[] { "народ" });

	// -ища pattern
	AssertAnalyzesTo(a, "път", new string[] { "път" });
	AssertAnalyzesTo(a, "пътя", new string[] { "път" });
	AssertAnalyzesTo(a, "пътят", new string[] { "път" });
	AssertAnalyzesTo(a, "пътища", new string[] { "път" });
	AssertAnalyzesTo(a, "пътищата", new string[] { "път" });

	// -чета pattern
	AssertAnalyzesTo(a, "градец", new string[] { "градец" });
	AssertAnalyzesTo(a, "градеца", new string[] { "градец" });
	AssertAnalyzesTo(a, "градецът", new string[] { "градец" });
	/* note the below forms conflate with each other, but not the rest */
	AssertAnalyzesTo(a, "градовце", new string[] { "градовц" });
	AssertAnalyzesTo(a, "градовцете", new string[] { "градовц" });

	// -овци pattern
	AssertAnalyzesTo(a, "дядо", new string[] { "дяд" });
	AssertAnalyzesTo(a, "дядото", new string[] { "дяд" });
	AssertAnalyzesTo(a, "дядовци", new string[] { "дяд" });
	AssertAnalyzesTo(a, "дядовците", new string[] { "дяд" });

	// -е pattern
	AssertAnalyzesTo(a, "мъж", new string[] { "мъж" });
	AssertAnalyzesTo(a, "мъжа", new string[] { "мъж" });
	AssertAnalyzesTo(a, "мъже", new string[] { "мъж" });
	AssertAnalyzesTo(a, "мъжете", new string[] { "мъж" });
	AssertAnalyzesTo(a, "мъжо", new string[] { "мъж" });
	/* word is too short, will not remove -ът */
	AssertAnalyzesTo(a, "мъжът", new string[] { "мъжът" });

	// -а pattern
	AssertAnalyzesTo(a, "крак", new string[] { "крак" });
	AssertAnalyzesTo(a, "крака", new string[] { "крак" });
	AssertAnalyzesTo(a, "кракът", new string[] { "крак" });
	AssertAnalyzesTo(a, "краката", new string[] { "крак" });

	// брат
	AssertAnalyzesTo(a, "брат", new string[] { "брат" });
	AssertAnalyzesTo(a, "брата", new string[] { "брат" });
	AssertAnalyzesTo(a, "братът", new string[] { "брат" });
	AssertAnalyzesTo(a, "братя", new string[] { "брат" });
	AssertAnalyzesTo(a, "братята", new string[] { "брат" });
	AssertAnalyzesTo(a, "брате", new string[] { "брат" });
	}

	/// <summary>
	/// Test showing how feminine noun forms conflate
	/// </summary>
	[Test]
	public virtual void TestFeminineNouns()
	{
	BulgarianAnalyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);

	AssertAnalyzesTo(a, "вест", new string[] { "вест" });
	AssertAnalyzesTo(a, "вестта", new string[] { "вест" });
	AssertAnalyzesTo(a, "вести", new string[] { "вест" });
	AssertAnalyzesTo(a, "вестите", new string[] { "вест" });
	}

	/// <summary>
	/// Test showing how neuter noun forms conflate an example noun for each common
	/// plural pattern is listed
	/// </summary>
	[Test]
	public virtual void TestNeuterNouns()
	{
	BulgarianAnalyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);

	// -а pattern
	AssertAnalyzesTo(a, "дърво", new string[] { "дърв" });
	AssertAnalyzesTo(a, "дървото", new string[] { "дърв" });
	AssertAnalyzesTo(a, "дърва", new string[] { "дърв" });
	AssertAnalyzesTo(a, "дървета", new string[] { "дърв" });
	AssertAnalyzesTo(a, "дървата", new string[] { "дърв" });
	AssertAnalyzesTo(a, "дърветата", new string[] { "дърв" });

	// -та pattern
	AssertAnalyzesTo(a, "море", new string[] { "мор" });
	AssertAnalyzesTo(a, "морето", new string[] { "мор" });
	AssertAnalyzesTo(a, "морета", new string[] { "мор" });
	AssertAnalyzesTo(a, "моретата", new string[] { "мор" });

	// -я pattern
	AssertAnalyzesTo(a, "изключение", new string[] { "изключени" });
	AssertAnalyzesTo(a, "изключението", new string[] { "изключени" });
	AssertAnalyzesTo(a, "изключенията", new string[] { "изключени" });
	/* note the below form in this example does not conflate with the rest */
	AssertAnalyzesTo(a, "изключения", new string[] { "изключн" });
	}

	/// <summary>
	/// Test showing how adjectival forms conflate
	/// </summary>
	[Test]
	public virtual void TestAdjectives()
	{
	BulgarianAnalyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);
	AssertAnalyzesTo(a, "красив", new string[] { "красив" });
	AssertAnalyzesTo(a, "красивия", new string[] { "красив" });
	AssertAnalyzesTo(a, "красивият", new string[] { "красив" });
	AssertAnalyzesTo(a, "красива", new string[] { "красив" });
	AssertAnalyzesTo(a, "красивата", new string[] { "красив" });
	AssertAnalyzesTo(a, "красиво", new string[] { "красив" });
	AssertAnalyzesTo(a, "красивото", new string[] { "красив" });
	AssertAnalyzesTo(a, "красиви", new string[] { "красив" });
	AssertAnalyzesTo(a, "красивите", new string[] { "красив" });
	}

	/// <summary>
	/// Test some exceptional rules, implemented as rewrites.
	/// </summary>
	[Test]
	public virtual void TestExceptions()
	{
	BulgarianAnalyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);

	// ци -> к
	AssertAnalyzesTo(a, "собственик", new string[] { "собственик" });
	AssertAnalyzesTo(a, "собственика", new string[] { "собственик" });
	AssertAnalyzesTo(a, "собственикът", new string[] { "собственик" });
	AssertAnalyzesTo(a, "собственици", new string[] { "собственик" });
	AssertAnalyzesTo(a, "собствениците", new string[] { "собственик" });

	// зи -> г
	AssertAnalyzesTo(a, "подлог", new string[] { "подлог" });
	AssertAnalyzesTo(a, "подлога", new string[] { "подлог" });
	AssertAnalyzesTo(a, "подлогът", new string[] { "подлог" });
	AssertAnalyzesTo(a, "подлози", new string[] { "подлог" });
	AssertAnalyzesTo(a, "подлозите", new string[] { "подлог" });

	// си -> х
	AssertAnalyzesTo(a, "кожух", new string[] { "кожух" });
	AssertAnalyzesTo(a, "кожуха", new string[] { "кожух" });
	AssertAnalyzesTo(a, "кожухът", new string[] { "кожух" });
	AssertAnalyzesTo(a, "кожуси", new string[] { "кожух" });
	AssertAnalyzesTo(a, "кожусите", new string[] { "кожух" });

	// ъ deletion
	AssertAnalyzesTo(a, "център", new string[] { "центр" });
	AssertAnalyzesTo(a, "центъра", new string[] { "центр" });
	AssertAnalyzesTo(a, "центърът", new string[] { "центр" });
	AssertAnalyzesTo(a, "центрове", new string[] { "центр" });
	AssertAnalyzesTo(a, "центровете", new string[] { "центр" });

	// еи -> я
	AssertAnalyzesTo(a, "промяна", new string[] { "промян" });
	AssertAnalyzesTo(a, "промяната", new string[] { "промян" });
	AssertAnalyzesTo(a, "промени", new string[] { "промян" });
	AssertAnalyzesTo(a, "промените", new string[] { "промян" });

	// ен -> н
	AssertAnalyzesTo(a, "песен", new string[] { "песн" });
	AssertAnalyzesTo(a, "песента", new string[] { "песн" });
	AssertAnalyzesTo(a, "песни", new string[] { "песн" });
	AssertAnalyzesTo(a, "песните", new string[] { "песн" });

	// -еве -> й
	// note: this is the only word i think this rule works for.
	// most -еве pluralized nouns are monosyllabic,
	// and the stemmer requires length > 6...
	AssertAnalyzesTo(a, "строй", new string[] { "строй" });
	AssertAnalyzesTo(a, "строеве", new string[] { "строй" });
	AssertAnalyzesTo(a, "строевете", new string[] { "строй" });
	/* note the below forms conflate with each other, but not the rest */
	AssertAnalyzesTo(a, "строя", new string[] { "стр" });
	AssertAnalyzesTo(a, "строят", new string[] { "стр" });
	}

	[Test]
	public virtual void TestWithKeywordAttribute()
	{
	CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
	set.add("строеве");
	MockTokenizer tokenStream = new MockTokenizer(new StringReader("строевете строеве"), MockTokenizer.WHITESPACE, false);

	BulgarianStemFilter filter = new BulgarianStemFilter(new SetKeywordMarkerFilter(tokenStream, set));
	AssertTokenStreamContents(filter, new string[] { "строй", "строеве" });
	}

	[Test]
	public virtual void TestEmptyTerm()
	{
	Analyzer a = new AnalyzerAnonymousInnerClassHelper(this);
	CheckOneTerm(a, "", "");
	}

	private class AnalyzerAnonymousInnerClassHelper : Analyzer
	{
	private readonly TestBulgarianStemmer outerInstance;

	public AnalyzerAnonymousInnerClassHelper(TestBulgarianStemmer outerInstance)
	{
	this.outerInstance = outerInstance;
	}

	protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
	{
	Tokenizer tokenizer = new KeywordTokenizer(reader);
	return new TokenStreamComponents(tokenizer, new BulgarianStemFilter(tokenizer));
	}
	}
	}
	}