blob: 9374f71179b3823c0d69182a2c1f59881ace272a [file] [log] [blame]
using Lucene.Net.Analysis.Core;
using Lucene.Net.Analysis.Miscellaneous;
using Lucene.Net.Analysis.Util;
using NUnit.Framework;
using System.IO;
namespace Lucene.Net.Analysis.Bg
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Test the Bulgarian Stemmer
/// </summary>
public class TestBulgarianStemmer : BaseTokenStreamTestCase
{
/// <summary>
/// Test showing how masculine noun forms conflate. An example noun for each
/// common (and some rare) plural pattern is listed.
/// </summary>
[Test]
public virtual void TestMasculineNouns()
{
BulgarianAnalyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);
// -и pattern
AssertAnalyzesTo(a, "град", new string[] { "град" });
AssertAnalyzesTo(a, "града", new string[] { "град" });
AssertAnalyzesTo(a, "градът", new string[] { "град" });
AssertAnalyzesTo(a, "градове", new string[] { "град" });
AssertAnalyzesTo(a, "градовете", new string[] { "град" });
// -ове pattern
AssertAnalyzesTo(a, "народ", new string[] { "народ" });
AssertAnalyzesTo(a, "народа", new string[] { "народ" });
AssertAnalyzesTo(a, "народът", new string[] { "народ" });
AssertAnalyzesTo(a, "народи", new string[] { "народ" });
AssertAnalyzesTo(a, "народите", new string[] { "народ" });
AssertAnalyzesTo(a, "народе", new string[] { "народ" });
// -ища pattern
AssertAnalyzesTo(a, "път", new string[] { "път" });
AssertAnalyzesTo(a, "пътя", new string[] { "път" });
AssertAnalyzesTo(a, "пътят", new string[] { "път" });
AssertAnalyzesTo(a, "пътища", new string[] { "път" });
AssertAnalyzesTo(a, "пътищата", new string[] { "път" });
// -чета pattern
AssertAnalyzesTo(a, "градец", new string[] { "градец" });
AssertAnalyzesTo(a, "градеца", new string[] { "градец" });
AssertAnalyzesTo(a, "градецът", new string[] { "градец" });
/* note the below forms conflate with each other, but not the rest */
AssertAnalyzesTo(a, "градовце", new string[] { "градовц" });
AssertAnalyzesTo(a, "градовцете", new string[] { "градовц" });
// -овци pattern
AssertAnalyzesTo(a, "дядо", new string[] { "дяд" });
AssertAnalyzesTo(a, "дядото", new string[] { "дяд" });
AssertAnalyzesTo(a, "дядовци", new string[] { "дяд" });
AssertAnalyzesTo(a, "дядовците", new string[] { "дяд" });
// -е pattern
AssertAnalyzesTo(a, "мъж", new string[] { "мъж" });
AssertAnalyzesTo(a, "мъжа", new string[] { "мъж" });
AssertAnalyzesTo(a, "мъже", new string[] { "мъж" });
AssertAnalyzesTo(a, "мъжете", new string[] { "мъж" });
AssertAnalyzesTo(a, "мъжо", new string[] { "мъж" });
/* word is too short, will not remove -ът */
AssertAnalyzesTo(a, "мъжът", new string[] { "мъжът" });
// -а pattern
AssertAnalyzesTo(a, "крак", new string[] { "крак" });
AssertAnalyzesTo(a, "крака", new string[] { "крак" });
AssertAnalyzesTo(a, "кракът", new string[] { "крак" });
AssertAnalyzesTo(a, "краката", new string[] { "крак" });
// брат
AssertAnalyzesTo(a, "брат", new string[] { "брат" });
AssertAnalyzesTo(a, "брата", new string[] { "брат" });
AssertAnalyzesTo(a, "братът", new string[] { "брат" });
AssertAnalyzesTo(a, "братя", new string[] { "брат" });
AssertAnalyzesTo(a, "братята", new string[] { "брат" });
AssertAnalyzesTo(a, "брате", new string[] { "брат" });
}
/// <summary>
/// Test showing how feminine noun forms conflate
/// </summary>
[Test]
public virtual void TestFeminineNouns()
{
BulgarianAnalyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);
AssertAnalyzesTo(a, "вест", new string[] { "вест" });
AssertAnalyzesTo(a, "вестта", new string[] { "вест" });
AssertAnalyzesTo(a, "вести", new string[] { "вест" });
AssertAnalyzesTo(a, "вестите", new string[] { "вест" });
}
/// <summary>
/// Test showing how neuter noun forms conflate an example noun for each common
/// plural pattern is listed
/// </summary>
[Test]
public virtual void TestNeuterNouns()
{
BulgarianAnalyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);
// -а pattern
AssertAnalyzesTo(a, "дърво", new string[] { "дърв" });
AssertAnalyzesTo(a, "дървото", new string[] { "дърв" });
AssertAnalyzesTo(a, "дърва", new string[] { "дърв" });
AssertAnalyzesTo(a, "дървета", new string[] { "дърв" });
AssertAnalyzesTo(a, "дървата", new string[] { "дърв" });
AssertAnalyzesTo(a, "дърветата", new string[] { "дърв" });
// -та pattern
AssertAnalyzesTo(a, "море", new string[] { "мор" });
AssertAnalyzesTo(a, "морето", new string[] { "мор" });
AssertAnalyzesTo(a, "морета", new string[] { "мор" });
AssertAnalyzesTo(a, "моретата", new string[] { "мор" });
// -я pattern
AssertAnalyzesTo(a, "изключение", new string[] { "изключени" });
AssertAnalyzesTo(a, "изключението", new string[] { "изключени" });
AssertAnalyzesTo(a, "изключенията", new string[] { "изключени" });
/* note the below form in this example does not conflate with the rest */
AssertAnalyzesTo(a, "изключения", new string[] { "изключн" });
}
/// <summary>
/// Test showing how adjectival forms conflate
/// </summary>
[Test]
public virtual void TestAdjectives()
{
BulgarianAnalyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);
AssertAnalyzesTo(a, "красив", new string[] { "красив" });
AssertAnalyzesTo(a, "красивия", new string[] { "красив" });
AssertAnalyzesTo(a, "красивият", new string[] { "красив" });
AssertAnalyzesTo(a, "красива", new string[] { "красив" });
AssertAnalyzesTo(a, "красивата", new string[] { "красив" });
AssertAnalyzesTo(a, "красиво", new string[] { "красив" });
AssertAnalyzesTo(a, "красивото", new string[] { "красив" });
AssertAnalyzesTo(a, "красиви", new string[] { "красив" });
AssertAnalyzesTo(a, "красивите", new string[] { "красив" });
}
/// <summary>
/// Test some exceptional rules, implemented as rewrites.
/// </summary>
[Test]
public virtual void TestExceptions()
{
BulgarianAnalyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);
// ци -> к
AssertAnalyzesTo(a, "собственик", new string[] { "собственик" });
AssertAnalyzesTo(a, "собственика", new string[] { "собственик" });
AssertAnalyzesTo(a, "собственикът", new string[] { "собственик" });
AssertAnalyzesTo(a, "собственици", new string[] { "собственик" });
AssertAnalyzesTo(a, "собствениците", new string[] { "собственик" });
// зи -> г
AssertAnalyzesTo(a, "подлог", new string[] { "подлог" });
AssertAnalyzesTo(a, "подлога", new string[] { "подлог" });
AssertAnalyzesTo(a, "подлогът", new string[] { "подлог" });
AssertAnalyzesTo(a, "подлози", new string[] { "подлог" });
AssertAnalyzesTo(a, "подлозите", new string[] { "подлог" });
// си -> х
AssertAnalyzesTo(a, "кожух", new string[] { "кожух" });
AssertAnalyzesTo(a, "кожуха", new string[] { "кожух" });
AssertAnalyzesTo(a, "кожухът", new string[] { "кожух" });
AssertAnalyzesTo(a, "кожуси", new string[] { "кожух" });
AssertAnalyzesTo(a, "кожусите", new string[] { "кожух" });
// ъ deletion
AssertAnalyzesTo(a, "център", new string[] { "центр" });
AssertAnalyzesTo(a, "центъра", new string[] { "центр" });
AssertAnalyzesTo(a, "центърът", new string[] { "центр" });
AssertAnalyzesTo(a, "центрове", new string[] { "центр" });
AssertAnalyzesTo(a, "центровете", new string[] { "центр" });
// е*и -> я*
AssertAnalyzesTo(a, "промяна", new string[] { "промян" });
AssertAnalyzesTo(a, "промяната", new string[] { "промян" });
AssertAnalyzesTo(a, "промени", new string[] { "промян" });
AssertAnalyzesTo(a, "промените", new string[] { "промян" });
// ен -> н
AssertAnalyzesTo(a, "песен", new string[] { "песн" });
AssertAnalyzesTo(a, "песента", new string[] { "песн" });
AssertAnalyzesTo(a, "песни", new string[] { "песн" });
AssertAnalyzesTo(a, "песните", new string[] { "песн" });
// -еве -> й
// note: this is the only word i think this rule works for.
// most -еве pluralized nouns are monosyllabic,
// and the stemmer requires length > 6...
AssertAnalyzesTo(a, "строй", new string[] { "строй" });
AssertAnalyzesTo(a, "строеве", new string[] { "строй" });
AssertAnalyzesTo(a, "строевете", new string[] { "строй" });
/* note the below forms conflate with each other, but not the rest */
AssertAnalyzesTo(a, "строя", new string[] { "стр" });
AssertAnalyzesTo(a, "строят", new string[] { "стр" });
}
[Test]
public virtual void TestWithKeywordAttribute()
{
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
set.add("строеве");
MockTokenizer tokenStream = new MockTokenizer(new StringReader("строевете строеве"), MockTokenizer.WHITESPACE, false);
BulgarianStemFilter filter = new BulgarianStemFilter(new SetKeywordMarkerFilter(tokenStream, set));
AssertTokenStreamContents(filter, new string[] { "строй", "строеве" });
}
[Test]
public virtual void TestEmptyTerm()
{
Analyzer a = new AnalyzerAnonymousInnerClassHelper(this);
CheckOneTerm(a, "", "");
}
private class AnalyzerAnonymousInnerClassHelper : Analyzer
{
private readonly TestBulgarianStemmer outerInstance;
public AnalyzerAnonymousInnerClassHelper(TestBulgarianStemmer outerInstance)
{
this.outerInstance = outerInstance;
}
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
{
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new BulgarianStemFilter(tokenizer));
}
}
}
}