blob: 91b055bff3d972f2c2e4acf4f6a6edf1813f3539 [file] [log] [blame]
using Lucene.Net.Analysis.Core;
using Lucene.Net.Analysis.Miscellaneous;
using Lucene.Net.Analysis.Util;
using NUnit.Framework;
using System.IO;
namespace Lucene.Net.Analysis.Br
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Test the Brazilian Stem Filter, which only modifies the term text.
///
/// It is very similar to the snowball portuguese algorithm but not exactly the same.
///
/// </summary>
public class TestBrazilianStemmer : BaseTokenStreamTestCase
{
[Test]
public virtual void TestWithSnowballExamples()
{
Check("boa", "boa");
Check("boainain", "boainain");
Check("boas", "boas");
Check("bôas", "boas"); // removes diacritic: different from snowball portugese
Check("boassu", "boassu");
Check("boataria", "boat");
Check("boate", "boat");
Check("boates", "boat");
Check("boatos", "boat");
Check("bob", "bob");
Check("boba", "bob");
Check("bobagem", "bobag");
Check("bobagens", "bobagens");
Check("bobalhões", "bobalho"); // removes diacritic: different from snowball portugese
Check("bobear", "bob");
Check("bobeira", "bobeir");
Check("bobinho", "bobinh");
Check("bobinhos", "bobinh");
Check("bobo", "bob");
Check("bobs", "bobs");
Check("boca", "boc");
Check("bocadas", "boc");
Check("bocadinho", "bocadinh");
Check("bocado", "boc");
Check("bocaiúva", "bocaiuv"); // removes diacritic: different from snowball portuguese
Check("boçal", "bocal"); // removes diacritic: different from snowball portuguese
Check("bocarra", "bocarr");
Check("bocas", "boc");
Check("bode", "bod");
Check("bodoque", "bodoqu");
Check("body", "body");
Check("boeing", "boeing");
Check("boem", "boem");
Check("boemia", "boem");
Check("boêmio", "boemi"); // removes diacritic: different from snowball portuguese
Check("bogotá", "bogot");
Check("boi", "boi");
Check("bóia", "boi"); // removes diacritic: different from snowball portuguese
Check("boiando", "boi");
Check("quiabo", "quiab");
Check("quicaram", "quic");
Check("quickly", "quickly");
Check("quieto", "quiet");
Check("quietos", "quiet");
Check("quilate", "quilat");
Check("quilates", "quilat");
Check("quilinhos", "quilinh");
Check("quilo", "quil");
Check("quilombo", "quilomb");
Check("quilométricas", "quilometr"); // removes diacritic: different from snowball portuguese
Check("quilométricos", "quilometr"); // removes diacritic: different from snowball portuguese
Check("quilômetro", "quilometr"); // removes diacritic: different from snowball portoguese
Check("quilômetros", "quilometr"); // removes diacritic: different from snowball portoguese
Check("quilos", "quil");
Check("quimica", "quimic");
Check("quilos", "quil");
Check("quimica", "quimic");
Check("quimicas", "quimic");
Check("quimico", "quimic");
Check("quimicos", "quimic");
Check("quimioterapia", "quimioterap");
Check("quimioterápicos", "quimioterap"); // removes diacritic: different from snowball portoguese
Check("quimono", "quimon");
Check("quincas", "quinc");
Check("quinhão", "quinha"); // removes diacritic: different from snowball portoguese
Check("quinhentos", "quinhent");
Check("quinn", "quinn");
Check("quino", "quin");
Check("quinta", "quint");
Check("quintal", "quintal");
Check("quintana", "quintan");
Check("quintanilha", "quintanilh");
Check("quintão", "quinta"); // removes diacritic: different from snowball portoguese
Check("quintessência", "quintessente"); // versus snowball portuguese 'quintessent'
Check("quintino", "quintin");
Check("quinto", "quint");
Check("quintos", "quint");
Check("quintuplicou", "quintuplic");
Check("quinze", "quinz");
Check("quinzena", "quinzen");
Check("quiosque", "quiosqu");
}
[Test]
public virtual void TestNormalization()
{
Check("Brasil", "brasil"); // lowercase by default
Check("Brasília", "brasil"); // remove diacritics
Check("quimio5terápicos", "quimio5terapicos"); // contains non-letter, diacritic will still be removed
Check("áá", "áá"); // token is too short: diacritics are not removed
Check("ááá", "aaa"); // normally, diacritics are removed
}
[Test]
public virtual void TestReusableTokenStream()
{
Analyzer a = new BrazilianAnalyzer(TEST_VERSION_CURRENT);
checkReuse(a, "boa", "boa");
checkReuse(a, "boainain", "boainain");
checkReuse(a, "boas", "boas");
checkReuse(a, "bôas", "boas"); // removes diacritic: different from snowball portugese
}
[Test]
public virtual void TestStemExclusionTable()
{
BrazilianAnalyzer a = new BrazilianAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, new CharArraySet(TEST_VERSION_CURRENT, AsSet("quintessência"), false));
checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
}
[Test]
public virtual void TestWithKeywordAttribute()
{
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
set.add("Brasília");
BrazilianStemFilter filter = new BrazilianStemFilter(new SetKeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader("Brasília Brasilia")), set));
AssertTokenStreamContents(filter, new string[] { "brasília", "brasil" });
}
private void Check(string input, string expected)
{
CheckOneTerm(new BrazilianAnalyzer(TEST_VERSION_CURRENT), input, expected);
}
private void checkReuse(Analyzer a, string input, string expected)
{
CheckOneTerm(a, input, expected);
}
/// <summary>
/// blast some random strings through the analyzer </summary>
[Test]
public virtual void TestRandomStrings()
{
CheckRandomData(Random(), new BrazilianAnalyzer(TEST_VERSION_CURRENT), 1000 * RANDOM_MULTIPLIER);
}
[Test]
public virtual void TestEmptyTerm()
{
Analyzer a = new AnalyzerAnonymousInnerClassHelper(this);
CheckOneTerm(a, "", "");
}
private class AnalyzerAnonymousInnerClassHelper : Analyzer
{
private readonly TestBrazilianStemmer outerInstance;
public AnalyzerAnonymousInnerClassHelper(TestBrazilianStemmer outerInstance)
{
this.outerInstance = outerInstance;
}
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
{
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new BrazilianStemFilter(tokenizer));
}
}
}
}