blob: f8ef708f5c90a14f0c35c4f37cb1415d0ad76f51 [file] [log] [blame]
using Lucene.Net.Analysis.Util;
using Lucene.Net.Util;
using NUnit.Framework;
using System;
namespace Lucene.Net.Analysis.Fr
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Test case for FrenchAnalyzer.
///
/// </summary>
public class TestFrenchAnalyzer : BaseTokenStreamTestCase
{
[Test]
public virtual void TestAnalyzer()
{
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
AssertAnalyzesTo(fa, "", new string[] { });
AssertAnalyzesTo(fa, "chien chat cheval", new string[] { "chien", "chat", "cheval" });
AssertAnalyzesTo(fa, "chien CHAT CHEVAL", new string[] { "chien", "chat", "cheval" });
AssertAnalyzesTo(fa, " chien ,? + = - CHAT /: > CHEVAL", new string[] { "chien", "chat", "cheval" });
AssertAnalyzesTo(fa, "chien++", new string[] { "chien" });
AssertAnalyzesTo(fa, "mot \"entreguillemet\"", new string[] { "mot", "entreguilemet" });
// let's do some french specific tests now
/* 1. couldn't resist
I would expect this to stay one term as in French the minus
sign is often used for composing words */
AssertAnalyzesTo(fa, "Jean-François", new string[] { "jean", "francoi" });
// 2. stopwords
AssertAnalyzesTo(fa, "le la chien les aux chat du des à cheval", new string[] { "chien", "chat", "cheval" });
// some nouns and adjectives
AssertAnalyzesTo(fa, "lances chismes habitable chiste éléments captifs", new string[] { "lanc", "chism", "habitabl", "chist", "element", "captif" });
// some verbs
AssertAnalyzesTo(fa, "finissions souffrirent rugissante", new string[] { "finision", "soufrirent", "rugisant" });
// some everything else
// aujourd'hui stays one term which is OK
AssertAnalyzesTo(fa, "C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ", new string[] { "c3po", "aujourd'hui", "oeuf", "ïaöuaä", "anticonstitutionel", "java" });
// some more everything else
// here 1940-1945 stays as one term, 1940:1945 not ?
AssertAnalyzesTo(fa, "33Bis 1940-1945 1940:1945 (---i+++)*", new string[] { "33bi", "1940", "1945", "1940", "1945", "i" });
}
/// @deprecated (3.1) remove this test for Lucene 5.0
[Test]
[Obsolete("(3.1) remove this test for Lucene 5.0")]
public virtual void TestAnalyzer30()
{
FrenchAnalyzer fa = new FrenchAnalyzer(LuceneVersion.LUCENE_30);
AssertAnalyzesTo(fa, "", new string[] { });
AssertAnalyzesTo(fa, "chien chat cheval", new string[] { "chien", "chat", "cheval" });
AssertAnalyzesTo(fa, "chien CHAT CHEVAL", new string[] { "chien", "chat", "cheval" });
AssertAnalyzesTo(fa, " chien ,? + = - CHAT /: > CHEVAL", new string[] { "chien", "chat", "cheval" });
AssertAnalyzesTo(fa, "chien++", new string[] { "chien" });
AssertAnalyzesTo(fa, "mot \"entreguillemet\"", new string[] { "mot", "entreguillemet" });
// let's do some french specific tests now
/* 1. couldn't resist
I would expect this to stay one term as in French the minus
sign is often used for composing words */
AssertAnalyzesTo(fa, "Jean-François", new string[] { "jean", "françois" });
// 2. stopwords
AssertAnalyzesTo(fa, "le la chien les aux chat du des à cheval", new string[] { "chien", "chat", "cheval" });
// some nouns and adjectives
AssertAnalyzesTo(fa, "lances chismes habitable chiste éléments captifs", new string[] { "lanc", "chism", "habit", "chist", "élément", "captif" });
// some verbs
AssertAnalyzesTo(fa, "finissions souffrirent rugissante", new string[] { "fin", "souffr", "rug" });
// some everything else
// aujourd'hui stays one term which is OK
AssertAnalyzesTo(fa, "C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ", new string[] { "c3po", "aujourd'hui", "oeuf", "ïâöûàä", "anticonstitutionnel", "jav" });
// some more everything else
// here 1940-1945 stays as one term, 1940:1945 not ?
AssertAnalyzesTo(fa, "33Bis 1940-1945 1940:1945 (---i+++)*", new string[] { "33bis", "1940-1945", "1940", "1945", "i" });
}
[Test]
public virtual void TestReusableTokenStream()
{
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
// stopwords
AssertAnalyzesTo(fa, "le la chien les aux chat du des à cheval", new string[] { "chien", "chat", "cheval" });
// some nouns and adjectives
AssertAnalyzesTo(fa, "lances chismes habitable chiste éléments captifs", new string[] { "lanc", "chism", "habitabl", "chist", "element", "captif" });
}
[Test]
public virtual void TestExclusionTableViaCtor()
{
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
set.add("habitable");
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
AssertAnalyzesTo(fa, "habitable chiste", new string[] { "habitable", "chist" });
fa = new FrenchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
AssertAnalyzesTo(fa, "habitable chiste", new string[] { "habitable", "chist" });
}
[Test]
public virtual void TestElision()
{
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
AssertAnalyzesTo(fa, "voir l'embrouille", new string[] { "voir", "embrouil" });
}
/// <summary>
/// Prior to 3.1, this analyzer had no lowercase filter.
/// stopwords were case sensitive. Preserve this for back compat. </summary>
/// @deprecated (3.1) Remove this test in Lucene 5.0
[Test]
[Obsolete("(3.1) Remove this test in Lucene 5.0")]
public virtual void TestBuggyStopwordsCasing()
{
FrenchAnalyzer a = new FrenchAnalyzer(LuceneVersion.LUCENE_30);
AssertAnalyzesTo(a, "Votre", new string[] { "votr" });
}
/// <summary>
/// Test that stopwords are not case sensitive
/// </summary>
[Test]
public virtual void TestStopwordsCasing()
{
FrenchAnalyzer a = new FrenchAnalyzer(LuceneVersion.LUCENE_31);
AssertAnalyzesTo(a, "Votre", new string[] { });
}
/// <summary>
/// blast some random strings through the analyzer </summary>
[Test]
public virtual void TestRandomStrings()
{
CheckRandomData(Random(), new FrenchAnalyzer(TEST_VERSION_CURRENT), 1000 * RANDOM_MULTIPLIER);
}
/// <summary>
/// test accent-insensitive </summary>
[Test]
public virtual void TestAccentInsensitive()
{
Analyzer a = new FrenchAnalyzer(TEST_VERSION_CURRENT);
CheckOneTerm(a, "sécuritaires", "securitair");
CheckOneTerm(a, "securitaires", "securitair");
}
}
}