blob: 2e3a64e4e48d73889090a00aa1ae64a1671e34d6 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.fr;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
/** Test case for FrenchAnalyzer. */
public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
public void testAnalyzer() throws Exception {
FrenchAnalyzer fa = new FrenchAnalyzer();
assertAnalyzesTo(fa, "", new String[] {});
assertAnalyzesTo(fa, "chien chat cheval", new String[] {"chien", "chat", "cheval"});
assertAnalyzesTo(fa, "chien CHAT CHEVAL", new String[] {"chien", "chat", "cheval"});
assertAnalyzesTo(
fa, " chien ,? + = - CHAT /: > CHEVAL", new String[] {"chien", "chat", "cheval"});
assertAnalyzesTo(fa, "chien++", new String[] {"chien"});
assertAnalyzesTo(fa, "mot \"entreguillemet\"", new String[] {"mot", "entreguilemet"});
// let's do some french specific tests now
/* 1. couldn't resist
I would expect this to stay one term as in French the minus
sign is often used for composing words */
assertAnalyzesTo(fa, "Jean-François", new String[] {"jean", "francoi"});
// 2. stopwords
assertAnalyzesTo(
fa, "le la chien les aux chat du des à cheval", new String[] {"chien", "chat", "cheval"});
// some nouns and adjectives
assertAnalyzesTo(
fa,
"lances chismes habitable chiste éléments captifs",
new String[] {"lanc", "chism", "habitabl", "chist", "element", "captif"});
// some verbs
assertAnalyzesTo(
fa,
"finissions souffrirent rugissante",
new String[] {"finision", "soufrirent", "rugisant"});
// some everything else
// aujourd'hui stays one term which is OK
assertAnalyzesTo(
fa,
"C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
new String[] {"c3po", "aujourd'hui", "oeuf", "ïaöuaä", "anticonstitutionel", "java"});
// some more everything else
// here 1940-1945 stays as one term, 1940:1945 not ?
assertAnalyzesTo(
fa,
"33Bis 1940-1945 1940:1945 (---i+++)*",
new String[] {"33bi", "1940", "1945", "1940", "1945", "i"});
fa.close();
}
public void testReusableTokenStream() throws Exception {
FrenchAnalyzer fa = new FrenchAnalyzer();
// stopwords
assertAnalyzesTo(
fa, "le la chien les aux chat du des à cheval", new String[] {"chien", "chat", "cheval"});
// some nouns and adjectives
assertAnalyzesTo(
fa,
"lances chismes habitable chiste éléments captifs",
new String[] {"lanc", "chism", "habitabl", "chist", "element", "captif"});
fa.close();
}
public void testExclusionTableViaCtor() throws Exception {
CharArraySet set = new CharArraySet(1, true);
set.add("habitable");
FrenchAnalyzer fa = new FrenchAnalyzer(CharArraySet.EMPTY_SET, set);
assertAnalyzesTo(fa, "habitable chiste", new String[] {"habitable", "chist"});
fa.close();
fa = new FrenchAnalyzer(CharArraySet.EMPTY_SET, set);
assertAnalyzesTo(fa, "habitable chiste", new String[] {"habitable", "chist"});
fa.close();
}
public void testElision() throws Exception {
FrenchAnalyzer fa = new FrenchAnalyzer();
assertAnalyzesTo(fa, "voir l'embrouille", new String[] {"voir", "embrouil"});
fa.close();
}
/** Test that stopwords are not case sensitive */
public void testStopwordsCasing() throws IOException {
FrenchAnalyzer a = new FrenchAnalyzer();
assertAnalyzesTo(a, "Votre", new String[] {});
a.close();
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new FrenchAnalyzer();
checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER);
a.close();
}
/** test accent-insensitive */
public void testAccentInsensitive() throws Exception {
Analyzer a = new FrenchAnalyzer();
checkOneTerm(a, "sécuritaires", "securitair");
checkOneTerm(a, "securitaires", "securitair");
a.close();
}
}