blob: 6c8365473ff72c1209fd8550c0e6405cd5c35013 [file] [log] [blame]
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java (revision 885098)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java (working copy)
@@ -26,7 +26,6 @@
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.util.Version;
import java.io.*;
@@ -36,19 +35,27 @@
import java.util.Collections;
/**
- * {@link Analyzer} for Czech language.
+ * {@link Analyzer} for Czech language.
* <p>
- * Supports an external list of stopwords (words that
- * will not be indexed at all).
- * A default set of stopwords is used unless an alternative list is specified.
+ * Supports an external list of stopwords (words that will not be indexed at
+ * all). A default set of stopwords is used unless an alternative list is
+ * specified.
* </p>
- *
- * <p><b>NOTE</b>: This class uses the same {@link Version}
- * dependent settings as {@link StandardAnalyzer}.</p>
+ *
+ * <a name="version"/>
+ * <p>
+ * You must specify the required {@link Version} compatibility when creating
+ * CzechAnalyzer:
+ * <ul>
+ * <li>As of 3.1, words are stemmed with {@link CzechStemFilter}
+ * <li>As of 2.9, StopFilter preserves position increments
+ * <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
+ * <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
+ * </ul>
*/
public final class CzechAnalyzer extends Analyzer {
- /**
+ /**
* List of typical stopwords.
* @deprecated use {@link #getDefaultStopSet()} instead
*/
@@ -74,10 +81,11 @@
"jeho\u017e","j\u00ed\u017e","jeliko\u017e","je\u017e","jako\u017e","na\u010de\u017e",
};
- /**
- * Returns a set of default Czech-stopwords
- * @return a set of default Czech-stopwords
- */
+ /**
+ * Returns a set of default Czech-stopwords
+ *
+ * @return a set of default Czech-stopwords
+ */
public static final Set<?> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_SET;
}
@@ -87,27 +95,29 @@
Arrays.asList(CZECH_STOP_WORDS), false));
}
- /**
- * Contains the stopwords used with the {@link StopFilter}.
- */
+ /**
+ * Contains the stopwords used with the {@link StopFilter}.
+ */
// TODO make this final in 3.1
private Set<?> stoptable;
private final Version matchVersion;
- /**
- * Builds an analyzer with the default stop words ({@link #CZECH_STOP_WORDS}).
- */
+ /**
+ * Builds an analyzer with the default stop words ({@link #CZECH_STOP_WORDS}).
+ *
+ * @param matchVersion Lucene version to match See
+ * {@link <a href="#version">above</a>}
+ */
public CzechAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
}
- /**
- * Builds an analyzer with the given stop words and stemming exclusion words
+ /**
+ * Builds an analyzer with the given stop words.
*
- * @param matchVersion
- * lucene compatibility version
- * @param stopwords
- * a stopword set
+ * @param matchVersion Lucene version to match See
+ * {@link <a href="#version">above</a>}
+ * @param stopwords a stopword set
*/
public CzechAnalyzer(Version matchVersion, Set<?> stopwords) {
this.matchVersion = matchVersion;
@@ -115,10 +125,14 @@
}
- /**
- * Builds an analyzer with the given stop words.
- * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
- */
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion Lucene version to match See
+ * {@link <a href="#version">above</a>}
+ * @param stopwords a stopword set
+ * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
+ */
public CzechAnalyzer(Version matchVersion, String... stopwords) {
this(matchVersion, StopFilter.makeStopSet( stopwords ));
}
@@ -126,16 +140,23 @@
/**
* Builds an analyzer with the given stop words.
*
+ * @param matchVersion Lucene version to match See
+ * {@link <a href="#version">above</a>}
+ * @param stopwords a stopword set
* @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
*/
public CzechAnalyzer(Version matchVersion, HashSet<?> stopwords) {
this(matchVersion, (Set<?>)stopwords);
}
- /**
- * Builds an analyzer with the given stop words.
- * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
- */
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion Lucene version to match See
+ * {@link <a href="#version">above</a>}
+ * @param stopwords a file containing stopwords
+ * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
+ */
public CzechAnalyzer(Version matchVersion, File stopwords ) throws IOException {
this(matchVersion, (Set<?>)WordlistLoader.getWordSet( stopwords ));
}
@@ -171,19 +192,24 @@
}
}
- /**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
- * {@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter}
- */
- @Override
+ /**
+ * Creates a {@link TokenStream} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, and {@link CzechStemFilter} (only if version is
+ * >= LUCENE_31)
+ */
+ @Override
public final TokenStream tokenStream( String fieldName, Reader reader ) {
TokenStream result = new StandardTokenizer( matchVersion, reader );
result = new StandardFilter( result );
result = new LowerCaseFilter( matchVersion, result );
result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
result, stoptable );
+ if (matchVersion.onOrAfter(Version.LUCENE_31))
+ result = new CzechStemFilter(result);
return result;
}
@@ -192,13 +218,15 @@
TokenStream result;
};
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in
- * the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
- * {@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter}
- */
+ /**
+ * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
+ * text in the provided {@link Reader}.
+ *
+ * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, and {@link CzechStemFilter} (only if version is
+ * >= LUCENE_31)
+ */
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
@@ -210,6 +238,8 @@
streams.result = new LowerCaseFilter(matchVersion, streams.result);
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.result, stoptable);
+ if (matchVersion.onOrAfter(Version.LUCENE_31))
+ streams.result = new CzechStemFilter(streams.result);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java (revision 0)
@@ -0,0 +1,52 @@
+package org.apache.lucene.analysis.cz;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A {@link TokenFilter} that applies {@link CzechStemmer} to stem Czech words.
+ *
+ * <p><b>NOTE</b>: Input is expected to be in lowercase,
+ * but with diacritical marks</p>
+ */
+public final class CzechStemFilter extends TokenFilter {
+ private final CzechStemmer stemmer;
+ private final TermAttribute termAtt;
+
+ public CzechStemFilter(TokenStream input) {
+ super(input);
+ stemmer = new CzechStemmer();
+ termAtt = addAttribute(TermAttribute.class);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
+ termAtt.setTermLength(newlen);
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\cz\CzechStemFilter.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java (revision 0)
@@ -0,0 +1,181 @@
+package org.apache.lucene.analysis.cz;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Light Stemmer for Czech.
+ * <p>
+ * Implements the algorithm described in:
+ * <i>
+ * Indexing and stemming approaches for the Czech language
+ * </i>
+ * http://portal.acm.org/citation.cfm?id=1598600
+ * </p>
+ */
+public class CzechStemmer {
+
+ /**
+ * Stem an input buffer of Czech text.
+ *
+ * @param s input buffer
+ * @param len length of input buffer
+ * @return length of input buffer after normalization
+ *
+ * <p><b>NOTE</b>: Input is expected to be in lowercase,
+ * but with diacritical marks</p>
+ */
+ public int stem(char s[], int len) {
+ len = removeCase(s, len);
+ len = removePossessives(s, len);
+ len = normalize(s, len);
+ return len;
+ }
+
+ private int removeCase(char s[], int len) {
+ if (len > 7 && endsWith(s, len, "atech"))
+ return len - 5;
+
+ if (len > 6 &&
+ (endsWith(s, len,"ětem") ||
+ endsWith(s, len,"etem") ||
+ endsWith(s, len,"atům")))
+ return len - 4;
+
+ if (len > 5 &&
+ (endsWith(s, len, "ech") ||
+ endsWith(s, len, "ich") ||
+ endsWith(s, len, "ích") ||
+ endsWith(s, len, "ého") ||
+ endsWith(s, len, "ěmi") ||
+ endsWith(s, len, "emi") ||
+ endsWith(s, len, "ému") ||
+ endsWith(s, len, "ěte") ||
+ endsWith(s, len, "ete") ||
+ endsWith(s, len, "ěti") ||
+ endsWith(s, len, "eti") ||
+ endsWith(s, len, "ího") ||
+ endsWith(s, len, "iho") ||
+ endsWith(s, len, "ími") ||
+ endsWith(s, len, "ímu") ||
+ endsWith(s, len, "imu") ||
+ endsWith(s, len, "ách") ||
+ endsWith(s, len, "ata") ||
+ endsWith(s, len, "aty") ||
+ endsWith(s, len, "ých") ||
+ endsWith(s, len, "ama") ||
+ endsWith(s, len, "ami") ||
+ endsWith(s, len, "ové") ||
+ endsWith(s, len, "ovi") ||
+ endsWith(s, len, "ými")))
+ return len - 3;
+
+ if (len > 4 &&
+ (endsWith(s, len, "em") ||
+ endsWith(s, len, "es") ||
+ endsWith(s, len, "ém") ||
+ endsWith(s, len, "ím") ||
+ endsWith(s, len, "ům") ||
+ endsWith(s, len, "at") ||
+ endsWith(s, len, "ám") ||
+ endsWith(s, len, "os") ||
+ endsWith(s, len, "us") ||
+ endsWith(s, len, "ým") ||
+ endsWith(s, len, "mi") ||
+ endsWith(s, len, "ou")))
+ return len - 2;
+
+ if (len > 3) {
+ switch (s[len - 1]) {
+ case 'a':
+ case 'e':
+ case 'i':
+ case 'o':
+ case 'u':
+ case 'ů':
+ case 'y':
+ case 'á':
+ case 'é':
+ case 'í':
+ case 'ý':
+ case 'ě':
+ return len - 1;
+ }
+ }
+
+ return len;
+ }
+
+ private int removePossessives(char s[], int len) {
+ if (len > 5 &&
+ (endsWith(s, len, "ov") ||
+ endsWith(s, len, "in") ||
+ endsWith(s, len, "ův")))
+ return len - 2;
+
+ return len;
+ }
+
+ private int normalize(char s[], int len) {
+ if (endsWith(s, len, "čt")) { // čt -> ck
+ s[len - 2] = 'c';
+ s[len - 1] = 'k';
+ return len;
+ }
+
+ if (endsWith(s, len, "št")) { // št -> sk
+ s[len - 2] = 's';
+ s[len - 1] = 'k';
+ return len;
+ }
+
+ switch(s[len - 1]) {
+ case 'c': // [cč] -> k
+ case 'č':
+ s[len - 1] = 'k';
+ return len;
+ case 'z': // [zž] -> h
+ case 'ž':
+ s[len - 1] = 'h';
+ return len;
+ }
+
+ if (len > 1 && s[len - 2] == 'e') {
+ s[len - 2] = s[len - 1]; // e* > *
+ return len - 1;
+ }
+
+ if (len > 2 && s[len - 2] == 'ů') {
+ s[len - 2] = 'o'; // *ů* -> *o*
+ return len;
+ }
+
+ return len;
+ }
+
+ private boolean endsWith(char s[], int len, String suffix) {
+ int suffixLen = suffix.length();
+ if (suffixLen > len)
+ return false;
+
+ for (int i = suffixLen - 1; i >= 0; i--)
+ if (s[len - (suffixLen - i)] != suffix.charAt(i))
+ return false;
+
+ return true;
+ }
+}
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\cz\CzechStemmer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java (revision 885098)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java (working copy)
@@ -24,31 +24,50 @@
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;
/**
* Test the CzechAnalyzer
*
- * CzechAnalyzer is like a StandardAnalyzer with a custom stopword list.
+ * Before Lucene 3.1, CzechAnalyzer was a StandardAnalyzer with a custom
+ * stopword list. As of 3.1 it also includes a stemmer.
*
*/
public class TestCzechAnalyzer extends BaseTokenStreamTestCase {
File dataDir = new File(System.getProperty("dataDir", "./bin"));
File customStopFile = new File(dataDir, "org/apache/lucene/analysis/cz/customStopWordFile.txt");
+ /**
+ * @deprecated Remove this test when support for 3.0 indexes is no longer needed.
+ */
+ public void testStopWordLegacy() throws Exception {
+ assertAnalyzesTo(new CzechAnalyzer(Version.LUCENE_30), "Pokud mluvime o volnem",
+ new String[] { "mluvime", "volnem" });
+ }
+
public void testStopWord() throws Exception {
- assertAnalyzesTo(new CzechAnalyzer(Version.LUCENE_CURRENT), "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" });
+ assertAnalyzesTo(new CzechAnalyzer(Version.LUCENE_CURRENT), "Pokud mluvime o volnem",
+ new String[] { "mluvim", "voln" });
}
-
- public void testReusableTokenStream() throws Exception {
- Analyzer analyzer = new CzechAnalyzer(Version.LUCENE_CURRENT);
+
+ /**
+ * @deprecated Remove this test when support for 3.0 indexes is no longer needed.
+ */
+ public void testReusableTokenStreamLegacy() throws Exception {
+ Analyzer analyzer = new CzechAnalyzer(Version.LUCENE_30);
assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" });
assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česká", "republika" });
}
+
+ public void testReusableTokenStream() throws Exception {
+ Analyzer analyzer = new CzechAnalyzer(Version.LUCENE_CURRENT);
+ assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvim", "voln" });
+ assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česk", "republik" });
+ }
- /*
+ /**
* An input stream that always throws IOException for testing.
+ * @deprecated Remove this class when the loadStopWords method is removed.
*/
private class UnreliableInputStream extends InputStream {
@Override
@@ -57,24 +76,26 @@
}
}
- /*
+ /**
* The loadStopWords method does not throw IOException on error,
* instead previously it set the stoptable to null (versus empty)
* this would cause a NPE when it is time to create the StopFilter.
+ * @deprecated Remove this test when the loadStopWords method is removed.
*/
public void testInvalidStopWordFile() throws Exception {
- CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
+ CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_30);
cz.loadStopWords(new UnreliableInputStream(), "UTF-8");
assertAnalyzesTo(cz, "Pokud mluvime o volnem",
new String[] { "pokud", "mluvime", "o", "volnem" });
}
- /*
+ /**
* Test that changes to the stop table via loadStopWords are applied immediately
* when using reusable token streams.
+ * @deprecated Remove this test when the loadStopWords method is removed.
*/
public void testStopWordFileReuse() throws Exception {
- CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
+ CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_30);
assertAnalyzesToReuse(cz, "Česká Republika",
new String[] { "česká", "republika" });
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java (revision 0)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java (revision 0)
@@ -0,0 +1,273 @@
+package org.apache.lucene.analysis.cz;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test the Czech Stemmer.
+ *
+ * Note: its algorithmic, so some stems are nonsense
+ *
+ */
+public class TestCzechStemmer extends BaseTokenStreamTestCase {
+
+ /**
+ * Test showing how masculine noun forms conflate
+ */
+ public void testMasculineNouns() throws IOException {
+ CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
+
+ /* animate ending with a hard consonant */
+ assertAnalyzesTo(cz, "pán", new String[] { "pán" });
+ assertAnalyzesTo(cz, "páni", new String[] { "pán" });
+ assertAnalyzesTo(cz, "pánové", new String[] { "pán" });
+ assertAnalyzesTo(cz, "pána", new String[] { "pán" });
+ assertAnalyzesTo(cz, "pánů", new String[] { "pán" });
+ assertAnalyzesTo(cz, "pánovi", new String[] { "pán" });
+ assertAnalyzesTo(cz, "pánům", new String[] { "pán" });
+ assertAnalyzesTo(cz, "pány", new String[] { "pán" });
+ assertAnalyzesTo(cz, "páne", new String[] { "pán" });
+ assertAnalyzesTo(cz, "pánech", new String[] { "pán" });
+ assertAnalyzesTo(cz, "pánem", new String[] { "pán" });
+
+ /* inanimate ending with hard consonant */
+ assertAnalyzesTo(cz, "hrad", new String[] { "hrad" });
+ assertAnalyzesTo(cz, "hradu", new String[] { "hrad" });
+ assertAnalyzesTo(cz, "hrade", new String[] { "hrad" });
+ assertAnalyzesTo(cz, "hradem", new String[] { "hrad" });
+ assertAnalyzesTo(cz, "hrady", new String[] { "hrad" });
+ assertAnalyzesTo(cz, "hradech", new String[] { "hrad" });
+ assertAnalyzesTo(cz, "hradům", new String[] { "hrad" });
+ assertAnalyzesTo(cz, "hradů", new String[] { "hrad" });
+
+ /* animate ending with a soft consonant */
+ assertAnalyzesTo(cz, "muž", new String[] { "muh" });
+ assertAnalyzesTo(cz, "muži", new String[] { "muh" });
+ assertAnalyzesTo(cz, "muže", new String[] { "muh" });
+ assertAnalyzesTo(cz, "mužů", new String[] { "muh" });
+ assertAnalyzesTo(cz, "mužům", new String[] { "muh" });
+ assertAnalyzesTo(cz, "mužích", new String[] { "muh" });
+ assertAnalyzesTo(cz, "mužem", new String[] { "muh" });
+
+ /* inanimate ending with a soft consonant */
+ assertAnalyzesTo(cz, "stroj", new String[] { "stroj" });
+ assertAnalyzesTo(cz, "stroje", new String[] { "stroj" });
+ assertAnalyzesTo(cz, "strojů", new String[] { "stroj" });
+ assertAnalyzesTo(cz, "stroji", new String[] { "stroj" });
+ assertAnalyzesTo(cz, "strojům", new String[] { "stroj" });
+ assertAnalyzesTo(cz, "strojích", new String[] { "stroj" });
+ assertAnalyzesTo(cz, "strojem", new String[] { "stroj" });
+
+ /* ending with a */
+ assertAnalyzesTo(cz, "předseda", new String[] { "předsd" });
+ assertAnalyzesTo(cz, "předsedové", new String[] { "předsd" });
+ assertAnalyzesTo(cz, "předsedy", new String[] { "předsd" });
+ assertAnalyzesTo(cz, "předsedů", new String[] { "předsd" });
+ assertAnalyzesTo(cz, "předsedovi", new String[] { "předsd" });
+ assertAnalyzesTo(cz, "předsedům", new String[] { "předsd" });
+ assertAnalyzesTo(cz, "předsedu", new String[] { "předsd" });
+ assertAnalyzesTo(cz, "předsedo", new String[] { "předsd" });
+ assertAnalyzesTo(cz, "předsedech", new String[] { "předsd" });
+ assertAnalyzesTo(cz, "předsedou", new String[] { "předsd" });
+
+ /* ending with e */
+ assertAnalyzesTo(cz, "soudce", new String[] { "soudk" });
+ assertAnalyzesTo(cz, "soudci", new String[] { "soudk" });
+ assertAnalyzesTo(cz, "soudců", new String[] { "soudk" });
+ assertAnalyzesTo(cz, "soudcům", new String[] { "soudk" });
+ assertAnalyzesTo(cz, "soudcích", new String[] { "soudk" });
+ assertAnalyzesTo(cz, "soudcem", new String[] { "soudk" });
+ }
+
+ /**
+ * Test showing how feminine noun forms conflate
+ */
+ public void testFeminineNouns() throws IOException {
+ CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
+
+ /* ending with hard consonant */
+ assertAnalyzesTo(cz, "kost", new String[] { "kost" });
+ assertAnalyzesTo(cz, "kosti", new String[] { "kost" });
+ assertAnalyzesTo(cz, "kostí", new String[] { "kost" });
+ assertAnalyzesTo(cz, "kostem", new String[] { "kost" });
+ assertAnalyzesTo(cz, "kostech", new String[] { "kost" });
+ assertAnalyzesTo(cz, "kostmi", new String[] { "kost" });
+
+ /* ending with a soft consonant */
+ // note: in this example sing nom. and sing acc. don't conflate w/ the rest
+ assertAnalyzesTo(cz, "píseň", new String[] { "písň" });
+ assertAnalyzesTo(cz, "písně", new String[] { "písn" });
+ assertAnalyzesTo(cz, "písni", new String[] { "písn" });
+ assertAnalyzesTo(cz, "písněmi", new String[] { "písn" });
+ assertAnalyzesTo(cz, "písních", new String[] { "písn" });
+ assertAnalyzesTo(cz, "písním", new String[] { "písn" });
+
+ /* ending with e */
+ assertAnalyzesTo(cz, "růže", new String[] { "růh" });
+ assertAnalyzesTo(cz, "růží", new String[] { "růh" });
+ assertAnalyzesTo(cz, "růžím", new String[] { "růh" });
+ assertAnalyzesTo(cz, "růžích", new String[] { "růh" });
+ assertAnalyzesTo(cz, "růžemi", new String[] { "růh" });
+ assertAnalyzesTo(cz, "růži", new String[] { "růh" });
+
+ /* ending with a */
+ assertAnalyzesTo(cz, "žena", new String[] { "žn" });
+ assertAnalyzesTo(cz, "ženy", new String[] { "žn" });
+ assertAnalyzesTo(cz, "žen", new String[] { "žn" });
+ assertAnalyzesTo(cz, "ženě", new String[] { "žn" });
+ assertAnalyzesTo(cz, "ženám", new String[] { "žn" });
+ assertAnalyzesTo(cz, "ženu", new String[] { "žn" });
+ assertAnalyzesTo(cz, "ženo", new String[] { "žn" });
+ assertAnalyzesTo(cz, "ženách", new String[] { "žn" });
+ assertAnalyzesTo(cz, "ženou", new String[] { "žn" });
+ assertAnalyzesTo(cz, "ženami", new String[] { "žn" });
+ }
+
+ /**
+ * Test showing how neuter noun forms conflate
+ */
+ public void testNeuterNouns() throws IOException {
+ CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
+
+ /* ending with o */
+ assertAnalyzesTo(cz, "město", new String[] { "měst" });
+ assertAnalyzesTo(cz, "města", new String[] { "měst" });
+ assertAnalyzesTo(cz, "měst", new String[] { "měst" });
+ assertAnalyzesTo(cz, "městu", new String[] { "měst" });
+ assertAnalyzesTo(cz, "městům", new String[] { "měst" });
+ assertAnalyzesTo(cz, "městě", new String[] { "měst" });
+ assertAnalyzesTo(cz, "městech", new String[] { "měst" });
+ assertAnalyzesTo(cz, "městem", new String[] { "měst" });
+ assertAnalyzesTo(cz, "městy", new String[] { "měst" });
+
+ /* ending with e */
+ assertAnalyzesTo(cz, "moře", new String[] { "moř" });
+ assertAnalyzesTo(cz, "moří", new String[] { "moř" });
+ assertAnalyzesTo(cz, "mořím", new String[] { "moř" });
+ assertAnalyzesTo(cz, "moři", new String[] { "moř" });
+ assertAnalyzesTo(cz, "mořích", new String[] { "moř" });
+ assertAnalyzesTo(cz, "mořem", new String[] { "moř" });
+
+ /* ending with ě */
+ assertAnalyzesTo(cz, "kuře", new String[] { "kuř" });
+ assertAnalyzesTo(cz, "kuřata", new String[] { "kuř" });
+ assertAnalyzesTo(cz, "kuřete", new String[] { "kuř" });
+ assertAnalyzesTo(cz, "kuřat", new String[] { "kuř" });
+ assertAnalyzesTo(cz, "kuřeti", new String[] { "kuř" });
+ assertAnalyzesTo(cz, "kuřatům", new String[] { "kuř" });
+ assertAnalyzesTo(cz, "kuřatech", new String[] { "kuř" });
+ assertAnalyzesTo(cz, "kuřetem", new String[] { "kuř" });
+ assertAnalyzesTo(cz, "kuřaty", new String[] { "kuř" });
+
+ /* ending with í */
+ assertAnalyzesTo(cz, "stavení", new String[] { "stavn" });
+ assertAnalyzesTo(cz, "stavením", new String[] { "stavn" });
+ assertAnalyzesTo(cz, "staveních", new String[] { "stavn" });
+ assertAnalyzesTo(cz, "staveními", new String[] { "stavn" });
+ }
+
+ /**
+ * Test showing how adjectival forms conflate
+ */
+ public void testAdjectives() throws IOException {
+ CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
+
+ /* ending with ý/á/é */
+ assertAnalyzesTo(cz, "mladý", new String[] { "mlad" });
+ assertAnalyzesTo(cz, "mladí", new String[] { "mlad" });
+ assertAnalyzesTo(cz, "mladého", new String[] { "mlad" });
+ assertAnalyzesTo(cz, "mladých", new String[] { "mlad" });
+ assertAnalyzesTo(cz, "mladému", new String[] { "mlad" });
+ assertAnalyzesTo(cz, "mladým", new String[] { "mlad" });
+ assertAnalyzesTo(cz, "mladé", new String[] { "mlad" });
+ assertAnalyzesTo(cz, "mladém", new String[] { "mlad" });
+ assertAnalyzesTo(cz, "mladými", new String[] { "mlad" });
+ assertAnalyzesTo(cz, "mladá", new String[] { "mlad" });
+ assertAnalyzesTo(cz, "mladou", new String[] { "mlad" });
+
+ /* ending with í */
+ assertAnalyzesTo(cz, "jarní", new String[] { "jarn" });
+ assertAnalyzesTo(cz, "jarního", new String[] { "jarn" });
+ assertAnalyzesTo(cz, "jarních", new String[] { "jarn" });
+ assertAnalyzesTo(cz, "jarnímu", new String[] { "jarn" });
+ assertAnalyzesTo(cz, "jarním", new String[] { "jarn" });
+ assertAnalyzesTo(cz, "jarními", new String[] { "jarn" });
+ }
+
+ /**
+ * Test some possessive suffixes
+ */
+ public void testPossessive() throws IOException {
+ CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
+ assertAnalyzesTo(cz, "Karlův", new String[] { "karl" });
+ assertAnalyzesTo(cz, "jazykový", new String[] { "jazyk" });
+ }
+
+ /**
+ * Test some exceptional rules, implemented as rewrites.
+ */
+ public void testExceptions() throws IOException {
+ CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
+
+ /* rewrite of št -> sk */
+ assertAnalyzesTo(cz, "český", new String[] { "česk" });
+ assertAnalyzesTo(cz, "čeští", new String[] { "česk" });
+
+ /* rewrite of čt -> ck */
+ assertAnalyzesTo(cz, "anglický", new String[] { "anglick" });
+ assertAnalyzesTo(cz, "angličtí", new String[] { "anglick" });
+
+ /* rewrite of z -> h */
+ assertAnalyzesTo(cz, "kniha", new String[] { "knih" });
+ assertAnalyzesTo(cz, "knize", new String[] { "knih" });
+
+ /* rewrite of ž -> h */
+ assertAnalyzesTo(cz, "mazat", new String[] { "mah" });
+ assertAnalyzesTo(cz, "mažu", new String[] { "mah" });
+
+ /* rewrite of c -> k */
+ assertAnalyzesTo(cz, "kluk", new String[] { "kluk" });
+ assertAnalyzesTo(cz, "kluci", new String[] { "kluk" });
+ assertAnalyzesTo(cz, "klucích", new String[] { "kluk" });
+
+ /* rewrite of č -> k */
+ assertAnalyzesTo(cz, "hezký", new String[] { "hezk" });
+ assertAnalyzesTo(cz, "hezčí", new String[] { "hezk" });
+
+ /* rewrite of *ů* -> *o* */
+ assertAnalyzesTo(cz, "hůl", new String[] { "hol" });
+ assertAnalyzesTo(cz, "hole", new String[] { "hol" });
+
+ /* rewrite of e* -> * */
+ assertAnalyzesTo(cz, "deska", new String[] { "desk" });
+ assertAnalyzesTo(cz, "desek", new String[] { "desk" });
+ }
+
+ /**
+ * Test that very short words are not stemmed.
+ */
+ public void testDontStem() throws IOException {
+ CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
+ assertAnalyzesTo(cz, "e", new String[] { "e" });
+ assertAnalyzesTo(cz, "zi", new String[] { "zi" });
+ }
+}
Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\cz\TestCzechStemmer.java
___________________________________________________________________
Added: svn:eol-style
+ native