| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java (revision 885098)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java (working copy)
|
| @@ -26,7 +26,6 @@
|
| import org.apache.lucene.analysis.WordlistLoader; |
| import org.apache.lucene.analysis.standard.StandardFilter; |
| import org.apache.lucene.analysis.standard.StandardTokenizer; |
| -import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc |
| import org.apache.lucene.util.Version; |
| |
| import java.io.*; |
| @@ -36,19 +35,27 @@
|
| import java.util.Collections; |
| |
| /** |
| - * {@link Analyzer} for Czech language. |
| + * {@link Analyzer} for Czech language. |
| * <p> |
| - * Supports an external list of stopwords (words that |
| - * will not be indexed at all). |
| - * A default set of stopwords is used unless an alternative list is specified. |
| + * Supports an external list of stopwords (words that will not be indexed at |
| + * all). A default set of stopwords is used unless an alternative list is |
| + * specified. |
| * </p> |
| - * |
| - * <p><b>NOTE</b>: This class uses the same {@link Version} |
| - * dependent settings as {@link StandardAnalyzer}.</p> |
| + * |
| + * <a name="version"/> |
| + * <p> |
| + * You must specify the required {@link Version} compatibility when creating |
| + * CzechAnalyzer: |
| + * <ul> |
| + * <li>As of 3.1, words are stemmed with {@link CzechStemFilter} |
| + * <li>As of 2.9, StopFilter preserves position increments |
| + * <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see |
| + * <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>) |
| + * </ul> |
| */ |
| public final class CzechAnalyzer extends Analyzer { |
| |
| - /** |
| + /** |
| * List of typical stopwords. |
| * @deprecated use {@link #getDefaultStopSet()} instead |
| */ |
| @@ -74,10 +81,11 @@
|
| "jeho\u017e","j\u00ed\u017e","jeliko\u017e","je\u017e","jako\u017e","na\u010de\u017e", |
| }; |
| |
| - /** |
| - * Returns a set of default Czech-stopwords |
| - * @return a set of default Czech-stopwords |
| - */ |
| + /** |
| + * Returns a set of default Czech-stopwords |
| + * |
| + * @return a set of default Czech-stopwords |
| + */ |
| public static final Set<?> getDefaultStopSet(){ |
| return DefaultSetHolder.DEFAULT_SET; |
| } |
| @@ -87,27 +95,29 @@
|
| Arrays.asList(CZECH_STOP_WORDS), false)); |
| } |
| |
| - /** |
| - * Contains the stopwords used with the {@link StopFilter}. |
| - */ |
| + /** |
| + * Contains the stopwords used with the {@link StopFilter}. |
| + */ |
| // TODO make this final in 3.1 |
| private Set<?> stoptable; |
| private final Version matchVersion; |
| |
| - /** |
| - * Builds an analyzer with the default stop words ({@link #CZECH_STOP_WORDS}). |
| - */ |
| + /** |
| + * Builds an analyzer with the default stop words ({@link #CZECH_STOP_WORDS}). |
| + * |
| + * @param matchVersion Lucene version to match See |
| + * {@link <a href="#version">above</a>} |
| + */ |
| public CzechAnalyzer(Version matchVersion) { |
| this(matchVersion, DefaultSetHolder.DEFAULT_SET); |
| } |
| |
| - /** |
| - * Builds an analyzer with the given stop words and stemming exclusion words |
| + /** |
| + * Builds an analyzer with the given stop words. |
| * |
| - * @param matchVersion |
| - * lucene compatibility version |
| - * @param stopwords |
| - * a stopword set |
| + * @param matchVersion Lucene version to match See |
| + * {@link <a href="#version">above</a>} |
| + * @param stopwords a stopword set |
| */ |
| public CzechAnalyzer(Version matchVersion, Set<?> stopwords) { |
| this.matchVersion = matchVersion; |
| @@ -115,10 +125,14 @@
|
| } |
| |
| |
| - /** |
| - * Builds an analyzer with the given stop words. |
| - * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead |
| - */ |
| + /** |
| + * Builds an analyzer with the given stop words. |
| + * |
| + * @param matchVersion Lucene version to match See |
| + * {@link <a href="#version">above</a>} |
| + * @param stopwords a stopword set |
| + * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead |
| + */ |
| public CzechAnalyzer(Version matchVersion, String... stopwords) { |
| this(matchVersion, StopFilter.makeStopSet( stopwords )); |
| } |
| @@ -126,16 +140,23 @@
|
| /** |
| * Builds an analyzer with the given stop words. |
| * |
| + * @param matchVersion Lucene version to match See |
| + * {@link <a href="#version">above</a>} |
| + * @param stopwords a stopword set |
| * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead |
| */ |
| public CzechAnalyzer(Version matchVersion, HashSet<?> stopwords) { |
| this(matchVersion, (Set<?>)stopwords); |
| } |
| |
| - /** |
| - * Builds an analyzer with the given stop words. |
| - * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead |
| - */ |
| + /** |
| + * Builds an analyzer with the given stop words. |
| + * |
| + * @param matchVersion Lucene version to match See |
| + * {@link <a href="#version">above</a>} |
| + * @param stopwords a file containing stopwords |
| + * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead |
| + */ |
| public CzechAnalyzer(Version matchVersion, File stopwords ) throws IOException { |
| this(matchVersion, (Set<?>)WordlistLoader.getWordSet( stopwords )); |
| } |
| @@ -171,19 +192,24 @@
|
| } |
| } |
| |
| - /** |
| - * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. |
| - * |
| - * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with |
| - * {@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter} |
| - */ |
| - @Override |
| + /** |
| + * Creates a {@link TokenStream} which tokenizes all the text in the provided |
| + * {@link Reader}. |
| + * |
| + * @return A {@link TokenStream} built from a {@link StandardTokenizer} |
| + * filtered with {@link StandardFilter}, {@link LowerCaseFilter}, |
| + * {@link StopFilter}, and {@link CzechStemFilter} (only if version is |
| + * >= LUCENE_31) |
| + */ |
| + @Override |
| public final TokenStream tokenStream( String fieldName, Reader reader ) { |
| TokenStream result = new StandardTokenizer( matchVersion, reader ); |
| result = new StandardFilter( result ); |
| result = new LowerCaseFilter( matchVersion, result ); |
| result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), |
| result, stoptable ); |
| + if (matchVersion.onOrAfter(Version.LUCENE_31)) |
| + result = new CzechStemFilter(result); |
| return result; |
| } |
| |
| @@ -192,13 +218,15 @@
|
| TokenStream result; |
| }; |
| |
| - /** |
| - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in |
| - * the provided {@link Reader}. |
| - * |
| - * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with |
| - * {@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter} |
| - */ |
| + /** |
| + * Returns a (possibly reused) {@link TokenStream} which tokenizes all the |
| + * text in the provided {@link Reader}. |
| + * |
| + * @return A {@link TokenStream} built from a {@link StandardTokenizer} |
| + * filtered with {@link StandardFilter}, {@link LowerCaseFilter}, |
| + * {@link StopFilter}, and {@link CzechStemFilter} (only if version is |
| + * >= LUCENE_31) |
| + */ |
| @Override |
| public TokenStream reusableTokenStream(String fieldName, Reader reader) |
| throws IOException { |
| @@ -210,6 +238,8 @@
|
| streams.result = new LowerCaseFilter(matchVersion, streams.result); |
| streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), |
| streams.result, stoptable); |
| + if (matchVersion.onOrAfter(Version.LUCENE_31)) |
| + streams.result = new CzechStemFilter(streams.result); |
| setPreviousTokenStream(streams); |
| } else { |
| streams.source.reset(reader); |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java (revision 0)
|
| @@ -0,0 +1,52 @@
|
| +package org.apache.lucene.analysis.cz; |
| + |
| +import java.io.IOException; |
| + |
| +import org.apache.lucene.analysis.TokenFilter; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +/** |
| + * A {@link TokenFilter} that applies {@link CzechStemmer} to stem Czech words. |
| + * |
| + * <p><b>NOTE</b>: Input is expected to be in lowercase, |
| + * but with diacritical marks</p> |
| + */ |
| +public final class CzechStemFilter extends TokenFilter { |
| + private final CzechStemmer stemmer; |
| + private final TermAttribute termAtt; |
| + |
| + public CzechStemFilter(TokenStream input) { |
| + super(input); |
| + stemmer = new CzechStemmer(); |
| + termAtt = addAttribute(TermAttribute.class); |
| + } |
| + |
| + @Override |
| + public boolean incrementToken() throws IOException { |
| + if (input.incrementToken()) { |
| + int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength()); |
| + termAtt.setTermLength(newlen); |
| + return true; |
| + } else { |
| + return false; |
| + } |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\cz\CzechStemFilter.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java (revision 0)
|
| @@ -0,0 +1,181 @@
|
| +package org.apache.lucene.analysis.cz; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +/** |
| + * Light Stemmer for Czech. |
| + * <p> |
| + * Implements the algorithm described in: |
| + * <i> |
| + * Indexing and stemming approaches for the Czech language |
| + * </i> |
| + * http://portal.acm.org/citation.cfm?id=1598600 |
| + * </p> |
| + */ |
| +public class CzechStemmer { |
| + |
| + /** |
| + * Stem an input buffer of Czech text. |
| + * |
| + * @param s input buffer |
| + * @param len length of input buffer |
| + * @return length of input buffer after normalization |
| + * |
| + * <p><b>NOTE</b>: Input is expected to be in lowercase, |
| + * but with diacritical marks</p> |
| + */ |
| + public int stem(char s[], int len) { |
| + len = removeCase(s, len); |
| + len = removePossessives(s, len); |
| + len = normalize(s, len); |
| + return len; |
| + } |
| + |
| + private int removeCase(char s[], int len) { |
| + if (len > 7 && endsWith(s, len, "atech")) |
| + return len - 5; |
| + |
| + if (len > 6 && |
| + (endsWith(s, len,"ětem") || |
| + endsWith(s, len,"etem") || |
| + endsWith(s, len,"atům"))) |
| + return len - 4; |
| + |
| + if (len > 5 && |
| + (endsWith(s, len, "ech") || |
| + endsWith(s, len, "ich") || |
| + endsWith(s, len, "ích") || |
| + endsWith(s, len, "ého") || |
| + endsWith(s, len, "ěmi") || |
| + endsWith(s, len, "emi") || |
| + endsWith(s, len, "ému") || |
| + endsWith(s, len, "ěte") || |
| + endsWith(s, len, "ete") || |
| + endsWith(s, len, "ěti") || |
| + endsWith(s, len, "eti") || |
| + endsWith(s, len, "ího") || |
| + endsWith(s, len, "iho") || |
| + endsWith(s, len, "ími") || |
| + endsWith(s, len, "ímu") || |
| + endsWith(s, len, "imu") || |
| + endsWith(s, len, "ách") || |
| + endsWith(s, len, "ata") || |
| + endsWith(s, len, "aty") || |
| + endsWith(s, len, "ých") || |
| + endsWith(s, len, "ama") || |
| + endsWith(s, len, "ami") || |
| + endsWith(s, len, "ové") || |
| + endsWith(s, len, "ovi") || |
| + endsWith(s, len, "ými"))) |
| + return len - 3; |
| + |
| + if (len > 4 && |
| + (endsWith(s, len, "em") || |
| + endsWith(s, len, "es") || |
| + endsWith(s, len, "ém") || |
| + endsWith(s, len, "ím") || |
| + endsWith(s, len, "ům") || |
| + endsWith(s, len, "at") || |
| + endsWith(s, len, "ám") || |
| + endsWith(s, len, "os") || |
| + endsWith(s, len, "us") || |
| + endsWith(s, len, "ým") || |
| + endsWith(s, len, "mi") || |
| + endsWith(s, len, "ou"))) |
| + return len - 2; |
| + |
| + if (len > 3) { |
| + switch (s[len - 1]) { |
| + case 'a': |
| + case 'e': |
| + case 'i': |
| + case 'o': |
| + case 'u': |
| + case 'ů': |
| + case 'y': |
| + case 'á': |
| + case 'é': |
| + case 'í': |
| + case 'ý': |
| + case 'ě': |
| + return len - 1; |
| + } |
| + } |
| + |
| + return len; |
| + } |
| + |
| + private int removePossessives(char s[], int len) { |
| + if (len > 5 && |
| + (endsWith(s, len, "ov") || |
| + endsWith(s, len, "in") || |
| + endsWith(s, len, "ův"))) |
| + return len - 2; |
| + |
| + return len; |
| + } |
| + |
| + private int normalize(char s[], int len) { |
| + if (endsWith(s, len, "čt")) { // čt -> ck |
| + s[len - 2] = 'c'; |
| + s[len - 1] = 'k'; |
| + return len; |
| + } |
| + |
| + if (endsWith(s, len, "št")) { // št -> sk |
| + s[len - 2] = 's'; |
| + s[len - 1] = 'k'; |
| + return len; |
| + } |
| + |
| + switch(s[len - 1]) { |
| + case 'c': // [cč] -> k |
| + case 'č': |
| + s[len - 1] = 'k'; |
| + return len; |
| + case 'z': // [zž] -> h |
| + case 'ž': |
| + s[len - 1] = 'h'; |
| + return len; |
| + } |
| + |
| + if (len > 1 && s[len - 2] == 'e') { |
| + s[len - 2] = s[len - 1]; // e* > * |
| + return len - 1; |
| + } |
| + |
| + if (len > 2 && s[len - 2] == 'ů') { |
| + s[len - 2] = 'o'; // *ů* -> *o* |
| + return len; |
| + } |
| + |
| + return len; |
| + } |
| + |
| + private boolean endsWith(char s[], int len, String suffix) { |
| + int suffixLen = suffix.length(); |
| + if (suffixLen > len) |
| + return false; |
| + |
| + for (int i = suffixLen - 1; i >= 0; i--) |
| + if (s[len - (suffixLen - i)] != suffix.charAt(i)) |
| + return false; |
| + |
| + return true; |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\cz\CzechStemmer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java (revision 885098)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java (working copy)
|
| @@ -24,31 +24,50 @@
|
| |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.Analyzer; |
| -import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.util.Version; |
| |
| /** |
| * Test the CzechAnalyzer |
| * |
| - * CzechAnalyzer is like a StandardAnalyzer with a custom stopword list. |
| + * Before Lucene 3.1, CzechAnalyzer was a StandardAnalyzer with a custom |
| + * stopword list. As of 3.1 it also includes a stemmer. |
| * |
| */ |
| public class TestCzechAnalyzer extends BaseTokenStreamTestCase { |
| File dataDir = new File(System.getProperty("dataDir", "./bin")); |
| File customStopFile = new File(dataDir, "org/apache/lucene/analysis/cz/customStopWordFile.txt"); |
| |
| + /** |
| + * @deprecated Remove this test when support for 3.0 indexes is no longer needed. |
| + */ |
| + public void testStopWordLegacy() throws Exception { |
| + assertAnalyzesTo(new CzechAnalyzer(Version.LUCENE_30), "Pokud mluvime o volnem", |
| + new String[] { "mluvime", "volnem" }); |
| + } |
| + |
| public void testStopWord() throws Exception { |
| - assertAnalyzesTo(new CzechAnalyzer(Version.LUCENE_CURRENT), "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" }); |
| + assertAnalyzesTo(new CzechAnalyzer(Version.LUCENE_CURRENT), "Pokud mluvime o volnem", |
| + new String[] { "mluvim", "voln" }); |
| } |
| - |
| - public void testReusableTokenStream() throws Exception { |
| - Analyzer analyzer = new CzechAnalyzer(Version.LUCENE_CURRENT); |
| + |
| + /** |
| + * @deprecated Remove this test when support for 3.0 indexes is no longer needed. |
| + */ |
| + public void testReusableTokenStreamLegacy() throws Exception { |
| + Analyzer analyzer = new CzechAnalyzer(Version.LUCENE_30); |
| assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" }); |
| assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česká", "republika" }); |
| } |
| + |
| + public void testReusableTokenStream() throws Exception { |
| + Analyzer analyzer = new CzechAnalyzer(Version.LUCENE_CURRENT); |
| + assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvim", "voln" }); |
| + assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česk", "republik" }); |
| + } |
| |
| - /* |
| + /** |
| * An input stream that always throws IOException for testing. |
| + * @deprecated Remove this class when the loadStopWords method is removed. |
| */ |
| private class UnreliableInputStream extends InputStream { |
| @Override |
| @@ -57,24 +76,26 @@
|
| } |
| } |
| |
| - /* |
| + /** |
| * The loadStopWords method does not throw IOException on error, |
| * instead previously it set the stoptable to null (versus empty) |
| * this would cause a NPE when it is time to create the StopFilter. |
| + * @deprecated Remove this test when the loadStopWords method is removed. |
| */ |
| public void testInvalidStopWordFile() throws Exception { |
| - CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT); |
| + CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_30); |
| cz.loadStopWords(new UnreliableInputStream(), "UTF-8"); |
| assertAnalyzesTo(cz, "Pokud mluvime o volnem", |
| new String[] { "pokud", "mluvime", "o", "volnem" }); |
| } |
| |
| - /* |
| + /** |
| * Test that changes to the stop table via loadStopWords are applied immediately |
| * when using reusable token streams. |
| + * @deprecated Remove this test when the loadStopWords method is removed. |
| */ |
| public void testStopWordFileReuse() throws Exception { |
| - CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT); |
| + CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_30); |
| assertAnalyzesToReuse(cz, "Česká Republika", |
| new String[] { "česká", "republika" }); |
| |
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java (revision 0)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java (revision 0)
|
| @@ -0,0 +1,273 @@
|
| +package org.apache.lucene.analysis.cz; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| + |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| +import org.apache.lucene.util.Version; |
| + |
| +/** |
| + * Test the Czech Stemmer. |
| + * |
| + * Note: its algorithmic, so some stems are nonsense |
| + * |
| + */ |
| +public class TestCzechStemmer extends BaseTokenStreamTestCase { |
| + |
| + /** |
| + * Test showing how masculine noun forms conflate |
| + */ |
| + public void testMasculineNouns() throws IOException { |
| + CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT); |
| + |
| + /* animate ending with a hard consonant */ |
| + assertAnalyzesTo(cz, "pán", new String[] { "pán" }); |
| + assertAnalyzesTo(cz, "páni", new String[] { "pán" }); |
| + assertAnalyzesTo(cz, "pánové", new String[] { "pán" }); |
| + assertAnalyzesTo(cz, "pána", new String[] { "pán" }); |
| + assertAnalyzesTo(cz, "pánů", new String[] { "pán" }); |
| + assertAnalyzesTo(cz, "pánovi", new String[] { "pán" }); |
| + assertAnalyzesTo(cz, "pánům", new String[] { "pán" }); |
| + assertAnalyzesTo(cz, "pány", new String[] { "pán" }); |
| + assertAnalyzesTo(cz, "páne", new String[] { "pán" }); |
| + assertAnalyzesTo(cz, "pánech", new String[] { "pán" }); |
| + assertAnalyzesTo(cz, "pánem", new String[] { "pán" }); |
| + |
| + /* inanimate ending with hard consonant */ |
| + assertAnalyzesTo(cz, "hrad", new String[] { "hrad" }); |
| + assertAnalyzesTo(cz, "hradu", new String[] { "hrad" }); |
| + assertAnalyzesTo(cz, "hrade", new String[] { "hrad" }); |
| + assertAnalyzesTo(cz, "hradem", new String[] { "hrad" }); |
| + assertAnalyzesTo(cz, "hrady", new String[] { "hrad" }); |
| + assertAnalyzesTo(cz, "hradech", new String[] { "hrad" }); |
| + assertAnalyzesTo(cz, "hradům", new String[] { "hrad" }); |
| + assertAnalyzesTo(cz, "hradů", new String[] { "hrad" }); |
| + |
| + /* animate ending with a soft consonant */ |
| + assertAnalyzesTo(cz, "muž", new String[] { "muh" }); |
| + assertAnalyzesTo(cz, "muži", new String[] { "muh" }); |
| + assertAnalyzesTo(cz, "muže", new String[] { "muh" }); |
| + assertAnalyzesTo(cz, "mužů", new String[] { "muh" }); |
| + assertAnalyzesTo(cz, "mužům", new String[] { "muh" }); |
| + assertAnalyzesTo(cz, "mužích", new String[] { "muh" }); |
| + assertAnalyzesTo(cz, "mužem", new String[] { "muh" }); |
| + |
| + /* inanimate ending with a soft consonant */ |
| + assertAnalyzesTo(cz, "stroj", new String[] { "stroj" }); |
| + assertAnalyzesTo(cz, "stroje", new String[] { "stroj" }); |
| + assertAnalyzesTo(cz, "strojů", new String[] { "stroj" }); |
| + assertAnalyzesTo(cz, "stroji", new String[] { "stroj" }); |
| + assertAnalyzesTo(cz, "strojům", new String[] { "stroj" }); |
| + assertAnalyzesTo(cz, "strojích", new String[] { "stroj" }); |
| + assertAnalyzesTo(cz, "strojem", new String[] { "stroj" }); |
| + |
| + /* ending with a */ |
| + assertAnalyzesTo(cz, "předseda", new String[] { "předsd" }); |
| + assertAnalyzesTo(cz, "předsedové", new String[] { "předsd" }); |
| + assertAnalyzesTo(cz, "předsedy", new String[] { "předsd" }); |
| + assertAnalyzesTo(cz, "předsedů", new String[] { "předsd" }); |
| + assertAnalyzesTo(cz, "předsedovi", new String[] { "předsd" }); |
| + assertAnalyzesTo(cz, "předsedům", new String[] { "předsd" }); |
| + assertAnalyzesTo(cz, "předsedu", new String[] { "předsd" }); |
| + assertAnalyzesTo(cz, "předsedo", new String[] { "předsd" }); |
| + assertAnalyzesTo(cz, "předsedech", new String[] { "předsd" }); |
| + assertAnalyzesTo(cz, "předsedou", new String[] { "předsd" }); |
| + |
| + /* ending with e */ |
| + assertAnalyzesTo(cz, "soudce", new String[] { "soudk" }); |
| + assertAnalyzesTo(cz, "soudci", new String[] { "soudk" }); |
| + assertAnalyzesTo(cz, "soudců", new String[] { "soudk" }); |
| + assertAnalyzesTo(cz, "soudcům", new String[] { "soudk" }); |
| + assertAnalyzesTo(cz, "soudcích", new String[] { "soudk" }); |
| + assertAnalyzesTo(cz, "soudcem", new String[] { "soudk" }); |
| + } |
| + |
| + /** |
| + * Test showing how feminine noun forms conflate |
| + */ |
| + public void testFeminineNouns() throws IOException { |
| + CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT); |
| + |
| + /* ending with hard consonant */ |
| + assertAnalyzesTo(cz, "kost", new String[] { "kost" }); |
| + assertAnalyzesTo(cz, "kosti", new String[] { "kost" }); |
| + assertAnalyzesTo(cz, "kostí", new String[] { "kost" }); |
| + assertAnalyzesTo(cz, "kostem", new String[] { "kost" }); |
| + assertAnalyzesTo(cz, "kostech", new String[] { "kost" }); |
| + assertAnalyzesTo(cz, "kostmi", new String[] { "kost" }); |
| + |
| + /* ending with a soft consonant */ |
| + // note: in this example sing nom. and sing acc. don't conflate w/ the rest |
| + assertAnalyzesTo(cz, "píseň", new String[] { "písň" }); |
| + assertAnalyzesTo(cz, "písně", new String[] { "písn" }); |
| + assertAnalyzesTo(cz, "písni", new String[] { "písn" }); |
| + assertAnalyzesTo(cz, "písněmi", new String[] { "písn" }); |
| + assertAnalyzesTo(cz, "písních", new String[] { "písn" }); |
| + assertAnalyzesTo(cz, "písním", new String[] { "písn" }); |
| + |
| + /* ending with e */ |
| + assertAnalyzesTo(cz, "růže", new String[] { "růh" }); |
| + assertAnalyzesTo(cz, "růží", new String[] { "růh" }); |
| + assertAnalyzesTo(cz, "růžím", new String[] { "růh" }); |
| + assertAnalyzesTo(cz, "růžích", new String[] { "růh" }); |
| + assertAnalyzesTo(cz, "růžemi", new String[] { "růh" }); |
| + assertAnalyzesTo(cz, "růži", new String[] { "růh" }); |
| + |
| + /* ending with a */ |
| + assertAnalyzesTo(cz, "žena", new String[] { "žn" }); |
| + assertAnalyzesTo(cz, "ženy", new String[] { "žn" }); |
| + assertAnalyzesTo(cz, "žen", new String[] { "žn" }); |
| + assertAnalyzesTo(cz, "ženě", new String[] { "žn" }); |
| + assertAnalyzesTo(cz, "ženám", new String[] { "žn" }); |
| + assertAnalyzesTo(cz, "ženu", new String[] { "žn" }); |
| + assertAnalyzesTo(cz, "ženo", new String[] { "žn" }); |
| + assertAnalyzesTo(cz, "ženách", new String[] { "žn" }); |
| + assertAnalyzesTo(cz, "ženou", new String[] { "žn" }); |
| + assertAnalyzesTo(cz, "ženami", new String[] { "žn" }); |
| + } |
| + |
| + /** |
| + * Test showing how neuter noun forms conflate |
| + */ |
| + public void testNeuterNouns() throws IOException { |
| + CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT); |
| + |
| + /* ending with o */ |
| + assertAnalyzesTo(cz, "město", new String[] { "měst" }); |
| + assertAnalyzesTo(cz, "města", new String[] { "měst" }); |
| + assertAnalyzesTo(cz, "měst", new String[] { "měst" }); |
| + assertAnalyzesTo(cz, "městu", new String[] { "měst" }); |
| + assertAnalyzesTo(cz, "městům", new String[] { "měst" }); |
| + assertAnalyzesTo(cz, "městě", new String[] { "měst" }); |
| + assertAnalyzesTo(cz, "městech", new String[] { "měst" }); |
| + assertAnalyzesTo(cz, "městem", new String[] { "měst" }); |
| + assertAnalyzesTo(cz, "městy", new String[] { "měst" }); |
| + |
| + /* ending with e */ |
| + assertAnalyzesTo(cz, "moře", new String[] { "moř" }); |
| + assertAnalyzesTo(cz, "moří", new String[] { "moř" }); |
| + assertAnalyzesTo(cz, "mořím", new String[] { "moř" }); |
| + assertAnalyzesTo(cz, "moři", new String[] { "moř" }); |
| + assertAnalyzesTo(cz, "mořích", new String[] { "moř" }); |
| + assertAnalyzesTo(cz, "mořem", new String[] { "moř" }); |
| + |
| + /* ending with ě */ |
| + assertAnalyzesTo(cz, "kuře", new String[] { "kuř" }); |
| + assertAnalyzesTo(cz, "kuřata", new String[] { "kuř" }); |
| + assertAnalyzesTo(cz, "kuřete", new String[] { "kuř" }); |
| + assertAnalyzesTo(cz, "kuřat", new String[] { "kuř" }); |
| + assertAnalyzesTo(cz, "kuřeti", new String[] { "kuř" }); |
| + assertAnalyzesTo(cz, "kuřatům", new String[] { "kuř" }); |
| + assertAnalyzesTo(cz, "kuřatech", new String[] { "kuř" }); |
| + assertAnalyzesTo(cz, "kuřetem", new String[] { "kuř" }); |
| + assertAnalyzesTo(cz, "kuřaty", new String[] { "kuř" }); |
| + |
| + /* ending with í */ |
| + assertAnalyzesTo(cz, "stavení", new String[] { "stavn" }); |
| + assertAnalyzesTo(cz, "stavením", new String[] { "stavn" }); |
| + assertAnalyzesTo(cz, "staveních", new String[] { "stavn" }); |
| + assertAnalyzesTo(cz, "staveními", new String[] { "stavn" }); |
| + } |
| + |
| + /** |
| + * Test showing how adjectival forms conflate |
| + */ |
| + public void testAdjectives() throws IOException { |
| + CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT); |
| + |
| + /* ending with ý/á/é */ |
| + assertAnalyzesTo(cz, "mladý", new String[] { "mlad" }); |
| + assertAnalyzesTo(cz, "mladí", new String[] { "mlad" }); |
| + assertAnalyzesTo(cz, "mladého", new String[] { "mlad" }); |
| + assertAnalyzesTo(cz, "mladých", new String[] { "mlad" }); |
| + assertAnalyzesTo(cz, "mladému", new String[] { "mlad" }); |
| + assertAnalyzesTo(cz, "mladým", new String[] { "mlad" }); |
| + assertAnalyzesTo(cz, "mladé", new String[] { "mlad" }); |
| + assertAnalyzesTo(cz, "mladém", new String[] { "mlad" }); |
| + assertAnalyzesTo(cz, "mladými", new String[] { "mlad" }); |
| + assertAnalyzesTo(cz, "mladá", new String[] { "mlad" }); |
| + assertAnalyzesTo(cz, "mladou", new String[] { "mlad" }); |
| + |
| + /* ending with í */ |
| + assertAnalyzesTo(cz, "jarní", new String[] { "jarn" }); |
| + assertAnalyzesTo(cz, "jarního", new String[] { "jarn" }); |
| + assertAnalyzesTo(cz, "jarních", new String[] { "jarn" }); |
| + assertAnalyzesTo(cz, "jarnímu", new String[] { "jarn" }); |
| + assertAnalyzesTo(cz, "jarním", new String[] { "jarn" }); |
| + assertAnalyzesTo(cz, "jarními", new String[] { "jarn" }); |
| + } |
| + |
| + /** |
| + * Test some possessive suffixes |
| + */ |
| + public void testPossessive() throws IOException { |
| + CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT); |
| + assertAnalyzesTo(cz, "Karlův", new String[] { "karl" }); |
| + assertAnalyzesTo(cz, "jazykový", new String[] { "jazyk" }); |
| + } |
| + |
| + /** |
| + * Test some exceptional rules, implemented as rewrites. |
| + */ |
| + public void testExceptions() throws IOException { |
| + CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT); |
| + |
| + /* rewrite of št -> sk */ |
| + assertAnalyzesTo(cz, "český", new String[] { "česk" }); |
| + assertAnalyzesTo(cz, "čeští", new String[] { "česk" }); |
| + |
| + /* rewrite of čt -> ck */ |
| + assertAnalyzesTo(cz, "anglický", new String[] { "anglick" }); |
| + assertAnalyzesTo(cz, "angličtí", new String[] { "anglick" }); |
| + |
| + /* rewrite of z -> h */ |
| + assertAnalyzesTo(cz, "kniha", new String[] { "knih" }); |
| + assertAnalyzesTo(cz, "knize", new String[] { "knih" }); |
| + |
| + /* rewrite of ž -> h */ |
| + assertAnalyzesTo(cz, "mazat", new String[] { "mah" }); |
| + assertAnalyzesTo(cz, "mažu", new String[] { "mah" }); |
| + |
| + /* rewrite of c -> k */ |
| + assertAnalyzesTo(cz, "kluk", new String[] { "kluk" }); |
| + assertAnalyzesTo(cz, "kluci", new String[] { "kluk" }); |
| + assertAnalyzesTo(cz, "klucích", new String[] { "kluk" }); |
| + |
| + /* rewrite of č -> k */ |
| + assertAnalyzesTo(cz, "hezký", new String[] { "hezk" }); |
| + assertAnalyzesTo(cz, "hezčí", new String[] { "hezk" }); |
| + |
| + /* rewrite of *ů* -> *o* */ |
| + assertAnalyzesTo(cz, "hůl", new String[] { "hol" }); |
| + assertAnalyzesTo(cz, "hole", new String[] { "hol" }); |
| + |
| + /* rewrite of e* -> * */ |
| + assertAnalyzesTo(cz, "deska", new String[] { "desk" }); |
| + assertAnalyzesTo(cz, "desek", new String[] { "desk" }); |
| + } |
| + |
| + /** |
| + * Test that very short words are not stemmed. |
| + */ |
| + public void testDontStem() throws IOException { |
| + CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT); |
| + assertAnalyzesTo(cz, "e", new String[] { "e" }); |
| + assertAnalyzesTo(cz, "zi", new String[] { "zi" }); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\cz\TestCzechStemmer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|