docs/attachments/LUCENE-2067/LUCENE-2067.patch - lucene-jira-archive - Git at Google

 Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
 ===================================================================
 --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java	(revision 885098)
 +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java	(working copy)
 @@ -26,7 +26,6 @@
  import org.apache.lucene.analysis.WordlistLoader;
  import org.apache.lucene.analysis.standard.StandardFilter;
  import org.apache.lucene.analysis.standard.StandardTokenizer;
 -import org.apache.lucene.analysis.standard.StandardAnalyzer;  // for javadoc
  import org.apache.lucene.util.Version;

  import java.io.*;
 @@ -36,19 +35,27 @@
  import java.util.Collections;

  /**
 - * {@link Analyzer} for Czech language.
 + * {@link Analyzer} for Czech language.
   * <p>
 - * Supports an external list of stopwords (words that
 - * will not be indexed at all).
 - * A default set of stopwords is used unless an alternative list is specified.
 + * Supports an external list of stopwords (words that will not be indexed at
 + * all). A default set of stopwords is used unless an alternative list is
 + * specified.
   * </p>
 - *
 - * <p><b>NOTE</b>: This class uses the same {@link Version}
 - * dependent settings as {@link StandardAnalyzer}.</p>
 + *
 + * <a name="version"/>
 + * <p>
 + * You must specify the required {@link Version} compatibility when creating
 + * CzechAnalyzer:
 + * <ul>
 + * <li>As of 3.1, words are stemmed with {@link CzechStemFilter}
 + * <li>As of 2.9, StopFilter preserves position increments
 + * <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
 + * <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
 + * </ul>
   */
  public final class CzechAnalyzer extends Analyzer {

 -	/**
 +  /**
  	 * List of typical stopwords.
  	 * @deprecated use {@link #getDefaultStopSet()} instead
  	 */
 @@ -74,10 +81,11 @@
          "jeho\u017e","j\u00ed\u017e","jeliko\u017e","je\u017e","jako\u017e","na\u010de\u017e",
      };

 -	/**
 -	 * Returns a set of default Czech-stopwords
 -	 * @return a set of default Czech-stopwords
 -	 */
 +  /**
 +   * Returns a set of default Czech-stopwords
 +   *
 +   * @return a set of default Czech-stopwords
 +   */
  	public static final Set<?> getDefaultStopSet(){
  	  return DefaultSetHolder.DEFAULT_SET;
  	}
 @@ -87,27 +95,29 @@
  	      Arrays.asList(CZECH_STOP_WORDS), false));
  	}

 -	/**
 -	 * Contains the stopwords used with the {@link StopFilter}.
 -	 */
 +  /**
 +   * Contains the stopwords used with the {@link StopFilter}.
 +   */
  	// TODO make this final in 3.1
  	private Set<?> stoptable;
    private final Version matchVersion;

 -	/**
 -	 * Builds an analyzer with the default stop words ({@link #CZECH_STOP_WORDS}).
 -	 */
 +  /**
 +   * Builds an analyzer with the default stop words ({@link #CZECH_STOP_WORDS}).
 +   *
 +   * @param matchVersion Lucene version to match See
 +   *          {@link <a href="#version">above</a>}
 +   */
  	public CzechAnalyzer(Version matchVersion) {
      this(matchVersion, DefaultSetHolder.DEFAULT_SET);
  	}

 -	/**
 -   * Builds an analyzer with the given stop words and stemming exclusion words
 +  /**
 +   * Builds an analyzer with the given stop words.
     *
 -   * @param matchVersion
 -   *          lucene compatibility version
 -   * @param stopwords
 -   *          a stopword set
 +   * @param matchVersion Lucene version to match See
 +   *          {@link <a href="#version">above</a>}
 +   * @param stopwords a stopword set
     */
    public CzechAnalyzer(Version matchVersion, Set<?> stopwords) {
      this.matchVersion = matchVersion;
 @@ -115,10 +125,14 @@
    }


 -	/**
 -	 * Builds an analyzer with the given stop words.
 -	 * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
 -	 */
 +  /**
 +   * Builds an analyzer with the given stop words.
 +   *
 +   * @param matchVersion Lucene version to match See
 +   *          {@link <a href="#version">above</a>}
 +   * @param stopwords a stopword set
 +   * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
 +   */
    public CzechAnalyzer(Version matchVersion, String... stopwords) {
      this(matchVersion, StopFilter.makeStopSet( stopwords ));
  	}
 @@ -126,16 +140,23 @@
    /**
     * Builds an analyzer with the given stop words.
     *
 +   * @param matchVersion Lucene version to match See
 +   *          {@link <a href="#version">above</a>}
 +   * @param stopwords a stopword set
     * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
     */
    public CzechAnalyzer(Version matchVersion, HashSet<?> stopwords) {
      this(matchVersion, (Set<?>)stopwords);
  	}

 -	/**
 -	 * Builds an analyzer with the given stop words.
 -	 * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
 -	 */
 +  /**
 +   * Builds an analyzer with the given stop words.
 +   *
 +   * @param matchVersion Lucene version to match See
 +   *          {@link <a href="#version">above</a>}
 +   * @param stopwords a file containing stopwords
 +   * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
 +   */
    public CzechAnalyzer(Version matchVersion, File stopwords ) throws IOException {
      this(matchVersion, (Set<?>)WordlistLoader.getWordSet( stopwords ));
  	}
 @@ -171,19 +192,24 @@
          }
      }

 -	/**
 -	 * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
 -	 *
 -	 * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
 -	 * 			{@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter}
 -	 */
 -	@Override
 +  /**
 +   * Creates a {@link TokenStream} which tokenizes all the text in the provided
 +   * {@link Reader}.
 +   *
 +   * @return A {@link TokenStream} built from a {@link StandardTokenizer}
 +   *         filtered with {@link StandardFilter}, {@link LowerCaseFilter},
 +   *         {@link StopFilter}, and {@link CzechStemFilter} (only if version is
 +   *         >= LUCENE_31)
 +   */
 +  @Override
  	public final TokenStream tokenStream( String fieldName, Reader reader ) {
                  TokenStream result = new StandardTokenizer( matchVersion, reader );
  		result = new StandardFilter( result );
  		result = new LowerCaseFilter( matchVersion, result );
  		result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
                                           result, stoptable );
 +		if (matchVersion.onOrAfter(Version.LUCENE_31))
 +		  result = new CzechStemFilter(result);
  		return result;
  	}

 @@ -192,13 +218,15 @@
  	    TokenStream result;
  	};

 -	/**
 -     * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in
 -     * the provided {@link Reader}.
 -     *
 -     * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
 -     *          {@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter}
 -     */
 +  /**
 +   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
 +   * text in the provided {@link Reader}.
 +   *
 +   * @return A {@link TokenStream} built from a {@link StandardTokenizer}
 +   *         filtered with {@link StandardFilter}, {@link LowerCaseFilter},
 +   *         {@link StopFilter}, and {@link CzechStemFilter} (only if version is
 +   *         >= LUCENE_31)
 +   */
  	@Override
  	public TokenStream reusableTokenStream(String fieldName, Reader reader)
        throws IOException {
 @@ -210,6 +238,8 @@
          streams.result = new LowerCaseFilter(matchVersion, streams.result);
          streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
                                          streams.result, stoptable);
 +        if (matchVersion.onOrAfter(Version.LUCENE_31))
 +          streams.result = new CzechStemFilter(streams.result);
          setPreviousTokenStream(streams);
        } else {
          streams.source.reset(reader);
 Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java
 ===================================================================
 --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java	(revision 0)
 +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilter.java	(revision 0)
 @@ -0,0 +1,52 @@
 +package org.apache.lucene.analysis.cz;
 +
 +import java.io.IOException;
 +
 +import org.apache.lucene.analysis.TokenFilter;
 +import org.apache.lucene.analysis.TokenStream;
 +import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 +
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +/**
 + * A {@link TokenFilter} that applies {@link CzechStemmer} to stem Czech words.
 + *
 + * <p><b>NOTE</b>: Input is expected to be in lowercase,
 + * but with diacritical marks</p>
 + */
 +public final class CzechStemFilter extends TokenFilter {
 +  private final CzechStemmer stemmer;
 +  private final TermAttribute termAtt;
 +
 +  public CzechStemFilter(TokenStream input) {
 +    super(input);
 +    stemmer = new CzechStemmer();
 +    termAtt = addAttribute(TermAttribute.class);
 +  }
 +
 +  @Override
 +  public boolean incrementToken() throws IOException {
 +    if (input.incrementToken()) {
 +      int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
 +      termAtt.setTermLength(newlen);
 +      return true;
 +    } else {
 +      return false;
 +    }
 +  }
 +}

 Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\cz\CzechStemFilter.java
 ___________________________________________________________________
 Added: svn:eol-style
    + native

 Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java
 ===================================================================
 --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java	(revision 0)
 +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java	(revision 0)
 @@ -0,0 +1,181 @@
 +package org.apache.lucene.analysis.cz;
 +
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +/**
 + * Light Stemmer for Czech.
 + * <p>
 + * Implements the algorithm described in:
 + * <i>
 + * Indexing and stemming approaches for the Czech language
 + * </i>
 + * http://portal.acm.org/citation.cfm?id=1598600
 + * </p>
 + */
 +public class CzechStemmer {
 +
 +  /**
 +   * Stem an input buffer of Czech text.
 +   *
 +   * @param s input buffer
 +   * @param len length of input buffer
 +   * @return length of input buffer after normalization
 +   *
 +   * <p><b>NOTE</b>: Input is expected to be in lowercase,
 +   * but with diacritical marks</p>
 +   */
 +  public int stem(char s[], int len) {
 +    len = removeCase(s, len);
 +    len = removePossessives(s, len);
 +    len = normalize(s, len);
 +    return len;
 +  }
 +
 +  private int removeCase(char s[], int len) {
 +    if (len > 7 && endsWith(s, len, "atech"))
 +      return len - 5;
 +
 +    if (len > 6 &&
 +        (endsWith(s, len,"ětem") ||
 +        endsWith(s, len,"etem") ||
 +        endsWith(s, len,"atům")))
 +      return len - 4;
 +
 +    if (len > 5 &&
 +        (endsWith(s, len, "ech") ||
 +        endsWith(s, len, "ich") ||
 +        endsWith(s, len, "ích") ||
 +        endsWith(s, len, "ého") ||
 +        endsWith(s, len, "ěmi") ||
 +        endsWith(s, len, "emi") ||
 +        endsWith(s, len, "ému") ||
 +        endsWith(s, len, "ěte") ||
 +        endsWith(s, len, "ete") ||
 +        endsWith(s, len, "ěti") ||
 +        endsWith(s, len, "eti") ||
 +        endsWith(s, len, "ího") ||
 +        endsWith(s, len, "iho") ||
 +        endsWith(s, len, "ími") ||
 +        endsWith(s, len, "ímu") ||
 +        endsWith(s, len, "imu") ||
 +        endsWith(s, len, "ách") ||
 +        endsWith(s, len, "ata") ||
 +        endsWith(s, len, "aty") ||
 +        endsWith(s, len, "ých") ||
 +        endsWith(s, len, "ama") ||
 +        endsWith(s, len, "ami") ||
 +        endsWith(s, len, "ové") ||
 +        endsWith(s, len, "ovi") ||
 +        endsWith(s, len, "ými")))
 +      return len - 3;
 +
 +    if (len > 4 &&
 +        (endsWith(s, len, "em") ||
 +        endsWith(s, len, "es") ||
 +        endsWith(s, len, "ém") ||
 +        endsWith(s, len, "ím") ||
 +        endsWith(s, len, "ům") ||
 +        endsWith(s, len, "at") ||
 +        endsWith(s, len, "ám") ||
 +        endsWith(s, len, "os") ||
 +        endsWith(s, len, "us") ||
 +        endsWith(s, len, "ým") ||
 +        endsWith(s, len, "mi") ||
 +        endsWith(s, len, "ou")))
 +      return len - 2;
 +
 +    if (len > 3) {
 +      switch (s[len - 1]) {
 +        case 'a':
 +        case 'e':
 +        case 'i':
 +        case 'o':
 +        case 'u':
 +        case 'ů':
 +        case 'y':
 +        case 'á':
 +        case 'é':
 +        case 'í':
 +        case 'ý':
 +        case 'ě':
 +          return len - 1;
 +      }
 +    }
 +
 +    return len;
 +  }
 +
 +  private int removePossessives(char s[], int len) {
 +    if (len > 5 &&
 +        (endsWith(s, len, "ov") ||
 +        endsWith(s, len, "in") ||
 +        endsWith(s, len, "ův")))
 +      return len - 2;
 +
 +    return len;
 +  }
 +
 +  private int normalize(char s[], int len) {
 +    if (endsWith(s, len, "čt")) { // čt -> ck
 +      s[len - 2] = 'c';
 +      s[len - 1] = 'k';
 +      return len;
 +    }
 +
 +    if (endsWith(s, len, "št")) { // št -> sk
 +      s[len - 2] = 's';
 +      s[len - 1] = 'k';
 +      return len;
 +    }
 +
 +    switch(s[len - 1]) {
 +      case 'c': // [cč] -> k
 +      case 'č':
 +        s[len - 1] = 'k';
 +        return len;
 +      case 'z': // [zž] -> h
 +      case 'ž':
 +        s[len - 1] = 'h';
 +        return len;
 +    }
 +
 +    if (len > 1 && s[len - 2] == 'e') {
 +      s[len - 2] = s[len - 1]; // e* > *
 +      return len - 1;
 +    }
 +
 +    if (len > 2 && s[len - 2] == 'ů') {
 +      s[len - 2] = 'o'; // *ů* -> *o*
 +      return len;
 +    }
 +
 +    return len;
 +  }
 +
 +  private boolean endsWith(char s[], int len, String suffix) {
 +    int suffixLen = suffix.length();
 +    if (suffixLen > len)
 +      return false;
 +
 +    for (int i = suffixLen - 1; i >= 0; i--)
 +      if (s[len - (suffixLen - i)] != suffix.charAt(i))
 +        return false;
 +
 +    return true;
 +  }
 +}

 Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\cz\CzechStemmer.java
 ___________________________________________________________________
 Added: svn:eol-style
    + native

 Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
 ===================================================================
 --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java	(revision 885098)
 +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java	(working copy)
 @@ -24,31 +24,50 @@

  import org.apache.lucene.analysis.BaseTokenStreamTestCase;
  import org.apache.lucene.analysis.Analyzer;
 -import org.apache.lucene.analysis.TokenStream;
  import org.apache.lucene.util.Version;

  /**
   * Test the CzechAnalyzer
   *
 - * CzechAnalyzer is like a StandardAnalyzer with a custom stopword list.
 + * Before Lucene 3.1, CzechAnalyzer was a StandardAnalyzer with a custom
 + * stopword list. As of 3.1 it also includes a stemmer.
   *
   */
  public class TestCzechAnalyzer extends BaseTokenStreamTestCase {
    File dataDir = new File(System.getProperty("dataDir", "./bin"));
    File customStopFile = new File(dataDir, "org/apache/lucene/analysis/cz/customStopWordFile.txt");

 +  /**
 +   * @deprecated Remove this test when support for 3.0 indexes is no longer needed.
 +   */
 +  public void testStopWordLegacy() throws Exception {
 +    assertAnalyzesTo(new CzechAnalyzer(Version.LUCENE_30), "Pokud mluvime o volnem",
 +        new String[] { "mluvime", "volnem" });
 +  }
 +
    public void testStopWord() throws Exception {
 -    assertAnalyzesTo(new CzechAnalyzer(Version.LUCENE_CURRENT), "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" });
 +    assertAnalyzesTo(new CzechAnalyzer(Version.LUCENE_CURRENT), "Pokud mluvime o volnem",
 +        new String[] { "mluvim", "voln" });
    }
 -
 -  public void testReusableTokenStream() throws Exception {
 -    Analyzer analyzer = new CzechAnalyzer(Version.LUCENE_CURRENT);
 +
 +  /**
 +   * @deprecated Remove this test when support for 3.0 indexes is no longer needed.
 +   */
 +  public void testReusableTokenStreamLegacy() throws Exception {
 +    Analyzer analyzer = new CzechAnalyzer(Version.LUCENE_30);
      assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" });
      assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česká", "republika" });
    }
 +
 +  public void testReusableTokenStream() throws Exception {
 +    Analyzer analyzer = new CzechAnalyzer(Version.LUCENE_CURRENT);
 +    assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvim", "voln" });
 +    assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česk", "republik" });
 +  }

 -  /*
 +  /**
     * An input stream that always throws IOException for testing.
 +   * @deprecated Remove this class when the loadStopWords method is removed.
     */
    private class UnreliableInputStream extends InputStream {
      @Override
 @@ -57,24 +76,26 @@
      }
    }

 -  /*
 +  /**
     * The loadStopWords method does not throw IOException on error,
     * instead previously it set the stoptable to null (versus empty)
     * this would cause a NPE when it is time to create the StopFilter.
 +   * @deprecated Remove this test when the loadStopWords method is removed.
     */
    public void testInvalidStopWordFile() throws Exception {
 -    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
 +    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_30);
      cz.loadStopWords(new UnreliableInputStream(), "UTF-8");
      assertAnalyzesTo(cz, "Pokud mluvime o volnem",
          new String[] { "pokud", "mluvime", "o", "volnem" });
    }

 -  /*
 +  /**
     * Test that changes to the stop table via loadStopWords are applied immediately
     * when using reusable token streams.
 +   * @deprecated Remove this test when the loadStopWords method is removed.
     */
    public void testStopWordFileReuse() throws Exception {
 -    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
 +    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_30);
      assertAnalyzesToReuse(cz, "Česká Republika",
        new String[] { "česká", "republika" });

 Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java
 ===================================================================
 --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java	(revision 0)
 +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java	(revision 0)
 @@ -0,0 +1,273 @@
 +package org.apache.lucene.analysis.cz;
 +
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +import java.io.IOException;
 +
 +import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 +import org.apache.lucene.util.Version;
 +
 +/**
 + * Test the Czech Stemmer.
 + *
 + * Note: its algorithmic, so some stems are nonsense
 + *
 + */
 +public class TestCzechStemmer extends BaseTokenStreamTestCase {
 +
 +  /**
 +   * Test showing how masculine noun forms conflate
 +   */
 +  public void testMasculineNouns() throws IOException {
 +    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
 +
 +    /* animate ending with a hard consonant */
 +    assertAnalyzesTo(cz, "pán", new String[] { "pán" });
 +    assertAnalyzesTo(cz, "páni", new String[] { "pán" });
 +    assertAnalyzesTo(cz, "pánové", new String[] { "pán" });
 +    assertAnalyzesTo(cz, "pána", new String[] { "pán" });
 +    assertAnalyzesTo(cz, "pánů", new String[] { "pán" });
 +    assertAnalyzesTo(cz, "pánovi", new String[] { "pán" });
 +    assertAnalyzesTo(cz, "pánům", new String[] { "pán" });
 +    assertAnalyzesTo(cz, "pány", new String[] { "pán" });
 +    assertAnalyzesTo(cz, "páne", new String[] { "pán" });
 +    assertAnalyzesTo(cz, "pánech", new String[] { "pán" });
 +    assertAnalyzesTo(cz, "pánem", new String[] { "pán" });
 +
 +    /* inanimate ending with hard consonant */
 +    assertAnalyzesTo(cz, "hrad", new String[] { "hrad" });
 +    assertAnalyzesTo(cz, "hradu", new String[] { "hrad" });
 +    assertAnalyzesTo(cz, "hrade", new String[] { "hrad" });
 +    assertAnalyzesTo(cz, "hradem", new String[] { "hrad" });
 +    assertAnalyzesTo(cz, "hrady", new String[] { "hrad" });
 +    assertAnalyzesTo(cz, "hradech", new String[] { "hrad" });
 +    assertAnalyzesTo(cz, "hradům", new String[] { "hrad" });
 +    assertAnalyzesTo(cz, "hradů", new String[] { "hrad" });
 +
 +    /* animate ending with a soft consonant */
 +    assertAnalyzesTo(cz, "muž", new String[] { "muh" });
 +    assertAnalyzesTo(cz, "muži", new String[] { "muh" });
 +    assertAnalyzesTo(cz, "muže", new String[] { "muh" });
 +    assertAnalyzesTo(cz, "mužů", new String[] { "muh" });
 +    assertAnalyzesTo(cz, "mužům", new String[] { "muh" });
 +    assertAnalyzesTo(cz, "mužích", new String[] { "muh" });
 +    assertAnalyzesTo(cz, "mužem", new String[] { "muh" });
 +
 +    /* inanimate ending with a soft consonant */
 +    assertAnalyzesTo(cz, "stroj", new String[] { "stroj" });
 +    assertAnalyzesTo(cz, "stroje", new String[] { "stroj" });
 +    assertAnalyzesTo(cz, "strojů", new String[] { "stroj" });
 +    assertAnalyzesTo(cz, "stroji", new String[] { "stroj" });
 +    assertAnalyzesTo(cz, "strojům", new String[] { "stroj" });
 +    assertAnalyzesTo(cz, "strojích", new String[] { "stroj" });
 +    assertAnalyzesTo(cz, "strojem", new String[] { "stroj" });
 +
 +    /* ending with a */
 +    assertAnalyzesTo(cz, "předseda", new String[] { "předsd" });
 +    assertAnalyzesTo(cz, "předsedové", new String[] { "předsd" });
 +    assertAnalyzesTo(cz, "předsedy", new String[] { "předsd" });
 +    assertAnalyzesTo(cz, "předsedů", new String[] { "předsd" });
 +    assertAnalyzesTo(cz, "předsedovi", new String[] { "předsd" });
 +    assertAnalyzesTo(cz, "předsedům", new String[] { "předsd" });
 +    assertAnalyzesTo(cz, "předsedu", new String[] { "předsd" });
 +    assertAnalyzesTo(cz, "předsedo", new String[] { "předsd" });
 +    assertAnalyzesTo(cz, "předsedech", new String[] { "předsd" });
 +    assertAnalyzesTo(cz, "předsedou", new String[] { "předsd" });
 +
 +    /* ending with e */
 +    assertAnalyzesTo(cz, "soudce", new String[] { "soudk" });
 +    assertAnalyzesTo(cz, "soudci", new String[] { "soudk" });
 +    assertAnalyzesTo(cz, "soudců", new String[] { "soudk" });
 +    assertAnalyzesTo(cz, "soudcům", new String[] { "soudk" });
 +    assertAnalyzesTo(cz, "soudcích", new String[] { "soudk" });
 +    assertAnalyzesTo(cz, "soudcem", new String[] { "soudk" });
 +  }
 +
 +  /**
 +   * Test showing how feminine noun forms conflate
 +   */
 +  public void testFeminineNouns() throws IOException {
 +    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
 +
 +    /* ending with hard consonant */
 +    assertAnalyzesTo(cz, "kost", new String[] { "kost" });
 +    assertAnalyzesTo(cz, "kosti", new String[] { "kost" });
 +    assertAnalyzesTo(cz, "kostí", new String[] { "kost" });
 +    assertAnalyzesTo(cz, "kostem", new String[] { "kost" });
 +    assertAnalyzesTo(cz, "kostech", new String[] { "kost" });
 +    assertAnalyzesTo(cz, "kostmi", new String[] { "kost" });
 +
 +    /* ending with a soft consonant */
 +    // note: in this example sing nom. and sing acc. don't conflate w/ the rest
 +    assertAnalyzesTo(cz, "píseň", new String[] { "písň" });
 +    assertAnalyzesTo(cz, "písně", new String[] { "písn" });
 +    assertAnalyzesTo(cz, "písni", new String[] { "písn" });
 +    assertAnalyzesTo(cz, "písněmi", new String[] { "písn" });
 +    assertAnalyzesTo(cz, "písních", new String[] { "písn" });
 +    assertAnalyzesTo(cz, "písním", new String[] { "písn" });
 +
 +    /* ending with e */
 +    assertAnalyzesTo(cz, "růže", new String[] { "růh" });
 +    assertAnalyzesTo(cz, "růží", new String[] { "růh" });
 +    assertAnalyzesTo(cz, "růžím", new String[] { "růh" });
 +    assertAnalyzesTo(cz, "růžích", new String[] { "růh" });
 +    assertAnalyzesTo(cz, "růžemi", new String[] { "růh" });
 +    assertAnalyzesTo(cz, "růži", new String[] { "růh" });
 +
 +    /* ending with a */
 +    assertAnalyzesTo(cz, "žena", new String[] { "žn" });
 +    assertAnalyzesTo(cz, "ženy", new String[] { "žn" });
 +    assertAnalyzesTo(cz, "žen", new String[] { "žn" });
 +    assertAnalyzesTo(cz, "ženě", new String[] { "žn" });
 +    assertAnalyzesTo(cz, "ženám", new String[] { "žn" });
 +    assertAnalyzesTo(cz, "ženu", new String[] { "žn" });
 +    assertAnalyzesTo(cz, "ženo", new String[] { "žn" });
 +    assertAnalyzesTo(cz, "ženách", new String[] { "žn" });
 +    assertAnalyzesTo(cz, "ženou", new String[] { "žn" });
 +    assertAnalyzesTo(cz, "ženami", new String[] { "žn" });
 +  }
 +
 +  /**
 +   * Test showing how neuter noun forms conflate
 +   */
 +  public void testNeuterNouns() throws IOException {
 +    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
 +
 +    /* ending with o */
 +    assertAnalyzesTo(cz, "město", new String[] { "měst" });
 +    assertAnalyzesTo(cz, "města", new String[] { "měst" });
 +    assertAnalyzesTo(cz, "měst", new String[] { "měst" });
 +    assertAnalyzesTo(cz, "městu", new String[] { "měst" });
 +    assertAnalyzesTo(cz, "městům", new String[] { "měst" });
 +    assertAnalyzesTo(cz, "městě", new String[] { "měst" });
 +    assertAnalyzesTo(cz, "městech", new String[] { "měst" });
 +    assertAnalyzesTo(cz, "městem", new String[] { "měst" });
 +    assertAnalyzesTo(cz, "městy", new String[] { "měst" });
 +
 +    /* ending with e */
 +    assertAnalyzesTo(cz, "moře", new String[] { "moř" });
 +    assertAnalyzesTo(cz, "moří", new String[] { "moř" });
 +    assertAnalyzesTo(cz, "mořím", new String[] { "moř" });
 +    assertAnalyzesTo(cz, "moři", new String[] { "moř" });
 +    assertAnalyzesTo(cz, "mořích", new String[] { "moř" });
 +    assertAnalyzesTo(cz, "mořem", new String[] { "moř" });
 +
 +    /* ending with ě */
 +    assertAnalyzesTo(cz, "kuře", new String[] { "kuř" });
 +    assertAnalyzesTo(cz, "kuřata", new String[] { "kuř" });
 +    assertAnalyzesTo(cz, "kuřete", new String[] { "kuř" });
 +    assertAnalyzesTo(cz, "kuřat", new String[] { "kuř" });
 +    assertAnalyzesTo(cz, "kuřeti", new String[] { "kuř" });
 +    assertAnalyzesTo(cz, "kuřatům", new String[] { "kuř" });
 +    assertAnalyzesTo(cz, "kuřatech", new String[] { "kuř" });
 +    assertAnalyzesTo(cz, "kuřetem", new String[] { "kuř" });
 +    assertAnalyzesTo(cz, "kuřaty", new String[] { "kuř" });
 +
 +    /* ending with í */
 +    assertAnalyzesTo(cz, "stavení", new String[] { "stavn" });
 +    assertAnalyzesTo(cz, "stavením", new String[] { "stavn" });
 +    assertAnalyzesTo(cz, "staveních", new String[] { "stavn" });
 +    assertAnalyzesTo(cz, "staveními", new String[] { "stavn" });
 +  }
 +
 +  /**
 +   * Test showing how adjectival forms conflate
 +   */
 +  public void testAdjectives() throws IOException {
 +    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
 +
 +    /* ending with ý/á/é */
 +    assertAnalyzesTo(cz, "mladý", new String[] { "mlad" });
 +    assertAnalyzesTo(cz, "mladí", new String[] { "mlad" });
 +    assertAnalyzesTo(cz, "mladého", new String[] { "mlad" });
 +    assertAnalyzesTo(cz, "mladých", new String[] { "mlad" });
 +    assertAnalyzesTo(cz, "mladému", new String[] { "mlad" });
 +    assertAnalyzesTo(cz, "mladým", new String[] { "mlad" });
 +    assertAnalyzesTo(cz, "mladé", new String[] { "mlad" });
 +    assertAnalyzesTo(cz, "mladém", new String[] { "mlad" });
 +    assertAnalyzesTo(cz, "mladými", new String[] { "mlad" });
 +    assertAnalyzesTo(cz, "mladá", new String[] { "mlad" });
 +    assertAnalyzesTo(cz, "mladou", new String[] { "mlad" });
 +
 +    /* ending with í */
 +    assertAnalyzesTo(cz, "jarní", new String[] { "jarn" });
 +    assertAnalyzesTo(cz, "jarního", new String[] { "jarn" });
 +    assertAnalyzesTo(cz, "jarních", new String[] { "jarn" });
 +    assertAnalyzesTo(cz, "jarnímu", new String[] { "jarn" });
 +    assertAnalyzesTo(cz, "jarním", new String[] { "jarn" });
 +    assertAnalyzesTo(cz, "jarními", new String[] { "jarn" });
 +  }
 +
 +  /**
 +   * Test some possessive suffixes
 +   */
 +  public void testPossessive() throws IOException {
 +    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
 +    assertAnalyzesTo(cz, "Karlův", new String[] { "karl" });
 +    assertAnalyzesTo(cz, "jazykový", new String[] { "jazyk" });
 +  }
 +
 +  /**
 +   * Test some exceptional rules, implemented as rewrites.
 +   */
 +  public void testExceptions() throws IOException {
 +    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
 +
 +    /* rewrite of št -> sk */
 +    assertAnalyzesTo(cz, "český", new String[] { "česk" });
 +    assertAnalyzesTo(cz, "čeští", new String[] { "česk" });
 +
 +    /* rewrite of čt -> ck */
 +    assertAnalyzesTo(cz, "anglický", new String[] { "anglick" });
 +    assertAnalyzesTo(cz, "angličtí", new String[] { "anglick" });
 +
 +    /* rewrite of z -> h */
 +    assertAnalyzesTo(cz, "kniha", new String[] { "knih" });
 +    assertAnalyzesTo(cz, "knize", new String[] { "knih" });
 +
 +    /* rewrite of ž -> h */
 +    assertAnalyzesTo(cz, "mazat", new String[] { "mah" });
 +    assertAnalyzesTo(cz, "mažu", new String[] { "mah" });
 +
 +    /* rewrite of c -> k */
 +    assertAnalyzesTo(cz, "kluk", new String[] { "kluk" });
 +    assertAnalyzesTo(cz, "kluci", new String[] { "kluk" });
 +    assertAnalyzesTo(cz, "klucích", new String[] { "kluk" });
 +
 +    /* rewrite of č -> k */
 +    assertAnalyzesTo(cz, "hezký", new String[] { "hezk" });
 +    assertAnalyzesTo(cz, "hezčí", new String[] { "hezk" });
 +
 +    /* rewrite of *ů* -> *o* */
 +    assertAnalyzesTo(cz, "hůl", new String[] { "hol" });
 +    assertAnalyzesTo(cz, "hole", new String[] { "hol" });
 +
 +    /* rewrite of e* -> * */
 +    assertAnalyzesTo(cz, "deska", new String[] { "desk" });
 +    assertAnalyzesTo(cz, "desek", new String[] { "desk" });
 +  }
 +
 +  /**
 +   * Test that very short words are not stemmed.
 +   */
 +  public void testDontStem() throws IOException {
 +    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
 +    assertAnalyzesTo(cz, "e", new String[] { "e" });
 +    assertAnalyzesTo(cz, "zi", new String[] { "zi" });
 +  }
 +}

 Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\cz\TestCzechStemmer.java
 ___________________________________________________________________
 Added: svn:eol-style
    + native