docs/attachments/LUCENE-4542/LUCENE-4542-with-solr.patch - lucene-jira-archive - Git at Google

 Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java
 ===================================================================
 --- lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java	(revision 1406601)
 +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java	(working copy)
 @@ -57,13 +57,13 @@
    public void testKeywordAttribute() throws IOException {
      MockTokenizer tokenizer = new MockTokenizer(new StringReader("lucene is awesome"), MockTokenizer.WHITESPACE, true);
      tokenizer.setEnableChecks(true);
 -    HunspellStemFilter filter = new HunspellStemFilter(tokenizer, DICTIONARY);
 +    HunspellStemFilter filter = new HunspellStemFilter(tokenizer, DICTIONARY, 2);
      assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1});

      // assert with keywork marker
      tokenizer = new MockTokenizer(new StringReader("lucene is awesome"), MockTokenizer.WHITESPACE, true);
      CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true);
 -    filter = new HunspellStemFilter(new KeywordMarkerFilter(tokenizer, set), DICTIONARY);
 +    filter = new HunspellStemFilter(new KeywordMarkerFilter(tokenizer, set), DICTIONARY, 2);
      assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
    }

 @@ -74,7 +74,7 @@
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
          Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
 -        return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY));
 +        return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, 2));
        }
      };
      checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
 @@ -85,7 +85,7 @@
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
          Tokenizer tokenizer = new KeywordTokenizer(reader);
 -        return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY));
 +        return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, 2));
        }
      };
      checkOneTermReuse(a, "", "");
 Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java
 ===================================================================
 --- lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java	(revision 1406601)
 +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java	(working copy)
 @@ -49,9 +49,10 @@
     *
     * @param input TokenStream whose tokens will be stemmed
     * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
 +   * @param recursionCap recursion cap
     */
 -  public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary) {
 -    this(input, dictionary, true);
 +  public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, int recursionCap) {
 +    this(input, dictionary, true, recursionCap);
    }

    /**
 @@ -61,11 +62,12 @@
     * @param input TokenStream whose tokens will be stemmed
     * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
     * @param dedup true if only unique terms should be output.
 +   * @param recursionCap recursion cap
     */
 -  public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup) {
 +  public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup, int recursionCap) {
      super(input);
      this.dedup = dedup;
 -    this.stemmer = new HunspellStemmer(dictionary);
 +    this.stemmer = new HunspellStemmer(dictionary, recursionCap);
    }

    /**
 Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java
 ===================================================================
 --- lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java	(revision 1406601)
 +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java	(working copy)
 @@ -56,11 +56,13 @@
    private static final String PARAM_AFFIX = "affix";
    private static final String PARAM_IGNORE_CASE = "ignoreCase";
    private static final String PARAM_STRICT_AFFIX_PARSING = "strictAffixParsing";
 +  private static final String PARAM_RECURSION_CAP = "recursionCap";
    private static final String TRUE = "true";
    private static final String FALSE = "false";

    private HunspellDictionary dictionary;
    private boolean ignoreCase = false;
 +  private int recursionCap = 2;

    /**
     * Loads the hunspell dictionary and affix files defined in the configuration
 @@ -90,6 +92,11 @@
        else throw new IllegalArgumentException("Unknown value for " + PARAM_STRICT_AFFIX_PARSING + ": " + strictAffixParsingParam + ". Must be true or false");
      }

 +    String recursionCap = args.get(PARAM_RECURSION_CAP);
 +    if (recursionCap != null) {
 +      this.recursionCap = Integer.parseInt(recursionCap);
 +    }
 +
      InputStream affix = null;
      List<InputStream> dictionaries = new ArrayList<InputStream>();

 @@ -117,6 +124,6 @@
     * @return HunspellStemFilter that filters the TokenStream
     */
    public TokenStream create(TokenStream tokenStream) {
 -    return new HunspellStemFilter(tokenStream, dictionary);
 +    return new HunspellStemFilter(tokenStream, dictionary, recursionCap);
    }
  }
 Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
 ===================================================================
 --- lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java	(revision 1406601)
 +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java	(working copy)
 @@ -37,9 +37,7 @@
   * conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping.
   */
  public class HunspellStemmer {
 -
 -  private static final int RECURSION_CAP = 2;
 -
 +  private int recursionCap = 2;
    private final HunspellDictionary dictionary;
    private final StringBuilder segment = new StringBuilder();
    private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_40);
 @@ -54,6 +52,17 @@
    }

    /**
 +   * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
 +   *
 +   * @param dictionary HunspellDictionary that will be used to create the stems
 +   * @param recursionCap recursion cap
 +   */
 +  public HunspellStemmer(HunspellDictionary dictionary, int recursionCap) {
 +    this(dictionary);
 +    this.recursionCap = recursionCap;
 +  }
 +
 +  /**
     * Find the stem(s) of the provided word
     *
     * @param word Word to find the stems for
 @@ -197,7 +206,7 @@
        }
      }

 -    if (affix.isCrossProduct() && recursionDepth < RECURSION_CAP) {
 +    if (affix.isCrossProduct() && recursionDepth < recursionCap) {
        stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth));
      }
	Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java
	===================================================================
	--- lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java (revision 1406601)
	+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java (working copy)
	@@ -57,13 +57,13 @@
	public void testKeywordAttribute() throws IOException {
	MockTokenizer tokenizer = new MockTokenizer(new StringReader("lucene is awesome"), MockTokenizer.WHITESPACE, true);
	tokenizer.setEnableChecks(true);
	- HunspellStemFilter filter = new HunspellStemFilter(tokenizer, DICTIONARY);
	+ HunspellStemFilter filter = new HunspellStemFilter(tokenizer, DICTIONARY, 2);
	assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1});

	// assert with keywork marker
	tokenizer = new MockTokenizer(new StringReader("lucene is awesome"), MockTokenizer.WHITESPACE, true);
	CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true);
	- filter = new HunspellStemFilter(new KeywordMarkerFilter(tokenizer, set), DICTIONARY);
	+ filter = new HunspellStemFilter(new KeywordMarkerFilter(tokenizer, set), DICTIONARY, 2);
	assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
	}

	@@ -74,7 +74,7 @@
	@Override
	protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
	Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
	- return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY));
	+ return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, 2));
	}
	};
	checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
	@@ -85,7 +85,7 @@
	@Override
	protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
	Tokenizer tokenizer = new KeywordTokenizer(reader);
	- return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY));
	+ return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, 2));
	}
	};
	checkOneTermReuse(a, "", "");
	Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java
	===================================================================
	--- lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java (revision 1406601)
	+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java (working copy)
	@@ -49,9 +49,10 @@
	*
	* @param input TokenStream whose tokens will be stemmed
	* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
	+ * @param recursionCap recursion cap
	*/
	- public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary) {
	- this(input, dictionary, true);
	+ public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, int recursionCap) {
	+ this(input, dictionary, true, recursionCap);
	}

	/**
	@@ -61,11 +62,12 @@
	* @param input TokenStream whose tokens will be stemmed
	* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
	* @param dedup true if only unique terms should be output.
	+ * @param recursionCap recursion cap
	*/
	- public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup) {
	+ public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup, int recursionCap) {
	super(input);
	this.dedup = dedup;
	- this.stemmer = new HunspellStemmer(dictionary);
	+ this.stemmer = new HunspellStemmer(dictionary, recursionCap);
	}

	/**
	Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java
	===================================================================
	--- lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java (revision 1406601)
	+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java (working copy)
	@@ -56,11 +56,13 @@
	private static final String PARAM_AFFIX = "affix";
	private static final String PARAM_IGNORE_CASE = "ignoreCase";
	private static final String PARAM_STRICT_AFFIX_PARSING = "strictAffixParsing";
	+ private static final String PARAM_RECURSION_CAP = "recursionCap";
	private static final String TRUE = "true";
	private static final String FALSE = "false";

	private HunspellDictionary dictionary;
	private boolean ignoreCase = false;
	+ private int recursionCap = 2;

	/**
	* Loads the hunspell dictionary and affix files defined in the configuration
	@@ -90,6 +92,11 @@
	else throw new IllegalArgumentException("Unknown value for " + PARAM_STRICT_AFFIX_PARSING + ": " + strictAffixParsingParam + ". Must be true or false");
	}

	+ String recursionCap = args.get(PARAM_RECURSION_CAP);
	+ if (recursionCap != null) {
	+ this.recursionCap = Integer.parseInt(recursionCap);
	+ }
	+
	InputStream affix = null;
	List<InputStream> dictionaries = new ArrayList<InputStream>();

	@@ -117,6 +124,6 @@
	* @return HunspellStemFilter that filters the TokenStream
	*/
	public TokenStream create(TokenStream tokenStream) {
	- return new HunspellStemFilter(tokenStream, dictionary);
	+ return new HunspellStemFilter(tokenStream, dictionary, recursionCap);
	}
	}
	Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
	===================================================================
	--- lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java (revision 1406601)
	+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java (working copy)
	@@ -37,9 +37,7 @@
	* conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping.
	*/
	public class HunspellStemmer {
	-
	- private static final int RECURSION_CAP = 2;
	-
	+ private int recursionCap = 2;
	private final HunspellDictionary dictionary;
	private final StringBuilder segment = new StringBuilder();
	private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_40);
	@@ -54,6 +52,17 @@
	}

	/**
	+ * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
	+ *
	+ * @param dictionary HunspellDictionary that will be used to create the stems
	+ * @param recursionCap recursion cap
	+ */
	+ public HunspellStemmer(HunspellDictionary dictionary, int recursionCap) {
	+ this(dictionary);
	+ this.recursionCap = recursionCap;
	+ }
	+
	+ /**
	* Find the stem(s) of the provided word
	*
	* @param word Word to find the stems for
	@@ -197,7 +206,7 @@
	}
	}

	- if (affix.isCrossProduct() && recursionDepth < RECURSION_CAP) {
	+ if (affix.isCrossProduct() && recursionDepth < recursionCap) {
	stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth));
	}