| Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java
|
| ===================================================================
|
| --- lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java (revision 1406601)
|
| +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java (working copy)
|
| @@ -57,13 +57,13 @@
|
| public void testKeywordAttribute() throws IOException { |
| MockTokenizer tokenizer = new MockTokenizer(new StringReader("lucene is awesome"), MockTokenizer.WHITESPACE, true); |
| tokenizer.setEnableChecks(true); |
| - HunspellStemFilter filter = new HunspellStemFilter(tokenizer, DICTIONARY); |
| + HunspellStemFilter filter = new HunspellStemFilter(tokenizer, DICTIONARY, 2);
|
| assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1}); |
| |
| // assert with keywork marker |
| tokenizer = new MockTokenizer(new StringReader("lucene is awesome"), MockTokenizer.WHITESPACE, true); |
| CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true); |
| - filter = new HunspellStemFilter(new KeywordMarkerFilter(tokenizer, set), DICTIONARY); |
| + filter = new HunspellStemFilter(new KeywordMarkerFilter(tokenizer, set), DICTIONARY, 2);
|
| assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1}); |
| } |
| |
| @@ -74,7 +74,7 @@
|
| @Override |
| protected TokenStreamComponents createComponents(String fieldName, Reader reader) { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); |
| - return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY)); |
| + return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, 2));
|
| } |
| }; |
| checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); |
| @@ -85,7 +85,7 @@
|
| @Override |
| protected TokenStreamComponents createComponents(String fieldName, Reader reader) { |
| Tokenizer tokenizer = new KeywordTokenizer(reader); |
| - return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY)); |
| + return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, 2));
|
| } |
| }; |
| checkOneTermReuse(a, "", ""); |
| Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java
|
| ===================================================================
|
| --- lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java (revision 1406601)
|
| +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java (working copy)
|
| @@ -49,9 +49,10 @@
|
| * |
| * @param input TokenStream whose tokens will be stemmed |
| * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens |
| + * @param recursionCap recursion cap
|
| */ |
| - public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary) { |
| - this(input, dictionary, true); |
| + public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, int recursionCap) {
|
| + this(input, dictionary, true, recursionCap);
|
| } |
| |
| /** |
| @@ -61,11 +62,12 @@
|
| * @param input TokenStream whose tokens will be stemmed |
| * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens |
| * @param dedup true if only unique terms should be output. |
| + * @param recursionCap recursion cap
|
| */ |
| - public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup) { |
| + public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup, int recursionCap) {
|
| super(input); |
| this.dedup = dedup; |
| - this.stemmer = new HunspellStemmer(dictionary); |
| + this.stemmer = new HunspellStemmer(dictionary, recursionCap);
|
| } |
| |
| /** |
| Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java
|
| ===================================================================
|
| --- lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java (revision 1406601)
|
| +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java (working copy)
|
| @@ -56,11 +56,13 @@
|
| private static final String PARAM_AFFIX = "affix"; |
| private static final String PARAM_IGNORE_CASE = "ignoreCase"; |
| private static final String PARAM_STRICT_AFFIX_PARSING = "strictAffixParsing"; |
| + private static final String PARAM_RECURSION_CAP = "recursionCap";
|
| private static final String TRUE = "true"; |
| private static final String FALSE = "false"; |
| |
| private HunspellDictionary dictionary; |
| private boolean ignoreCase = false; |
| + private int recursionCap = 2;
|
| |
| /** |
| * Loads the hunspell dictionary and affix files defined in the configuration |
| @@ -90,6 +92,11 @@
|
| else throw new IllegalArgumentException("Unknown value for " + PARAM_STRICT_AFFIX_PARSING + ": " + strictAffixParsingParam + ". Must be true or false"); |
| } |
| |
| + String recursionCap = args.get(PARAM_RECURSION_CAP);
|
| + if (recursionCap != null) {
|
| + this.recursionCap = Integer.parseInt(recursionCap);
|
| + }
|
| +
|
| InputStream affix = null; |
| List<InputStream> dictionaries = new ArrayList<InputStream>(); |
| |
| @@ -117,6 +124,6 @@
|
| * @return HunspellStemFilter that filters the TokenStream |
| */ |
| public TokenStream create(TokenStream tokenStream) { |
| - return new HunspellStemFilter(tokenStream, dictionary); |
| + return new HunspellStemFilter(tokenStream, dictionary, recursionCap);
|
| } |
| } |
| Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
|
| ===================================================================
|
| --- lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java (revision 1406601)
|
| +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java (working copy)
|
| @@ -37,9 +37,7 @@
|
| * conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping. |
| */ |
| public class HunspellStemmer { |
| - |
| - private static final int RECURSION_CAP = 2; |
| - |
| + private int recursionCap = 2;
|
| private final HunspellDictionary dictionary; |
| private final StringBuilder segment = new StringBuilder(); |
| private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_40); |
| @@ -54,6 +52,17 @@
|
| } |
| |
| /** |
| + * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
|
| + *
|
| + * @param dictionary HunspellDictionary that will be used to create the stems
|
| + * @param recursionCap recursion cap
|
| + */
|
| + public HunspellStemmer(HunspellDictionary dictionary, int recursionCap) {
|
| + this(dictionary);
|
| + this.recursionCap = recursionCap;
|
| + }
|
| +
|
| + /**
|
| * Find the stem(s) of the provided word |
| * |
| * @param word Word to find the stems for |
| @@ -197,7 +206,7 @@
|
| } |
| } |
| |
| - if (affix.isCrossProduct() && recursionDepth < RECURSION_CAP) { |
| + if (affix.isCrossProduct() && recursionDepth < recursionCap) {
|
| stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth)); |
| } |
| |