| diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java |
| index 3d7c03e..235970b 100644 |
| --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java |
| +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java |
| @@ -58,6 +58,16 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase { |
| } |
| ***/ |
| |
| + /** Prints the tokenization of the input after being whitespace separated |
| + * and filtered by a WordDelimiterFilter using `flags` |
| + */ |
| + public static void debugTokenFilter(int flags, final String input) throws Exception { |
| + WordDelimiterFilter wdf = new WordDelimiterFilter(keywordMockTokenizer(input), |
| + WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null); |
| + System.out.println("Analyzing: " + input); |
| + TokenFilterDebugging.debugTokenFilter(wdf, input); |
| + } |
| + |
| @Test |
| public void testOffsets() throws IOException { |
| int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; |
| @@ -122,12 +132,31 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase { |
| new int[] { 11, 15, 15 }); |
| } |
| |
| - public void doSplit(final String input, String... output) throws Exception { |
| - int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; |
| + public void doSplit(final String input, int flags, String... output) throws Exception{ |
| WordDelimiterFilter wdf = new WordDelimiterFilter(keywordMockTokenizer(input), |
| WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null); |
| |
| - assertTokenStreamContents(wdf, output); |
| + try { |
| + assertTokenStreamContents(wdf, output); |
| + } catch (AssertionError e) { |
| + System.out.println("Analyzing: " + input); |
| + System.out.println("Expected tokens: "); |
| + boolean printed = false; |
| + for (String token : output) { |
| + System.out.print(token + " "); |
| + printed = true; |
| + } |
| + if (printed) { |
| + System.out.println(); |
| + } |
| + debugTokenFilter(flags, input); |
| + throw new AssertionError(e); |
| + } |
| + } |
| + |
| + public void doSplit(final String input, String... output) throws Exception { |
| + int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; |
| + doSplit(input, flags, output); |
| } |
| |
| @Test |
| @@ -164,7 +193,206 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase { |
| // don't split supplementaries into unpaired surrogates |
| doSplit("𠀀𠀀", "𠀀𠀀"); |
| } |
| - |
| + |
| + /** |
| + * This offers a more precise view or what the WDFT does, |
| + * in particular with strange combinations of tags. |
| + * |
| + * Some results may seem incorrect, but it is the behavior of the |
| + * TokenFilter for ages. The correction of these behavior may be necessary, |
| + * but it is an other issue. |
| + */ |
| + public void testNoRegretion() throws Exception { |
| + /* Use the following to print generated tokens |
| + * debugTokenFilter(GENERATE_NUMBER_PARTS | CATENATE_ALL, "abcDef"); |
| + */ |
| + doSplit("abc", 0, "abc"); |
| + doSplit("Wi-Fi", 0); |
| + doSplit("-42", 0, "42"); // Strange behavior ! |
| + doSplit("42", 0, "42"); |
| + doSplit("-42-42", 0); |
| + doSplit("PowerShot", 0, "PowerShot"); |
| + doSplit("SD500", 0, "SD500"); |
| + doSplit("//hello---there, 'dude'", 0); |
| + doSplit("O'Neil's", 0); |
| + doSplit("A's+B's&C's", 0); |
| + doSplit("Super-Duper-XL500-42-AutoCoder!", 0); |
| + doSplit("---", 0); |
| + |
| + /** |
| + * We test many of the combinations between: |
| + * PRESERVE_ORIGINAL, CATENATE_ALL, GENERATE_WORD_PARTS, GENERATE_NUMBER_PARTS. |
| + * This means, 4 single tags, 6 couple tags and only 2 triplet tags |
| + */ |
| + doSplit("abc", PRESERVE_ORIGINAL, "abc"); |
| + doSplit("Wi-Fi", PRESERVE_ORIGINAL, "Wi-Fi"); |
| + doSplit("-42", PRESERVE_ORIGINAL, "-42", "42"); |
| + doSplit("42", PRESERVE_ORIGINAL, "42"); |
| + doSplit("-42-42", PRESERVE_ORIGINAL, "-42-42"); |
| + doSplit("PowerShot", PRESERVE_ORIGINAL, "PowerShot"); |
| + doSplit("SD500", PRESERVE_ORIGINAL, "SD500"); |
| + doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL, "//hello---there, 'dude'"); |
| + doSplit("O'Neil's", PRESERVE_ORIGINAL, "O'Neil's"); |
| + doSplit("A's+B's&C's", PRESERVE_ORIGINAL, "A's+B's&C's"); |
| + doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL, "Super-Duper-XL500-42-AutoCoder!"); |
| + doSplit("---", PRESERVE_ORIGINAL, "---"); |
| + |
| + doSplit("abc", CATENATE_ALL, "abc"); |
| + doSplit("Wi-Fi", CATENATE_ALL, "WiFi"); |
| + doSplit("-42", CATENATE_ALL, "42"); |
| + doSplit("42", CATENATE_ALL, "42"); |
| + doSplit("-42-42", CATENATE_ALL, "4242"); |
| + doSplit("PowerShot", CATENATE_ALL, "PowerShot"); |
| + doSplit("SD500", CATENATE_ALL, "SD500"); |
| + doSplit("//hello---there, 'dude'", CATENATE_ALL, "hellotheredude"); |
| + doSplit("O'Neil's", CATENATE_ALL, "ONeils"); |
| + doSplit("A's+B's&C's", CATENATE_ALL, "AsBsCs"); |
| + doSplit("Super-Duper-XL500-42-AutoCoder!", CATENATE_ALL, "SuperDuperXL50042AutoCoder"); |
| + doSplit("---", CATENATE_ALL); |
| + |
| + doSplit("abc", GENERATE_WORD_PARTS, "abc"); |
| + doSplit("Wi-Fi", GENERATE_WORD_PARTS, "Wi", "Fi"); |
| + doSplit("-42", GENERATE_WORD_PARTS, "42"); |
| + doSplit("42", GENERATE_WORD_PARTS, "42"); |
| + doSplit("-42-42", GENERATE_WORD_PARTS); |
| + doSplit("PowerShot", GENERATE_WORD_PARTS, "PowerShot"); |
| + doSplit("SD500", GENERATE_WORD_PARTS, "SD500"); |
| + doSplit("//hello---there, 'dude'", GENERATE_WORD_PARTS, "hello", "there", "dude"); |
| + doSplit("O'Neil's", GENERATE_WORD_PARTS, "O", "Neil", "s"); |
| + doSplit("A's+B's&C's", GENERATE_WORD_PARTS, "A", "s", "B", "s", "C", "s"); |
| + doSplit("Super-Duper-XL500-42-AutoCoder!", GENERATE_WORD_PARTS, "Super", "Duper", "XL500", "AutoCoder"); |
| + doSplit("---", GENERATE_WORD_PARTS); |
| + |
| + doSplit("abc", GENERATE_NUMBER_PARTS, "abc"); |
| + doSplit("Wi-Fi", GENERATE_NUMBER_PARTS); |
| + doSplit("-42", GENERATE_NUMBER_PARTS, "42"); |
| + doSplit("42", GENERATE_NUMBER_PARTS, "42"); |
| + doSplit("-42-42", GENERATE_NUMBER_PARTS, "42", "42"); |
| + doSplit("PowerShot", GENERATE_NUMBER_PARTS, "PowerShot"); |
| + doSplit("SD500", GENERATE_NUMBER_PARTS, "SD500"); |
| + doSplit("//hello---there, 'dude'", GENERATE_NUMBER_PARTS); |
| + doSplit("O'Neil's", GENERATE_NUMBER_PARTS); |
| + doSplit("A's+B's&C's", GENERATE_NUMBER_PARTS); |
| + doSplit("Super-Duper-XL500-42-AutoCoder!", GENERATE_NUMBER_PARTS, "42"); |
| + doSplit("---", GENERATE_NUMBER_PARTS); |
| + |
| + doSplit("abc", PRESERVE_ORIGINAL | CATENATE_ALL, "abc"); |
| + doSplit("Wi-Fi", PRESERVE_ORIGINAL | CATENATE_ALL, "Wi-Fi", "WiFi"); |
| + doSplit("-42", PRESERVE_ORIGINAL | CATENATE_ALL, "-42", "42"); |
| + doSplit("42", PRESERVE_ORIGINAL | CATENATE_ALL, "42"); |
| + doSplit("-42-42", PRESERVE_ORIGINAL | CATENATE_ALL, "-42-42", "4242"); |
| + doSplit("PowerShot", PRESERVE_ORIGINAL | CATENATE_ALL, "PowerShot"); |
| + doSplit("SD500", PRESERVE_ORIGINAL | CATENATE_ALL, "SD500"); |
| + doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL | CATENATE_ALL, "//hello---there, 'dude'", "hellotheredude"); |
| + doSplit("O'Neil's", PRESERVE_ORIGINAL | CATENATE_ALL, "O'Neil's", "ONeils"); |
| + doSplit("A's+B's&C's", PRESERVE_ORIGINAL | CATENATE_ALL, "A's+B's&C's", "AsBsCs"); |
| + doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL | CATENATE_ALL, "Super-Duper-XL500-42-AutoCoder!", "SuperDuperXL50042AutoCoder"); |
| + doSplit("---", PRESERVE_ORIGINAL | CATENATE_ALL, "---"); |
| + |
| + doSplit("abc", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "abc"); |
| + doSplit("Wi-Fi", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "Wi-Fi", "Wi", "Fi"); |
| + doSplit("-42", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "-42", "42"); // Strange behavior, 42 is not word |
| + doSplit("42", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "42"); |
| + doSplit("-42-42", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "-42-42"); |
| + doSplit("PowerShot", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "PowerShot"); |
| + doSplit("SD500", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "SD500"); |
| + doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "//hello---there, 'dude'", "hello", "there", "dude"); |
| + doSplit("O'Neil's", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "O'Neil's", "O", "Neil", "s"); |
| + doSplit("A's+B's&C's", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "A's+B's&C's", "A", "s", "B", "s", "C", "s"); |
| + doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "Super-Duper-XL500-42-AutoCoder!", "Super", "Duper", "XL500", "AutoCoder"); |
| + doSplit("---", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "---"); |
| + |
| + doSplit("abc", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "abc"); |
| + doSplit("Wi-Fi", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "Wi-Fi"); |
| + doSplit("-42", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "-42", "42"); |
| + doSplit("42", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "42"); |
| + doSplit("-42-42", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "-42-42", "42", "42"); |
| + doSplit("PowerShot", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "PowerShot"); |
| + doSplit("SD500", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "SD500"); |
| + doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "//hello---there, 'dude'"); |
| + doSplit("O'Neil's", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "O'Neil's"); |
| + doSplit("A's+B's&C's", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "A's+B's&C's"); |
| + doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "Super-Duper-XL500-42-AutoCoder!", "42"); |
| + doSplit("---", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "---"); |
| + |
| + doSplit("abc", CATENATE_ALL | GENERATE_WORD_PARTS, "abc"); |
| + doSplit("Wi-Fi", CATENATE_ALL | GENERATE_WORD_PARTS, "Wi", "WiFi", "Fi"); |
| + doSplit("-42", CATENATE_ALL | GENERATE_WORD_PARTS, "42"); |
| + doSplit("42", CATENATE_ALL | GENERATE_WORD_PARTS, "42"); |
| + doSplit("-42-42", CATENATE_ALL | GENERATE_WORD_PARTS, "4242"); |
| + doSplit("PowerShot", CATENATE_ALL | GENERATE_WORD_PARTS, "PowerShot"); |
| + doSplit("SD500", CATENATE_ALL | GENERATE_WORD_PARTS, "SD500"); |
| + doSplit("//hello---there, 'dude'", CATENATE_ALL | GENERATE_WORD_PARTS, "hello", "hellotheredude", "there", "dude"); |
| + doSplit("O'Neil's", CATENATE_ALL | GENERATE_WORD_PARTS, "O", "ONeils", "Neil", "s"); |
| + doSplit("A's+B's&C's", CATENATE_ALL | GENERATE_WORD_PARTS, "A", "AsBsCs", "s", "B", "s", "C", "s"); |
| + doSplit("Super-Duper-XL500-42-AutoCoder!", CATENATE_ALL | GENERATE_WORD_PARTS, "Super", "SuperDuperXL50042AutoCoder", "Duper", "XL500", "AutoCoder"); |
| + doSplit("---", CATENATE_ALL | GENERATE_WORD_PARTS); |
| + |
| + doSplit("abc", CATENATE_ALL | GENERATE_NUMBER_PARTS, "abc"); |
| + doSplit("Wi-Fi", CATENATE_ALL | GENERATE_NUMBER_PARTS, "WiFi"); |
| + doSplit("-42", CATENATE_ALL | GENERATE_NUMBER_PARTS, "42"); |
| + doSplit("42", CATENATE_ALL | GENERATE_NUMBER_PARTS, "42"); |
| + doSplit("-42-42", CATENATE_ALL | GENERATE_NUMBER_PARTS, "42", "4242", "42"); |
| + doSplit("PowerShot", CATENATE_ALL | GENERATE_NUMBER_PARTS, "PowerShot"); |
| + doSplit("SD500", CATENATE_ALL | GENERATE_NUMBER_PARTS, "SD500"); |
| + doSplit("//hello---there, 'dude'", CATENATE_ALL | GENERATE_NUMBER_PARTS, "hellotheredude"); |
| + doSplit("O'Neil's", CATENATE_ALL | GENERATE_NUMBER_PARTS, "ONeils"); |
| + doSplit("A's+B's&C's", CATENATE_ALL | GENERATE_NUMBER_PARTS, "AsBsCs"); |
| + doSplit("A's+B's&C's", CATENATE_ALL | GENERATE_NUMBER_PARTS, "AsBsampCs"); |
| + doSplit("Super-Duper-XL500-42-AutoCoder!", CATENATE_ALL | GENERATE_NUMBER_PARTS, "SuperDuperXL50042AutoCoder", "42"); |
| + doSplit("---", CATENATE_ALL | GENERATE_WORD_PARTS); |
| + |
| + doSplit("abc", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "abc"); |
| + doSplit("Wi-Fi", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "Wi", "Fi"); |
| + doSplit("-42", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "42"); |
| + doSplit("42", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "42"); |
| + doSplit("-42-42", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "42", "42"); |
| + doSplit("PowerShot", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "PowerShot"); |
| + doSplit("SD500", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "SD500"); |
| + doSplit("//hello---there, 'dude'", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "hello", "there", "dude"); |
| + doSplit("O'Neil's", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "O", "Neil", "s"); |
| + doSplit("A's+B's&C's", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "A", "s", "B", "s", "C", "s"); |
| + doSplit("Super-Duper-XL500-42-AutoCoder!", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "Super", "Duper", "XL500", "42", "AutoCoder"); |
| + doSplit("---", GENERATE_WORD_PARTS | GENERATE_WORD_PARTS); |
| + |
| + doSplit("abc", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "abc"); |
| + doSplit("Wi-Fi", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "Wi-Fi", "Wi", "WiFi", "Fi"); |
| + doSplit("-42", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "-42", "42"); |
| + doSplit("42", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "42"); |
| + doSplit("-42-42", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "-42-42","4242"); |
| + doSplit("PowerShot", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "PowerShot"); |
| + doSplit("SD500", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "SD500"); |
| + doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "//hello---there, 'dude'", "hello", "hellotheredude", "there", "dude"); |
| + doSplit("O'Neil's", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "O'Neil's", "O", "ONeils", "Neil", "s"); |
| + doSplit("A's+B's&C's", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "A's+B's&C's", "A", "AsBsCs", "s", "B", "s", "C", "s"); |
| + doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "Super-Duper-XL500-42-AutoCoder!", "Super", "SuperDuperXL50042AutoCoder", "Duper", "XL500", "AutoCoder"); |
| + doSplit("---", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "---"); |
| + |
| + doSplit("abc", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "abc"); |
| + doSplit("Wi-Fi", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "Wi-Fi", "Wi", "Fi"); |
| + doSplit("-42", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "-42", "42"); |
| + doSplit("42", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "42"); |
| + doSplit("-42-42", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "-42-42", "42", "42"); |
| + doSplit("PowerShot", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "PowerShot"); |
| + doSplit("SD500", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "SD500"); |
| + doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "//hello---there, 'dude'", "hello", "there", "dude"); |
| + doSplit("O'Neil's", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "O'Neil's", "O", "Neil", "s"); |
| + doSplit("A's+B's&C's", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "A's+B's&C's", "A", "s", "B", "s", "C", "s"); |
| + doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "Super-Duper-XL500-42-AutoCoder!", "Super", "Duper", "XL500", "42", "AutoCoder"); |
| + doSplit("---", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "---"); |
| + |
| + // SPLIT_ON_CASE_CHANGE produces zero token (the default behavior when there |
| + // there is the generation of subwords |
| + doSplit("abcDef", SPLIT_ON_CASE_CHANGE); |
| + doSplit("abcDef", SPLIT_ON_CASE_CHANGE | PRESERVE_ORIGINAL, "abcDef"); |
| + doSplit("abcDef", SPLIT_ON_CASE_CHANGE | GENERATE_WORD_PARTS, "abc", "Def"); |
| + doSplit("abcDef", SPLIT_ON_CASE_CHANGE | GENERATE_WORD_PARTS | CATENATE_ALL, "abc", "abcDef", "Def"); |
| + /* The following is bugued? Should be "abcDef" */ |
| + doSplit("abcDef", SPLIT_ON_CASE_CHANGE | PRESERVE_ORIGINAL | CATENATE_ALL, "abcDef", "abcDef"); |
| + /* The following is bugued? Should not be "abcDef", "abc", "Def" */ |
| + doSplit("abcDef", SPLIT_ON_CASE_CHANGE | PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "abcDef", "abc", "abcDef", "Def"); |
| + } |
| + |
| public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception { |
| int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS; |
| flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0; |
| @@ -172,7 +400,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase { |
| |
| assertTokenStreamContents(wdf, output); |
| } |
| - |
| + |
| /* |
| * Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters. |
| */ |
| @@ -181,7 +409,44 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase { |
| doSplitPossessive(1, "ra's", "ra"); |
| doSplitPossessive(0, "ra's", "ra", "s"); |
| } |
| - |
| + |
| + /** We should not generate the same token twice with PRESERVE_ORIGINAL and CONCATENATE_ALL|CATENATE_WORDS */ |
| + public void checkNoDuplicates(String input_string) throws Exception { |
| + final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; |
| + |
| + /* analyzer that uses whitespace + wdf */ |
| + Analyzer a = new Analyzer() { |
| + @Override |
| + public TokenStreamComponents createComponents(String field) { |
| + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| + return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null)); |
| + } |
| + }; |
| + TokenStream ts = a.tokenStream("dummy", input_string); |
| + ts.reset(); |
| + |
| + final Set<String> already_seen_words = new HashSet<>(); |
| + while (ts.incrementToken()) { |
| + CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); |
| + PositionIncrementAttribute posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class); |
| + String word = termAtt.toString(); |
| + if (posIncrAtt.getPositionIncrement() == 0) { |
| + assertFalse("The term " + word + " is duplicated when tokenizing " + input_string, |
| + already_seen_words.contains(word)); |
| + } else { |
| + already_seen_words.clear(); |
| + } |
| + already_seen_words.add(word); |
| + } |
| + a.close(); |
| + } |
| + /* |
| + * Currently, the plugin does not ensure that there are no |
| + * duplicate tokens. |
| + */ |
| + public void testNoDuplicates() throws Exception { |
| + // checkNoDuplicates("abcDef abcDef"); |
| + } |
| /* |
| * Set a large position increment gap of 10 if the token is "largegap" or "/" |
| */ |