docs/attachments/LUCENE-7004/TEST-LUCENE-7004.PATCH - lucene-jira-archive - Git at Google

 diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
 index 3d7c03e..235970b 100644
 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
 +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
 @@ -58,6 +58,16 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
    }
    ***/

 +  /** Prints the tokenization of the input after being whitespace separated
 +   * and filtered by a WordDelimiterFilter using `flags`
 +   */
 +  public static void debugTokenFilter(int flags, final String input) throws Exception {
 +    WordDelimiterFilter wdf = new WordDelimiterFilter(keywordMockTokenizer(input),
 +        WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
 +    System.out.println("Analyzing: " + input);
 +    TokenFilterDebugging.debugTokenFilter(wdf, input);
 +  }
 +
    @Test
    public void testOffsets() throws IOException {
      int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
 @@ -122,12 +132,31 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
          new int[] { 11, 15, 15 });
    }

 -  public void doSplit(final String input, String... output) throws Exception {
 -    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
 +  public void doSplit(final String input, int flags, String... output) throws Exception{
      WordDelimiterFilter wdf = new WordDelimiterFilter(keywordMockTokenizer(input),
          WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);

 -    assertTokenStreamContents(wdf, output);
 +    try {
 +      assertTokenStreamContents(wdf, output);
 +    } catch (AssertionError e) {
 +      System.out.println("Analyzing: " + input);
 +      System.out.println("Expected tokens: ");
 +      boolean printed = false;
 +      for (String token : output) {
 +        System.out.print(token + " ");
 +        printed = true;
 +      }
 +      if (printed) {
 +        System.out.println();
 +      }
 +      debugTokenFilter(flags, input);
 +      throw new AssertionError(e);
 +    }
 +  }
 +
 +  public void doSplit(final String input, String... output) throws Exception {
 +    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
 +    doSplit(input, flags, output);
    }

    @Test
 @@ -164,7 +193,206 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
      // don't split supplementaries into unpaired surrogates
      doSplit("𠀀𠀀", "𠀀𠀀");
    }
 -
 +
 +  /**
 +   * This offers a more precise view or what the WDFT does,
 +   * in particular with strange combinations of tags.
 +   *
 +   * Some results may seem incorrect, but it is the behavior of the
 +   * TokenFilter for ages. The correction of these behavior may be necessary,
 +   * but it is an other issue.
 +   */
 +  public void testNoRegretion() throws Exception {
 +    /* Use the following to print generated tokens
 +     * debugTokenFilter(GENERATE_NUMBER_PARTS | CATENATE_ALL, "abcDef");
 +     */
 +    doSplit("abc", 0, "abc");
 +    doSplit("Wi-Fi", 0);
 +    doSplit("-42",  0, "42"); // Strange behavior !
 +    doSplit("42", 0, "42");
 +    doSplit("-42-42", 0);
 +    doSplit("PowerShot", 0, "PowerShot");
 +    doSplit("SD500", 0, "SD500");
 +    doSplit("//hello---there, 'dude'", 0);
 +    doSplit("O'Neil's", 0);
 +    doSplit("A's+B's&C's", 0);
 +    doSplit("Super-Duper-XL500-42-AutoCoder!", 0);
 +    doSplit("---",  0);
 +
 +    /**
 +     * We test many of the combinations between:
 +     * PRESERVE_ORIGINAL, CATENATE_ALL, GENERATE_WORD_PARTS, GENERATE_NUMBER_PARTS.
 +     * This means, 4 single tags, 6 couple tags and only 2 triplet tags
 +     */
 +    doSplit("abc", PRESERVE_ORIGINAL, "abc");
 +    doSplit("Wi-Fi", PRESERVE_ORIGINAL, "Wi-Fi");
 +    doSplit("-42",  PRESERVE_ORIGINAL, "-42", "42");
 +    doSplit("42", PRESERVE_ORIGINAL, "42");
 +    doSplit("-42-42", PRESERVE_ORIGINAL, "-42-42");
 +    doSplit("PowerShot", PRESERVE_ORIGINAL, "PowerShot");
 +    doSplit("SD500", PRESERVE_ORIGINAL, "SD500");
 +    doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL, "//hello---there, 'dude'");
 +    doSplit("O'Neil's", PRESERVE_ORIGINAL, "O'Neil's");
 +    doSplit("A's+B's&C's", PRESERVE_ORIGINAL, "A's+B's&C's");
 +    doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL, "Super-Duper-XL500-42-AutoCoder!");
 +    doSplit("---",  PRESERVE_ORIGINAL, "---");
 +
 +    doSplit("abc", CATENATE_ALL, "abc");
 +    doSplit("Wi-Fi", CATENATE_ALL, "WiFi");
 +    doSplit("-42",  CATENATE_ALL, "42");
 +    doSplit("42", CATENATE_ALL, "42");
 +    doSplit("-42-42", CATENATE_ALL, "4242");
 +    doSplit("PowerShot", CATENATE_ALL, "PowerShot");
 +    doSplit("SD500", CATENATE_ALL, "SD500");
 +    doSplit("//hello---there, 'dude'", CATENATE_ALL, "hellotheredude");
 +    doSplit("O'Neil's", CATENATE_ALL, "ONeils");
 +    doSplit("A's+B's&C's", CATENATE_ALL, "AsBsCs");
 +    doSplit("Super-Duper-XL500-42-AutoCoder!", CATENATE_ALL, "SuperDuperXL50042AutoCoder");
 +    doSplit("---",  CATENATE_ALL);
 +
 +    doSplit("abc", GENERATE_WORD_PARTS, "abc");
 +    doSplit("Wi-Fi", GENERATE_WORD_PARTS, "Wi", "Fi");
 +    doSplit("-42",  GENERATE_WORD_PARTS, "42");
 +    doSplit("42", GENERATE_WORD_PARTS, "42");
 +    doSplit("-42-42", GENERATE_WORD_PARTS);
 +    doSplit("PowerShot", GENERATE_WORD_PARTS, "PowerShot");
 +    doSplit("SD500", GENERATE_WORD_PARTS, "SD500");
 +    doSplit("//hello---there, 'dude'", GENERATE_WORD_PARTS, "hello", "there", "dude");
 +    doSplit("O'Neil's", GENERATE_WORD_PARTS, "O", "Neil", "s");
 +    doSplit("A's+B's&C's", GENERATE_WORD_PARTS, "A", "s", "B", "s", "C", "s");
 +    doSplit("Super-Duper-XL500-42-AutoCoder!", GENERATE_WORD_PARTS, "Super", "Duper", "XL500", "AutoCoder");
 +    doSplit("---",  GENERATE_WORD_PARTS);
 +
 +    doSplit("abc", GENERATE_NUMBER_PARTS, "abc");
 +    doSplit("Wi-Fi", GENERATE_NUMBER_PARTS);
 +    doSplit("-42",  GENERATE_NUMBER_PARTS, "42");
 +    doSplit("42", GENERATE_NUMBER_PARTS, "42");
 +    doSplit("-42-42", GENERATE_NUMBER_PARTS, "42", "42");
 +    doSplit("PowerShot", GENERATE_NUMBER_PARTS, "PowerShot");
 +    doSplit("SD500", GENERATE_NUMBER_PARTS, "SD500");
 +    doSplit("//hello---there, 'dude'", GENERATE_NUMBER_PARTS);
 +    doSplit("O'Neil's", GENERATE_NUMBER_PARTS);
 +    doSplit("A's+B's&amp;C's", GENERATE_NUMBER_PARTS);
 +    doSplit("Super-Duper-XL500-42-AutoCoder!", GENERATE_NUMBER_PARTS, "42");
 +    doSplit("---",  GENERATE_NUMBER_PARTS);
 +
 +    doSplit("abc", PRESERVE_ORIGINAL | CATENATE_ALL, "abc");
 +    doSplit("Wi-Fi", PRESERVE_ORIGINAL | CATENATE_ALL, "Wi-Fi", "WiFi");
 +    doSplit("-42",  PRESERVE_ORIGINAL | CATENATE_ALL, "-42", "42");
 +    doSplit("42", PRESERVE_ORIGINAL | CATENATE_ALL, "42");
 +    doSplit("-42-42", PRESERVE_ORIGINAL | CATENATE_ALL, "-42-42", "4242");
 +    doSplit("PowerShot", PRESERVE_ORIGINAL | CATENATE_ALL, "PowerShot");
 +    doSplit("SD500", PRESERVE_ORIGINAL | CATENATE_ALL, "SD500");
 +    doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL | CATENATE_ALL, "//hello---there, 'dude'", "hellotheredude");
 +    doSplit("O'Neil's", PRESERVE_ORIGINAL | CATENATE_ALL, "O'Neil's", "ONeils");
 +    doSplit("A's+B's&C's", PRESERVE_ORIGINAL | CATENATE_ALL, "A's+B's&C's", "AsBsCs");
 +    doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL | CATENATE_ALL, "Super-Duper-XL500-42-AutoCoder!", "SuperDuperXL50042AutoCoder");
 +    doSplit("---",  PRESERVE_ORIGINAL | CATENATE_ALL, "---");
 +
 +    doSplit("abc", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "abc");
 +    doSplit("Wi-Fi", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "Wi-Fi", "Wi", "Fi");
 +    doSplit("-42",  PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "-42", "42"); // Strange behavior, 42 is not word
 +    doSplit("42", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "42");
 +    doSplit("-42-42", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "-42-42");
 +    doSplit("PowerShot", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "PowerShot");
 +    doSplit("SD500", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "SD500");
 +    doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "//hello---there, 'dude'", "hello", "there", "dude");
 +    doSplit("O'Neil's", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "O'Neil's", "O", "Neil", "s");
 +    doSplit("A's+B's&C's", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "A's+B's&C's", "A", "s", "B", "s", "C", "s");
 +    doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "Super-Duper-XL500-42-AutoCoder!", "Super", "Duper", "XL500", "AutoCoder");
 +    doSplit("---",  PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "---");
 +
 +    doSplit("abc", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "abc");
 +    doSplit("Wi-Fi", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "Wi-Fi");
 +    doSplit("-42",  PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "-42", "42");
 +    doSplit("42", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "42");
 +    doSplit("-42-42", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "-42-42", "42", "42");
 +    doSplit("PowerShot", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "PowerShot");
 +    doSplit("SD500", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "SD500");
 +    doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "//hello---there, 'dude'");
 +    doSplit("O'Neil's", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "O'Neil's");
 +    doSplit("A's+B's&C's", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "A's+B's&C's");
 +    doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "Super-Duper-XL500-42-AutoCoder!", "42");
 +    doSplit("---",  PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "---");
 +
 +    doSplit("abc", CATENATE_ALL | GENERATE_WORD_PARTS, "abc");
 +    doSplit("Wi-Fi", CATENATE_ALL | GENERATE_WORD_PARTS, "Wi", "WiFi", "Fi");
 +    doSplit("-42",  CATENATE_ALL | GENERATE_WORD_PARTS, "42");
 +    doSplit("42", CATENATE_ALL | GENERATE_WORD_PARTS, "42");
 +    doSplit("-42-42", CATENATE_ALL | GENERATE_WORD_PARTS, "4242");
 +    doSplit("PowerShot", CATENATE_ALL | GENERATE_WORD_PARTS, "PowerShot");
 +    doSplit("SD500", CATENATE_ALL | GENERATE_WORD_PARTS, "SD500");
 +    doSplit("//hello---there, 'dude'", CATENATE_ALL | GENERATE_WORD_PARTS, "hello", "hellotheredude", "there", "dude");
 +    doSplit("O'Neil's", CATENATE_ALL | GENERATE_WORD_PARTS, "O", "ONeils", "Neil", "s");
 +    doSplit("A's+B's&C's", CATENATE_ALL | GENERATE_WORD_PARTS, "A", "AsBsCs", "s", "B", "s", "C", "s");
 +    doSplit("Super-Duper-XL500-42-AutoCoder!", CATENATE_ALL | GENERATE_WORD_PARTS, "Super", "SuperDuperXL50042AutoCoder", "Duper", "XL500", "AutoCoder");
 +    doSplit("---",  CATENATE_ALL | GENERATE_WORD_PARTS);
 +
 +    doSplit("abc", CATENATE_ALL | GENERATE_NUMBER_PARTS, "abc");
 +    doSplit("Wi-Fi", CATENATE_ALL | GENERATE_NUMBER_PARTS, "WiFi");
 +    doSplit("-42",  CATENATE_ALL | GENERATE_NUMBER_PARTS, "42");
 +    doSplit("42", CATENATE_ALL | GENERATE_NUMBER_PARTS, "42");
 +    doSplit("-42-42", CATENATE_ALL | GENERATE_NUMBER_PARTS, "42", "4242", "42");
 +    doSplit("PowerShot", CATENATE_ALL | GENERATE_NUMBER_PARTS, "PowerShot");
 +    doSplit("SD500", CATENATE_ALL | GENERATE_NUMBER_PARTS, "SD500");
 +    doSplit("//hello---there, 'dude'", CATENATE_ALL | GENERATE_NUMBER_PARTS, "hellotheredude");
 +    doSplit("O'Neil's", CATENATE_ALL | GENERATE_NUMBER_PARTS, "ONeils");
 +    doSplit("A's+B's&C's", CATENATE_ALL | GENERATE_NUMBER_PARTS, "AsBsCs");
 +    doSplit("A's+B's&amp;C's", CATENATE_ALL | GENERATE_NUMBER_PARTS, "AsBsampCs");
 +    doSplit("Super-Duper-XL500-42-AutoCoder!", CATENATE_ALL | GENERATE_NUMBER_PARTS, "SuperDuperXL50042AutoCoder", "42");
 +    doSplit("---",  CATENATE_ALL | GENERATE_WORD_PARTS);
 +
 +    doSplit("abc", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "abc");
 +    doSplit("Wi-Fi", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "Wi", "Fi");
 +    doSplit("-42",  GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "42");
 +    doSplit("42", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "42");
 +    doSplit("-42-42", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "42", "42");
 +    doSplit("PowerShot", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "PowerShot");
 +    doSplit("SD500", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "SD500");
 +    doSplit("//hello---there, 'dude'", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "hello", "there", "dude");
 +    doSplit("O'Neil's", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "O", "Neil", "s");
 +    doSplit("A's+B's&C's", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "A", "s", "B", "s", "C", "s");
 +    doSplit("Super-Duper-XL500-42-AutoCoder!", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "Super", "Duper", "XL500", "42", "AutoCoder");
 +    doSplit("---",  GENERATE_WORD_PARTS | GENERATE_WORD_PARTS);
 +
 +    doSplit("abc", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "abc");
 +    doSplit("Wi-Fi", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "Wi-Fi", "Wi", "WiFi", "Fi");
 +    doSplit("-42",  PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "-42", "42");
 +    doSplit("42", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "42");
 +    doSplit("-42-42", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "-42-42","4242");
 +    doSplit("PowerShot", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "PowerShot");
 +    doSplit("SD500", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "SD500");
 +    doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "//hello---there, 'dude'", "hello", "hellotheredude", "there", "dude");
 +    doSplit("O'Neil's", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "O'Neil's", "O", "ONeils", "Neil", "s");
 +    doSplit("A's+B's&C's", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "A's+B's&C's", "A", "AsBsCs", "s", "B", "s", "C", "s");
 +    doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "Super-Duper-XL500-42-AutoCoder!", "Super", "SuperDuperXL50042AutoCoder", "Duper", "XL500", "AutoCoder");
 +    doSplit("---",  PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "---");
 +
 +    doSplit("abc", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "abc");
 +    doSplit("Wi-Fi", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "Wi-Fi", "Wi", "Fi");
 +    doSplit("-42",  PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "-42", "42");
 +    doSplit("42", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "42");
 +    doSplit("-42-42", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "-42-42", "42", "42");
 +    doSplit("PowerShot", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "PowerShot");
 +    doSplit("SD500", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "SD500");
 +    doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "//hello---there, 'dude'", "hello", "there", "dude");
 +    doSplit("O'Neil's", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "O'Neil's", "O", "Neil", "s");
 +    doSplit("A's+B's&C's", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "A's+B's&C's", "A", "s", "B", "s", "C", "s");
 +    doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "Super-Duper-XL500-42-AutoCoder!", "Super", "Duper", "XL500", "42", "AutoCoder");
 +    doSplit("---",  PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "---");
 +
 +    // SPLIT_ON_CASE_CHANGE produces zero token (the default behavior when there
 +    // there is the generation of subwords
 +    doSplit("abcDef", SPLIT_ON_CASE_CHANGE);
 +    doSplit("abcDef", SPLIT_ON_CASE_CHANGE | PRESERVE_ORIGINAL, "abcDef");
 +    doSplit("abcDef", SPLIT_ON_CASE_CHANGE | GENERATE_WORD_PARTS, "abc", "Def");
 +    doSplit("abcDef", SPLIT_ON_CASE_CHANGE | GENERATE_WORD_PARTS | CATENATE_ALL, "abc", "abcDef", "Def");
 +    /* The following is bugued? Should be "abcDef" */
 +    doSplit("abcDef", SPLIT_ON_CASE_CHANGE | PRESERVE_ORIGINAL | CATENATE_ALL, "abcDef", "abcDef");
 +    /* The following is bugued? Should not be "abcDef", "abc", "Def" */
 +    doSplit("abcDef", SPLIT_ON_CASE_CHANGE | PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "abcDef", "abc", "abcDef", "Def");
 +  }
 +
    public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
      int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
      flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0;
 @@ -172,7 +400,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {

      assertTokenStreamContents(wdf, output);
    }
 -
 +
    /*
     * Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters.
     */
 @@ -181,7 +409,44 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
      doSplitPossessive(1, "ra's", "ra");
      doSplitPossessive(0, "ra's", "ra", "s");
    }
 -
 +
 +  /** We should not generate the same token twice with PRESERVE_ORIGINAL and CONCATENATE_ALL|CATENATE_WORDS */
 +  public void checkNoDuplicates(String input_string) throws Exception {
 +    final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
 +
 +    /* analyzer that uses whitespace + wdf */
 +    Analyzer a = new Analyzer() {
 +      @Override
 +      public TokenStreamComponents createComponents(String field) {
 +        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
 +        return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null));
 +      }
 +    };
 +    TokenStream ts = a.tokenStream("dummy", input_string);
 +    ts.reset();
 +
 +    final Set<String> already_seen_words = new HashSet<>();
 +    while (ts.incrementToken()) {
 +     CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
 +     PositionIncrementAttribute posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
 +     String word = termAtt.toString();
 +      if (posIncrAtt.getPositionIncrement() == 0) {
 +        assertFalse("The term " + word + " is duplicated when tokenizing " + input_string,
 +            already_seen_words.contains(word));
 +      } else {
 +        already_seen_words.clear();
 +      }
 +      already_seen_words.add(word);
 +    }
 +    a.close();
 +  }
 +  /*
 +   * Currently, the plugin does not ensure that there are no
 +   * duplicate tokens.
 +   */
 +  public void testNoDuplicates() throws Exception  {
 +    // checkNoDuplicates("abcDef abcDef");
 +  }
    /*
     * Set a large position increment gap of 10 if the token is "largegap" or "/"
     */
	diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
	index 3d7c03e..235970b 100644
	--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
	+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
	@@ -58,6 +58,16 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
	}
	***/

	+ /** Prints the tokenization of the input after being whitespace separated
	+ * and filtered by a WordDelimiterFilter using `flags`
	+ */
	+ public static void debugTokenFilter(int flags, final String input) throws Exception {
	+ WordDelimiterFilter wdf = new WordDelimiterFilter(keywordMockTokenizer(input),
	+ WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
	+ System.out.println("Analyzing: " + input);
	+ TokenFilterDebugging.debugTokenFilter(wdf, input);
	+ }
	+
	@Test
	public void testOffsets() throws IOException {
	int flags = GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS \| CATENATE_ALL \| SPLIT_ON_CASE_CHANGE \| SPLIT_ON_NUMERICS \| STEM_ENGLISH_POSSESSIVE;
	@@ -122,12 +132,31 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
	new int[] { 11, 15, 15 });
	}

	- public void doSplit(final String input, String... output) throws Exception {
	- int flags = GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS \| SPLIT_ON_CASE_CHANGE \| SPLIT_ON_NUMERICS \| STEM_ENGLISH_POSSESSIVE;
	+ public void doSplit(final String input, int flags, String... output) throws Exception{
	WordDelimiterFilter wdf = new WordDelimiterFilter(keywordMockTokenizer(input),
	WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);

	- assertTokenStreamContents(wdf, output);
	+ try {
	+ assertTokenStreamContents(wdf, output);
	+ } catch (AssertionError e) {
	+ System.out.println("Analyzing: " + input);
	+ System.out.println("Expected tokens: ");
	+ boolean printed = false;
	+ for (String token : output) {
	+ System.out.print(token + " ");
	+ printed = true;
	+ }
	+ if (printed) {
	+ System.out.println();
	+ }
	+ debugTokenFilter(flags, input);
	+ throw new AssertionError(e);
	+ }
	+ }
	+
	+ public void doSplit(final String input, String... output) throws Exception {
	+ int flags = GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS \| SPLIT_ON_CASE_CHANGE \| SPLIT_ON_NUMERICS \| STEM_ENGLISH_POSSESSIVE;
	+ doSplit(input, flags, output);
	}

	@Test
	@@ -164,7 +193,206 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
	// don't split supplementaries into unpaired surrogates
	doSplit("𠀀𠀀", "𠀀𠀀");
	}
	-
	+
	+ /**
	+ * This offers a more precise view or what the WDFT does,
	+ * in particular with strange combinations of tags.
	+ *
	+ * Some results may seem incorrect, but it is the behavior of the
	+ * TokenFilter for ages. The correction of these behavior may be necessary,
	+ * but it is an other issue.
	+ */
	+ public void testNoRegretion() throws Exception {
	+ /* Use the following to print generated tokens
	+ * debugTokenFilter(GENERATE_NUMBER_PARTS \| CATENATE_ALL, "abcDef");
	+ */
	+ doSplit("abc", 0, "abc");
	+ doSplit("Wi-Fi", 0);
	+ doSplit("-42", 0, "42"); // Strange behavior !
	+ doSplit("42", 0, "42");
	+ doSplit("-42-42", 0);
	+ doSplit("PowerShot", 0, "PowerShot");
	+ doSplit("SD500", 0, "SD500");
	+ doSplit("//hello---there, 'dude'", 0);
	+ doSplit("O'Neil's", 0);
	+ doSplit("A's+B's&C's", 0);
	+ doSplit("Super-Duper-XL500-42-AutoCoder!", 0);
	+ doSplit("---", 0);
	+
	+ /**
	+ * We test many of the combinations between:
	+ * PRESERVE_ORIGINAL, CATENATE_ALL, GENERATE_WORD_PARTS, GENERATE_NUMBER_PARTS.
	+ * This means, 4 single tags, 6 couple tags and only 2 triplet tags
	+ */
	+ doSplit("abc", PRESERVE_ORIGINAL, "abc");
	+ doSplit("Wi-Fi", PRESERVE_ORIGINAL, "Wi-Fi");
	+ doSplit("-42", PRESERVE_ORIGINAL, "-42", "42");
	+ doSplit("42", PRESERVE_ORIGINAL, "42");
	+ doSplit("-42-42", PRESERVE_ORIGINAL, "-42-42");
	+ doSplit("PowerShot", PRESERVE_ORIGINAL, "PowerShot");
	+ doSplit("SD500", PRESERVE_ORIGINAL, "SD500");
	+ doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL, "//hello---there, 'dude'");
	+ doSplit("O'Neil's", PRESERVE_ORIGINAL, "O'Neil's");
	+ doSplit("A's+B's&C's", PRESERVE_ORIGINAL, "A's+B's&C's");
	+ doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL, "Super-Duper-XL500-42-AutoCoder!");
	+ doSplit("---", PRESERVE_ORIGINAL, "---");
	+
	+ doSplit("abc", CATENATE_ALL, "abc");
	+ doSplit("Wi-Fi", CATENATE_ALL, "WiFi");
	+ doSplit("-42", CATENATE_ALL, "42");
	+ doSplit("42", CATENATE_ALL, "42");
	+ doSplit("-42-42", CATENATE_ALL, "4242");
	+ doSplit("PowerShot", CATENATE_ALL, "PowerShot");
	+ doSplit("SD500", CATENATE_ALL, "SD500");
	+ doSplit("//hello---there, 'dude'", CATENATE_ALL, "hellotheredude");
	+ doSplit("O'Neil's", CATENATE_ALL, "ONeils");
	+ doSplit("A's+B's&C's", CATENATE_ALL, "AsBsCs");
	+ doSplit("Super-Duper-XL500-42-AutoCoder!", CATENATE_ALL, "SuperDuperXL50042AutoCoder");
	+ doSplit("---", CATENATE_ALL);
	+
	+ doSplit("abc", GENERATE_WORD_PARTS, "abc");
	+ doSplit("Wi-Fi", GENERATE_WORD_PARTS, "Wi", "Fi");
	+ doSplit("-42", GENERATE_WORD_PARTS, "42");
	+ doSplit("42", GENERATE_WORD_PARTS, "42");
	+ doSplit("-42-42", GENERATE_WORD_PARTS);
	+ doSplit("PowerShot", GENERATE_WORD_PARTS, "PowerShot");
	+ doSplit("SD500", GENERATE_WORD_PARTS, "SD500");
	+ doSplit("//hello---there, 'dude'", GENERATE_WORD_PARTS, "hello", "there", "dude");
	+ doSplit("O'Neil's", GENERATE_WORD_PARTS, "O", "Neil", "s");
	+ doSplit("A's+B's&C's", GENERATE_WORD_PARTS, "A", "s", "B", "s", "C", "s");
	+ doSplit("Super-Duper-XL500-42-AutoCoder!", GENERATE_WORD_PARTS, "Super", "Duper", "XL500", "AutoCoder");
	+ doSplit("---", GENERATE_WORD_PARTS);
	+
	+ doSplit("abc", GENERATE_NUMBER_PARTS, "abc");
	+ doSplit("Wi-Fi", GENERATE_NUMBER_PARTS);
	+ doSplit("-42", GENERATE_NUMBER_PARTS, "42");
	+ doSplit("42", GENERATE_NUMBER_PARTS, "42");
	+ doSplit("-42-42", GENERATE_NUMBER_PARTS, "42", "42");
	+ doSplit("PowerShot", GENERATE_NUMBER_PARTS, "PowerShot");
	+ doSplit("SD500", GENERATE_NUMBER_PARTS, "SD500");
	+ doSplit("//hello---there, 'dude'", GENERATE_NUMBER_PARTS);
	+ doSplit("O'Neil's", GENERATE_NUMBER_PARTS);
	+ doSplit("A's+B's&C's", GENERATE_NUMBER_PARTS);
	+ doSplit("Super-Duper-XL500-42-AutoCoder!", GENERATE_NUMBER_PARTS, "42");
	+ doSplit("---", GENERATE_NUMBER_PARTS);
	+
	+ doSplit("abc", PRESERVE_ORIGINAL \| CATENATE_ALL, "abc");
	+ doSplit("Wi-Fi", PRESERVE_ORIGINAL \| CATENATE_ALL, "Wi-Fi", "WiFi");
	+ doSplit("-42", PRESERVE_ORIGINAL \| CATENATE_ALL, "-42", "42");
	+ doSplit("42", PRESERVE_ORIGINAL \| CATENATE_ALL, "42");
	+ doSplit("-42-42", PRESERVE_ORIGINAL \| CATENATE_ALL, "-42-42", "4242");
	+ doSplit("PowerShot", PRESERVE_ORIGINAL \| CATENATE_ALL, "PowerShot");
	+ doSplit("SD500", PRESERVE_ORIGINAL \| CATENATE_ALL, "SD500");
	+ doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL \| CATENATE_ALL, "//hello---there, 'dude'", "hellotheredude");
	+ doSplit("O'Neil's", PRESERVE_ORIGINAL \| CATENATE_ALL, "O'Neil's", "ONeils");
	+ doSplit("A's+B's&C's", PRESERVE_ORIGINAL \| CATENATE_ALL, "A's+B's&C's", "AsBsCs");
	+ doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL \| CATENATE_ALL, "Super-Duper-XL500-42-AutoCoder!", "SuperDuperXL50042AutoCoder");
	+ doSplit("---", PRESERVE_ORIGINAL \| CATENATE_ALL, "---");
	+
	+ doSplit("abc", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS, "abc");
	+ doSplit("Wi-Fi", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS, "Wi-Fi", "Wi", "Fi");
	+ doSplit("-42", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS, "-42", "42"); // Strange behavior, 42 is not word
	+ doSplit("42", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS, "42");
	+ doSplit("-42-42", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS, "-42-42");
	+ doSplit("PowerShot", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS, "PowerShot");
	+ doSplit("SD500", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS, "SD500");
	+ doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS, "//hello---there, 'dude'", "hello", "there", "dude");
	+ doSplit("O'Neil's", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS, "O'Neil's", "O", "Neil", "s");
	+ doSplit("A's+B's&C's", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS, "A's+B's&C's", "A", "s", "B", "s", "C", "s");
	+ doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS, "Super-Duper-XL500-42-AutoCoder!", "Super", "Duper", "XL500", "AutoCoder");
	+ doSplit("---", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS, "---");
	+
	+ doSplit("abc", PRESERVE_ORIGINAL \| GENERATE_NUMBER_PARTS, "abc");
	+ doSplit("Wi-Fi", PRESERVE_ORIGINAL \| GENERATE_NUMBER_PARTS, "Wi-Fi");
	+ doSplit("-42", PRESERVE_ORIGINAL \| GENERATE_NUMBER_PARTS, "-42", "42");
	+ doSplit("42", PRESERVE_ORIGINAL \| GENERATE_NUMBER_PARTS, "42");
	+ doSplit("-42-42", PRESERVE_ORIGINAL \| GENERATE_NUMBER_PARTS, "-42-42", "42", "42");
	+ doSplit("PowerShot", PRESERVE_ORIGINAL \| GENERATE_NUMBER_PARTS, "PowerShot");
	+ doSplit("SD500", PRESERVE_ORIGINAL \| GENERATE_NUMBER_PARTS, "SD500");
	+ doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL \| GENERATE_NUMBER_PARTS, "//hello---there, 'dude'");
	+ doSplit("O'Neil's", PRESERVE_ORIGINAL \| GENERATE_NUMBER_PARTS, "O'Neil's");
	+ doSplit("A's+B's&C's", PRESERVE_ORIGINAL \| GENERATE_NUMBER_PARTS, "A's+B's&C's");
	+ doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL \| GENERATE_NUMBER_PARTS, "Super-Duper-XL500-42-AutoCoder!", "42");
	+ doSplit("---", PRESERVE_ORIGINAL \| GENERATE_NUMBER_PARTS, "---");
	+
	+ doSplit("abc", CATENATE_ALL \| GENERATE_WORD_PARTS, "abc");
	+ doSplit("Wi-Fi", CATENATE_ALL \| GENERATE_WORD_PARTS, "Wi", "WiFi", "Fi");
	+ doSplit("-42", CATENATE_ALL \| GENERATE_WORD_PARTS, "42");
	+ doSplit("42", CATENATE_ALL \| GENERATE_WORD_PARTS, "42");
	+ doSplit("-42-42", CATENATE_ALL \| GENERATE_WORD_PARTS, "4242");
	+ doSplit("PowerShot", CATENATE_ALL \| GENERATE_WORD_PARTS, "PowerShot");
	+ doSplit("SD500", CATENATE_ALL \| GENERATE_WORD_PARTS, "SD500");
	+ doSplit("//hello---there, 'dude'", CATENATE_ALL \| GENERATE_WORD_PARTS, "hello", "hellotheredude", "there", "dude");
	+ doSplit("O'Neil's", CATENATE_ALL \| GENERATE_WORD_PARTS, "O", "ONeils", "Neil", "s");
	+ doSplit("A's+B's&C's", CATENATE_ALL \| GENERATE_WORD_PARTS, "A", "AsBsCs", "s", "B", "s", "C", "s");
	+ doSplit("Super-Duper-XL500-42-AutoCoder!", CATENATE_ALL \| GENERATE_WORD_PARTS, "Super", "SuperDuperXL50042AutoCoder", "Duper", "XL500", "AutoCoder");
	+ doSplit("---", CATENATE_ALL \| GENERATE_WORD_PARTS);
	+
	+ doSplit("abc", CATENATE_ALL \| GENERATE_NUMBER_PARTS, "abc");
	+ doSplit("Wi-Fi", CATENATE_ALL \| GENERATE_NUMBER_PARTS, "WiFi");
	+ doSplit("-42", CATENATE_ALL \| GENERATE_NUMBER_PARTS, "42");
	+ doSplit("42", CATENATE_ALL \| GENERATE_NUMBER_PARTS, "42");
	+ doSplit("-42-42", CATENATE_ALL \| GENERATE_NUMBER_PARTS, "42", "4242", "42");
	+ doSplit("PowerShot", CATENATE_ALL \| GENERATE_NUMBER_PARTS, "PowerShot");
	+ doSplit("SD500", CATENATE_ALL \| GENERATE_NUMBER_PARTS, "SD500");
	+ doSplit("//hello---there, 'dude'", CATENATE_ALL \| GENERATE_NUMBER_PARTS, "hellotheredude");
	+ doSplit("O'Neil's", CATENATE_ALL \| GENERATE_NUMBER_PARTS, "ONeils");
	+ doSplit("A's+B's&C's", CATENATE_ALL \| GENERATE_NUMBER_PARTS, "AsBsCs");
	+ doSplit("A's+B's&C's", CATENATE_ALL \| GENERATE_NUMBER_PARTS, "AsBsampCs");
	+ doSplit("Super-Duper-XL500-42-AutoCoder!", CATENATE_ALL \| GENERATE_NUMBER_PARTS, "SuperDuperXL50042AutoCoder", "42");
	+ doSplit("---", CATENATE_ALL \| GENERATE_WORD_PARTS);
	+
	+ doSplit("abc", GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "abc");
	+ doSplit("Wi-Fi", GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "Wi", "Fi");
	+ doSplit("-42", GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "42");
	+ doSplit("42", GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "42");
	+ doSplit("-42-42", GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "42", "42");
	+ doSplit("PowerShot", GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "PowerShot");
	+ doSplit("SD500", GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "SD500");
	+ doSplit("//hello---there, 'dude'", GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "hello", "there", "dude");
	+ doSplit("O'Neil's", GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "O", "Neil", "s");
	+ doSplit("A's+B's&C's", GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "A", "s", "B", "s", "C", "s");
	+ doSplit("Super-Duper-XL500-42-AutoCoder!", GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "Super", "Duper", "XL500", "42", "AutoCoder");
	+ doSplit("---", GENERATE_WORD_PARTS \| GENERATE_WORD_PARTS);
	+
	+ doSplit("abc", PRESERVE_ORIGINAL \| CATENATE_ALL \| GENERATE_WORD_PARTS, "abc");
	+ doSplit("Wi-Fi", PRESERVE_ORIGINAL \| CATENATE_ALL \| GENERATE_WORD_PARTS, "Wi-Fi", "Wi", "WiFi", "Fi");
	+ doSplit("-42", PRESERVE_ORIGINAL \| CATENATE_ALL \| GENERATE_WORD_PARTS, "-42", "42");
	+ doSplit("42", PRESERVE_ORIGINAL \| CATENATE_ALL \| GENERATE_WORD_PARTS, "42");
	+ doSplit("-42-42", PRESERVE_ORIGINAL \| CATENATE_ALL \| GENERATE_WORD_PARTS, "-42-42","4242");
	+ doSplit("PowerShot", PRESERVE_ORIGINAL \| CATENATE_ALL \| GENERATE_WORD_PARTS, "PowerShot");
	+ doSplit("SD500", PRESERVE_ORIGINAL \| CATENATE_ALL \| GENERATE_WORD_PARTS, "SD500");
	+ doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL \| CATENATE_ALL \| GENERATE_WORD_PARTS, "//hello---there, 'dude'", "hello", "hellotheredude", "there", "dude");
	+ doSplit("O'Neil's", PRESERVE_ORIGINAL \| CATENATE_ALL \| GENERATE_WORD_PARTS, "O'Neil's", "O", "ONeils", "Neil", "s");
	+ doSplit("A's+B's&C's", PRESERVE_ORIGINAL \| CATENATE_ALL \| GENERATE_WORD_PARTS, "A's+B's&C's", "A", "AsBsCs", "s", "B", "s", "C", "s");
	+ doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL \| CATENATE_ALL \| GENERATE_WORD_PARTS, "Super-Duper-XL500-42-AutoCoder!", "Super", "SuperDuperXL50042AutoCoder", "Duper", "XL500", "AutoCoder");
	+ doSplit("---", PRESERVE_ORIGINAL \| CATENATE_ALL \| GENERATE_WORD_PARTS, "---");
	+
	+ doSplit("abc", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "abc");
	+ doSplit("Wi-Fi", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "Wi-Fi", "Wi", "Fi");
	+ doSplit("-42", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "-42", "42");
	+ doSplit("42", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "42");
	+ doSplit("-42-42", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "-42-42", "42", "42");
	+ doSplit("PowerShot", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "PowerShot");
	+ doSplit("SD500", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "SD500");
	+ doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "//hello---there, 'dude'", "hello", "there", "dude");
	+ doSplit("O'Neil's", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "O'Neil's", "O", "Neil", "s");
	+ doSplit("A's+B's&C's", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "A's+B's&C's", "A", "s", "B", "s", "C", "s");
	+ doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS, "Super-Duper-XL500-42-AutoCoder!", "Super", "Duper", "XL500", "42", "AutoCoder");
	+ doSplit("---", PRESERVE_ORIGINAL \| CATENATE_ALL \| GENERATE_WORD_PARTS, "---");
	+
	+ // SPLIT_ON_CASE_CHANGE produces zero token (the default behavior when there
	+ // there is the generation of subwords
	+ doSplit("abcDef", SPLIT_ON_CASE_CHANGE);
	+ doSplit("abcDef", SPLIT_ON_CASE_CHANGE \| PRESERVE_ORIGINAL, "abcDef");
	+ doSplit("abcDef", SPLIT_ON_CASE_CHANGE \| GENERATE_WORD_PARTS, "abc", "Def");
	+ doSplit("abcDef", SPLIT_ON_CASE_CHANGE \| GENERATE_WORD_PARTS \| CATENATE_ALL, "abc", "abcDef", "Def");
	+ /* The following is bugued? Should be "abcDef" */
	+ doSplit("abcDef", SPLIT_ON_CASE_CHANGE \| PRESERVE_ORIGINAL \| CATENATE_ALL, "abcDef", "abcDef");
	+ /* The following is bugued? Should not be "abcDef", "abc", "Def" */
	+ doSplit("abcDef", SPLIT_ON_CASE_CHANGE \| PRESERVE_ORIGINAL \| CATENATE_ALL \| GENERATE_WORD_PARTS, "abcDef", "abc", "abcDef", "Def");
	+ }
	+
	public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
	int flags = GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS \| SPLIT_ON_CASE_CHANGE \| SPLIT_ON_NUMERICS;
	flags \|= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0;
	@@ -172,7 +400,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {

	assertTokenStreamContents(wdf, output);
	}
	-
	+
	/*
	* Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters.
	*/
	@@ -181,7 +409,44 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
	doSplitPossessive(1, "ra's", "ra");
	doSplitPossessive(0, "ra's", "ra", "s");
	}
	-
	+
	+ /** We should not generate the same token twice with PRESERVE_ORIGINAL and CONCATENATE_ALL\|CATENATE_WORDS */
	+ public void checkNoDuplicates(String input_string) throws Exception {
	+ final int flags = PRESERVE_ORIGINAL \| GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS \| CATENATE_WORDS \| CATENATE_NUMBERS \| CATENATE_ALL \| SPLIT_ON_CASE_CHANGE \| SPLIT_ON_NUMERICS \| STEM_ENGLISH_POSSESSIVE;
	+
	+ /* analyzer that uses whitespace + wdf */
	+ Analyzer a = new Analyzer() {
	+ @Override
	+ public TokenStreamComponents createComponents(String field) {
	+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null));
	+ }
	+ };
	+ TokenStream ts = a.tokenStream("dummy", input_string);
	+ ts.reset();
	+
	+ final Set<String> already_seen_words = new HashSet<>();
	+ while (ts.incrementToken()) {
	+ CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
	+ PositionIncrementAttribute posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
	+ String word = termAtt.toString();
	+ if (posIncrAtt.getPositionIncrement() == 0) {
	+ assertFalse("The term " + word + " is duplicated when tokenizing " + input_string,
	+ already_seen_words.contains(word));
	+ } else {
	+ already_seen_words.clear();
	+ }
	+ already_seen_words.add(word);
	+ }
	+ a.close();
	+ }
	+ /*
	+ * Currently, the plugin does not ensure that there are no
	+ * duplicate tokens.
	+ */
	+ public void testNoDuplicates() throws Exception {
	+ // checkNoDuplicates("abcDef abcDef");
	+ }
	/*
	* Set a large position increment gap of 10 if the token is "largegap" or "/"
	*/