blob: 048826ef36c9d8502c57be6a12655f89165a8a42 [file] [log] [blame]
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
index 3d7c03e..235970b 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
@@ -58,6 +58,16 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
}
***/
+ /** Prints the tokenization of the input after being whitespace separated
+ * and filtered by a WordDelimiterFilter using `flags`
+ */
+ public static void debugTokenFilter(int flags, final String input) throws Exception {
+ WordDelimiterFilter wdf = new WordDelimiterFilter(keywordMockTokenizer(input),
+ WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
+ System.out.println("Analyzing: " + input);
+ TokenFilterDebugging.debugTokenFilter(wdf, input);
+ }
+
@Test
public void testOffsets() throws IOException {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
@@ -122,12 +132,31 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
new int[] { 11, 15, 15 });
}
- public void doSplit(final String input, String... output) throws Exception {
- int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+ public void doSplit(final String input, int flags, String... output) throws Exception{
WordDelimiterFilter wdf = new WordDelimiterFilter(keywordMockTokenizer(input),
WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
- assertTokenStreamContents(wdf, output);
+ try {
+ assertTokenStreamContents(wdf, output);
+ } catch (AssertionError e) {
+ System.out.println("Analyzing: " + input);
+ System.out.println("Expected tokens: ");
+ boolean printed = false;
+ for (String token : output) {
+ System.out.print(token + " ");
+ printed = true;
+ }
+ if (printed) {
+ System.out.println();
+ }
+ debugTokenFilter(flags, input);
+ throw new AssertionError(e);
+ }
+ }
+
+ public void doSplit(final String input, String... output) throws Exception {
+ int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+ doSplit(input, flags, output);
}
@Test
@@ -164,7 +193,206 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
// don't split supplementaries into unpaired surrogates
doSplit("𠀀𠀀", "𠀀𠀀");
}
-
+
+ /**
+ * This offers a more precise view or what the WDFT does,
+ * in particular with strange combinations of tags.
+ *
+ * Some results may seem incorrect, but it is the behavior of the
+ * TokenFilter for ages. The correction of these behavior may be necessary,
+ * but it is an other issue.
+ */
+ public void testNoRegretion() throws Exception {
+ /* Use the following to print generated tokens
+ * debugTokenFilter(GENERATE_NUMBER_PARTS | CATENATE_ALL, "abcDef");
+ */
+ doSplit("abc", 0, "abc");
+ doSplit("Wi-Fi", 0);
+ doSplit("-42", 0, "42"); // Strange behavior !
+ doSplit("42", 0, "42");
+ doSplit("-42-42", 0);
+ doSplit("PowerShot", 0, "PowerShot");
+ doSplit("SD500", 0, "SD500");
+ doSplit("//hello---there, 'dude'", 0);
+ doSplit("O'Neil's", 0);
+ doSplit("A's+B's&C's", 0);
+ doSplit("Super-Duper-XL500-42-AutoCoder!", 0);
+ doSplit("---", 0);
+
+ /**
+ * We test many of the combinations between:
+ * PRESERVE_ORIGINAL, CATENATE_ALL, GENERATE_WORD_PARTS, GENERATE_NUMBER_PARTS.
+ * This means, 4 single tags, 6 couple tags and only 2 triplet tags
+ */
+ doSplit("abc", PRESERVE_ORIGINAL, "abc");
+ doSplit("Wi-Fi", PRESERVE_ORIGINAL, "Wi-Fi");
+ doSplit("-42", PRESERVE_ORIGINAL, "-42", "42");
+ doSplit("42", PRESERVE_ORIGINAL, "42");
+ doSplit("-42-42", PRESERVE_ORIGINAL, "-42-42");
+ doSplit("PowerShot", PRESERVE_ORIGINAL, "PowerShot");
+ doSplit("SD500", PRESERVE_ORIGINAL, "SD500");
+ doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL, "//hello---there, 'dude'");
+ doSplit("O'Neil's", PRESERVE_ORIGINAL, "O'Neil's");
+ doSplit("A's+B's&C's", PRESERVE_ORIGINAL, "A's+B's&C's");
+ doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL, "Super-Duper-XL500-42-AutoCoder!");
+ doSplit("---", PRESERVE_ORIGINAL, "---");
+
+ doSplit("abc", CATENATE_ALL, "abc");
+ doSplit("Wi-Fi", CATENATE_ALL, "WiFi");
+ doSplit("-42", CATENATE_ALL, "42");
+ doSplit("42", CATENATE_ALL, "42");
+ doSplit("-42-42", CATENATE_ALL, "4242");
+ doSplit("PowerShot", CATENATE_ALL, "PowerShot");
+ doSplit("SD500", CATENATE_ALL, "SD500");
+ doSplit("//hello---there, 'dude'", CATENATE_ALL, "hellotheredude");
+ doSplit("O'Neil's", CATENATE_ALL, "ONeils");
+ doSplit("A's+B's&C's", CATENATE_ALL, "AsBsCs");
+ doSplit("Super-Duper-XL500-42-AutoCoder!", CATENATE_ALL, "SuperDuperXL50042AutoCoder");
+ doSplit("---", CATENATE_ALL);
+
+ doSplit("abc", GENERATE_WORD_PARTS, "abc");
+ doSplit("Wi-Fi", GENERATE_WORD_PARTS, "Wi", "Fi");
+ doSplit("-42", GENERATE_WORD_PARTS, "42");
+ doSplit("42", GENERATE_WORD_PARTS, "42");
+ doSplit("-42-42", GENERATE_WORD_PARTS);
+ doSplit("PowerShot", GENERATE_WORD_PARTS, "PowerShot");
+ doSplit("SD500", GENERATE_WORD_PARTS, "SD500");
+ doSplit("//hello---there, 'dude'", GENERATE_WORD_PARTS, "hello", "there", "dude");
+ doSplit("O'Neil's", GENERATE_WORD_PARTS, "O", "Neil", "s");
+ doSplit("A's+B's&C's", GENERATE_WORD_PARTS, "A", "s", "B", "s", "C", "s");
+ doSplit("Super-Duper-XL500-42-AutoCoder!", GENERATE_WORD_PARTS, "Super", "Duper", "XL500", "AutoCoder");
+ doSplit("---", GENERATE_WORD_PARTS);
+
+ doSplit("abc", GENERATE_NUMBER_PARTS, "abc");
+ doSplit("Wi-Fi", GENERATE_NUMBER_PARTS);
+ doSplit("-42", GENERATE_NUMBER_PARTS, "42");
+ doSplit("42", GENERATE_NUMBER_PARTS, "42");
+ doSplit("-42-42", GENERATE_NUMBER_PARTS, "42", "42");
+ doSplit("PowerShot", GENERATE_NUMBER_PARTS, "PowerShot");
+ doSplit("SD500", GENERATE_NUMBER_PARTS, "SD500");
+ doSplit("//hello---there, 'dude'", GENERATE_NUMBER_PARTS);
+ doSplit("O'Neil's", GENERATE_NUMBER_PARTS);
+ doSplit("A's+B's&C's", GENERATE_NUMBER_PARTS);
+ doSplit("Super-Duper-XL500-42-AutoCoder!", GENERATE_NUMBER_PARTS, "42");
+ doSplit("---", GENERATE_NUMBER_PARTS);
+
+ doSplit("abc", PRESERVE_ORIGINAL | CATENATE_ALL, "abc");
+ doSplit("Wi-Fi", PRESERVE_ORIGINAL | CATENATE_ALL, "Wi-Fi", "WiFi");
+ doSplit("-42", PRESERVE_ORIGINAL | CATENATE_ALL, "-42", "42");
+ doSplit("42", PRESERVE_ORIGINAL | CATENATE_ALL, "42");
+ doSplit("-42-42", PRESERVE_ORIGINAL | CATENATE_ALL, "-42-42", "4242");
+ doSplit("PowerShot", PRESERVE_ORIGINAL | CATENATE_ALL, "PowerShot");
+ doSplit("SD500", PRESERVE_ORIGINAL | CATENATE_ALL, "SD500");
+ doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL | CATENATE_ALL, "//hello---there, 'dude'", "hellotheredude");
+ doSplit("O'Neil's", PRESERVE_ORIGINAL | CATENATE_ALL, "O'Neil's", "ONeils");
+ doSplit("A's+B's&C's", PRESERVE_ORIGINAL | CATENATE_ALL, "A's+B's&C's", "AsBsCs");
+ doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL | CATENATE_ALL, "Super-Duper-XL500-42-AutoCoder!", "SuperDuperXL50042AutoCoder");
+ doSplit("---", PRESERVE_ORIGINAL | CATENATE_ALL, "---");
+
+ doSplit("abc", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "abc");
+ doSplit("Wi-Fi", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "Wi-Fi", "Wi", "Fi");
+ doSplit("-42", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "-42", "42"); // Strange behavior, 42 is not word
+ doSplit("42", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "42");
+ doSplit("-42-42", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "-42-42");
+ doSplit("PowerShot", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "PowerShot");
+ doSplit("SD500", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "SD500");
+ doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "//hello---there, 'dude'", "hello", "there", "dude");
+ doSplit("O'Neil's", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "O'Neil's", "O", "Neil", "s");
+ doSplit("A's+B's&C's", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "A's+B's&C's", "A", "s", "B", "s", "C", "s");
+ doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "Super-Duper-XL500-42-AutoCoder!", "Super", "Duper", "XL500", "AutoCoder");
+ doSplit("---", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS, "---");
+
+ doSplit("abc", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "abc");
+ doSplit("Wi-Fi", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "Wi-Fi");
+ doSplit("-42", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "-42", "42");
+ doSplit("42", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "42");
+ doSplit("-42-42", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "-42-42", "42", "42");
+ doSplit("PowerShot", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "PowerShot");
+ doSplit("SD500", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "SD500");
+ doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "//hello---there, 'dude'");
+ doSplit("O'Neil's", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "O'Neil's");
+ doSplit("A's+B's&C's", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "A's+B's&C's");
+ doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "Super-Duper-XL500-42-AutoCoder!", "42");
+ doSplit("---", PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS, "---");
+
+ doSplit("abc", CATENATE_ALL | GENERATE_WORD_PARTS, "abc");
+ doSplit("Wi-Fi", CATENATE_ALL | GENERATE_WORD_PARTS, "Wi", "WiFi", "Fi");
+ doSplit("-42", CATENATE_ALL | GENERATE_WORD_PARTS, "42");
+ doSplit("42", CATENATE_ALL | GENERATE_WORD_PARTS, "42");
+ doSplit("-42-42", CATENATE_ALL | GENERATE_WORD_PARTS, "4242");
+ doSplit("PowerShot", CATENATE_ALL | GENERATE_WORD_PARTS, "PowerShot");
+ doSplit("SD500", CATENATE_ALL | GENERATE_WORD_PARTS, "SD500");
+ doSplit("//hello---there, 'dude'", CATENATE_ALL | GENERATE_WORD_PARTS, "hello", "hellotheredude", "there", "dude");
+ doSplit("O'Neil's", CATENATE_ALL | GENERATE_WORD_PARTS, "O", "ONeils", "Neil", "s");
+ doSplit("A's+B's&C's", CATENATE_ALL | GENERATE_WORD_PARTS, "A", "AsBsCs", "s", "B", "s", "C", "s");
+ doSplit("Super-Duper-XL500-42-AutoCoder!", CATENATE_ALL | GENERATE_WORD_PARTS, "Super", "SuperDuperXL50042AutoCoder", "Duper", "XL500", "AutoCoder");
+ doSplit("---", CATENATE_ALL | GENERATE_WORD_PARTS);
+
+ doSplit("abc", CATENATE_ALL | GENERATE_NUMBER_PARTS, "abc");
+ doSplit("Wi-Fi", CATENATE_ALL | GENERATE_NUMBER_PARTS, "WiFi");
+ doSplit("-42", CATENATE_ALL | GENERATE_NUMBER_PARTS, "42");
+ doSplit("42", CATENATE_ALL | GENERATE_NUMBER_PARTS, "42");
+ doSplit("-42-42", CATENATE_ALL | GENERATE_NUMBER_PARTS, "42", "4242", "42");
+ doSplit("PowerShot", CATENATE_ALL | GENERATE_NUMBER_PARTS, "PowerShot");
+ doSplit("SD500", CATENATE_ALL | GENERATE_NUMBER_PARTS, "SD500");
+ doSplit("//hello---there, 'dude'", CATENATE_ALL | GENERATE_NUMBER_PARTS, "hellotheredude");
+ doSplit("O'Neil's", CATENATE_ALL | GENERATE_NUMBER_PARTS, "ONeils");
+ doSplit("A's+B's&C's", CATENATE_ALL | GENERATE_NUMBER_PARTS, "AsBsCs");
+ doSplit("A's+B's&C's", CATENATE_ALL | GENERATE_NUMBER_PARTS, "AsBsampCs");
+ doSplit("Super-Duper-XL500-42-AutoCoder!", CATENATE_ALL | GENERATE_NUMBER_PARTS, "SuperDuperXL50042AutoCoder", "42");
+ doSplit("---", CATENATE_ALL | GENERATE_WORD_PARTS);
+
+ doSplit("abc", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "abc");
+ doSplit("Wi-Fi", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "Wi", "Fi");
+ doSplit("-42", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "42");
+ doSplit("42", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "42");
+ doSplit("-42-42", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "42", "42");
+ doSplit("PowerShot", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "PowerShot");
+ doSplit("SD500", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "SD500");
+ doSplit("//hello---there, 'dude'", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "hello", "there", "dude");
+ doSplit("O'Neil's", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "O", "Neil", "s");
+ doSplit("A's+B's&C's", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "A", "s", "B", "s", "C", "s");
+ doSplit("Super-Duper-XL500-42-AutoCoder!", GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "Super", "Duper", "XL500", "42", "AutoCoder");
+ doSplit("---", GENERATE_WORD_PARTS | GENERATE_WORD_PARTS);
+
+ doSplit("abc", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "abc");
+ doSplit("Wi-Fi", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "Wi-Fi", "Wi", "WiFi", "Fi");
+ doSplit("-42", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "-42", "42");
+ doSplit("42", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "42");
+ doSplit("-42-42", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "-42-42","4242");
+ doSplit("PowerShot", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "PowerShot");
+ doSplit("SD500", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "SD500");
+ doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "//hello---there, 'dude'", "hello", "hellotheredude", "there", "dude");
+ doSplit("O'Neil's", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "O'Neil's", "O", "ONeils", "Neil", "s");
+ doSplit("A's+B's&C's", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "A's+B's&C's", "A", "AsBsCs", "s", "B", "s", "C", "s");
+ doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "Super-Duper-XL500-42-AutoCoder!", "Super", "SuperDuperXL50042AutoCoder", "Duper", "XL500", "AutoCoder");
+ doSplit("---", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "---");
+
+ doSplit("abc", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "abc");
+ doSplit("Wi-Fi", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "Wi-Fi", "Wi", "Fi");
+ doSplit("-42", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "-42", "42");
+ doSplit("42", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "42");
+ doSplit("-42-42", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "-42-42", "42", "42");
+ doSplit("PowerShot", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "PowerShot");
+ doSplit("SD500", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "SD500");
+ doSplit("//hello---there, 'dude'", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "//hello---there, 'dude'", "hello", "there", "dude");
+ doSplit("O'Neil's", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "O'Neil's", "O", "Neil", "s");
+ doSplit("A's+B's&C's", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "A's+B's&C's", "A", "s", "B", "s", "C", "s");
+ doSplit("Super-Duper-XL500-42-AutoCoder!", PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, "Super-Duper-XL500-42-AutoCoder!", "Super", "Duper", "XL500", "42", "AutoCoder");
+ doSplit("---", PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "---");
+
+ // SPLIT_ON_CASE_CHANGE produces zero token (the default behavior when there
+ // there is the generation of subwords
+ doSplit("abcDef", SPLIT_ON_CASE_CHANGE);
+ doSplit("abcDef", SPLIT_ON_CASE_CHANGE | PRESERVE_ORIGINAL, "abcDef");
+ doSplit("abcDef", SPLIT_ON_CASE_CHANGE | GENERATE_WORD_PARTS, "abc", "Def");
+ doSplit("abcDef", SPLIT_ON_CASE_CHANGE | GENERATE_WORD_PARTS | CATENATE_ALL, "abc", "abcDef", "Def");
+ /* The following is bugued? Should be "abcDef" */
+ doSplit("abcDef", SPLIT_ON_CASE_CHANGE | PRESERVE_ORIGINAL | CATENATE_ALL, "abcDef", "abcDef");
+ /* The following is bugued? Should not be "abcDef", "abc", "Def" */
+ doSplit("abcDef", SPLIT_ON_CASE_CHANGE | PRESERVE_ORIGINAL | CATENATE_ALL | GENERATE_WORD_PARTS, "abcDef", "abc", "abcDef", "Def");
+ }
+
public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0;
@@ -172,7 +400,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
assertTokenStreamContents(wdf, output);
}
-
+
/*
* Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters.
*/
@@ -181,7 +409,44 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
doSplitPossessive(1, "ra's", "ra");
doSplitPossessive(0, "ra's", "ra", "s");
}
-
+
+ /** We should not generate the same token twice with PRESERVE_ORIGINAL and CONCATENATE_ALL|CATENATE_WORDS */
+ public void checkNoDuplicates(String input_string) throws Exception {
+ final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+
+ /* analyzer that uses whitespace + wdf */
+ Analyzer a = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String field) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null));
+ }
+ };
+ TokenStream ts = a.tokenStream("dummy", input_string);
+ ts.reset();
+
+ final Set<String> already_seen_words = new HashSet<>();
+ while (ts.incrementToken()) {
+ CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
+ PositionIncrementAttribute posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
+ String word = termAtt.toString();
+ if (posIncrAtt.getPositionIncrement() == 0) {
+ assertFalse("The term " + word + " is duplicated when tokenizing " + input_string,
+ already_seen_words.contains(word));
+ } else {
+ already_seen_words.clear();
+ }
+ already_seen_words.add(word);
+ }
+ a.close();
+ }
+ /*
+ * Currently, the plugin does not ensure that there are no
+ * duplicate tokens.
+ */
+ public void testNoDuplicates() throws Exception {
+ // checkNoDuplicates("abcDef abcDef");
+ }
/*
* Set a large position increment gap of 10 if the token is "largegap" or "/"
*/