| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.miscellaneous; |
| |
| import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_ALL; |
| import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_NUMBERS; |
| import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_WORDS; |
| import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS; |
| import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_WORD_PARTS; |
| import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.IGNORE_KEYWORDS; |
| import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.PRESERVE_ORIGINAL; |
| import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE; |
| import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_NUMERICS; |
| import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE; |
| import static org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Random; |
| import java.util.Set; |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.CannedTokenStream; |
| import org.apache.lucene.analysis.CharArraySet; |
| import org.apache.lucene.analysis.MockTokenizer; |
| import org.apache.lucene.analysis.StopFilter; |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.core.KeywordTokenizer; |
| import org.apache.lucene.analysis.en.EnglishAnalyzer; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.util.IOUtils; |
| import org.apache.lucene.util.TestUtil; |
| |
| /** |
| * New WordDelimiterGraphFilter tests... most of the tests are in ConvertedLegacyTest TODO: should |
| * explicitly test things like protWords and not rely on the factory tests in Solr. |
| */ |
| public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase { |
| |
| public void testOffsets() throws IOException { |
| int flags = |
| GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | CATENATE_ALL |
| | SPLIT_ON_CASE_CHANGE |
| | SPLIT_ON_NUMERICS |
| | STEM_ENGLISH_POSSESSIVE; |
| // test that subwords and catenated subwords have |
| // the correct offsets. |
| WordDelimiterGraphFilter wdf = |
| new WordDelimiterGraphFilter( |
| new CannedTokenStream(new Token("foo-bar", 5, 12)), |
| true, |
| DEFAULT_WORD_DELIM_TABLE, |
| flags, |
| null); |
| |
| assertTokenStreamContents( |
| wdf, new String[] {"foobar", "foo", "bar"}, new int[] {5, 5, 9}, new int[] {12, 8, 12}); |
| |
| // with illegal offsets: |
| wdf = |
| new WordDelimiterGraphFilter( |
| new CannedTokenStream(new Token("foo-bar", 5, 6)), |
| true, |
| DEFAULT_WORD_DELIM_TABLE, |
| flags, |
| null); |
| assertTokenStreamContents( |
| wdf, new String[] {"foobar", "foo", "bar"}, new int[] {5, 5, 5}, new int[] {6, 6, 6}); |
| } |
| |
| public void testOffsetChange() throws Exception { |
| int flags = |
| GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | CATENATE_ALL |
| | SPLIT_ON_CASE_CHANGE |
| | SPLIT_ON_NUMERICS |
| | STEM_ENGLISH_POSSESSIVE; |
| WordDelimiterGraphFilter wdf = |
| new WordDelimiterGraphFilter( |
| new CannedTokenStream(new Token("übelkeit)", 7, 16)), |
| true, |
| DEFAULT_WORD_DELIM_TABLE, |
| flags, |
| null); |
| |
| assertTokenStreamContents(wdf, new String[] {"übelkeit"}, new int[] {7}, new int[] {15}); |
| } |
| |
| public void testOffsetChange2() throws Exception { |
| int flags = |
| GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | CATENATE_ALL |
| | SPLIT_ON_CASE_CHANGE |
| | SPLIT_ON_NUMERICS |
| | STEM_ENGLISH_POSSESSIVE; |
| WordDelimiterGraphFilter wdf = |
| new WordDelimiterGraphFilter( |
| new CannedTokenStream(new Token("(übelkeit", 7, 17)), |
| true, |
| DEFAULT_WORD_DELIM_TABLE, |
| flags, |
| null); |
| // illegal offsets: |
| assertTokenStreamContents(wdf, new String[] {"übelkeit"}, new int[] {7}, new int[] {17}); |
| } |
| |
| public void testOffsetChange3() throws Exception { |
| int flags = |
| GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | CATENATE_ALL |
| | SPLIT_ON_CASE_CHANGE |
| | SPLIT_ON_NUMERICS |
| | STEM_ENGLISH_POSSESSIVE; |
| WordDelimiterGraphFilter wdf = |
| new WordDelimiterGraphFilter( |
| new CannedTokenStream(new Token("(übelkeit", 7, 16)), |
| true, |
| DEFAULT_WORD_DELIM_TABLE, |
| flags, |
| null); |
| assertTokenStreamContents(wdf, new String[] {"übelkeit"}, new int[] {8}, new int[] {16}); |
| } |
| |
| public void testOffsetChange4() throws Exception { |
| int flags = |
| GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | CATENATE_ALL |
| | SPLIT_ON_CASE_CHANGE |
| | SPLIT_ON_NUMERICS |
| | STEM_ENGLISH_POSSESSIVE; |
| WordDelimiterGraphFilter wdf = |
| new WordDelimiterGraphFilter( |
| new CannedTokenStream(new Token("(foo,bar)", 7, 16)), |
| true, |
| DEFAULT_WORD_DELIM_TABLE, |
| flags, |
| null); |
| |
| assertTokenStreamContents( |
| wdf, new String[] {"foobar", "foo", "bar"}, new int[] {8, 8, 12}, new int[] {15, 11, 15}); |
| } |
| |
| public void doSplit(final String input, String... output) throws Exception { |
| int flags = |
| GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | SPLIT_ON_CASE_CHANGE |
| | SPLIT_ON_NUMERICS |
| | STEM_ENGLISH_POSSESSIVE; |
| WordDelimiterGraphFilter wdf = |
| new WordDelimiterGraphFilter( |
| keywordMockTokenizer(input), |
| false, |
| WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, |
| flags, |
| null); |
| |
| assertTokenStreamContents(wdf, output); |
| } |
| |
| public void testSplits() throws Exception { |
| doSplit("basic-split", "basic", "split"); |
| doSplit("camelCase", "camel", "Case"); |
| |
| // non-space marking symbol shouldn't cause split |
| // this is an example in Thai |
| doSplit("\u0e1a\u0e49\u0e32\u0e19", "\u0e1a\u0e49\u0e32\u0e19"); |
| // possessive followed by delimiter |
| doSplit("test's'", "test"); |
| |
| // some russian upper and lowercase |
| doSplit("Роберт", "Роберт"); |
| // now cause a split (russian camelCase) |
| doSplit("РобЕрт", "Роб", "Ерт"); |
| |
| // a composed titlecase character, don't split |
| doSplit("aDžungla", "aDžungla"); |
| |
| // a modifier letter, don't split |
| doSplit("ســـــــــــــــــلام", "ســـــــــــــــــلام"); |
| |
| // enclosing mark, don't split |
| doSplit("test⃝", "test⃝"); |
| |
| // combining spacing mark (the virama), don't split |
| doSplit("हिन्दी", "हिन्दी"); |
| |
| // don't split non-ascii digits |
| doSplit("١٢٣٤", "١٢٣٤"); |
| |
| // don't split supplementaries into unpaired surrogates |
| doSplit("𠀀𠀀", "𠀀𠀀"); |
| } |
| |
| public void doSplitPossessive(int stemPossessive, final String input, final String... output) |
| throws Exception { |
| int flags = |
| GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS; |
| flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0; |
| WordDelimiterGraphFilter wdf = |
| new WordDelimiterGraphFilter(keywordMockTokenizer(input), flags, null); |
| |
| assertTokenStreamContents(wdf, output); |
| } |
| |
| /* |
| * Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters. |
| */ |
| public void testPossessives() throws Exception { |
| doSplitPossessive(1, "ra's", "ra"); |
| doSplitPossessive(0, "ra's", "ra", "s"); |
| } |
| |
| public void testTokenType() throws Exception { |
| int flags = |
| GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | CATENATE_ALL |
| | SPLIT_ON_CASE_CHANGE |
| | SPLIT_ON_NUMERICS |
| | STEM_ENGLISH_POSSESSIVE; |
| // test that subwords and catenated subwords have |
| // the correct offsets. |
| Token token = new Token("foo-bar", 5, 12); |
| token.setType("mytype"); |
| WordDelimiterGraphFilter wdf = |
| new WordDelimiterGraphFilter(new CannedTokenStream(token), flags, null); |
| |
| assertTokenStreamContents( |
| wdf, new String[] {"foobar", "foo", "bar"}, new String[] {"mytype", "mytype", "mytype"}); |
| } |
| |
| /* |
| * Set a large position increment gap of 10 if the token is "largegap" or "/" |
| */ |
| private static final class LargePosIncTokenFilter extends TokenFilter { |
| private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); |
| |
| protected LargePosIncTokenFilter(TokenStream input) { |
| super(input); |
| } |
| |
| @Override |
| public boolean incrementToken() throws IOException { |
| if (input.incrementToken()) { |
| if (termAtt.toString().equals("largegap") || termAtt.toString().equals("/")) |
| posIncAtt.setPositionIncrement(10); |
| return true; |
| } else { |
| return false; |
| } |
| } |
| } |
| |
| public void testPositionIncrements() throws Exception { |
| |
| Analyzer a4 = |
| new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| final int flags = |
| SPLIT_ON_NUMERICS |
| | GENERATE_WORD_PARTS |
| | PRESERVE_ORIGINAL |
| | GENERATE_NUMBER_PARTS |
| | SPLIT_ON_CASE_CHANGE; |
| return new TokenStreamComponents( |
| tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, CharArraySet.EMPTY_SET)); |
| } |
| }; |
| assertAnalyzesTo( |
| a4, |
| "SAL_S8371 - SAL", |
| new String[] {"SAL_S8371", "SAL", "S", "8371", "-", "SAL"}, |
| new int[] {1, 0, 1, 1, 1, 1}); |
| |
| final int flags = |
| GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | CATENATE_ALL |
| | SPLIT_ON_CASE_CHANGE |
| | SPLIT_ON_NUMERICS |
| | STEM_ENGLISH_POSSESSIVE; |
| final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false); |
| |
| /* analyzer that uses whitespace + wdf */ |
| Analyzer a = |
| new Analyzer() { |
| @Override |
| public TokenStreamComponents createComponents(String field) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents( |
| tokenizer, |
| new WordDelimiterGraphFilter( |
| tokenizer, true, DEFAULT_WORD_DELIM_TABLE, flags, protWords)); |
| } |
| }; |
| |
| /* in this case, works as expected. */ |
| assertAnalyzesTo( |
| a, |
| "LUCENE / SOLR", |
| new String[] {"LUCENE", "SOLR"}, |
| new int[] {0, 9}, |
| new int[] {6, 13}, |
| null, |
| new int[] {1, 2}, |
| null, |
| false); |
| |
| /* only in this case, posInc of 2 ?! */ |
| assertAnalyzesTo( |
| a, |
| "LUCENE / solR", |
| new String[] {"LUCENE", "solR", "sol", "R"}, |
| new int[] {0, 9, 9, 12}, |
| new int[] {6, 13, 12, 13}, |
| null, |
| new int[] {1, 2, 0, 1}, |
| null, |
| false); |
| |
| assertAnalyzesTo( |
| a, |
| "LUCENE / NUTCH SOLR", |
| new String[] {"LUCENE", "NUTCH", "SOLR"}, |
| new int[] {0, 9, 15}, |
| new int[] {6, 14, 19}, |
| null, |
| new int[] {1, 2, 1}, |
| null, |
| false); |
| |
| /* analyzer that will consume tokens with large position increments */ |
| Analyzer a2 = |
| new Analyzer() { |
| @Override |
| public TokenStreamComponents createComponents(String field) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents( |
| tokenizer, |
| new WordDelimiterGraphFilter( |
| new LargePosIncTokenFilter(tokenizer), |
| true, |
| DEFAULT_WORD_DELIM_TABLE, |
| flags, |
| protWords)); |
| } |
| }; |
| |
| /* increment of "largegap" is preserved */ |
| assertAnalyzesTo( |
| a2, |
| "LUCENE largegap SOLR", |
| new String[] {"LUCENE", "largegap", "SOLR"}, |
| new int[] {0, 7, 16}, |
| new int[] {6, 15, 20}, |
| null, |
| new int[] {1, 10, 1}, |
| null, |
| false); |
| |
| /* the "/" had a position increment of 10, where did it go?!?!! */ |
| assertAnalyzesTo( |
| a2, |
| "LUCENE / SOLR", |
| new String[] {"LUCENE", "SOLR"}, |
| new int[] {0, 9}, |
| new int[] {6, 13}, |
| null, |
| new int[] {1, 11}, |
| null, |
| false); |
| |
| /* in this case, the increment of 10 from the "/" is carried over */ |
| assertAnalyzesTo( |
| a2, |
| "LUCENE / solR", |
| new String[] {"LUCENE", "solR", "sol", "R"}, |
| new int[] {0, 9, 9, 12}, |
| new int[] {6, 13, 12, 13}, |
| null, |
| new int[] {1, 11, 0, 1}, |
| null, |
| false); |
| |
| assertAnalyzesTo( |
| a2, |
| "LUCENE / NUTCH SOLR", |
| new String[] {"LUCENE", "NUTCH", "SOLR"}, |
| new int[] {0, 9, 15}, |
| new int[] {6, 14, 19}, |
| null, |
| new int[] {1, 11, 1}, |
| null, |
| false); |
| |
| Analyzer a3 = |
| new Analyzer() { |
| @Override |
| public TokenStreamComponents createComponents(String field) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| StopFilter filter = new StopFilter(tokenizer, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET); |
| return new TokenStreamComponents( |
| tokenizer, |
| new WordDelimiterGraphFilter( |
| filter, true, DEFAULT_WORD_DELIM_TABLE, flags, protWords)); |
| } |
| }; |
| |
| assertAnalyzesTo( |
| a3, |
| "lucene.solr", |
| new String[] {"lucenesolr", "lucene", "solr"}, |
| new int[] {0, 0, 7}, |
| new int[] {11, 6, 11}, |
| null, |
| new int[] {1, 0, 1}, |
| null, |
| false); |
| |
| /* the stopword should add a gap here */ |
| assertAnalyzesTo( |
| a3, |
| "the lucene.solr", |
| new String[] {"lucenesolr", "lucene", "solr"}, |
| new int[] {4, 4, 11}, |
| new int[] {15, 10, 15}, |
| null, |
| new int[] {2, 0, 1}, |
| null, |
| false); |
| |
| IOUtils.close(a, a2, a3, a4); |
| } |
| |
| public void testKeywordFilter() throws Exception { |
| assertAnalyzesTo( |
| keywordTestAnalyzer(GENERATE_WORD_PARTS), |
| "abc-def klm-nop kpop", |
| new String[] {"abc", "def", "klm", "nop", "kpop"}); |
| assertAnalyzesTo( |
| keywordTestAnalyzer(GENERATE_WORD_PARTS | IGNORE_KEYWORDS), |
| "abc-def klm-nop kpop", |
| new String[] {"abc", "def", "klm-nop", "kpop"}, |
| new int[] {0, 0, 8, 16}, |
| new int[] {7, 7, 15, 20}, |
| null, |
| new int[] {1, 1, 1, 1}, |
| null, |
| false); |
| } |
| |
| private Analyzer keywordTestAnalyzer(int flags) throws Exception { |
| return new Analyzer() { |
| @Override |
| public TokenStreamComponents createComponents(String field) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| KeywordMarkerFilter kFilter = |
| new KeywordMarkerFilter(tokenizer) { |
| private final CharTermAttribute term = addAttribute(CharTermAttribute.class); |
| |
| @Override |
| public boolean isKeyword() { |
| // Marks terms starting with the letter 'k' as keywords |
| return term.toString().charAt(0) == 'k'; |
| } |
| }; |
| return new TokenStreamComponents( |
| tokenizer, new WordDelimiterGraphFilter(kFilter, flags, null)); |
| } |
| }; |
| } |
| |
| public void testOriginalTokenEmittedFirst() throws Exception { |
| final int flags = |
| PRESERVE_ORIGINAL |
| | GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | CATENATE_WORDS |
| | CATENATE_NUMBERS |
| | CATENATE_ALL |
| | SPLIT_ON_CASE_CHANGE |
| | SPLIT_ON_NUMERICS |
| | STEM_ENGLISH_POSSESSIVE; |
| |
| /* analyzer that uses whitespace + wdf */ |
| Analyzer a = |
| new Analyzer() { |
| @Override |
| public TokenStreamComponents createComponents(String field) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents( |
| tokenizer, |
| new WordDelimiterGraphFilter( |
| tokenizer, true, DEFAULT_WORD_DELIM_TABLE, flags, null)); |
| } |
| }; |
| |
| assertAnalyzesTo( |
| a, |
| "abc-def abcDEF abc123", |
| new String[] { |
| "abc-def", "abcdef", "abc", "def", "abcDEF", "abcDEF", "abc", "DEF", "abc123", "abc123", |
| "abc", "123" |
| }); |
| a.close(); |
| } |
| |
| // https://issues.apache.org/jira/browse/LUCENE-9006 |
| public void testCatenateAllEmittedBeforeParts() throws Exception { |
| // no number parts |
| final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | CATENATE_ALL; |
| final boolean useCharFilter = true; |
| final boolean graphOffsetsAreCorrect = |
| false; // note: could solve via always incrementing wordPos on first word ('8') |
| |
| // not using getAnalyzer because we want adjustInternalOffsets=true |
| Analyzer a = |
| new Analyzer() { |
| @Override |
| public TokenStreamComponents createComponents(String field) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents( |
| tokenizer, |
| new WordDelimiterGraphFilter( |
| tokenizer, true, DEFAULT_WORD_DELIM_TABLE, flags, null)); |
| } |
| }; |
| |
| // input starts with a number, but we don't generate numbers. |
| // Nonetheless preserve-original and concatenate-all show up first. |
| assertTokenStreamContents( |
| a.tokenStream("dummy", "8-other"), |
| new String[] {"8-other", "8other", "other"}, |
| new int[] {0, 0, 2}, |
| new int[] {7, 7, 7}, |
| new int[] {1, 0, 0}); |
| checkAnalysisConsistency(random(), a, useCharFilter, "8-other", graphOffsetsAreCorrect); |
| verify("8-other", flags); // uses getAnalyzer which uses adjustInternalOffsets=false which works |
| |
| // input ends with a number, but we don't generate numbers |
| assertTokenStreamContents( |
| a.tokenStream("dummy", "other-9"), |
| new String[] {"other-9", "other9", "other"}, |
| new int[] {0, 0, 0}, |
| new int[] {7, 7, 5}, |
| new int[] {1, 0, 0}); |
| checkAnalysisConsistency(random(), a, useCharFilter, "other-9", graphOffsetsAreCorrect); |
| verify("9-other", flags); // uses getAnalyzer which uses adjustInternalOffsets=false which works |
| |
| a.close(); |
| } |
| |
| /* |
| static char[] fuzzDict = {'-', 'H', 'w', '4'}; |
| public void testFuzz() throws IOException { |
| //System.out.println(getGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | CATENATE_WORDS), "H-H")); // orig:[H H, HH H] orig; fixed posInc:"[HH H H]" |
| //System.out.println(getGraphStrings(getAnalyzer(CATENATE_WORDS | CATENATE_ALL), "H-4")); // fixPos:[H H4] final:"[H4 H]" |
| |
| StringBuilder input = new StringBuilder("000000"); // fill with arbitrary chars; not too long or too short |
| |
| for (int flags = 0; flags < IGNORE_KEYWORDS; flags++) { // all interesting bit flags precede IGNORE_KEYWORDS |
| System.out.println("Flags: " + flags + " " + WordDelimiterGraphFilter.flagsToString(flags)); |
| final Analyzer analyzer = getAnalyzer(flags); |
| fuzzLoop(input, 0, analyzer); |
| } |
| } |
| |
| public void fuzzLoop(StringBuilder input, int inputPrefixLenFuzzed, Analyzer analyzer) throws IOException { |
| if (inputPrefixLenFuzzed < input.length()) { |
| for (char c : fuzzDict) { |
| input.setCharAt(inputPrefixLenFuzzed, c); |
| fuzzLoop(input, inputPrefixLenFuzzed + 1, analyzer); // recursive |
| } |
| return; |
| } |
| |
| fuzzDoCheck(input.toString(), analyzer); |
| } |
| |
| private void fuzzDoCheck(String input, Analyzer analyzer) throws IOException { |
| try (TokenStream ts1 = analyzer.tokenStream("fieldName", input)) { |
| ts1.reset(); |
| while (ts1.incrementToken()) { // modified WDF sorter compare() contains assertion check |
| //do-nothing |
| } |
| ts1.end(); |
| } catch (AssertionError e) { |
| System.out.println("failed input: " + input); |
| throw e; |
| } |
| } |
| */ |
| |
| /** concat numbers + words + all */ |
| public void testLotsOfConcatenating() throws Exception { |
| final int flags = |
| GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | CATENATE_WORDS |
| | CATENATE_NUMBERS |
| | CATENATE_ALL |
| | SPLIT_ON_CASE_CHANGE |
| | SPLIT_ON_NUMERICS |
| | STEM_ENGLISH_POSSESSIVE; |
| |
| /* analyzer that uses whitespace + wdf */ |
| Analyzer a = |
| new Analyzer() { |
| @Override |
| public TokenStreamComponents createComponents(String field) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents( |
| tokenizer, |
| new WordDelimiterGraphFilter( |
| tokenizer, true, DEFAULT_WORD_DELIM_TABLE, flags, null)); |
| } |
| }; |
| |
| assertAnalyzesTo( |
| a, |
| "abc-def-123-456", |
| new String[] {"abcdef123456", "abcdef", "abc", "def", "123456", "123", "456"}, |
| new int[] {0, 0, 0, 4, 8, 8, 12}, |
| new int[] {15, 7, 3, 7, 15, 11, 15}, |
| null, |
| new int[] {1, 0, 0, 1, 1, 0, 1}, |
| null, |
| false); |
| a.close(); |
| } |
| |
| /** concat numbers + words + all + preserve original */ |
| public void testLotsOfConcatenating2() throws Exception { |
| final int flags = |
| PRESERVE_ORIGINAL |
| | GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | CATENATE_WORDS |
| | CATENATE_NUMBERS |
| | CATENATE_ALL |
| | SPLIT_ON_CASE_CHANGE |
| | SPLIT_ON_NUMERICS |
| | STEM_ENGLISH_POSSESSIVE; |
| |
| /* analyzer that uses whitespace + wdf */ |
| Analyzer a = |
| new Analyzer() { |
| @Override |
| public TokenStreamComponents createComponents(String field) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents( |
| tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, null)); |
| } |
| }; |
| |
| assertAnalyzesTo( |
| a, |
| "abc-def-123-456", |
| new String[] { |
| "abc-def-123-456", "abcdef123456", "abcdef", "abc", "def", "123456", "123", "456" |
| }, |
| new int[] {0, 0, 0, 0, 0, 0, 0, 0}, |
| new int[] {15, 15, 15, 15, 15, 15, 15, 15}, |
| null, |
| new int[] {1, 0, 0, 0, 1, 1, 0, 1}, |
| null, |
| false); |
| a.close(); |
| } |
| |
| /** blast some random strings through the analyzer */ |
| public void testRandomStrings() throws Exception { |
| int numIterations = atLeast(3); |
| for (int i = 0; i < numIterations; i++) { |
| final int flags = random().nextInt(512); |
| final CharArraySet protectedWords; |
| if (random().nextBoolean()) { |
| protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false); |
| } else { |
| protectedWords = null; |
| } |
| |
| Analyzer a = |
| new Analyzer() { |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents( |
| tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, protectedWords)); |
| } |
| }; |
| // TODO: properly support positionLengthAttribute |
| checkRandomData(random(), a, 100 * RANDOM_MULTIPLIER, 20, false, false); |
| a.close(); |
| } |
| } |
| |
| /** blast some enormous random strings through the analyzer */ |
| public void testRandomHugeStrings() throws Exception { |
| int numIterations = atLeast(1); |
| for (int i = 0; i < numIterations; i++) { |
| final int flags = random().nextInt(512); |
| final CharArraySet protectedWords; |
| if (random().nextBoolean()) { |
| protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false); |
| } else { |
| protectedWords = null; |
| } |
| |
| Analyzer a = |
| new Analyzer() { |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| TokenStream wdgf = new WordDelimiterGraphFilter(tokenizer, flags, protectedWords); |
| return new TokenStreamComponents(tokenizer, wdgf); |
| } |
| }; |
| // TODO: properly support positionLengthAttribute |
| checkRandomData(random(), a, 10 * RANDOM_MULTIPLIER, 8192, false, false); |
| a.close(); |
| } |
| } |
| |
| public void testEmptyTerm() throws IOException { |
| Random random = random(); |
| for (int i = 0; i < 512; i++) { |
| final int flags = i; |
| final CharArraySet protectedWords; |
| if (random.nextBoolean()) { |
| protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false); |
| } else { |
| protectedWords = null; |
| } |
| |
| Analyzer a = |
| new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new KeywordTokenizer(); |
| return new TokenStreamComponents( |
| tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, protectedWords)); |
| } |
| }; |
| // depending upon options, this thing may or may not preserve the empty term |
| checkAnalysisConsistency(random, a, random.nextBoolean(), ""); |
| a.close(); |
| } |
| } |
| |
| private Analyzer getAnalyzer(int flags) { |
| return getAnalyzer(flags, null); |
| } |
| |
| private Analyzer getAnalyzer(int flags, CharArraySet protectedWords) { |
| return new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new KeywordTokenizer(); |
| return new TokenStreamComponents( |
| tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, protectedWords)); |
| } |
| }; |
| } |
| |
| private static boolean has(int flags, int flag) { |
| return (flags & flag) != 0; |
| } |
| |
| private static boolean isEnglishPossessive(String text, int pos) { |
| if (pos > 2) { |
| if ((text.charAt(pos - 1) == 's' || text.charAt(pos - 1) == 'S') |
| && (pos == text.length() || text.charAt(pos) != '-')) { |
| text = text.substring(0, text.length() - 2); |
| } |
| } |
| return true; |
| } |
| |
| private static class WordPart { |
| final String part; |
| final int startOffset; |
| final int endOffset; |
| final int type; |
| |
| public WordPart(String text, int startOffset, int endOffset) { |
| this.part = text.substring(startOffset, endOffset); |
| this.startOffset = startOffset; |
| this.endOffset = endOffset; |
| this.type = toType(part.charAt(0)); |
| } |
| |
| @Override |
| public String toString() { |
| return "WordPart(" + part + " " + startOffset + "-" + endOffset + ")"; |
| } |
| } |
| |
| private static final int NUMBER = 0; |
| private static final int LETTER = 1; |
| private static final int DELIM = 2; |
| |
| private static int toType(char ch) { |
| if (Character.isDigit(ch)) { |
| // numbers |
| return NUMBER; |
| } else if (Character.isLetter(ch)) { |
| // letters |
| return LETTER; |
| } else { |
| // delimiter |
| return DELIM; |
| } |
| } |
| |
| /** |
| * Does (hopefully) the same thing as WordDelimiterGraphFilter, according to the flags, but more |
| * slowly, returning all string paths combinations. |
| */ |
| private Set<String> slowWDF(String text, int flags) { |
| |
| // first make word parts: |
| List<WordPart> wordParts = new ArrayList<>(); |
| int lastCH = -1; |
| int wordPartStart = 0; |
| boolean inToken = false; |
| |
| for (int i = 0; i < text.length(); i++) { |
| char ch = text.charAt(i); |
| if (toType(ch) == DELIM) { |
| // delimiter |
| if (inToken) { |
| // end current token |
| wordParts.add(new WordPart(text, wordPartStart, i)); |
| inToken = false; |
| } |
| |
| // strip english possessive at the end of this token?: |
| if (has(flags, STEM_ENGLISH_POSSESSIVE) |
| && ch == '\'' |
| && i > 0 |
| && i < text.length() - 1 |
| && (text.charAt(i + 1) == 's' || text.charAt(i + 1) == 'S') |
| && toType(text.charAt(i - 1)) == LETTER |
| && (i + 2 == text.length() || toType(text.charAt(i + 2)) == DELIM)) { |
| i += 2; |
| } |
| |
| } else if (inToken == false) { |
| // start new token |
| inToken = true; |
| wordPartStart = i; |
| } else { |
| boolean newToken = false; |
| if (Character.isLetter(lastCH)) { |
| if (Character.isLetter(ch)) { |
| if (has(flags, SPLIT_ON_CASE_CHANGE) |
| && Character.isLowerCase(lastCH) |
| && Character.isLowerCase(ch) == false) { |
| // start new token on lower -> UPPER case change (but not vice versa!) |
| newToken = true; |
| } |
| } else if (has(flags, SPLIT_ON_NUMERICS) && Character.isDigit(ch)) { |
| // start new token on letter -> number change |
| newToken = true; |
| } |
| } else { |
| assert Character.isDigit(lastCH); |
| if (Character.isLetter(ch) && has(flags, SPLIT_ON_NUMERICS)) { |
| // start new token on number -> letter change |
| newToken = true; |
| } |
| } |
| if (newToken) { |
| wordParts.add(new WordPart(text, wordPartStart, i)); |
| wordPartStart = i; |
| } |
| } |
| lastCH = ch; |
| } |
| |
| if (inToken) { |
| // add last token |
| wordParts.add(new WordPart(text, wordPartStart, text.length())); |
| } |
| |
| Set<String> paths = new HashSet<>(); |
| if (wordParts.isEmpty() == false) { |
| enumerate(flags, 0, text, wordParts, paths, new StringBuilder()); |
| } |
| |
| if (has(flags, PRESERVE_ORIGINAL)) { |
| paths.add(text); |
| } |
| |
| if (has(flags, CATENATE_ALL) && wordParts.isEmpty() == false) { |
| StringBuilder b = new StringBuilder(); |
| for (WordPart wordPart : wordParts) { |
| b.append(wordPart.part); |
| } |
| paths.add(b.toString()); |
| } |
| |
| return paths; |
| } |
| |
| private void add(StringBuilder path, String part) { |
| if (path.length() != 0) { |
| path.append(' '); |
| } |
| path.append(part); |
| } |
| |
| private void add(StringBuilder path, List<WordPart> wordParts, int from, int to) { |
| if (path.length() != 0) { |
| path.append(' '); |
| } |
| // no spaces: |
| for (int i = from; i < to; i++) { |
| path.append(wordParts.get(i).part); |
| } |
| } |
| |
| private void addWithSpaces(StringBuilder path, List<WordPart> wordParts, int from, int to) { |
| for (int i = from; i < to; i++) { |
| add(path, wordParts.get(i).part); |
| } |
| } |
| |
| /** Finds the end (exclusive) of the series of part with the same type */ |
| private int endOfRun(List<WordPart> wordParts, int start) { |
| int upto = start + 1; |
| while (upto < wordParts.size() && wordParts.get(upto).type == wordParts.get(start).type) { |
| upto++; |
| } |
| return upto; |
| } |
| |
| /** Recursively enumerates all paths through the word parts */ |
| private void enumerate( |
| int flags, |
| int upto, |
| String text, |
| List<WordPart> wordParts, |
| Set<String> paths, |
| StringBuilder path) { |
| if (upto == wordParts.size()) { |
| if (path.length() > 0) { |
| paths.add(path.toString()); |
| } |
| } else { |
| int savLength = path.length(); |
| int end = endOfRun(wordParts, upto); |
| |
| if (wordParts.get(upto).type == NUMBER) { |
| // always output single word, optionally surrounded by delims: |
| if (has(flags, GENERATE_NUMBER_PARTS) || wordParts.size() == 1) { |
| addWithSpaces(path, wordParts, upto, end); |
| if (has(flags, CATENATE_NUMBERS)) { |
| // recurse first with the parts |
| enumerate(flags, end, text, wordParts, paths, path); |
| path.setLength(savLength); |
| // .. and second with the concat |
| add(path, wordParts, upto, end); |
| } |
| } else if (has(flags, CATENATE_NUMBERS)) { |
| add(path, wordParts, upto, end); |
| } |
| enumerate(flags, end, text, wordParts, paths, path); |
| path.setLength(savLength); |
| } else { |
| assert wordParts.get(upto).type == LETTER; |
| // always output single word, optionally surrounded by delims: |
| if (has(flags, GENERATE_WORD_PARTS) || wordParts.size() == 1) { |
| addWithSpaces(path, wordParts, upto, end); |
| if (has(flags, CATENATE_WORDS)) { |
| // recurse first with the parts |
| enumerate(flags, end, text, wordParts, paths, path); |
| path.setLength(savLength); |
| // .. and second with the concat |
| add(path, wordParts, upto, end); |
| } |
| } else if (has(flags, CATENATE_WORDS)) { |
| add(path, wordParts, upto, end); |
| } |
| enumerate(flags, end, text, wordParts, paths, path); |
| path.setLength(savLength); |
| } |
| } |
| } |
| |
| public void testBasicGraphSplits() throws Exception { |
| assertGraphStrings(getAnalyzer(0), "PowerShotPlus", "PowerShotPlus"); |
| assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS), "PowerShotPlus", "PowerShotPlus"); |
| assertGraphStrings( |
| getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE), |
| "PowerShotPlus", |
| "Power Shot Plus"); |
| assertGraphStrings( |
| getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | PRESERVE_ORIGINAL), |
| "PowerShotPlus", |
| "PowerShotPlus", |
| "Power Shot Plus"); |
| |
| assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS), "Power-Shot-Plus", "Power Shot Plus"); |
| assertGraphStrings( |
| getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE), |
| "Power-Shot-Plus", |
| "Power Shot Plus"); |
| assertGraphStrings( |
| getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | PRESERVE_ORIGINAL), |
| "Power-Shot-Plus", |
| "Power-Shot-Plus", |
| "Power Shot Plus"); |
| |
| assertGraphStrings( |
| getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE), |
| "PowerShotPlus", |
| "Power Shot Plus"); |
| assertGraphStrings( |
| getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE), |
| "PowerShot1000Plus", |
| "Power Shot1000Plus"); |
| assertGraphStrings( |
| getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE), |
| "Power-Shot-Plus", |
| "Power Shot Plus"); |
| |
| assertGraphStrings( |
| getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | CATENATE_WORDS), |
| "PowerShotPlus", |
| "Power Shot Plus", |
| "PowerShotPlus"); |
| assertGraphStrings( |
| getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | CATENATE_WORDS), |
| "PowerShot1000Plus", |
| "Power Shot1000Plus", |
| "PowerShot1000Plus"); |
| assertGraphStrings( |
| getAnalyzer( |
| GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | SPLIT_ON_CASE_CHANGE |
| | CATENATE_WORDS |
| | CATENATE_NUMBERS), |
| "Power-Shot-1000-17-Plus", |
| "Power Shot 1000 17 Plus", |
| "Power Shot 100017 Plus", |
| "PowerShot 1000 17 Plus", |
| "PowerShot 100017 Plus"); |
| assertGraphStrings( |
| getAnalyzer( |
| GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | SPLIT_ON_CASE_CHANGE |
| | CATENATE_WORDS |
| | CATENATE_NUMBERS |
| | PRESERVE_ORIGINAL), |
| "Power-Shot-1000-17-Plus", |
| "Power-Shot-1000-17-Plus", |
| "Power Shot 1000 17 Plus", |
| "Power Shot 100017 Plus", |
| "PowerShot 1000 17 Plus", |
| "PowerShot 100017 Plus"); |
| } |
| |
| /* |
| public void testToDot() throws Exception { |
| int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE | PRESERVE_ORIGINAL | CATENATE_WORDS | CATENATE_NUMBERS | STEM_ENGLISH_POSSESSIVE; |
| String text = "PowerSystem2000-5-Shot's"; |
| WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token(text, 0, text.length())), DEFAULT_WORD_DELIM_TABLE, flags, null); |
| //StringWriter sw = new StringWriter(); |
| // TokenStreamToDot toDot = new TokenStreamToDot(text, wdf, new PrintWriter(sw)); |
| PrintWriter pw = new PrintWriter("/tmp/foo2.dot"); |
| TokenStreamToDot toDot = new TokenStreamToDot(text, wdf, pw); |
| toDot.toDot(); |
| pw.close(); |
| //System.out.println("DOT:\n" + sw.toString()); |
| } |
| */ |
| |
| private String randomWDFText() { |
| StringBuilder b = new StringBuilder(); |
| int length = TestUtil.nextInt(random(), 1, 50); |
| for (int i = 0; i < length; i++) { |
| int surpriseMe = random().nextInt(37); |
| int lower = -1; |
| int upper = -1; |
| if (surpriseMe < 10) { |
| // lowercase letter |
| lower = 'a'; |
| upper = 'z'; |
| } else if (surpriseMe < 20) { |
| // uppercase letter |
| lower = 'A'; |
| upper = 'Z'; |
| } else if (surpriseMe < 30) { |
| // digit |
| lower = '0'; |
| upper = '9'; |
| } else if (surpriseMe < 35) { |
| // punct |
| lower = '-'; |
| upper = '-'; |
| } else { |
| b.append("'s"); |
| } |
| |
| if (lower != -1) { |
| b.append((char) TestUtil.nextInt(random(), lower, upper)); |
| } |
| } |
| |
| return b.toString(); |
| } |
| |
| public void testInvalidFlag() throws Exception { |
| expectThrows( |
| IllegalArgumentException.class, |
| () -> { |
| new WordDelimiterGraphFilter(new CannedTokenStream(), 1 << 31, null); |
| }); |
| } |
| |
| public void testRandomPaths() throws Exception { |
| int iters = atLeast(10); |
| for (int iter = 0; iter < iters; iter++) { |
| String text = randomWDFText(); |
| if (VERBOSE) { |
| System.out.println("\nTEST: text=" + text + " len=" + text.length()); |
| } |
| |
| int flags = 0; |
| if (random().nextBoolean()) { |
| flags |= GENERATE_WORD_PARTS; |
| } |
| if (random().nextBoolean()) { |
| flags |= GENERATE_NUMBER_PARTS; |
| } |
| if (random().nextBoolean()) { |
| flags |= CATENATE_WORDS; |
| } |
| if (random().nextBoolean()) { |
| flags |= CATENATE_NUMBERS; |
| } |
| if (random().nextBoolean()) { |
| flags |= CATENATE_ALL; |
| } |
| if (random().nextBoolean()) { |
| flags |= PRESERVE_ORIGINAL; |
| } |
| if (random().nextBoolean()) { |
| flags |= SPLIT_ON_CASE_CHANGE; |
| } |
| if (random().nextBoolean()) { |
| flags |= SPLIT_ON_NUMERICS; |
| } |
| if (random().nextBoolean()) { |
| flags |= STEM_ENGLISH_POSSESSIVE; |
| } |
| |
| verify(text, flags); |
| } |
| } |
| |
| /** Runs normal and slow WDGF and compares results */ |
| private void verify(String text, int flags) throws IOException { |
| |
| Set<String> expected = slowWDF(text, flags); |
| if (VERBOSE) { |
| for (String path : expected) { |
| System.out.println(" " + path); |
| } |
| } |
| |
| Set<String> actual = getGraphStrings(getAnalyzer(flags), text); |
| if (actual.equals(expected) == false) { |
| StringBuilder b = new StringBuilder(); |
| b.append("\n\nFAIL: text="); |
| b.append(text); |
| b.append(" flags="); |
| b.append(WordDelimiterGraphFilter.flagsToString(flags)); |
| b.append('\n'); |
| b.append(" expected paths:\n"); |
| for (String s : expected) { |
| b.append(" "); |
| b.append(s); |
| if (actual.contains(s) == false) { |
| b.append(" [missing!]"); |
| } |
| b.append('\n'); |
| } |
| |
| b.append(" actual paths:\n"); |
| for (String s : actual) { |
| b.append(" "); |
| b.append(s); |
| if (expected.contains(s) == false) { |
| b.append(" [unexpected!]"); |
| } |
| b.append('\n'); |
| } |
| |
| fail(b.toString()); |
| } |
| |
| boolean useCharFilter = true; |
| checkAnalysisConsistency(random(), getAnalyzer(flags), useCharFilter, text); |
| } |
| |
| public void testOnlyNumbers() throws Exception { |
| // no token should be produced |
| assertGraphStrings( |
| getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS), "7-586"); |
| } |
| |
| public void testNoCatenate() throws Exception { |
| // no token should be produced |
| assertGraphStrings( |
| getAnalyzer( |
| GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS), |
| "a-b-c-9-d", |
| "a b c 9 d"); |
| } |
| |
| public void testCuriousCase1() throws Exception { |
| verify( |
| "u-0L-4836-ip4Gw--13--q7--L07E1", |
| CATENATE_WORDS |
| | CATENATE_ALL |
| | SPLIT_ON_CASE_CHANGE |
| | SPLIT_ON_NUMERICS |
| | STEM_ENGLISH_POSSESSIVE); |
| } |
| |
| public void testCuriousCase2() throws Exception { |
| verify("u-l-p", CATENATE_ALL); |
| } |
| |
| public void testOriginalPosLength() throws Exception { |
| verify("Foo-Bar-Baz", CATENATE_WORDS | SPLIT_ON_CASE_CHANGE | PRESERVE_ORIGINAL); |
| } |
| |
| public void testCuriousCase3() throws Exception { |
| verify( |
| "cQzk4-GL0izl0mKM-J8--4m-'s", |
| GENERATE_NUMBER_PARTS | CATENATE_NUMBERS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS); |
| } |
| |
| public void testEmptyString() throws Exception { |
| WordDelimiterGraphFilter wdf = |
| new WordDelimiterGraphFilter( |
| new CannedTokenStream(new Token("", 0, 0)), |
| GENERATE_WORD_PARTS | CATENATE_ALL | PRESERVE_ORIGINAL, |
| null); |
| wdf.reset(); |
| assertTrue(wdf.incrementToken()); |
| assertFalse(wdf.incrementToken()); |
| wdf.end(); |
| wdf.close(); |
| } |
| |
| public void testProtectedWords() throws Exception { |
| TokenStream tokens = |
| new CannedTokenStream(new Token("foo17-bar", 0, 9), new Token("foo-bar", 0, 7)); |
| |
| CharArraySet protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("foo17-BAR")), true); |
| WordDelimiterGraphFilter wdf = |
| new WordDelimiterGraphFilter( |
| tokens, GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | CATENATE_ALL, protectedWords); |
| assertGraphStrings(wdf, "foo17-bar foo bar", "foo17-bar foo-bar", "foo17-bar foobar"); |
| } |
| } |