| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.miscellaneous; |
| |
| import java.io.IOException; |
| import java.util.Arrays; |
| import java.util.HashSet; |
| import java.util.Random; |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.CannedTokenStream; |
| import org.apache.lucene.analysis.CharArraySet; |
| import org.apache.lucene.analysis.MockTokenizer; |
| import org.apache.lucene.analysis.StopFilter; |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.core.KeywordTokenizer; |
| import org.apache.lucene.analysis.en.EnglishAnalyzer; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.util.IOUtils; |
| |
| /** |
| * New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest TODO: should |
| * explicitly test things like protWords and not rely on the factory tests in Solr. |
| */ |
| @Deprecated |
| public class TestWordDelimiterFilter extends BaseTokenStreamTestCase { |
| |
| private static final int CATENATE_ALL = WordDelimiterFilter.CATENATE_ALL; |
| private static final int CATENATE_NUMBERS = WordDelimiterFilter.CATENATE_NUMBERS; |
| private static final int CATENATE_WORDS = WordDelimiterFilter.CATENATE_WORDS; |
| private static final int GENERATE_NUMBER_PARTS = WordDelimiterFilter.GENERATE_NUMBER_PARTS; |
| private static final int GENERATE_WORD_PARTS = WordDelimiterFilter.GENERATE_WORD_PARTS; |
| private static final int IGNORE_KEYWORDS = WordDelimiterFilter.IGNORE_KEYWORDS; |
| private static final int PRESERVE_ORIGINAL = WordDelimiterFilter.PRESERVE_ORIGINAL; |
| private static final int SPLIT_ON_CASE_CHANGE = WordDelimiterFilter.SPLIT_ON_CASE_CHANGE; |
| private static final int SPLIT_ON_NUMERICS = WordDelimiterFilter.SPLIT_ON_NUMERICS; |
| private static final int STEM_ENGLISH_POSSESSIVE = WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE; |
| private static final byte[] DEFAULT_WORD_DELIM_TABLE = |
| WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; |
| |
| /* |
| public void testPerformance() throws IOException { |
| String s = "now is the time-for all good men to come to-the aid of their country."; |
| Token tok = new Token(); |
| long start = System.currentTimeMillis(); |
| int ret=0; |
| for (int i=0; i<1000000; i++) { |
| StringReader r = new StringReader(s); |
| TokenStream ts = new WhitespaceTokenizer(r); |
| ts = new WordDelimiterFilter(ts, 1,1,1,1,0); |
| |
| while (ts.next(tok) != null) ret++; |
| } |
| |
| System.out.println("ret="+ret+" time="+(System.currentTimeMillis()-start)); |
| } |
| ***/ |
| |
| public void testOffsets() throws IOException { |
| int flags = |
| GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | CATENATE_ALL |
| | SPLIT_ON_CASE_CHANGE |
| | SPLIT_ON_NUMERICS |
| | STEM_ENGLISH_POSSESSIVE; |
| // test that subwords and catenated subwords have |
| // the correct offsets. |
| WordDelimiterFilter wdf = |
| new WordDelimiterFilter( |
| new CannedTokenStream(new Token("foo-bar", 5, 12)), |
| DEFAULT_WORD_DELIM_TABLE, |
| flags, |
| null); |
| |
| assertTokenStreamContents( |
| wdf, new String[] {"foo", "foobar", "bar"}, new int[] {5, 5, 9}, new int[] {8, 12, 12}); |
| |
| wdf = |
| new WordDelimiterFilter( |
| new CannedTokenStream(new Token("foo-bar", 5, 6)), |
| DEFAULT_WORD_DELIM_TABLE, |
| flags, |
| null); |
| |
| assertTokenStreamContents( |
| wdf, new String[] {"foo", "bar", "foobar"}, new int[] {5, 5, 5}, new int[] {6, 6, 6}); |
| } |
| |
| public void testOffsetChange() throws Exception { |
| int flags = |
| GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | CATENATE_ALL |
| | SPLIT_ON_CASE_CHANGE |
| | SPLIT_ON_NUMERICS |
| | STEM_ENGLISH_POSSESSIVE; |
| WordDelimiterFilter wdf = |
| new WordDelimiterFilter( |
| new CannedTokenStream(new Token("übelkeit)", 7, 16)), |
| DEFAULT_WORD_DELIM_TABLE, |
| flags, |
| null); |
| |
| assertTokenStreamContents(wdf, new String[] {"übelkeit"}, new int[] {7}, new int[] {15}); |
| } |
| |
| public void testOffsetChange2() throws Exception { |
| int flags = |
| GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | CATENATE_ALL |
| | SPLIT_ON_CASE_CHANGE |
| | SPLIT_ON_NUMERICS |
| | STEM_ENGLISH_POSSESSIVE; |
| WordDelimiterFilter wdf = |
| new WordDelimiterFilter( |
| new CannedTokenStream(new Token("(übelkeit", 7, 17)), |
| DEFAULT_WORD_DELIM_TABLE, |
| flags, |
| null); |
| |
| assertTokenStreamContents(wdf, new String[] {"übelkeit"}, new int[] {8}, new int[] {17}); |
| } |
| |
| public void testOffsetChange3() throws Exception { |
| int flags = |
| GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | CATENATE_ALL |
| | SPLIT_ON_CASE_CHANGE |
| | SPLIT_ON_NUMERICS |
| | STEM_ENGLISH_POSSESSIVE; |
| WordDelimiterFilter wdf = |
| new WordDelimiterFilter( |
| new CannedTokenStream(new Token("(übelkeit", 7, 16)), |
| DEFAULT_WORD_DELIM_TABLE, |
| flags, |
| null); |
| |
| assertTokenStreamContents(wdf, new String[] {"übelkeit"}, new int[] {8}, new int[] {16}); |
| } |
| |
| public void testOffsetChange4() throws Exception { |
| int flags = |
| GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | CATENATE_ALL |
| | SPLIT_ON_CASE_CHANGE |
| | SPLIT_ON_NUMERICS |
| | STEM_ENGLISH_POSSESSIVE; |
| WordDelimiterFilter wdf = |
| new WordDelimiterFilter( |
| new CannedTokenStream(new Token("(foo,bar)", 7, 16)), |
| DEFAULT_WORD_DELIM_TABLE, |
| flags, |
| null); |
| |
| assertTokenStreamContents( |
| wdf, new String[] {"foo", "foobar", "bar"}, new int[] {8, 8, 12}, new int[] {11, 15, 15}); |
| } |
| |
| public void doSplit(final String input, String... output) throws Exception { |
| int flags = |
| GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | SPLIT_ON_CASE_CHANGE |
| | SPLIT_ON_NUMERICS |
| | STEM_ENGLISH_POSSESSIVE; |
| WordDelimiterFilter wdf = |
| new WordDelimiterFilter( |
| keywordMockTokenizer(input), |
| WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, |
| flags, |
| null); |
| |
| assertTokenStreamContents(wdf, output); |
| } |
| |
| public void testSplits() throws Exception { |
| doSplit("basic-split", "basic", "split"); |
| doSplit("camelCase", "camel", "Case"); |
| |
| // non-space marking symbol shouldn't cause split |
| // this is an example in Thai |
| doSplit("\u0e1a\u0e49\u0e32\u0e19", "\u0e1a\u0e49\u0e32\u0e19"); |
| // possessive followed by delimiter |
| doSplit("test's'", "test"); |
| |
| // some russian upper and lowercase |
| doSplit("Роберт", "Роберт"); |
| // now cause a split (russian camelCase) |
| doSplit("РобЕрт", "Роб", "Ерт"); |
| |
| // a composed titlecase character, don't split |
| doSplit("aDžungla", "aDžungla"); |
| |
| // a modifier letter, don't split |
| doSplit("ســـــــــــــــــلام", "ســـــــــــــــــلام"); |
| |
| // enclosing mark, don't split |
| doSplit("test⃝", "test⃝"); |
| |
| // combining spacing mark (the virama), don't split |
| doSplit("हिन्दी", "हिन्दी"); |
| |
| // don't split non-ascii digits |
| doSplit("١٢٣٤", "١٢٣٤"); |
| |
| // don't split supplementaries into unpaired surrogates |
| doSplit("𠀀𠀀", "𠀀𠀀"); |
| } |
| |
| public void doSplitPossessive(int stemPossessive, final String input, final String... output) |
| throws Exception { |
| int flags = |
| GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS; |
| flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0; |
| WordDelimiterFilter wdf = new WordDelimiterFilter(keywordMockTokenizer(input), flags, null); |
| |
| assertTokenStreamContents(wdf, output); |
| } |
| |
| /* |
| * Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters. |
| */ |
| public void testPossessives() throws Exception { |
| doSplitPossessive(1, "ra's", "ra"); |
| doSplitPossessive(0, "ra's", "ra", "s"); |
| } |
| |
| /* |
| * Set a large position increment gap of 10 if the token is "largegap" or "/" |
| */ |
| private static final class LargePosIncTokenFilter extends TokenFilter { |
| private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); |
| |
| protected LargePosIncTokenFilter(TokenStream input) { |
| super(input); |
| } |
| |
| @Override |
| public boolean incrementToken() throws IOException { |
| if (input.incrementToken()) { |
| if (termAtt.toString().equals("largegap") || termAtt.toString().equals("/")) |
| posIncAtt.setPositionIncrement(10); |
| return true; |
| } else { |
| return false; |
| } |
| } |
| } |
| |
| public void testPositionIncrements() throws Exception { |
| final int flags = |
| GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | CATENATE_ALL |
| | SPLIT_ON_CASE_CHANGE |
| | SPLIT_ON_NUMERICS |
| | STEM_ENGLISH_POSSESSIVE; |
| final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false); |
| |
| /* analyzer that uses whitespace + wdf */ |
| Analyzer a = |
| new Analyzer() { |
| @Override |
| public TokenStreamComponents createComponents(String field) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents( |
| tokenizer, new WordDelimiterFilter(tokenizer, flags, protWords)); |
| } |
| }; |
| |
| /* in this case, works as expected. */ |
| assertAnalyzesTo( |
| a, |
| "LUCENE / SOLR", |
| new String[] {"LUCENE", "SOLR"}, |
| new int[] {0, 9}, |
| new int[] {6, 13}, |
| null, |
| new int[] {1, 1}, |
| null, |
| false); |
| |
| /* only in this case, posInc of 2 ?! */ |
| assertAnalyzesTo( |
| a, |
| "LUCENE / solR", |
| new String[] {"LUCENE", "sol", "solR", "R"}, |
| new int[] {0, 9, 9, 12}, |
| new int[] {6, 12, 13, 13}, |
| null, |
| new int[] {1, 1, 0, 1}, |
| null, |
| false); |
| |
| assertAnalyzesTo( |
| a, |
| "LUCENE / NUTCH SOLR", |
| new String[] {"LUCENE", "NUTCH", "SOLR"}, |
| new int[] {0, 9, 15}, |
| new int[] {6, 14, 19}, |
| null, |
| new int[] {1, 1, 1}, |
| null, |
| false); |
| |
| /* analyzer that will consume tokens with large position increments */ |
| Analyzer a2 = |
| new Analyzer() { |
| @Override |
| public TokenStreamComponents createComponents(String field) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents( |
| tokenizer, |
| new WordDelimiterFilter(new LargePosIncTokenFilter(tokenizer), flags, protWords)); |
| } |
| }; |
| |
| /* increment of "largegap" is preserved */ |
| assertAnalyzesTo( |
| a2, |
| "LUCENE largegap SOLR", |
| new String[] {"LUCENE", "largegap", "SOLR"}, |
| new int[] {0, 7, 16}, |
| new int[] {6, 15, 20}, |
| null, |
| new int[] {1, 10, 1}, |
| null, |
| false); |
| |
| /* the "/" had a position increment of 10, where did it go?!?!! */ |
| assertAnalyzesTo( |
| a2, |
| "LUCENE / SOLR", |
| new String[] {"LUCENE", "SOLR"}, |
| new int[] {0, 9}, |
| new int[] {6, 13}, |
| null, |
| new int[] {1, 11}, |
| null, |
| false); |
| |
| /* in this case, the increment of 10 from the "/" is carried over */ |
| assertAnalyzesTo( |
| a2, |
| "LUCENE / solR", |
| new String[] {"LUCENE", "sol", "solR", "R"}, |
| new int[] {0, 9, 9, 12}, |
| new int[] {6, 12, 13, 13}, |
| null, |
| new int[] {1, 11, 0, 1}, |
| null, |
| false); |
| |
| assertAnalyzesTo( |
| a2, |
| "LUCENE / NUTCH SOLR", |
| new String[] {"LUCENE", "NUTCH", "SOLR"}, |
| new int[] {0, 9, 15}, |
| new int[] {6, 14, 19}, |
| null, |
| new int[] {1, 11, 1}, |
| null, |
| false); |
| |
| Analyzer a3 = |
| new Analyzer() { |
| @Override |
| public TokenStreamComponents createComponents(String field) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| StopFilter filter = new StopFilter(tokenizer, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET); |
| return new TokenStreamComponents( |
| tokenizer, new WordDelimiterFilter(filter, flags, protWords)); |
| } |
| }; |
| |
| assertAnalyzesTo( |
| a3, |
| "lucene.solr", |
| new String[] {"lucene", "lucenesolr", "solr"}, |
| new int[] {0, 0, 7}, |
| new int[] {6, 11, 11}, |
| null, |
| new int[] {1, 0, 1}, |
| null, |
| false); |
| |
| /* the stopword should add a gap here */ |
| assertAnalyzesTo( |
| a3, |
| "the lucene.solr", |
| new String[] {"lucene", "lucenesolr", "solr"}, |
| new int[] {4, 4, 11}, |
| new int[] {10, 15, 15}, |
| null, |
| new int[] {2, 0, 1}, |
| null, |
| false); |
| |
| IOUtils.close(a, a2, a3); |
| } |
| |
| public void testKeywordFilter() throws Exception { |
| assertAnalyzesTo( |
| keywordTestAnalyzer(GENERATE_WORD_PARTS), |
| "abc-def klm-nop kpop", |
| new String[] {"abc", "def", "klm", "nop", "kpop"}); |
| assertAnalyzesTo( |
| keywordTestAnalyzer(GENERATE_WORD_PARTS | IGNORE_KEYWORDS), |
| "abc-def klm-nop kpop", |
| new String[] {"abc", "def", "klm-nop", "kpop"}, |
| new int[] {0, 4, 8, 16}, |
| new int[] {3, 7, 15, 20}, |
| null, |
| new int[] {1, 1, 1, 1}, |
| null, |
| false); |
| } |
| |
| private Analyzer keywordTestAnalyzer(int flags) throws Exception { |
| return new Analyzer() { |
| @Override |
| public TokenStreamComponents createComponents(String field) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| KeywordMarkerFilter kFilter = |
| new KeywordMarkerFilter(tokenizer) { |
| private final CharTermAttribute term = addAttribute(CharTermAttribute.class); |
| |
| @Override |
| public boolean isKeyword() { |
| // Marks terms starting with the letter 'k' as keywords |
| return term.toString().charAt(0) == 'k'; |
| } |
| }; |
| return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(kFilter, flags, null)); |
| } |
| }; |
| } |
| |
| /** concat numbers + words + all */ |
| public void testLotsOfConcatenating() throws Exception { |
| final int flags = |
| GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | CATENATE_WORDS |
| | CATENATE_NUMBERS |
| | CATENATE_ALL |
| | SPLIT_ON_CASE_CHANGE |
| | SPLIT_ON_NUMERICS |
| | STEM_ENGLISH_POSSESSIVE; |
| |
| /* analyzer that uses whitespace + wdf */ |
| Analyzer a = |
| new Analyzer() { |
| @Override |
| public TokenStreamComponents createComponents(String field) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents( |
| tokenizer, new WordDelimiterFilter(tokenizer, flags, null)); |
| } |
| }; |
| |
| assertAnalyzesTo( |
| a, |
| "abc-def-123-456", |
| new String[] {"abc", "abcdef", "abcdef123456", "def", "123", "123456", "456"}, |
| new int[] {0, 0, 0, 4, 8, 8, 12}, |
| new int[] {3, 7, 15, 7, 11, 15, 15}, |
| null, |
| new int[] {1, 0, 0, 1, 1, 0, 1}, |
| null, |
| false); |
| a.close(); |
| } |
| |
| /** concat numbers + words + all + preserve original */ |
| public void testLotsOfConcatenating2() throws Exception { |
| final int flags = |
| PRESERVE_ORIGINAL |
| | GENERATE_WORD_PARTS |
| | GENERATE_NUMBER_PARTS |
| | CATENATE_WORDS |
| | CATENATE_NUMBERS |
| | CATENATE_ALL |
| | SPLIT_ON_CASE_CHANGE |
| | SPLIT_ON_NUMERICS |
| | STEM_ENGLISH_POSSESSIVE; |
| |
| /* analyzer that uses whitespace + wdf */ |
| Analyzer a = |
| new Analyzer() { |
| @Override |
| public TokenStreamComponents createComponents(String field) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents( |
| tokenizer, new WordDelimiterFilter(tokenizer, flags, null)); |
| } |
| }; |
| |
| assertAnalyzesTo( |
| a, |
| "abc-def-123-456", |
| new String[] { |
| "abc-def-123-456", "abc", "abcdef", "abcdef123456", "def", "123", "123456", "456" |
| }, |
| new int[] {0, 0, 0, 0, 4, 8, 8, 12}, |
| new int[] {15, 3, 7, 15, 7, 11, 15, 15}, |
| null, |
| new int[] {1, 0, 0, 0, 1, 1, 0, 1}, |
| null, |
| false); |
| a.close(); |
| } |
| |
| /** blast some random strings through the analyzer */ |
| public void testRandomStrings() throws Exception { |
| int numIterations = atLeast(3); |
| for (int i = 0; i < numIterations; i++) { |
| final int flags = random().nextInt(512); |
| final CharArraySet protectedWords; |
| if (random().nextBoolean()) { |
| protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false); |
| } else { |
| protectedWords = null; |
| } |
| |
| Analyzer a = |
| new Analyzer() { |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents( |
| tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords)); |
| } |
| }; |
| // TODO: properly support positionLengthAttribute |
| checkRandomData(random(), a, 100 * RANDOM_MULTIPLIER, 20, false, false); |
| a.close(); |
| } |
| } |
| |
| /** blast some enormous random strings through the analyzer */ |
| public void testRandomHugeStrings() throws Exception { |
| int numIterations = atLeast(1); |
| for (int i = 0; i < numIterations; i++) { |
| final int flags = random().nextInt(512); |
| final CharArraySet protectedWords; |
| if (random().nextBoolean()) { |
| protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false); |
| } else { |
| protectedWords = null; |
| } |
| |
| Analyzer a = |
| new Analyzer() { |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents( |
| tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords)); |
| } |
| }; |
| // TODO: properly support positionLengthAttribute |
| checkRandomData(random(), a, 10 * RANDOM_MULTIPLIER, 8192, false, false); |
| a.close(); |
| } |
| } |
| |
| public void testEmptyTerm() throws IOException { |
| Random random = random(); |
| for (int i = 0; i < 512; i++) { |
| final int flags = i; |
| final CharArraySet protectedWords; |
| if (random.nextBoolean()) { |
| protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false); |
| } else { |
| protectedWords = null; |
| } |
| |
| Analyzer a = |
| new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new KeywordTokenizer(); |
| return new TokenStreamComponents( |
| tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords)); |
| } |
| }; |
| // depending upon options, this thing may or may not preserve the empty term |
| checkAnalysisConsistency(random, a, random.nextBoolean(), ""); |
| a.close(); |
| } |
| } |
| |
| /* |
| public void testToDot() throws Exception { |
| int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE | PRESERVE_ORIGINAL | CATENATE_WORDS | CATENATE_NUMBERS | STEM_ENGLISH_POSSESSIVE; |
| String text = "PowerSystem2000-5-Shot's"; |
| WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token(text, 0, text.length())), DEFAULT_WORD_DELIM_TABLE, flags, null); |
| //StringWriter sw = new StringWriter(); |
| // TokenStreamToDot toDot = new TokenStreamToDot(text, wdf, new PrintWriter(sw)); |
| PrintWriter pw = new PrintWriter("/x/tmp/before.dot"); |
| TokenStreamToDot toDot = new TokenStreamToDot(text, wdf, pw); |
| toDot.toDot(); |
| pw.close(); |
| System.out.println("TEST DONE"); |
| //System.out.println("DOT:\n" + sw.toString()); |
| } |
| */ |
| |
| public void testOnlyNumbers() throws Exception { |
| int flags = GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS; |
| Analyzer a = |
| new Analyzer() { |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents( |
| tokenizer, new WordDelimiterFilter(tokenizer, flags, null)); |
| } |
| }; |
| |
| assertAnalyzesTo( |
| a, "7-586", new String[] {}, new int[] {}, new int[] {}, null, new int[] {}, null, false); |
| } |
| |
| public void testNumberPunct() throws Exception { |
| int flags = GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS; |
| Analyzer a = |
| new Analyzer() { |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents( |
| tokenizer, new WordDelimiterFilter(tokenizer, flags, null)); |
| } |
| }; |
| |
| assertAnalyzesTo( |
| a, |
| "6-", |
| new String[] {"6"}, |
| new int[] {0}, |
| new int[] {1}, |
| null, |
| new int[] {1}, |
| null, |
| false); |
| } |
| |
| private Analyzer getAnalyzer(final int flags) { |
| return new Analyzer() { |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents( |
| tokenizer, new WordDelimiterFilter(tokenizer, flags, null)); |
| } |
| }; |
| } |
| } |