| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.commongrams; |
| |
| import java.io.StringReader; |
| import java.util.Arrays; |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.CharArraySet; |
| import org.apache.lucene.analysis.MockTokenizer; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.core.WhitespaceTokenizer; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| |
| /** Tests CommonGrams(Query)Filter */ |
| public class TestCommonGramsFilter extends BaseTokenStreamTestCase { |
| private static final CharArraySet commonWords = |
| new CharArraySet(Arrays.asList("s", "a", "b", "c", "d", "the", "of"), false); |
| |
| public void testReset() throws Exception { |
| final String input = "How the s a brown s cow d like A B thing?"; |
| WhitespaceTokenizer wt = new WhitespaceTokenizer(); |
| wt.setReader(new StringReader(input)); |
| CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); |
| |
| CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class); |
| cgf.reset(); |
| assertTrue(cgf.incrementToken()); |
| assertEquals("How", term.toString()); |
| assertTrue(cgf.incrementToken()); |
| assertEquals("How_the", term.toString()); |
| assertTrue(cgf.incrementToken()); |
| assertEquals("the", term.toString()); |
| assertTrue(cgf.incrementToken()); |
| assertEquals("the_s", term.toString()); |
| cgf.close(); |
| |
| wt.setReader(new StringReader(input)); |
| cgf.reset(); |
| assertTrue(cgf.incrementToken()); |
| assertEquals("How", term.toString()); |
| } |
| |
| public void testQueryReset() throws Exception { |
| final String input = "How the s a brown s cow d like A B thing?"; |
| WhitespaceTokenizer wt = new WhitespaceTokenizer(); |
| wt.setReader(new StringReader(input)); |
| CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); |
| CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf); |
| |
| CharTermAttribute term = wt.addAttribute(CharTermAttribute.class); |
| nsf.reset(); |
| assertTrue(nsf.incrementToken()); |
| assertEquals("How_the", term.toString()); |
| assertTrue(nsf.incrementToken()); |
| assertEquals("the_s", term.toString()); |
| nsf.close(); |
| |
| wt.setReader(new StringReader(input)); |
| nsf.reset(); |
| assertTrue(nsf.incrementToken()); |
| assertEquals("How_the", term.toString()); |
| } |
| |
| /** |
| * This is for testing CommonGramsQueryFilter which outputs a set of tokens optimized for querying |
| * with only one token at each position, either a unigram or a bigram It also will not return a |
| * token for the final position if the final word is already in the preceding bigram |
| * Example:(three tokens/positions in) "foo bar the"=>"foo:1|bar:2,bar-the:2|the:3=> "foo" |
| * "bar-the" (2 tokens out) |
| */ |
| public void testCommonGramsQueryFilter() throws Exception { |
| Analyzer a = |
| new Analyzer() { |
| @Override |
| public TokenStreamComponents createComponents(String field) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents( |
| tokenizer, |
| new CommonGramsQueryFilter(new CommonGramsFilter(tokenizer, commonWords))); |
| } |
| }; |
| |
| // Stop words used below are "of" "the" and "s" |
| |
| // two word queries |
| assertAnalyzesTo(a, "brown fox", new String[] {"brown", "fox"}); |
| assertAnalyzesTo(a, "the fox", new String[] {"the_fox"}); |
| assertAnalyzesTo(a, "fox of", new String[] {"fox_of"}); |
| assertAnalyzesTo(a, "of the", new String[] {"of_the"}); |
| |
| // one word queries |
| assertAnalyzesTo(a, "the", new String[] {"the"}); |
| assertAnalyzesTo(a, "foo", new String[] {"foo"}); |
| |
| // 3 word combinations s=stopword/common word n=not a stop word |
| assertAnalyzesTo(a, "n n n", new String[] {"n", "n", "n"}); |
| assertAnalyzesTo(a, "quick brown fox", new String[] {"quick", "brown", "fox"}); |
| |
| assertAnalyzesTo(a, "n n s", new String[] {"n", "n_s"}); |
| assertAnalyzesTo(a, "quick brown the", new String[] {"quick", "brown_the"}); |
| |
| assertAnalyzesTo(a, "n s n", new String[] {"n_s", "s_n"}); |
| assertAnalyzesTo(a, "quick the brown", new String[] {"quick_the", "the_brown"}); |
| |
| assertAnalyzesTo(a, "n s s", new String[] {"n_s", "s_s"}); |
| assertAnalyzesTo(a, "fox of the", new String[] {"fox_of", "of_the"}); |
| |
| assertAnalyzesTo(a, "s n n", new String[] {"s_n", "n", "n"}); |
| assertAnalyzesTo(a, "the quick brown", new String[] {"the_quick", "quick", "brown"}); |
| |
| assertAnalyzesTo(a, "s n s", new String[] {"s_n", "n_s"}); |
| assertAnalyzesTo(a, "the fox of", new String[] {"the_fox", "fox_of"}); |
| |
| assertAnalyzesTo(a, "s s n", new String[] {"s_s", "s_n"}); |
| assertAnalyzesTo(a, "of the fox", new String[] {"of_the", "the_fox"}); |
| |
| assertAnalyzesTo(a, "s s s", new String[] {"s_s", "s_s"}); |
| assertAnalyzesTo(a, "of the of", new String[] {"of_the", "the_of"}); |
| a.close(); |
| } |
| |
| public void testCommonGramsFilter() throws Exception { |
| Analyzer a = |
| new Analyzer() { |
| @Override |
| public TokenStreamComponents createComponents(String field) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents( |
| tokenizer, new CommonGramsFilter(tokenizer, commonWords)); |
| } |
| }; |
| |
| // Stop words used below are "of" "the" and "s" |
| // one word queries |
| assertAnalyzesTo(a, "the", new String[] {"the"}); |
| assertAnalyzesTo(a, "foo", new String[] {"foo"}); |
| |
| // two word queries |
| assertAnalyzesTo(a, "brown fox", new String[] {"brown", "fox"}, new int[] {1, 1}); |
| assertAnalyzesTo(a, "the fox", new String[] {"the", "the_fox", "fox"}, new int[] {1, 0, 1}); |
| assertAnalyzesTo(a, "fox of", new String[] {"fox", "fox_of", "of"}, new int[] {1, 0, 1}); |
| assertAnalyzesTo(a, "of the", new String[] {"of", "of_the", "the"}, new int[] {1, 0, 1}); |
| |
| // 3 word combinations s=stopword/common word n=not a stop word |
| assertAnalyzesTo(a, "n n n", new String[] {"n", "n", "n"}, new int[] {1, 1, 1}); |
| assertAnalyzesTo( |
| a, "quick brown fox", new String[] {"quick", "brown", "fox"}, new int[] {1, 1, 1}); |
| |
| assertAnalyzesTo(a, "n n s", new String[] {"n", "n", "n_s", "s"}, new int[] {1, 1, 0, 1}); |
| assertAnalyzesTo( |
| a, |
| "quick brown the", |
| new String[] {"quick", "brown", "brown_the", "the"}, |
| new int[] {1, 1, 0, 1}); |
| |
| assertAnalyzesTo( |
| a, "n s n", new String[] {"n", "n_s", "s", "s_n", "n"}, new int[] {1, 0, 1, 0, 1}); |
| assertAnalyzesTo( |
| a, |
| "quick the fox", |
| new String[] {"quick", "quick_the", "the", "the_fox", "fox"}, |
| new int[] {1, 0, 1, 0, 1}); |
| |
| assertAnalyzesTo( |
| a, "n s s", new String[] {"n", "n_s", "s", "s_s", "s"}, new int[] {1, 0, 1, 0, 1}); |
| assertAnalyzesTo( |
| a, |
| "fox of the", |
| new String[] {"fox", "fox_of", "of", "of_the", "the"}, |
| new int[] {1, 0, 1, 0, 1}); |
| |
| assertAnalyzesTo(a, "s n n", new String[] {"s", "s_n", "n", "n"}, new int[] {1, 0, 1, 1}); |
| assertAnalyzesTo( |
| a, |
| "the quick brown", |
| new String[] {"the", "the_quick", "quick", "brown"}, |
| new int[] {1, 0, 1, 1}); |
| |
| assertAnalyzesTo( |
| a, "s n s", new String[] {"s", "s_n", "n", "n_s", "s"}, new int[] {1, 0, 1, 0, 1}); |
| assertAnalyzesTo( |
| a, |
| "the fox of", |
| new String[] {"the", "the_fox", "fox", "fox_of", "of"}, |
| new int[] {1, 0, 1, 0, 1}); |
| |
| assertAnalyzesTo( |
| a, "s s n", new String[] {"s", "s_s", "s", "s_n", "n"}, new int[] {1, 0, 1, 0, 1}); |
| assertAnalyzesTo( |
| a, |
| "of the fox", |
| new String[] {"of", "of_the", "the", "the_fox", "fox"}, |
| new int[] {1, 0, 1, 0, 1}); |
| |
| assertAnalyzesTo( |
| a, "s s s", new String[] {"s", "s_s", "s", "s_s", "s"}, new int[] {1, 0, 1, 0, 1}); |
| assertAnalyzesTo( |
| a, |
| "of the of", |
| new String[] {"of", "of_the", "the", "the_of", "of"}, |
| new int[] {1, 0, 1, 0, 1}); |
| a.close(); |
| } |
| |
| /** Test that CommonGramsFilter works correctly in case-insensitive mode */ |
| public void testCaseSensitive() throws Exception { |
| final String input = "How The s a brown s cow d like A B thing?"; |
| MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| wt.setReader(new StringReader(input)); |
| TokenFilter cgf = new CommonGramsFilter(wt, commonWords); |
| assertTokenStreamContents( |
| cgf, |
| new String[] { |
| "How", "The", "The_s", "s", "s_a", "a", "a_brown", "brown", "brown_s", "s", "s_cow", |
| "cow", "cow_d", "d", "d_like", "like", "A", "B", "thing?" |
| }); |
| } |
| |
| /** Test CommonGramsQueryFilter in the case that the last word is a stopword */ |
| public void testLastWordisStopWord() throws Exception { |
| final String input = "dog the"; |
| MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| wt.setReader(new StringReader(input)); |
| CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); |
| TokenFilter nsf = new CommonGramsQueryFilter(cgf); |
| assertTokenStreamContents(nsf, new String[] {"dog_the"}); |
| } |
| |
| /** Test CommonGramsQueryFilter in the case that the first word is a stopword */ |
| public void testFirstWordisStopWord() throws Exception { |
| final String input = "the dog"; |
| MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| wt.setReader(new StringReader(input)); |
| CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); |
| TokenFilter nsf = new CommonGramsQueryFilter(cgf); |
| assertTokenStreamContents(nsf, new String[] {"the_dog"}); |
| } |
| |
| /** Test CommonGramsQueryFilter in the case of a single (stop)word query */ |
| public void testOneWordQueryStopWord() throws Exception { |
| final String input = "the"; |
| MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| wt.setReader(new StringReader(input)); |
| CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); |
| TokenFilter nsf = new CommonGramsQueryFilter(cgf); |
| assertTokenStreamContents(nsf, new String[] {"the"}); |
| } |
| |
| /** Test CommonGramsQueryFilter in the case of a single word query */ |
| public void testOneWordQuery() throws Exception { |
| final String input = "monster"; |
| MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| wt.setReader(new StringReader(input)); |
| CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); |
| TokenFilter nsf = new CommonGramsQueryFilter(cgf); |
| assertTokenStreamContents(nsf, new String[] {"monster"}); |
| } |
| |
| /** Test CommonGramsQueryFilter when first and last words are stopwords. */ |
| public void TestFirstAndLastStopWord() throws Exception { |
| final String input = "the of"; |
| MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| wt.setReader(new StringReader(input)); |
| CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); |
| TokenFilter nsf = new CommonGramsQueryFilter(cgf); |
| assertTokenStreamContents(nsf, new String[] {"the_of"}); |
| } |
| |
| /** blast some random strings through the analyzer */ |
| public void testRandomStrings() throws Exception { |
| Analyzer a = |
| new Analyzer() { |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| CommonGramsFilter cgf = new CommonGramsFilter(t, commonWords); |
| return new TokenStreamComponents(t, cgf); |
| } |
| }; |
| |
| checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER); |
| a.close(); |
| |
| Analyzer b = |
| new Analyzer() { |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| CommonGramsFilter cgf = new CommonGramsFilter(t, commonWords); |
| return new TokenStreamComponents(t, new CommonGramsQueryFilter(cgf)); |
| } |
| }; |
| |
| checkRandomData(random(), b, 200 * RANDOM_MULTIPLIER); |
| b.close(); |
| } |
| } |