| Index: contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java |
| =================================================================== |
| --- contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (revision 669476) |
| +++ contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (working copy) |
| @@ -67,6 +67,11 @@ |
| private boolean outputUnigrams = true; |
| |
| /** |
| + * Sometimes you may want to make sure to output a single unigram in case |
| + * there are no shingles. */ |
| + private boolean outputUnigramIfNoNgrams = false; |
| + |
| + /** |
| * maximum shingle size (number of tokens) |
| */ |
| private int maxShingleSize; |
| @@ -124,6 +129,10 @@ |
| this.outputUnigrams = outputUnigrams; |
| } |
| |
| + public void setOutputUnigramIfNoNgrams(boolean outputUnigramIfNoNgrams) { |
| + this.outputUnigramIfNoNgrams = outputUnigramIfNoNgrams; |
| + } |
| + |
| /** |
| * Set the max shingle size (default: 2) |
| * |
| @@ -149,6 +158,9 @@ |
| } |
| } |
| |
| + private Token mostRecentInputToken = null; |
| + private boolean returnedAnyTokensYet = false; |
| + |
| /* (non-Javadoc) |
| * @see org.apache.lucene.analysis.TokenStream#next() |
| */ |
| @@ -159,9 +171,18 @@ |
| Token nextToken = null; |
| if ( ! outputBuf.isEmpty()) |
| { |
| - nextToken = (Token)outputBuf.remove(0); |
| + returnedAnyTokensYet = true; |
| + return (Token)outputBuf.remove(0); |
| } |
| - return nextToken; |
| + else if (outputUnigramIfNoNgrams && !returnedAnyTokensYet && mostRecentInputToken != null) |
| + { |
| + returnedAnyTokensYet = true; |
| + return mostRecentInputToken; |
| + } |
| + else |
| + { |
| + return null; |
| + } |
| } |
| |
| /** |
| @@ -182,6 +203,7 @@ |
| lastToken.startOffset())); |
| } |
| tokenBuf.add(lastToken); |
| + mostRecentInputToken = lastToken; |
| return getNextToken(); |
| } else { |
| return null; |
| Index: contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java |
| =================================================================== |
| --- contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (revision 669476) |
| +++ contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (working copy) |
| @@ -100,6 +100,23 @@ |
| 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 |
| }; |
| |
| + public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] { |
| + new Token("please divide", 0, 13), |
| + new Token("divide this", 7, 18), |
| + new Token("this sentence", 14, 27), |
| + new Token("sentence into", 19, 32), |
| + new Token("into shingles", 28, 39), |
| + }; |
| + |
| + public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] { |
| + 1, 1, 1, 1, 1, 1 |
| + }; |
| + |
| + public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] { |
| + "shingle", "shingle", "shingle", "shingle", "shingle" |
| + }; |
| + |
| + |
| public static final Token[] TRI_GRAM_TOKENS = new Token[] { |
| new Token("please", 0, 6), |
| new Token("please divide", 0, 13), |
| @@ -132,6 +149,24 @@ |
| }; |
| |
| |
| + public static final Token[] TEST_SINGLE_TOKEN = new Token[] { |
| + new Token("please", 0, 6), |
| + }; |
| + |
| + public static final Token[] SINGLE_TOKEN = new Token[] { |
| + new Token("please", 0, 6), |
| + }; |
| + |
| + public static final int[] SINGLE_TOKEN_INCREMENTS = new int[] { |
| + 1 |
| + }; |
| + |
| + public static final String[] SINGLE_TOKEN_TYPES = new String[] { |
| + "word" |
| + }; |
| + |
| + |
| + |
| protected void setUp() throws Exception { |
| super.setUp(); |
| testTokenWithHoles = new Token[] { |
| @@ -148,6 +183,7 @@ |
| /* |
| * Class under test for void ShingleFilter(TokenStream, int) |
| */ |
| + |
| public void testBiGramFilter() throws IOException { |
| this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS, |
| BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES); |
| @@ -158,16 +194,50 @@ |
| BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES); |
| } |
| |
| + public void testBigramFilterWithJustOneToken() throws IOException { |
| + this.shingleFilterTest(2, TEST_SINGLE_TOKEN, SINGLE_TOKEN, |
| + SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES); |
| + } |
| + |
| + public void testBiGramFilterWithOutputUnigramIfNoNgramsOption() throws IOException { |
| + this.shingleFilterTestOutputUnigramIfNoNgrams(2, TEST_TOKEN, |
| + BI_GRAM_TOKENS_WITHOUT_UNIGRAMS, |
| + BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, |
| + BI_GRAM_TYPES_WITHOUT_UNIGRAMS); |
| + } |
| + |
| + public void testBiGramFilterWithOutputUnigramIfNoNgramsOptionAndJustOneToken() throws IOException { |
| + this.shingleFilterTestOutputUnigramIfNoNgrams(2, TEST_SINGLE_TOKEN, |
| + SINGLE_TOKEN, SINGLE_TOKEN_INCREMENTS, |
| + SINGLE_TOKEN_TYPES); |
| + } |
| + |
| public void testTriGramFilter() throws IOException { |
| this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS, |
| TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES); |
| } |
| |
| + |
| protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, |
| int[] positionIncrements, String[] types) |
| throws IOException { |
| + TokenStream filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize); |
| + this.shingleFilterTest(filter, tokensToShingle, tokensToCompare, positionIncrements, types); |
| + } |
| |
| - TokenStream filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize); |
| + protected void shingleFilterTestOutputUnigramIfNoNgrams(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, |
| + int[] positionIncrements, String[] types) |
| + throws IOException { |
| + ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize); |
| + filter.setOutputUnigrams(false); |
| + filter.setOutputUnigramIfNoNgrams(true); |
| + this.shingleFilterTest(filter, tokensToShingle, tokensToCompare, positionIncrements, types); |
| + } |
| + |
| + protected void shingleFilterTest(TokenStream filter, Token[] tokensToShingle, Token[] tokensToCompare, |
| + int[] positionIncrements, String[] types) |
| + throws IOException { |
| + |
| Token token; |
| int i = 0; |
| |
| @@ -185,5 +255,7 @@ |
| assertEquals("Wrong type for token \"" + termText + "\"", types[i], token.type()); |
| i++; |
| } |
| + |
| + assertEquals("Wrong number of tokens returned", tokensToCompare.length, i); |
| } |
| } |