blob: 7ab80162230ca5365063b9110378d13f42884438 [file] [log] [blame]
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (revision 669476)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (working copy)
@@ -67,6 +67,11 @@
private boolean outputUnigrams = true;
/**
+ * Sometimes you may want to make sure to output a single unigram in case
+ * there are no shingles. */
+ private boolean outputUnigramIfNoNgrams = false;
+
+ /**
* maximum shingle size (number of tokens)
*/
private int maxShingleSize;
@@ -124,6 +129,10 @@
this.outputUnigrams = outputUnigrams;
}
+ public void setOutputUnigramIfNoNgrams(boolean outputUnigramIfNoNgrams) {
+ this.outputUnigramIfNoNgrams = outputUnigramIfNoNgrams;
+ }
+
/**
* Set the max shingle size (default: 2)
*
@@ -149,6 +158,9 @@
}
}
+ private Token mostRecentInputToken = null;
+ private boolean returnedAnyTokensYet = false;
+
/* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#next()
*/
@@ -159,9 +171,18 @@
Token nextToken = null;
if ( ! outputBuf.isEmpty())
{
- nextToken = (Token)outputBuf.remove(0);
+ returnedAnyTokensYet = true;
+ return (Token)outputBuf.remove(0);
}
- return nextToken;
+ else if (outputUnigramIfNoNgrams && !returnedAnyTokensYet && mostRecentInputToken != null)
+ {
+ returnedAnyTokensYet = true;
+ return mostRecentInputToken;
+ }
+ else
+ {
+ return null;
+ }
}
/**
@@ -182,6 +203,7 @@
lastToken.startOffset()));
}
tokenBuf.add(lastToken);
+ mostRecentInputToken = lastToken;
return getNextToken();
} else {
return null;
Index: contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
===================================================================
--- contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (revision 669476)
+++ contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (working copy)
@@ -100,6 +100,23 @@
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
};
+ public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
+ new Token("please divide", 0, 13),
+ new Token("divide this", 7, 18),
+ new Token("this sentence", 14, 27),
+ new Token("sentence into", 19, 32),
+ new Token("into shingles", 28, 39),
+ };
+
+ public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
+ 1, 1, 1, 1, 1, 1
+ };
+
+ public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
+ "shingle", "shingle", "shingle", "shingle", "shingle"
+ };
+
+
public static final Token[] TRI_GRAM_TOKENS = new Token[] {
new Token("please", 0, 6),
new Token("please divide", 0, 13),
@@ -132,6 +149,24 @@
};
+ public static final Token[] TEST_SINGLE_TOKEN = new Token[] {
+ new Token("please", 0, 6),
+ };
+
+ public static final Token[] SINGLE_TOKEN = new Token[] {
+ new Token("please", 0, 6),
+ };
+
+ public static final int[] SINGLE_TOKEN_INCREMENTS = new int[] {
+ 1
+ };
+
+ public static final String[] SINGLE_TOKEN_TYPES = new String[] {
+ "word"
+ };
+
+
+
protected void setUp() throws Exception {
super.setUp();
testTokenWithHoles = new Token[] {
@@ -148,6 +183,7 @@
/*
* Class under test for void ShingleFilter(TokenStream, int)
*/
+
public void testBiGramFilter() throws IOException {
this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS,
BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
@@ -158,16 +194,50 @@
BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
}
+ public void testBigramFilterWithJustOneToken() throws IOException {
+ this.shingleFilterTest(2, TEST_SINGLE_TOKEN, SINGLE_TOKEN,
+ SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES);
+ }
+
+ public void testBiGramFilterWithOutputUnigramIfNoNgramsOption() throws IOException {
+ this.shingleFilterTestOutputUnigramIfNoNgrams(2, TEST_TOKEN,
+ BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
+ BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS,
+ BI_GRAM_TYPES_WITHOUT_UNIGRAMS);
+ }
+
+ public void testBiGramFilterWithOutputUnigramIfNoNgramsOptionAndJustOneToken() throws IOException {
+ this.shingleFilterTestOutputUnigramIfNoNgrams(2, TEST_SINGLE_TOKEN,
+ SINGLE_TOKEN, SINGLE_TOKEN_INCREMENTS,
+ SINGLE_TOKEN_TYPES);
+ }
+
public void testTriGramFilter() throws IOException {
this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS,
TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES);
}
+
protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
int[] positionIncrements, String[] types)
throws IOException {
+ TokenStream filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
+ this.shingleFilterTest(filter, tokensToShingle, tokensToCompare, positionIncrements, types);
+ }
- TokenStream filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
+ protected void shingleFilterTestOutputUnigramIfNoNgrams(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
+ int[] positionIncrements, String[] types)
+ throws IOException {
+ ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
+ filter.setOutputUnigrams(false);
+ filter.setOutputUnigramIfNoNgrams(true);
+ this.shingleFilterTest(filter, tokensToShingle, tokensToCompare, positionIncrements, types);
+ }
+
+ protected void shingleFilterTest(TokenStream filter, Token[] tokensToShingle, Token[] tokensToCompare,
+ int[] positionIncrements, String[] types)
+ throws IOException {
+
Token token;
int i = 0;
@@ -185,5 +255,7 @@
assertEquals("Wrong type for token \"" + termText + "\"", types[i], token.type());
i++;
}
+
+ assertEquals("Wrong number of tokens returned", tokensToCompare.length, i);
}
}