blob: a2dc39ad8f85acf7173a804a0a83d916b35f4c65 [file] [log] [blame]
Index: modules/analysis/CHANGES.txt
===================================================================
--- modules/analysis/CHANGES.txt (revision 1079618)
+++ modules/analysis/CHANGES.txt (working copy)
@@ -4,6 +4,8 @@
API Changes
+ * LUCENE-1227,LUCENE-2947: NGramTokenizer now handles any number of characters. Improved flexibility for dealing with whitespace. Increased emphasis put on starting and ending ngrams. (David Byrne)
+
* LUCENE-2413: Deprecated PatternAnalyzer in common/miscellaneous, in favor
of the pattern package (CharFilter, Tokenizer, TokenFilter). (Robert Muir)
Index: modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
===================================================================
--- modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (revision 1079618)
+++ modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (working copy)
@@ -19,6 +19,9 @@
import java.io.StringReader;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@@ -26,18 +29,16 @@
* Tests {@link NGramTokenizer} for correctness.
*/
public class NGramTokenizerTest extends BaseTokenStreamTestCase {
- private StringReader input;
@Override
public void setUp() throws Exception {
super.setUp();
- input = new StringReader("abcde");
}
public void testInvalidInput() throws Exception {
boolean gotException = false;
try {
- new NGramTokenizer(input, 2, 1);
+ new NGramTokenizer(new StringReader("foo"), 2, 1);
} catch (IllegalArgumentException e) {
gotException = true;
}
@@ -47,7 +48,7 @@
public void testInvalidInput2() throws Exception {
boolean gotException = false;
try {
- new NGramTokenizer(input, 0, 1);
+ new NGramTokenizer(new StringReader("foo"), 0, 1);
} catch (IllegalArgumentException e) {
gotException = true;
}
@@ -55,34 +56,79 @@
}
public void testUnigrams() throws Exception {
- NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
- assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
+ NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcde"), 1, 1);
+ assertTokenStreamContents(tokenizer,
+ new String[]{"a","b","c","d","e"},
+ new int[]{0,1,2,3,4},
+ new int[]{1,2,3,4,5}, 5 /* abcde */);
}
public void testBigrams() throws Exception {
- NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
- assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, 5 /* abcde */);
+ NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcde"), 2, 2);
+ assertTokenStreamContents(tokenizer,
+ new String[]{"_a","ab","bc","cd","de","e_"},
+ new int[]{0,0,1,2,3,4},
+ new int[]{1,2,3,4,5,5}, 5 /* abcde */);
}
public void testNgrams() throws Exception {
- NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
+ NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcde"), 1, 3);
assertTokenStreamContents(tokenizer,
- new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
- new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
- new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
+ new String[]{"a","b","c","d","e", "_a","ab","bc","cd","de","e_", "_ab","abc","bcd","cde","de_"},
+ new int[]{0,1,2,3,4, 0,0,1,2,3,4, 0,0,1,2,3},
+ new int[]{1,2,3,4,5, 1,2,3,4,5,5, 2,3,4,5,5},
5 /* abcde */
);
}
public void testOversizedNgrams() throws Exception {
- NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
+ NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcde"), 7, 7);
assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
}
public void testReset() throws Exception {
- NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
- assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
- tokenizer.reset(new StringReader("abcde"));
- assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
+ NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcd"));
+ assertTokenStreamContents(tokenizer,
+ new String[]{"_a","ab","bc","cd","d_", "_ab","abc","bcd","cd_", "_abc","abcd","bcd_"},
+ new int[]{0,0,1,2,3, 0,0,1,2, 0,0,1},
+ new int[]{1,2,3,4,4, 2,3,4,4, 3,4,4}, 4);
+ tokenizer.reset(new StringReader("abcd"));
+ assertTokenStreamContents(tokenizer,
+ new String[]{"_a","ab","bc","cd","d_", "_ab","abc","bcd","cd_", "_abc","abcd","bcd_"},
+ new int[]{0,0,1,2,3, 0,0,1,2, 0,0,1},
+ new int[]{1,2,3,4,4, 2,3,4,4, 3,4,4}, 4);
}
+
+ public void testInteriorWhitespace() throws Exception {
+ NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("a\tb c"),2,2);
+ assertTokenStreamContents(tokenizer,
+ new String[]{"_a","a_","_b","b_","_c","c_"},
+ new int[]{0,0,1,2,3,7},
+ new int[]{1,2,3,4,8,8},
+ 8
+ );
+ }
+
+ public void testExteriorWhitespace() throws Exception {
+ NGramTokenizer tokenizer = new NGramTokenizer(new StringReader(" abc\n\n"),2,2);
+ assertTokenStreamContents(tokenizer,
+ new String[]{"__","_a","ab","bc","c_","__"},
+ new int[]{0,0,2,3,4,5},
+ new int[]{1,3,4,5,6,7},
+ 7
+ );
+ }
+
+ public void testCustomWhitespace() throws Exception {
+ Character w[] = {'|',';'};
+ Set<Character> whitespace = new HashSet<Character>(Arrays.asList(w));
+ NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("a||b;c"),2,3,whitespace);
+ assertTokenStreamContents(tokenizer,
+ new String[]{"_a","a_","_b","b_","_c","c_", "_a_","a_b","_b_","b_c","_c_"},
+ new int[]{0,0,1,3,4,5, 0,0,1,3,4},
+ new int[]{1,2,4,5,6,6, 2,4,5,6,6},
+ 6
+ );
+ }
+
}
Index: modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (revision 1079618)
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (working copy)
@@ -17,40 +17,74 @@
* limitations under the License.
*/
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.Set;
+
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.AttributeSource;
-import java.io.IOException;
-import java.io.Reader;
-
/**
* Tokenizes the input into n-grams of the given size(s).
*/
public final class NGramTokenizer extends Tokenizer {
- public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
- public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
- private int minGram, maxGram;
+ public static final int DEFAULT_MIN_NGRAM_SIZE = 2;
+ public static final int DEFAULT_MAX_NGRAM_SIZE = 4;
+
+ public static final Set<Character> DEFAULT_WHITESPACE_CHARS;
+ static {
+ Character whitespace[] = { ' ', '\t', '\n' };
+ DEFAULT_WHITESPACE_CHARS = new HashSet<Character>(Arrays.asList(whitespace));
+ }
+
private int gramSize;
- private int pos = 0;
- private int inLen;
- private String inStr;
- private boolean started = false;
+ private int minGram;
+ private int maxGram;
+ private int tmp;
+
+ private LinkedList<Integer> charsQueue;
+ private LinkedList<Integer> offsetQueue;
+ private Set<Character> whitespace;
+
+ private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private boolean collapse;
/**
+ * Creates NGramTokenizer with default min and max n-grams.
+ * @param input {@link Reader} holding the input to be tokenized
+ */
+ public NGramTokenizer(Reader input) {
+ this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE, DEFAULT_WHITESPACE_CHARS);
+ }
+
+ /**
* Creates NGramTokenizer with given min and max n-grams.
* @param input {@link Reader} holding the input to be tokenized
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
*/
public NGramTokenizer(Reader input, int minGram, int maxGram) {
+ this(input, minGram, maxGram, DEFAULT_WHITESPACE_CHARS);
+ }
+
+ /**
+ * Creates NGramTokenizer with given min and max n-grams.
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ * @param whitespace whitespace characters to be collapsed together
+ */
+ public NGramTokenizer(Reader input, int minGram, int maxGram, Set<Character> whitespace) {
super(input);
- init(minGram, maxGram);
+ init(minGram, maxGram, whitespace);
}
/**
@@ -61,8 +95,20 @@
* @param maxGram the largest n-gram to generate
*/
public NGramTokenizer(AttributeSource source, Reader input, int minGram, int maxGram) {
+ this(source, input, minGram, maxGram, DEFAULT_WHITESPACE_CHARS);
+ }
+
+ /**
+ * Creates NGramTokenizer with given min and max n-grams.
+ * @param source {@link AttributeSource} to use
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ * @param whitespace whitespace characters to be collapsed together
+ */
+ public NGramTokenizer(AttributeSource source, Reader input, int minGram, int maxGram, Set<Character> whitespace) {
super(source, input);
- init(minGram, maxGram);
+ init(minGram, maxGram, whitespace);
}
/**
@@ -73,19 +119,23 @@
* @param maxGram the largest n-gram to generate
*/
public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) {
- super(factory, input);
- init(minGram, maxGram);
+ this(factory, input, minGram, maxGram, DEFAULT_WHITESPACE_CHARS);
}
/**
- * Creates NGramTokenizer with default min and max n-grams.
+ * Creates NGramTokenizer with given min and max n-grams.
+ * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
* @param input {@link Reader} holding the input to be tokenized
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ * @param whitespace whitespace characters to be collapsed together
*/
- public NGramTokenizer(Reader input) {
- this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
+ public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram, Set<Character> whitespace) {
+ super(factory, input);
+ init(minGram, maxGram, whitespace);
}
- private void init(int minGram, int maxGram) {
+ private void init(int minGram, int maxGram, Set<Character> whitespace) {
if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero");
}
@@ -94,42 +144,66 @@
}
this.minGram = minGram;
this.maxGram = maxGram;
+ this.whitespace = whitespace;
+ this.maxGram = maxGram;
+ this.minGram = minGram;
+ gramSize = minGram;
+ resetPosition();
}
- /** Returns the next token in the stream, or null at EOS. */
@Override
public final boolean incrementToken() throws IOException {
- clearAttributes();
- if (!started) {
- started = true;
- gramSize = minGram;
- char[] chars = new char[1024];
- input.read(chars);
- inStr = new String(chars).trim(); // remove any trailing empty strings
- inLen = inStr.length();
+ if (gramSize > maxGram)
+ return false;
+
+ if (charsQueue.isEmpty()) {
+ charsQueue.offer((int)'_');
+ offsetQueue.offer(0);
+ for (int x = 0; x < gramSize-1; x++) {
+ int count = nextChar();
+ if (tmp == -1) {
+ count += offsetQueue.getLast()-1;
+ offsetAtt.setOffset(correctOffset(count), correctOffset(count));
+ return false;
+ } else {
+ charsQueue.offer(tmp);
+ offsetQueue.offer(offsetQueue.getLast()+count);
+ }
+ }
+ } else if (tmp == -1) {
+ ++gramSize;
+ resetPosition();
+ input.reset();
+ return incrementToken();
+ } else {
+ int count = nextChar();
+ if (tmp == -1) {
+ charsQueue.offer((int)'_');
+ offsetQueue.offer(offsetQueue.getLast()+count-1);
+ } else {
+ charsQueue.offer(tmp);
+ offsetQueue.offer(offsetQueue.getLast()+count);
+ }
+ charsQueue.poll();
+ offsetQueue.poll();
+
+ int dist = offsetQueue.get(1)-offsetQueue.getFirst()-1;
+ if (dist > 0)
+ offsetQueue.set(0, offsetQueue.getFirst()+dist);
}
- if (pos+gramSize > inLen) { // if we hit the end of the string
- pos = 0; // reset to beginning of string
- gramSize++; // increase n-gram size
- if (gramSize > maxGram) // we are done
- return false;
- if (pos+gramSize > inLen)
- return false;
+ final StringBuilder sb = new StringBuilder();
+ for (int i : charsQueue)
+ sb.append((char) i);
+
+ if (offsetQueue.getFirst() != offsetQueue.getLast()) {
+ clearAttributes();
+ termAtt.setEmpty().append(sb.toString());
+ offsetAtt.setOffset(correctOffset(offsetQueue.getFirst()), correctOffset(offsetQueue.getLast()));
+ return true;
+ } else {
+ return incrementToken();
}
-
- int oldPos = pos;
- pos++;
- termAtt.setEmpty().append(inStr, oldPos, oldPos+gramSize);
- offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize));
- return true;
- }
-
- @Override
- public final void end() {
- // set final offset
- final int finalOffset = inLen;
- this.offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
@@ -141,7 +215,35 @@
@Override
public void reset() throws IOException {
super.reset();
- started = false;
- pos = 0;
+ gramSize = minGram;
+ resetPosition();
}
+
+ private void resetPosition() {
+ charsQueue = new LinkedList<Integer>();
+ offsetQueue = new LinkedList<Integer>();
+ offsetQueue.offer(0);
+ tmp = 0;
+ collapse = false;
+ }
+
+ /** Returns the number of characters read*/
+ private int nextChar() throws IOException {
+ int count = 0;
+ if (collapse) {
+ collapse = false;
+ do {
+ tmp = input.read();
+ ++count;
+ } while (whitespace.contains(new Character((char)tmp)));
+ } else {
+ tmp = input.read();
+ ++count;
+ if (whitespace.contains(new Character((char)tmp))) {
+ tmp = '_';
+ collapse = true;
+ }
+ }
+ return count;
+ }
}