| Index: modules/analysis/CHANGES.txt |
| =================================================================== |
| --- modules/analysis/CHANGES.txt (revision 1079618) |
| +++ modules/analysis/CHANGES.txt (working copy) |
| @@ -4,6 +4,8 @@ |
| |
| API Changes |
| |
| + * LUCENE-1227,LUCENE-2947: NGramTokenizer now handles any number of characters. Improved flexibility for dealing with whitespace. Increased emphasis put on starting and ending ngrams. (David Byrne) |
| + |
| * LUCENE-2413: Deprecated PatternAnalyzer in common/miscellaneous, in favor |
| of the pattern package (CharFilter, Tokenizer, TokenFilter). (Robert Muir) |
| |
| Index: modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java |
| =================================================================== |
| --- modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (revision 1079618) |
| +++ modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (working copy) |
| @@ -19,6 +19,9 @@ |
| |
| |
| import java.io.StringReader; |
| +import java.util.Arrays; |
| +import java.util.HashSet; |
| +import java.util.Set; |
| |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| |
| @@ -26,18 +29,16 @@ |
| * Tests {@link NGramTokenizer} for correctness. |
| */ |
| public class NGramTokenizerTest extends BaseTokenStreamTestCase { |
| - private StringReader input; |
| |
| @Override |
| public void setUp() throws Exception { |
| super.setUp(); |
| - input = new StringReader("abcde"); |
| } |
| |
| public void testInvalidInput() throws Exception { |
| boolean gotException = false; |
| try { |
| - new NGramTokenizer(input, 2, 1); |
| + new NGramTokenizer(new StringReader("foo"), 2, 1); |
| } catch (IllegalArgumentException e) { |
| gotException = true; |
| } |
| @@ -47,7 +48,7 @@ |
| public void testInvalidInput2() throws Exception { |
| boolean gotException = false; |
| try { |
| - new NGramTokenizer(input, 0, 1); |
| + new NGramTokenizer(new StringReader("foo"), 0, 1); |
| } catch (IllegalArgumentException e) { |
| gotException = true; |
| } |
| @@ -55,34 +56,79 @@ |
| } |
| |
| public void testUnigrams() throws Exception { |
| - NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1); |
| - assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */); |
| + NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcde"), 1, 1); |
| + assertTokenStreamContents(tokenizer, |
| + new String[]{"a","b","c","d","e"}, |
| + new int[]{0,1,2,3,4}, |
| + new int[]{1,2,3,4,5}, 5 /* abcde */); |
| } |
| |
| public void testBigrams() throws Exception { |
| - NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2); |
| - assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, 5 /* abcde */); |
| + NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcde"), 2, 2); |
| + assertTokenStreamContents(tokenizer, |
| + new String[]{"_a","ab","bc","cd","de","e_"}, |
| + new int[]{0,0,1,2,3,4}, |
| + new int[]{1,2,3,4,5,5}, 5 /* abcde */); |
| } |
| |
| public void testNgrams() throws Exception { |
| - NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3); |
| + NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcde"), 1, 3); |
| assertTokenStreamContents(tokenizer, |
| - new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"}, |
| - new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2}, |
| - new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5}, |
| + new String[]{"a","b","c","d","e", "_a","ab","bc","cd","de","e_", "_ab","abc","bcd","cde","de_"}, |
| + new int[]{0,1,2,3,4, 0,0,1,2,3,4, 0,0,1,2,3}, |
| + new int[]{1,2,3,4,5, 1,2,3,4,5,5, 2,3,4,5,5}, |
| 5 /* abcde */ |
| ); |
| } |
| |
| public void testOversizedNgrams() throws Exception { |
| - NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7); |
| + NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcde"), 7, 7); |
| assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */); |
| } |
| |
| public void testReset() throws Exception { |
| - NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1); |
| - assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */); |
| - tokenizer.reset(new StringReader("abcde")); |
| - assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */); |
| + NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcd")); |
| + assertTokenStreamContents(tokenizer, |
| + new String[]{"_a","ab","bc","cd","d_", "_ab","abc","bcd","cd_", "_abc","abcd","bcd_"}, |
| + new int[]{0,0,1,2,3, 0,0,1,2, 0,0,1}, |
| + new int[]{1,2,3,4,4, 2,3,4,4, 3,4,4}, 4); |
| + tokenizer.reset(new StringReader("abcd")); |
| + assertTokenStreamContents(tokenizer, |
| + new String[]{"_a","ab","bc","cd","d_", "_ab","abc","bcd","cd_", "_abc","abcd","bcd_"}, |
| + new int[]{0,0,1,2,3, 0,0,1,2, 0,0,1}, |
| + new int[]{1,2,3,4,4, 2,3,4,4, 3,4,4}, 4); |
| } |
| + |
| + public void testInteriorWhitespace() throws Exception { |
| + NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("a\tb c"),2,2); |
| + assertTokenStreamContents(tokenizer, |
| + new String[]{"_a","a_","_b","b_","_c","c_"}, |
| + new int[]{0,0,1,2,3,7}, |
| + new int[]{1,2,3,4,8,8}, |
| + 8 |
| + ); |
| + } |
| + |
| + public void testExteriorWhitespace() throws Exception { |
| + NGramTokenizer tokenizer = new NGramTokenizer(new StringReader(" abc\n\n"),2,2); |
| + assertTokenStreamContents(tokenizer, |
| + new String[]{"__","_a","ab","bc","c_","__"}, |
| + new int[]{0,0,2,3,4,5}, |
| + new int[]{1,3,4,5,6,7}, |
| + 7 |
| + ); |
| + } |
| + |
| + public void testCustomWhitespace() throws Exception { |
| + Character w[] = {'|',';'}; |
| + Set<Character> whitespace = new HashSet<Character>(Arrays.asList(w)); |
| + NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("a||b;c"),2,3,whitespace); |
| + assertTokenStreamContents(tokenizer, |
| + new String[]{"_a","a_","_b","b_","_c","c_", "_a_","a_b","_b_","b_c","_c_"}, |
| + new int[]{0,0,1,3,4,5, 0,0,1,3,4}, |
| + new int[]{1,2,4,5,6,6, 2,4,5,6,6}, |
| + 6 |
| + ); |
| + } |
| + |
| } |
| Index: modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java |
| =================================================================== |
| --- modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (revision 1079618) |
| +++ modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (working copy) |
| @@ -17,40 +17,74 @@ |
| * limitations under the License. |
| */ |
| |
| +import java.io.IOException; |
| +import java.io.Reader; |
| +import java.util.Arrays; |
| +import java.util.HashSet; |
| +import java.util.LinkedList; |
| +import java.util.Set; |
| + |
| import org.apache.lucene.analysis.Tokenizer; |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.util.AttributeSource; |
| |
| -import java.io.IOException; |
| -import java.io.Reader; |
| - |
| /** |
| * Tokenizes the input into n-grams of the given size(s). |
| */ |
| public final class NGramTokenizer extends Tokenizer { |
| - public static final int DEFAULT_MIN_NGRAM_SIZE = 1; |
| - public static final int DEFAULT_MAX_NGRAM_SIZE = 2; |
| |
| - private int minGram, maxGram; |
| + public static final int DEFAULT_MIN_NGRAM_SIZE = 2; |
| + public static final int DEFAULT_MAX_NGRAM_SIZE = 4; |
| + |
| + public static final Set<Character> DEFAULT_WHITESPACE_CHARS; |
| + static { |
| + Character whitespace[] = { ' ', '\t', '\n' }; |
| + DEFAULT_WHITESPACE_CHARS = new HashSet<Character>(Arrays.asList(whitespace)); |
| + } |
| + |
| private int gramSize; |
| - private int pos = 0; |
| - private int inLen; |
| - private String inStr; |
| - private boolean started = false; |
| + private int minGram; |
| + private int maxGram; |
| + private int tmp; |
| + |
| + private LinkedList<Integer> charsQueue; |
| + private LinkedList<Integer> offsetQueue; |
| + private Set<Character> whitespace; |
| + |
| + private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| + private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); |
| |
| - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| - private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); |
| + private boolean collapse; |
| |
| /** |
| + * Creates NGramTokenizer with default min and max n-grams. |
| + * @param input {@link Reader} holding the input to be tokenized |
| + */ |
| + public NGramTokenizer(Reader input) { |
| + this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE, DEFAULT_WHITESPACE_CHARS); |
| + } |
| + |
| + /** |
| * Creates NGramTokenizer with given min and max n-grams. |
| * @param input {@link Reader} holding the input to be tokenized |
| * @param minGram the smallest n-gram to generate |
| * @param maxGram the largest n-gram to generate |
| */ |
| public NGramTokenizer(Reader input, int minGram, int maxGram) { |
| + this(input, minGram, maxGram, DEFAULT_WHITESPACE_CHARS); |
| + } |
| + |
| + /** |
| + * Creates NGramTokenizer with given min and max n-grams. |
| + * @param input {@link Reader} holding the input to be tokenized |
| + * @param minGram the smallest n-gram to generate |
| + * @param maxGram the largest n-gram to generate |
| + * @param whitespace whitespace characters to be collapsed together |
| + */ |
| + public NGramTokenizer(Reader input, int minGram, int maxGram, Set<Character> whitespace) { |
| super(input); |
| - init(minGram, maxGram); |
| + init(minGram, maxGram, whitespace); |
| } |
| |
| /** |
| @@ -61,8 +95,20 @@ |
| * @param maxGram the largest n-gram to generate |
| */ |
| public NGramTokenizer(AttributeSource source, Reader input, int minGram, int maxGram) { |
| + this(source, input, minGram, maxGram, DEFAULT_WHITESPACE_CHARS); |
| + } |
| + |
| + /** |
| + * Creates NGramTokenizer with given min and max n-grams. |
| + * @param source {@link AttributeSource} to use |
| + * @param input {@link Reader} holding the input to be tokenized |
| + * @param minGram the smallest n-gram to generate |
| + * @param maxGram the largest n-gram to generate |
| + * @param whitespace whitespace characters to be collapsed together |
| + */ |
| + public NGramTokenizer(AttributeSource source, Reader input, int minGram, int maxGram, Set<Character> whitespace) { |
| super(source, input); |
| - init(minGram, maxGram); |
| + init(minGram, maxGram, whitespace); |
| } |
| |
| /** |
| @@ -73,19 +119,23 @@ |
| * @param maxGram the largest n-gram to generate |
| */ |
| public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) { |
| - super(factory, input); |
| - init(minGram, maxGram); |
| + this(factory, input, minGram, maxGram, DEFAULT_WHITESPACE_CHARS); |
| } |
| |
| /** |
| - * Creates NGramTokenizer with default min and max n-grams. |
| + * Creates NGramTokenizer with given min and max n-grams. |
| + * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use |
| * @param input {@link Reader} holding the input to be tokenized |
| + * @param minGram the smallest n-gram to generate |
| + * @param maxGram the largest n-gram to generate |
| + * @param whitespace whitespace characters to be collapsed together |
| */ |
| - public NGramTokenizer(Reader input) { |
| - this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE); |
| + public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram, Set<Character> whitespace) { |
| + super(factory, input); |
| + init(minGram, maxGram, whitespace); |
| } |
| |
| - private void init(int minGram, int maxGram) { |
| + private void init(int minGram, int maxGram, Set<Character> whitespace) { |
| if (minGram < 1) { |
| throw new IllegalArgumentException("minGram must be greater than zero"); |
| } |
| @@ -94,42 +144,66 @@ |
| } |
| this.minGram = minGram; |
| this.maxGram = maxGram; |
| + this.whitespace = whitespace; |
| + this.maxGram = maxGram; |
| + this.minGram = minGram; |
| + gramSize = minGram; |
| + resetPosition(); |
| } |
| |
| - /** Returns the next token in the stream, or null at EOS. */ |
| @Override |
| public final boolean incrementToken() throws IOException { |
| - clearAttributes(); |
| - if (!started) { |
| - started = true; |
| - gramSize = minGram; |
| - char[] chars = new char[1024]; |
| - input.read(chars); |
| - inStr = new String(chars).trim(); // remove any trailing empty strings |
| - inLen = inStr.length(); |
| + if (gramSize > maxGram) |
| + return false; |
| + |
| + if (charsQueue.isEmpty()) { |
| + charsQueue.offer((int)'_'); |
| + offsetQueue.offer(0); |
| + for (int x = 0; x < gramSize-1; x++) { |
| + int count = nextChar(); |
| + if (tmp == -1) { |
| + count += offsetQueue.getLast()-1; |
| + offsetAtt.setOffset(correctOffset(count), correctOffset(count)); |
| + return false; |
| + } else { |
| + charsQueue.offer(tmp); |
| + offsetQueue.offer(offsetQueue.getLast()+count); |
| + } |
| + } |
| + } else if (tmp == -1) { |
| + ++gramSize; |
| + resetPosition(); |
| + input.reset(); |
| + return incrementToken(); |
| + } else { |
| + int count = nextChar(); |
| + if (tmp == -1) { |
| + charsQueue.offer((int)'_'); |
| + offsetQueue.offer(offsetQueue.getLast()+count-1); |
| + } else { |
| + charsQueue.offer(tmp); |
| + offsetQueue.offer(offsetQueue.getLast()+count); |
| + } |
| + charsQueue.poll(); |
| + offsetQueue.poll(); |
| + |
| + int dist = offsetQueue.get(1)-offsetQueue.getFirst()-1; |
| + if (dist > 0) |
| + offsetQueue.set(0, offsetQueue.getFirst()+dist); |
| } |
| |
| - if (pos+gramSize > inLen) { // if we hit the end of the string |
| - pos = 0; // reset to beginning of string |
| - gramSize++; // increase n-gram size |
| - if (gramSize > maxGram) // we are done |
| - return false; |
| - if (pos+gramSize > inLen) |
| - return false; |
| + final StringBuilder sb = new StringBuilder(); |
| + for (int i : charsQueue) |
| + sb.append((char) i); |
| + |
| + if (offsetQueue.getFirst() != offsetQueue.getLast()) { |
| + clearAttributes(); |
| + termAtt.setEmpty().append(sb.toString()); |
| + offsetAtt.setOffset(correctOffset(offsetQueue.getFirst()), correctOffset(offsetQueue.getLast())); |
| + return true; |
| + } else { |
| + return incrementToken(); |
| } |
| - |
| - int oldPos = pos; |
| - pos++; |
| - termAtt.setEmpty().append(inStr, oldPos, oldPos+gramSize); |
| - offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize)); |
| - return true; |
| - } |
| - |
| - @Override |
| - public final void end() { |
| - // set final offset |
| - final int finalOffset = inLen; |
| - this.offsetAtt.setOffset(finalOffset, finalOffset); |
| } |
| |
| @Override |
| @@ -141,7 +215,35 @@ |
| @Override |
| public void reset() throws IOException { |
| super.reset(); |
| - started = false; |
| - pos = 0; |
| + gramSize = minGram; |
| + resetPosition(); |
| } |
| + |
| + private void resetPosition() { |
| + charsQueue = new LinkedList<Integer>(); |
| + offsetQueue = new LinkedList<Integer>(); |
| + offsetQueue.offer(0); |
| + tmp = 0; |
| + collapse = false; |
| + } |
| + |
| + /** Returns the number of characters read*/ |
| + private int nextChar() throws IOException { |
| + int count = 0; |
| + if (collapse) { |
| + collapse = false; |
| + do { |
| + tmp = input.read(); |
| + ++count; |
| + } while (whitespace.contains(new Character((char)tmp))); |
| + } else { |
| + tmp = input.read(); |
| + ++count; |
| + if (whitespace.contains(new Character((char)tmp))) { |
| + tmp = '_'; |
| + collapse = true; |
| + } |
| + } |
| + return count; |
| + } |
| } |