docs/attachments/LUCENE-2947/LUCENE-2947.patch - lucene-jira-archive - Git at Google

 Index: modules/analysis/CHANGES.txt
 ===================================================================
 --- modules/analysis/CHANGES.txt	(revision 1079618)
 +++ modules/analysis/CHANGES.txt	(working copy)
 @@ -4,6 +4,8 @@

  API Changes

 + * LUCENE-1227,LUCENE-2947: NGramTokenizer now handles any number of characters. Improved flexibility for dealing with whitespace. Increased emphasis put on starting and ending ngrams. (David Byrne)
 +
   * LUCENE-2413: Deprecated PatternAnalyzer in common/miscellaneous, in favor
     of the pattern package (CharFilter, Tokenizer, TokenFilter).  (Robert Muir)

 Index: modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
 ===================================================================
 --- modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java	(revision 1079618)
 +++ modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java	(working copy)
 @@ -19,6 +19,9 @@


  import java.io.StringReader;
 +import java.util.Arrays;
 +import java.util.HashSet;
 +import java.util.Set;

  import org.apache.lucene.analysis.BaseTokenStreamTestCase;

 @@ -26,18 +29,16 @@
   * Tests {@link NGramTokenizer} for correctness.
   */
  public class NGramTokenizerTest extends BaseTokenStreamTestCase {
 -    private StringReader input;

      @Override
      public void setUp() throws Exception {
          super.setUp();
 -        input = new StringReader("abcde");
      }

      public void testInvalidInput() throws Exception {
          boolean gotException = false;
          try {
 -            new NGramTokenizer(input, 2, 1);
 +            new NGramTokenizer(new StringReader("foo"), 2, 1);
          } catch (IllegalArgumentException e) {
              gotException = true;
          }
 @@ -47,7 +48,7 @@
      public void testInvalidInput2() throws Exception {
          boolean gotException = false;
          try {
 -            new NGramTokenizer(input, 0, 1);
 +            new NGramTokenizer(new StringReader("foo"), 0, 1);
          } catch (IllegalArgumentException e) {
              gotException = true;
          }
 @@ -55,34 +56,79 @@
      }

      public void testUnigrams() throws Exception {
 -        NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
 -        assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
 +        NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcde"), 1, 1);
 +        assertTokenStreamContents(tokenizer,
 +          new String[]{"a","b","c","d","e"},
 +          new int[]{0,1,2,3,4},
 +          new int[]{1,2,3,4,5}, 5 /* abcde */);
      }

      public void testBigrams() throws Exception {
 -        NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
 -        assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, 5 /* abcde */);
 +        NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcde"), 2, 2);
 +        assertTokenStreamContents(tokenizer,
 +          new String[]{"_a","ab","bc","cd","de","e_"},
 +          new int[]{0,0,1,2,3,4},
 +          new int[]{1,2,3,4,5,5}, 5 /* abcde */);
      }

      public void testNgrams() throws Exception {
 -        NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
 +        NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcde"), 1, 3);
          assertTokenStreamContents(tokenizer,
 -          new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
 -          new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
 -          new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
 +          new String[]{"a","b","c","d","e", "_a","ab","bc","cd","de","e_", "_ab","abc","bcd","cde","de_"},
 +          new int[]{0,1,2,3,4, 0,0,1,2,3,4, 0,0,1,2,3},
 +          new int[]{1,2,3,4,5, 1,2,3,4,5,5, 2,3,4,5,5},
            5 /* abcde */
          );
      }

      public void testOversizedNgrams() throws Exception {
 -        NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
 +        NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcde"), 7, 7);
          assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
      }

      public void testReset() throws Exception {
 -      NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
 -      assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
 -      tokenizer.reset(new StringReader("abcde"));
 -      assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
 +      NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcd"));
 +      assertTokenStreamContents(tokenizer,
 +        new String[]{"_a","ab","bc","cd","d_", "_ab","abc","bcd","cd_", "_abc","abcd","bcd_"},
 +        new int[]{0,0,1,2,3, 0,0,1,2, 0,0,1},
 +        new int[]{1,2,3,4,4, 2,3,4,4, 3,4,4}, 4);
 +      tokenizer.reset(new StringReader("abcd"));
 +      assertTokenStreamContents(tokenizer,
 +        new String[]{"_a","ab","bc","cd","d_", "_ab","abc","bcd","cd_", "_abc","abcd","bcd_"},
 +        new int[]{0,0,1,2,3, 0,0,1,2, 0,0,1},
 +        new int[]{1,2,3,4,4, 2,3,4,4, 3,4,4}, 4);
      }
 +
 +    public void testInteriorWhitespace() throws Exception {
 +        NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("a\tb    c"),2,2);
 +        assertTokenStreamContents(tokenizer,
 +          new String[]{"_a","a_","_b","b_","_c","c_"},
 +          new int[]{0,0,1,2,3,7},
 +          new int[]{1,2,3,4,8,8},
 +          8
 +        );
 +    }
 +
 +    public void testExteriorWhitespace() throws Exception {
 +        NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("  abc\n\n"),2,2);
 +        assertTokenStreamContents(tokenizer,
 +          new String[]{"__","_a","ab","bc","c_","__"},
 +          new int[]{0,0,2,3,4,5},
 +          new int[]{1,3,4,5,6,7},
 +          7
 +        );
 +    }
 +
 +    public void testCustomWhitespace() throws Exception {
 +        Character w[] = {'|',';'};
 +        Set<Character> whitespace = new HashSet<Character>(Arrays.asList(w));
 +        NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("a||b;c"),2,3,whitespace);
 +        assertTokenStreamContents(tokenizer,
 +          new String[]{"_a","a_","_b","b_","_c","c_", "_a_","a_b","_b_","b_c","_c_"},
 +          new int[]{0,0,1,3,4,5, 0,0,1,3,4},
 +          new int[]{1,2,4,5,6,6, 2,4,5,6,6},
 +          6
 +        );
 +    }
 +
  }
 Index: modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
 ===================================================================
 --- modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java	(revision 1079618)
 +++ modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java	(working copy)
 @@ -17,40 +17,74 @@
   * limitations under the License.
   */

 +import java.io.IOException;
 +import java.io.Reader;
 +import java.util.Arrays;
 +import java.util.HashSet;
 +import java.util.LinkedList;
 +import java.util.Set;
 +
  import org.apache.lucene.analysis.Tokenizer;
 +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  import org.apache.lucene.util.AttributeSource;

 -import java.io.IOException;
 -import java.io.Reader;
 -
  /**
   * Tokenizes the input into n-grams of the given size(s).
   */
  public final class NGramTokenizer extends Tokenizer {
 -  public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
 -  public static final int DEFAULT_MAX_NGRAM_SIZE = 2;

 -  private int minGram, maxGram;
 +  public static final int DEFAULT_MIN_NGRAM_SIZE = 2;
 +  public static final int DEFAULT_MAX_NGRAM_SIZE = 4;
 +
 +  public static final Set<Character> DEFAULT_WHITESPACE_CHARS;
 +  static {
 +    Character whitespace[] = { ' ', '\t', '\n' };
 +    DEFAULT_WHITESPACE_CHARS = new HashSet<Character>(Arrays.asList(whitespace));
 +  }
 +
    private int gramSize;
 -  private int pos = 0;
 -  private int inLen;
 -  private String inStr;
 -  private boolean started = false;
 +  private int minGram;
 +  private int maxGram;
 +  private int tmp;
 +
 +  private LinkedList<Integer> charsQueue;
 +  private LinkedList<Integer> offsetQueue;
 +  private Set<Character> whitespace;
 +
 +  private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 +  private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

 -  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 -  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 +  private boolean collapse;

    /**
 +   * Creates NGramTokenizer with default min and max n-grams.
 +   * @param input {@link Reader} holding the input to be tokenized
 +   */
 +  public NGramTokenizer(Reader input) {
 +    this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE, DEFAULT_WHITESPACE_CHARS);
 +  }
 +
 +  /**
     * Creates NGramTokenizer with given min and max n-grams.
     * @param input {@link Reader} holding the input to be tokenized
     * @param minGram the smallest n-gram to generate
     * @param maxGram the largest n-gram to generate
     */
    public NGramTokenizer(Reader input, int minGram, int maxGram) {
 +    this(input, minGram, maxGram, DEFAULT_WHITESPACE_CHARS);
 +  }
 +
 +  /**
 +   * Creates NGramTokenizer with given min and max n-grams.
 +   * @param input {@link Reader} holding the input to be tokenized
 +   * @param minGram the smallest n-gram to generate
 +   * @param maxGram the largest n-gram to generate
 +   * @param whitespace whitespace characters to be collapsed together
 +   */
 +  public NGramTokenizer(Reader input, int minGram, int maxGram, Set<Character> whitespace) {
      super(input);
 -    init(minGram, maxGram);
 +    init(minGram, maxGram, whitespace);
    }

    /**
 @@ -61,8 +95,20 @@
     * @param maxGram the largest n-gram to generate
     */
    public NGramTokenizer(AttributeSource source, Reader input, int minGram, int maxGram) {
 +    this(source, input, minGram, maxGram, DEFAULT_WHITESPACE_CHARS);
 +  }
 +
 +  /**
 +   * Creates NGramTokenizer with given min and max n-grams.
 +   * @param source {@link AttributeSource} to use
 +   * @param input {@link Reader} holding the input to be tokenized
 +   * @param minGram the smallest n-gram to generate
 +   * @param maxGram the largest n-gram to generate
 +   * @param whitespace whitespace characters to be collapsed together
 +   */
 +  public NGramTokenizer(AttributeSource source, Reader input, int minGram, int maxGram, Set<Character> whitespace) {
      super(source, input);
 -    init(minGram, maxGram);
 +    init(minGram, maxGram, whitespace);
    }

    /**
 @@ -73,19 +119,23 @@
     * @param maxGram the largest n-gram to generate
     */
    public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) {
 -    super(factory, input);
 -    init(minGram, maxGram);
 +    this(factory, input, minGram, maxGram, DEFAULT_WHITESPACE_CHARS);
    }

    /**
 -   * Creates NGramTokenizer with default min and max n-grams.
 +   * Creates NGramTokenizer with given min and max n-grams.
 +   * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
     * @param input {@link Reader} holding the input to be tokenized
 +   * @param minGram the smallest n-gram to generate
 +   * @param maxGram the largest n-gram to generate
 +   * @param whitespace whitespace characters to be collapsed together
     */
 -  public NGramTokenizer(Reader input) {
 -    this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
 +  public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram, Set<Character> whitespace) {
 +    super(factory, input);
 +    init(minGram, maxGram, whitespace);
    }

 -  private void init(int minGram, int maxGram) {
 +  private void init(int minGram, int maxGram, Set<Character> whitespace) {
      if (minGram < 1) {
        throw new IllegalArgumentException("minGram must be greater than zero");
      }
 @@ -94,42 +144,66 @@
      }
      this.minGram = minGram;
      this.maxGram = maxGram;
 +    this.whitespace = whitespace;
 +    this.maxGram = maxGram;
 +    this.minGram = minGram;
 +    gramSize = minGram;
 +    resetPosition();
    }

 -  /** Returns the next token in the stream, or null at EOS. */
    @Override
    public final boolean incrementToken() throws IOException {
 -    clearAttributes();
 -    if (!started) {
 -      started = true;
 -      gramSize = minGram;
 -      char[] chars = new char[1024];
 -      input.read(chars);
 -      inStr = new String(chars).trim();  // remove any trailing empty strings
 -      inLen = inStr.length();
 +    if (gramSize > maxGram)
 +      return false;
 +
 +    if (charsQueue.isEmpty()) {
 +      charsQueue.offer((int)'_');
 +      offsetQueue.offer(0);
 +      for (int x = 0; x < gramSize-1; x++) {
 +        int count = nextChar();
 +        if (tmp == -1) {
 +          count += offsetQueue.getLast()-1;
 +          offsetAtt.setOffset(correctOffset(count), correctOffset(count));
 +          return false;
 +        } else {
 +          charsQueue.offer(tmp);
 +          offsetQueue.offer(offsetQueue.getLast()+count);
 +        }
 +      }
 +    } else if (tmp == -1) {
 +      ++gramSize;
 +      resetPosition();
 +      input.reset();
 +      return incrementToken();
 +    } else {
 +      int count = nextChar();
 +      if (tmp == -1) {
 +        charsQueue.offer((int)'_');
 +        offsetQueue.offer(offsetQueue.getLast()+count-1);
 +      } else {
 +        charsQueue.offer(tmp);
 +        offsetQueue.offer(offsetQueue.getLast()+count);
 +      }
 +      charsQueue.poll();
 +      offsetQueue.poll();
 +
 +      int dist = offsetQueue.get(1)-offsetQueue.getFirst()-1;
 +      if (dist > 0)
 +        offsetQueue.set(0, offsetQueue.getFirst()+dist);
      }

 -    if (pos+gramSize > inLen) {            // if we hit the end of the string
 -      pos = 0;                           // reset to beginning of string
 -      gramSize++;                        // increase n-gram size
 -      if (gramSize > maxGram)            // we are done
 -        return false;
 -      if (pos+gramSize > inLen)
 -        return false;
 +    final StringBuilder sb = new StringBuilder();
 +    for (int i : charsQueue)
 +      sb.append((char) i);
 +
 +    if (offsetQueue.getFirst() != offsetQueue.getLast()) {
 +      clearAttributes();
 +      termAtt.setEmpty().append(sb.toString());
 +      offsetAtt.setOffset(correctOffset(offsetQueue.getFirst()), correctOffset(offsetQueue.getLast()));
 +      return true;
 +    } else {
 +      return incrementToken();
      }
 -
 -    int oldPos = pos;
 -    pos++;
 -    termAtt.setEmpty().append(inStr, oldPos, oldPos+gramSize);
 -    offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize));
 -    return true;
 -  }
 -
 -  @Override
 -  public final void end() {
 -    // set final offset
 -    final int finalOffset = inLen;
 -    this.offsetAtt.setOffset(finalOffset, finalOffset);
    }

    @Override
 @@ -141,7 +215,35 @@
    @Override
    public void reset() throws IOException {
      super.reset();
 -    started = false;
 -    pos = 0;
 +    gramSize = minGram;
 +    resetPosition();
    }
 +
 +  private void resetPosition() {
 +    charsQueue = new LinkedList<Integer>();
 +    offsetQueue = new LinkedList<Integer>();
 +    offsetQueue.offer(0);
 +    tmp = 0;
 +    collapse = false;
 +  }
 +
 +  /** Returns the number of characters read*/
 +  private int nextChar() throws IOException {
 +    int count = 0;
 +    if (collapse) {
 +      collapse = false;
 +      do {
 +        tmp = input.read();
 +        ++count;
 +      } while (whitespace.contains(new Character((char)tmp)));
 +    } else {
 +      tmp = input.read();
 +      ++count;
 +      if (whitespace.contains(new Character((char)tmp))) {
 +        tmp = '_';
 +        collapse = true;
 +      }
 +    }
 +    return count;
 +  }
  }
	Index: modules/analysis/CHANGES.txt
	===================================================================
	--- modules/analysis/CHANGES.txt (revision 1079618)
	+++ modules/analysis/CHANGES.txt (working copy)
	@@ -4,6 +4,8 @@

	API Changes

	+ * LUCENE-1227,LUCENE-2947: NGramTokenizer now handles any number of characters. Improved flexibility for dealing with whitespace. Increased emphasis put on starting and ending ngrams. (David Byrne)
	+
	* LUCENE-2413: Deprecated PatternAnalyzer in common/miscellaneous, in favor
	of the pattern package (CharFilter, Tokenizer, TokenFilter). (Robert Muir)

	Index: modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
	===================================================================
	--- modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (revision 1079618)
	+++ modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (working copy)
	@@ -19,6 +19,9 @@


	import java.io.StringReader;
	+import java.util.Arrays;
	+import java.util.HashSet;
	+import java.util.Set;

	import org.apache.lucene.analysis.BaseTokenStreamTestCase;

	@@ -26,18 +29,16 @@
	* Tests {@link NGramTokenizer} for correctness.
	*/
	public class NGramTokenizerTest extends BaseTokenStreamTestCase {
	- private StringReader input;

	@Override
	public void setUp() throws Exception {
	super.setUp();
	- input = new StringReader("abcde");
	}

	public void testInvalidInput() throws Exception {
	boolean gotException = false;
	try {
	- new NGramTokenizer(input, 2, 1);
	+ new NGramTokenizer(new StringReader("foo"), 2, 1);
	} catch (IllegalArgumentException e) {
	gotException = true;
	}
	@@ -47,7 +48,7 @@
	public void testInvalidInput2() throws Exception {
	boolean gotException = false;
	try {
	- new NGramTokenizer(input, 0, 1);
	+ new NGramTokenizer(new StringReader("foo"), 0, 1);
	} catch (IllegalArgumentException e) {
	gotException = true;
	}
	@@ -55,34 +56,79 @@
	}

	public void testUnigrams() throws Exception {
	- NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
	- assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
	+ NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcde"), 1, 1);
	+ assertTokenStreamContents(tokenizer,
	+ new String[]{"a","b","c","d","e"},
	+ new int[]{0,1,2,3,4},
	+ new int[]{1,2,3,4,5}, 5 /* abcde */);
	}

	public void testBigrams() throws Exception {
	- NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
	- assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, 5 /* abcde */);
	+ NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcde"), 2, 2);
	+ assertTokenStreamContents(tokenizer,
	+ new String[]{"_a","ab","bc","cd","de","e_"},
	+ new int[]{0,0,1,2,3,4},
	+ new int[]{1,2,3,4,5,5}, 5 /* abcde */);
	}

	public void testNgrams() throws Exception {
	- NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
	+ NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcde"), 1, 3);
	assertTokenStreamContents(tokenizer,
	- new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
	- new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
	- new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
	+ new String[]{"a","b","c","d","e", "_a","ab","bc","cd","de","e_", "_ab","abc","bcd","cde","de_"},
	+ new int[]{0,1,2,3,4, 0,0,1,2,3,4, 0,0,1,2,3},
	+ new int[]{1,2,3,4,5, 1,2,3,4,5,5, 2,3,4,5,5},
	5 /* abcde */
	);
	}

	public void testOversizedNgrams() throws Exception {
	- NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
	+ NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcde"), 7, 7);
	assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
	}

	public void testReset() throws Exception {
	- NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
	- assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
	- tokenizer.reset(new StringReader("abcde"));
	- assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
	+ NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("abcd"));
	+ assertTokenStreamContents(tokenizer,
	+ new String[]{"_a","ab","bc","cd","d_", "_ab","abc","bcd","cd_", "_abc","abcd","bcd_"},
	+ new int[]{0,0,1,2,3, 0,0,1,2, 0,0,1},
	+ new int[]{1,2,3,4,4, 2,3,4,4, 3,4,4}, 4);
	+ tokenizer.reset(new StringReader("abcd"));
	+ assertTokenStreamContents(tokenizer,
	+ new String[]{"_a","ab","bc","cd","d_", "_ab","abc","bcd","cd_", "_abc","abcd","bcd_"},
	+ new int[]{0,0,1,2,3, 0,0,1,2, 0,0,1},
	+ new int[]{1,2,3,4,4, 2,3,4,4, 3,4,4}, 4);
	}
	+
	+ public void testInteriorWhitespace() throws Exception {
	+ NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("a\tb c"),2,2);
	+ assertTokenStreamContents(tokenizer,
	+ new String[]{"_a","a_","_b","b_","_c","c_"},
	+ new int[]{0,0,1,2,3,7},
	+ new int[]{1,2,3,4,8,8},
	+ 8
	+ );
	+ }
	+
	+ public void testExteriorWhitespace() throws Exception {
	+ NGramTokenizer tokenizer = new NGramTokenizer(new StringReader(" abc\n\n"),2,2);
	+ assertTokenStreamContents(tokenizer,
	+ new String[]{"__","_a","ab","bc","c_","__"},
	+ new int[]{0,0,2,3,4,5},
	+ new int[]{1,3,4,5,6,7},
	+ 7
	+ );
	+ }
	+
	+ public void testCustomWhitespace() throws Exception {
	+ Character w[] = {'\|',';'};
	+ Set<Character> whitespace = new HashSet<Character>(Arrays.asList(w));
	+ NGramTokenizer tokenizer = new NGramTokenizer(new StringReader("a\|\|b;c"),2,3,whitespace);
	+ assertTokenStreamContents(tokenizer,
	+ new String[]{"_a","a_","_b","b_","_c","c_", "_a_","a_b","_b_","b_c","_c_"},
	+ new int[]{0,0,1,3,4,5, 0,0,1,3,4},
	+ new int[]{1,2,4,5,6,6, 2,4,5,6,6},
	+ 6
	+ );
	+ }
	+
	}
	Index: modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
	===================================================================
	--- modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (revision 1079618)
	+++ modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (working copy)
	@@ -17,40 +17,74 @@
	* limitations under the License.
	*/

	+import java.io.IOException;
	+import java.io.Reader;
	+import java.util.Arrays;
	+import java.util.HashSet;
	+import java.util.LinkedList;
	+import java.util.Set;
	+
	import org.apache.lucene.analysis.Tokenizer;
	+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.util.AttributeSource;

	-import java.io.IOException;
	-import java.io.Reader;
	-
	/**
	* Tokenizes the input into n-grams of the given size(s).
	*/
	public final class NGramTokenizer extends Tokenizer {
	- public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
	- public static final int DEFAULT_MAX_NGRAM_SIZE = 2;

	- private int minGram, maxGram;
	+ public static final int DEFAULT_MIN_NGRAM_SIZE = 2;
	+ public static final int DEFAULT_MAX_NGRAM_SIZE = 4;
	+
	+ public static final Set<Character> DEFAULT_WHITESPACE_CHARS;
	+ static {
	+ Character whitespace[] = { ' ', '\t', '\n' };
	+ DEFAULT_WHITESPACE_CHARS = new HashSet<Character>(Arrays.asList(whitespace));
	+ }
	+
	private int gramSize;
	- private int pos = 0;
	- private int inLen;
	- private String inStr;
	- private boolean started = false;
	+ private int minGram;
	+ private int maxGram;
	+ private int tmp;
	+
	+ private LinkedList<Integer> charsQueue;
	+ private LinkedList<Integer> offsetQueue;
	+ private Set<Character> whitespace;
	+
	+ private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
	+ private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

	- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
	- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
	+ private boolean collapse;

	/**
	+ * Creates NGramTokenizer with default min and max n-grams.
	+ * @param input {@link Reader} holding the input to be tokenized
	+ */
	+ public NGramTokenizer(Reader input) {
	+ this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE, DEFAULT_WHITESPACE_CHARS);
	+ }
	+
	+ /**
	* Creates NGramTokenizer with given min and max n-grams.
	* @param input {@link Reader} holding the input to be tokenized
	* @param minGram the smallest n-gram to generate
	* @param maxGram the largest n-gram to generate
	*/
	public NGramTokenizer(Reader input, int minGram, int maxGram) {
	+ this(input, minGram, maxGram, DEFAULT_WHITESPACE_CHARS);
	+ }
	+
	+ /**
	+ * Creates NGramTokenizer with given min and max n-grams.
	+ * @param input {@link Reader} holding the input to be tokenized
	+ * @param minGram the smallest n-gram to generate
	+ * @param maxGram the largest n-gram to generate
	+ * @param whitespace whitespace characters to be collapsed together
	+ */
	+ public NGramTokenizer(Reader input, int minGram, int maxGram, Set<Character> whitespace) {
	super(input);
	- init(minGram, maxGram);
	+ init(minGram, maxGram, whitespace);
	}

	/**
	@@ -61,8 +95,20 @@
	* @param maxGram the largest n-gram to generate
	*/
	public NGramTokenizer(AttributeSource source, Reader input, int minGram, int maxGram) {
	+ this(source, input, minGram, maxGram, DEFAULT_WHITESPACE_CHARS);
	+ }
	+
	+ /**
	+ * Creates NGramTokenizer with given min and max n-grams.
	+ * @param source {@link AttributeSource} to use
	+ * @param input {@link Reader} holding the input to be tokenized
	+ * @param minGram the smallest n-gram to generate
	+ * @param maxGram the largest n-gram to generate
	+ * @param whitespace whitespace characters to be collapsed together
	+ */
	+ public NGramTokenizer(AttributeSource source, Reader input, int minGram, int maxGram, Set<Character> whitespace) {
	super(source, input);
	- init(minGram, maxGram);
	+ init(minGram, maxGram, whitespace);
	}

	/**
	@@ -73,19 +119,23 @@
	* @param maxGram the largest n-gram to generate
	*/
	public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) {
	- super(factory, input);
	- init(minGram, maxGram);
	+ this(factory, input, minGram, maxGram, DEFAULT_WHITESPACE_CHARS);
	}

	/**
	- * Creates NGramTokenizer with default min and max n-grams.
	+ * Creates NGramTokenizer with given min and max n-grams.
	+ * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
	* @param input {@link Reader} holding the input to be tokenized
	+ * @param minGram the smallest n-gram to generate
	+ * @param maxGram the largest n-gram to generate
	+ * @param whitespace whitespace characters to be collapsed together
	*/
	- public NGramTokenizer(Reader input) {
	- this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
	+ public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram, Set<Character> whitespace) {
	+ super(factory, input);
	+ init(minGram, maxGram, whitespace);
	}

	- private void init(int minGram, int maxGram) {
	+ private void init(int minGram, int maxGram, Set<Character> whitespace) {
	if (minGram < 1) {
	throw new IllegalArgumentException("minGram must be greater than zero");
	}
	@@ -94,42 +144,66 @@
	}
	this.minGram = minGram;
	this.maxGram = maxGram;
	+ this.whitespace = whitespace;
	+ this.maxGram = maxGram;
	+ this.minGram = minGram;
	+ gramSize = minGram;
	+ resetPosition();
	}

	- /** Returns the next token in the stream, or null at EOS. */
	@Override
	public final boolean incrementToken() throws IOException {
	- clearAttributes();
	- if (!started) {
	- started = true;
	- gramSize = minGram;
	- char[] chars = new char[1024];
	- input.read(chars);
	- inStr = new String(chars).trim(); // remove any trailing empty strings
	- inLen = inStr.length();
	+ if (gramSize > maxGram)
	+ return false;
	+
	+ if (charsQueue.isEmpty()) {
	+ charsQueue.offer((int)'_');
	+ offsetQueue.offer(0);
	+ for (int x = 0; x < gramSize-1; x++) {
	+ int count = nextChar();
	+ if (tmp == -1) {
	+ count += offsetQueue.getLast()-1;
	+ offsetAtt.setOffset(correctOffset(count), correctOffset(count));
	+ return false;
	+ } else {
	+ charsQueue.offer(tmp);
	+ offsetQueue.offer(offsetQueue.getLast()+count);
	+ }
	+ }
	+ } else if (tmp == -1) {
	+ ++gramSize;
	+ resetPosition();
	+ input.reset();
	+ return incrementToken();
	+ } else {
	+ int count = nextChar();
	+ if (tmp == -1) {
	+ charsQueue.offer((int)'_');
	+ offsetQueue.offer(offsetQueue.getLast()+count-1);
	+ } else {
	+ charsQueue.offer(tmp);
	+ offsetQueue.offer(offsetQueue.getLast()+count);
	+ }
	+ charsQueue.poll();
	+ offsetQueue.poll();
	+
	+ int dist = offsetQueue.get(1)-offsetQueue.getFirst()-1;
	+ if (dist > 0)
	+ offsetQueue.set(0, offsetQueue.getFirst()+dist);
	}

	- if (pos+gramSize > inLen) { // if we hit the end of the string
	- pos = 0; // reset to beginning of string
	- gramSize++; // increase n-gram size
	- if (gramSize > maxGram) // we are done
	- return false;
	- if (pos+gramSize > inLen)
	- return false;
	+ final StringBuilder sb = new StringBuilder();
	+ for (int i : charsQueue)
	+ sb.append((char) i);
	+
	+ if (offsetQueue.getFirst() != offsetQueue.getLast()) {
	+ clearAttributes();
	+ termAtt.setEmpty().append(sb.toString());
	+ offsetAtt.setOffset(correctOffset(offsetQueue.getFirst()), correctOffset(offsetQueue.getLast()));
	+ return true;
	+ } else {
	+ return incrementToken();
	}
	-
	- int oldPos = pos;
	- pos++;
	- termAtt.setEmpty().append(inStr, oldPos, oldPos+gramSize);
	- offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize));
	- return true;
	- }
	-
	- @Override
	- public final void end() {
	- // set final offset
	- final int finalOffset = inLen;
	- this.offsetAtt.setOffset(finalOffset, finalOffset);
	}

	@Override
	@@ -141,7 +215,35 @@
	@Override
	public void reset() throws IOException {
	super.reset();
	- started = false;
	- pos = 0;
	+ gramSize = minGram;
	+ resetPosition();
	}
	+
	+ private void resetPosition() {
	+ charsQueue = new LinkedList<Integer>();
	+ offsetQueue = new LinkedList<Integer>();
	+ offsetQueue.offer(0);
	+ tmp = 0;
	+ collapse = false;
	+ }
	+
	+ /** Returns the number of characters read*/
	+ private int nextChar() throws IOException {
	+ int count = 0;
	+ if (collapse) {
	+ collapse = false;
	+ do {
	+ tmp = input.read();
	+ ++count;
	+ } while (whitespace.contains(new Character((char)tmp)));
	+ } else {
	+ tmp = input.read();
	+ ++count;
	+ if (whitespace.contains(new Character((char)tmp))) {
	+ tmp = '_';
	+ collapse = true;
	+ }
	+ }
	+ return count;
	+ }
	}