lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.shingle;


 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.store.Directory;

 /**
  * A test class for ShingleAnalyzerWrapper as regards queries and scoring.
  */
 public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
   private Analyzer analyzer;
   private IndexSearcher searcher;
   private IndexReader reader;
   private Directory directory;

   /**
    * Set up a new index in RAM with three test phrases and the supplied Analyzer.
    *
    * @throws Exception if an error occurs with index writer or searcher
    */
   @Override
   public void setUp() throws Exception {
     super.setUp();
     analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 2);
     directory = newDirectory();
     IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(analyzer));

     Document doc;
     doc = new Document();
     doc.add(new TextField("content", "please divide this sentence into shingles", Field.Store.YES));
     writer.addDocument(doc);

     doc = new Document();
     doc.add(new TextField("content", "just another test sentence", Field.Store.YES));
     writer.addDocument(doc);

     doc = new Document();
     doc.add(new TextField("content", "a sentence which contains no test", Field.Store.YES));
     writer.addDocument(doc);

     writer.close();

     reader = DirectoryReader.open(directory);
     searcher = newSearcher(reader);
   }

   @Override
   public void tearDown() throws Exception {
     reader.close();
     directory.close();
     analyzer.close();
     super.tearDown();
   }

   protected void compareRanks(ScoreDoc[] hits, int[] ranks) throws Exception {
     assertEquals(ranks.length, hits.length);
     for (int i = 0; i < ranks.length; i++) {
       assertEquals(ranks[i], hits[i].doc);
     }
   }

   /*
    * This shows how to construct a phrase query containing shingles.
    */
   public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
     PhraseQuery.Builder builder = new PhraseQuery.Builder();
     try (TokenStream ts = analyzer.tokenStream("content", "this sentence")) {
       int j = -1;

       PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
       CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

       ts.reset();
       while (ts.incrementToken()) {
         j += posIncrAtt.getPositionIncrement();
         String termText = termAtt.toString();
         builder.add(new Term("content", termText), j);
       }
       ts.end();
     }

     PhraseQuery q = builder.build();
     ScoreDoc[] hits = searcher.search(q, 1000).scoreDocs;
     int[] ranks = new int[] { 0 };
     compareRanks(hits, ranks);
   }

   /*
    * How to construct a boolean query with shingles. A query like this will
    * implicitly score those documents higher that contain the words in the query
    * in the right order and adjacent to each other.
    */
   public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
     BooleanQuery.Builder q = new BooleanQuery.Builder();

     try (TokenStream ts = analyzer.tokenStream("content", "test sentence")) {
       CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

       ts.reset();
       while (ts.incrementToken()) {
         String termText =  termAtt.toString();
         q.add(new TermQuery(new Term("content", termText)),
             BooleanClause.Occur.SHOULD);
       }
       ts.end();
     }

     ScoreDoc[] hits = searcher.search(q.build(), 1000).scoreDocs;
     int[] ranks = new int[] { 1, 2, 0 };
     compareRanks(hits, ranks);
   }

   public void testReusableTokenStream() throws Exception {
     Analyzer a = new ShingleAnalyzerWrapper(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 2);
     assertAnalyzesTo(a, "please divide into shingles",
         new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" },
         new int[] { 0, 0, 7, 7, 14, 14, 19 },
         new int[] { 6, 13, 13, 18, 18, 27, 27 },
         new int[] { 1, 0, 1, 0, 1, 0, 1 });
     assertAnalyzesTo(a, "divide me up again",
         new String[] { "divide", "divide me", "me", "me up", "up", "up again", "again" },
         new int[] { 0, 0, 7, 7, 10, 10, 13 },
         new int[] { 6, 9, 9, 12, 12, 18, 18 },
         new int[] { 1, 0, 1, 0, 1, 0, 1 });
     a.close();
   }

   public void testNonDefaultMinShingleSize() throws Exception {
     ShingleAnalyzerWrapper analyzer
       = new ShingleAnalyzerWrapper(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4);
     assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
                           new String[] { "please",   "please divide this",   "please divide this sentence",
                                          "divide",   "divide this sentence", "divide this sentence into",
                                          "this",     "this sentence into",   "this sentence into shingles",
                                          "sentence", "sentence into shingles",
                                          "into",
                                          "shingles" },
                           new int[] { 0,  0,  0,  7,  7,  7, 14, 14, 14, 19, 19, 28, 33 },
                           new int[] { 6, 18, 27, 13, 27, 32, 18, 32, 41, 27, 41, 32, 41 },
                           new int[] { 1,  0,  0,  1,  0,  0,  1,  0,  0,  1,  0,  1,  1 });
     analyzer.close();

     analyzer = new ShingleAnalyzerWrapper(
         new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4,
         ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
     assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
                           new String[] { "please divide this",   "please divide this sentence",
                                          "divide this sentence", "divide this sentence into",
                                          "this sentence into",   "this sentence into shingles",
                                          "sentence into shingles" },
                           new int[] {  0,  0,  7,  7, 14, 14, 19 },
                           new int[] { 18, 27, 27, 32, 32, 41, 41 },
                           new int[] {  1,  0,  1,  0,  1,  0,  1 });
     analyzer.close();
   }

   public void testNonDefaultMinAndSameMaxShingleSize() throws Exception {
     ShingleAnalyzerWrapper analyzer
       = new ShingleAnalyzerWrapper(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3);
     assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
                           new String[] { "please",   "please divide this",
                                          "divide",   "divide this sentence",
                                          "this",     "this sentence into",
                                          "sentence", "sentence into shingles",
                                          "into",
                                          "shingles" },
                           new int[] { 0,  0,  7,  7, 14, 14, 19, 19, 28, 33 },
                           new int[] { 6, 18, 13, 27, 18, 32, 27, 41, 32, 41 },
                           new int[] { 1,  0,  1,  0,  1,  0,  1,  0,  1,  1 });
     analyzer.close();

     analyzer = new ShingleAnalyzerWrapper(
         new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3,
         ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
     assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
                           new String[] { "please divide this",
                                          "divide this sentence",
                                          "this sentence into",
                                          "sentence into shingles" },
                           new int[] {  0,  7, 14, 19 },
                           new int[] { 18, 27, 32, 41 },
                           new int[] {  1,  1,  1,  1 });
     analyzer.close();
   }

   public void testNoTokenSeparator() throws Exception {
     ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
         new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
         ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
         ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
         "", true, false,
         ShingleFilter.DEFAULT_FILLER_TOKEN);
     assertAnalyzesTo(analyzer, "please divide into shingles",
                           new String[] { "please", "pleasedivide",
                                          "divide", "divideinto",
                                          "into", "intoshingles",
                                          "shingles" },
                           new int[] { 0,  0,  7,  7, 14, 14, 19 },
                           new int[] { 6, 13, 13, 18, 18, 27, 27 },
                           new int[] { 1,  0,  1,  0,  1,  0,  1 });
     analyzer.close();

     analyzer = new ShingleAnalyzerWrapper(
         new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
         ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
         ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
         "", false, false,
         ShingleFilter.DEFAULT_FILLER_TOKEN);
     assertAnalyzesTo(analyzer, "please divide into shingles",
                           new String[] { "pleasedivide",
                                          "divideinto",
                                          "intoshingles" },
                           new int[] {  0,  7, 14 },
                           new int[] { 13, 18, 27 },
                           new int[] {  1,  1,  1 });
     analyzer.close();
   }

   public void testNullTokenSeparator() throws Exception {
     ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
         new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
         ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
         ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
         null, true, false,
         ShingleFilter.DEFAULT_FILLER_TOKEN);
     assertAnalyzesTo(analyzer, "please divide into shingles",
                           new String[] { "please", "pleasedivide",
                                          "divide", "divideinto",
                                          "into", "intoshingles",
                                          "shingles" },
                           new int[] { 0,  0,  7,  7, 14, 14, 19 },
                           new int[] { 6, 13, 13, 18, 18, 27, 27 },
                           new int[] { 1,  0,  1,  0,  1,  0,  1 });
     analyzer.close();

     analyzer = new ShingleAnalyzerWrapper(
         new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
         ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
         ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
         "", false, false,
         ShingleFilter.DEFAULT_FILLER_TOKEN);
     assertAnalyzesTo(analyzer, "please divide into shingles",
                           new String[] { "pleasedivide",
                                          "divideinto",
                                          "intoshingles" },
                           new int[] {  0,  7, 14 },
                           new int[] { 13, 18, 27 },
                           new int[] {  1,  1,  1 });
     analyzer.close();
   }

   public void testAltTokenSeparator() throws Exception {
     ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
         new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
         ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
         ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
         "<SEP>", true, false,
         ShingleFilter.DEFAULT_FILLER_TOKEN);
     assertAnalyzesTo(analyzer, "please divide into shingles",
                           new String[] { "please", "please<SEP>divide",
                                          "divide", "divide<SEP>into",
                                          "into", "into<SEP>shingles",
                                          "shingles" },
                           new int[] { 0,  0,  7,  7, 14, 14, 19 },
                           new int[] { 6, 13, 13, 18, 18, 27, 27 },
                           new int[] { 1,  0,  1,  0,  1,  0,  1 });
     analyzer.close();

     analyzer = new ShingleAnalyzerWrapper(
         new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
         ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
         ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
         "<SEP>", false, false,
         ShingleFilter.DEFAULT_FILLER_TOKEN);
     assertAnalyzesTo(analyzer, "please divide into shingles",
                           new String[] { "please<SEP>divide",
                                          "divide<SEP>into",
                                          "into<SEP>shingles" },
                           new int[] {  0,  7, 14 },
                           new int[] { 13, 18, 27 },
                           new int[] {  1,  1,  1 });
     analyzer.close();
   }

   public void testAltFillerToken() throws Exception {
     Analyzer delegate = new Analyzer() {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         CharArraySet stopSet = StopFilter.makeStopSet("into");
         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
         TokenFilter filter = new StopFilter(tokenizer, stopSet);
         return new TokenStreamComponents(tokenizer, filter);
       }
     };

     ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
         delegate,
         ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
         ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
         ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
         true, false, "--");
     assertAnalyzesTo(analyzer, "please divide into shingles",
                      new String[] { "please", "please divide",
                                     "divide", "divide --",
                                     "-- shingles", "shingles" },
                      new int[] { 0,  0,  7,  7, 19, 19 },
                      new int[] { 6, 13, 13, 19, 27, 27 },
                      new int[] { 1,  0,  1,  0,  1,  1 });
     analyzer.close();

     delegate = new Analyzer() {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         CharArraySet stopSet = StopFilter.makeStopSet("into");
         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
         TokenFilter filter = new StopFilter(tokenizer, stopSet);
         return new TokenStreamComponents(tokenizer, filter);
       }
     };
     analyzer = new ShingleAnalyzerWrapper(
         delegate,
         ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
         ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
         ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
         false, false, null);
     assertAnalyzesTo(analyzer, "please divide into shingles",
                      new String[] { "please divide", "divide ", " shingles" },
                      new int[] {  0,  7, 19 },
                      new int[] { 13, 19, 27 },
                      new int[] {  1,  1,  1 });
     analyzer.close();

     delegate = new Analyzer() {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         CharArraySet stopSet = StopFilter.makeStopSet("into");
         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
         TokenFilter filter = new StopFilter(tokenizer, stopSet);
         return new TokenStreamComponents(tokenizer, filter);
       }
     };
     analyzer = new ShingleAnalyzerWrapper(
         delegate,
         ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
         ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
         ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
         false, false, "");
     assertAnalyzesTo(analyzer, "please divide into shingles",
                      new String[] { "please divide", "divide ", " shingles" },
                      new int[] {  0,  7, 19 },
                      new int[] { 13, 19, 27 },
                      new int[] {  1,  1,  1 });
     analyzer.close();
   }

   public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
     ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
         new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
         ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
         ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
         "", false, true,
         ShingleFilter.DEFAULT_FILLER_TOKEN);
     assertAnalyzesTo(analyzer, "please",
                           new String[] { "please" },
                           new int[] { 0 },
                           new int[] { 6 },
                           new int[] { 1 });
     analyzer.close();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.shingle;


	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.BaseTokenStreamTestCase;
	import org.apache.lucene.analysis.CharArraySet;
	import org.apache.lucene.analysis.MockAnalyzer;
	import org.apache.lucene.analysis.MockTokenizer;
	import org.apache.lucene.analysis.StopFilter;
	import org.apache.lucene.analysis.TokenFilter;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.Tokenizer;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field;
	import org.apache.lucene.document.TextField;
	import org.apache.lucene.index.DirectoryReader;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.IndexWriter;
	import org.apache.lucene.index.IndexWriterConfig;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.search.BooleanClause;
	import org.apache.lucene.search.BooleanQuery;
	import org.apache.lucene.search.IndexSearcher;
	import org.apache.lucene.search.PhraseQuery;
	import org.apache.lucene.search.ScoreDoc;
	import org.apache.lucene.search.TermQuery;
	import org.apache.lucene.store.Directory;

	/**
	* A test class for ShingleAnalyzerWrapper as regards queries and scoring.
	*/
	public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
	private Analyzer analyzer;
	private IndexSearcher searcher;
	private IndexReader reader;
	private Directory directory;

	/**
	* Set up a new index in RAM with three test phrases and the supplied Analyzer.
	*
	* @throws Exception if an error occurs with index writer or searcher
	*/
	@Override
	public void setUp() throws Exception {
	super.setUp();
	analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 2);
	directory = newDirectory();
	IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(analyzer));

	Document doc;
	doc = new Document();
	doc.add(new TextField("content", "please divide this sentence into shingles", Field.Store.YES));
	writer.addDocument(doc);

	doc = new Document();
	doc.add(new TextField("content", "just another test sentence", Field.Store.YES));
	writer.addDocument(doc);

	doc = new Document();
	doc.add(new TextField("content", "a sentence which contains no test", Field.Store.YES));
	writer.addDocument(doc);

	writer.close();

	reader = DirectoryReader.open(directory);
	searcher = newSearcher(reader);
	}

	@Override
	public void tearDown() throws Exception {
	reader.close();
	directory.close();
	analyzer.close();
	super.tearDown();
	}

	protected void compareRanks(ScoreDoc[] hits, int[] ranks) throws Exception {
	assertEquals(ranks.length, hits.length);
	for (int i = 0; i < ranks.length; i++) {
	assertEquals(ranks[i], hits[i].doc);
	}
	}

	/*
	* This shows how to construct a phrase query containing shingles.
	*/
	public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
	PhraseQuery.Builder builder = new PhraseQuery.Builder();
	try (TokenStream ts = analyzer.tokenStream("content", "this sentence")) {
	int j = -1;

	PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
	CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

	ts.reset();
	while (ts.incrementToken()) {
	j += posIncrAtt.getPositionIncrement();
	String termText = termAtt.toString();
	builder.add(new Term("content", termText), j);
	}
	ts.end();
	}

	PhraseQuery q = builder.build();
	ScoreDoc[] hits = searcher.search(q, 1000).scoreDocs;
	int[] ranks = new int[] { 0 };
	compareRanks(hits, ranks);
	}

	/*
	* How to construct a boolean query with shingles. A query like this will
	* implicitly score those documents higher that contain the words in the query
	* in the right order and adjacent to each other.
	*/
	public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
	BooleanQuery.Builder q = new BooleanQuery.Builder();

	try (TokenStream ts = analyzer.tokenStream("content", "test sentence")) {
	CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

	ts.reset();
	while (ts.incrementToken()) {
	String termText = termAtt.toString();
	q.add(new TermQuery(new Term("content", termText)),
	BooleanClause.Occur.SHOULD);
	}
	ts.end();
	}

	ScoreDoc[] hits = searcher.search(q.build(), 1000).scoreDocs;
	int[] ranks = new int[] { 1, 2, 0 };
	compareRanks(hits, ranks);
	}

	public void testReusableTokenStream() throws Exception {
	Analyzer a = new ShingleAnalyzerWrapper(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 2);
	assertAnalyzesTo(a, "please divide into shingles",
	new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" },
	new int[] { 0, 0, 7, 7, 14, 14, 19 },
	new int[] { 6, 13, 13, 18, 18, 27, 27 },
	new int[] { 1, 0, 1, 0, 1, 0, 1 });
	assertAnalyzesTo(a, "divide me up again",
	new String[] { "divide", "divide me", "me", "me up", "up", "up again", "again" },
	new int[] { 0, 0, 7, 7, 10, 10, 13 },
	new int[] { 6, 9, 9, 12, 12, 18, 18 },
	new int[] { 1, 0, 1, 0, 1, 0, 1 });
	a.close();
	}

	public void testNonDefaultMinShingleSize() throws Exception {
	ShingleAnalyzerWrapper analyzer
	= new ShingleAnalyzerWrapper(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4);
	assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
	new String[] { "please", "please divide this", "please divide this sentence",
	"divide", "divide this sentence", "divide this sentence into",
	"this", "this sentence into", "this sentence into shingles",
	"sentence", "sentence into shingles",
	"into",
	"shingles" },
	new int[] { 0, 0, 0, 7, 7, 7, 14, 14, 14, 19, 19, 28, 33 },
	new int[] { 6, 18, 27, 13, 27, 32, 18, 32, 41, 27, 41, 32, 41 },
	new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 });
	analyzer.close();

	analyzer = new ShingleAnalyzerWrapper(
	new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4,
	ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
	assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
	new String[] { "please divide this", "please divide this sentence",
	"divide this sentence", "divide this sentence into",
	"this sentence into", "this sentence into shingles",
	"sentence into shingles" },
	new int[] { 0, 0, 7, 7, 14, 14, 19 },
	new int[] { 18, 27, 27, 32, 32, 41, 41 },
	new int[] { 1, 0, 1, 0, 1, 0, 1 });
	analyzer.close();
	}

	public void testNonDefaultMinAndSameMaxShingleSize() throws Exception {
	ShingleAnalyzerWrapper analyzer
	= new ShingleAnalyzerWrapper(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3);
	assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
	new String[] { "please", "please divide this",
	"divide", "divide this sentence",
	"this", "this sentence into",
	"sentence", "sentence into shingles",
	"into",
	"shingles" },
	new int[] { 0, 0, 7, 7, 14, 14, 19, 19, 28, 33 },
	new int[] { 6, 18, 13, 27, 18, 32, 27, 41, 32, 41 },
	new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 });
	analyzer.close();

	analyzer = new ShingleAnalyzerWrapper(
	new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3,
	ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
	assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
	new String[] { "please divide this",
	"divide this sentence",
	"this sentence into",
	"sentence into shingles" },
	new int[] { 0, 7, 14, 19 },
	new int[] { 18, 27, 32, 41 },
	new int[] { 1, 1, 1, 1 });
	analyzer.close();
	}

	public void testNoTokenSeparator() throws Exception {
	ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
	new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
	ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
	ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
	"", true, false,
	ShingleFilter.DEFAULT_FILLER_TOKEN);
	assertAnalyzesTo(analyzer, "please divide into shingles",
	new String[] { "please", "pleasedivide",
	"divide", "divideinto",
	"into", "intoshingles",
	"shingles" },
	new int[] { 0, 0, 7, 7, 14, 14, 19 },
	new int[] { 6, 13, 13, 18, 18, 27, 27 },
	new int[] { 1, 0, 1, 0, 1, 0, 1 });
	analyzer.close();

	analyzer = new ShingleAnalyzerWrapper(
	new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
	ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
	ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
	"", false, false,
	ShingleFilter.DEFAULT_FILLER_TOKEN);
	assertAnalyzesTo(analyzer, "please divide into shingles",
	new String[] { "pleasedivide",
	"divideinto",
	"intoshingles" },
	new int[] { 0, 7, 14 },
	new int[] { 13, 18, 27 },
	new int[] { 1, 1, 1 });
	analyzer.close();
	}

	public void testNullTokenSeparator() throws Exception {
	ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
	new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
	ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
	ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
	null, true, false,
	ShingleFilter.DEFAULT_FILLER_TOKEN);
	assertAnalyzesTo(analyzer, "please divide into shingles",
	new String[] { "please", "pleasedivide",
	"divide", "divideinto",
	"into", "intoshingles",
	"shingles" },
	new int[] { 0, 0, 7, 7, 14, 14, 19 },
	new int[] { 6, 13, 13, 18, 18, 27, 27 },
	new int[] { 1, 0, 1, 0, 1, 0, 1 });
	analyzer.close();

	analyzer = new ShingleAnalyzerWrapper(
	new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
	ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
	ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
	"", false, false,
	ShingleFilter.DEFAULT_FILLER_TOKEN);
	assertAnalyzesTo(analyzer, "please divide into shingles",
	new String[] { "pleasedivide",
	"divideinto",
	"intoshingles" },
	new int[] { 0, 7, 14 },
	new int[] { 13, 18, 27 },
	new int[] { 1, 1, 1 });
	analyzer.close();
	}

	public void testAltTokenSeparator() throws Exception {
	ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
	new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
	ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
	ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
	"<SEP>", true, false,
	ShingleFilter.DEFAULT_FILLER_TOKEN);
	assertAnalyzesTo(analyzer, "please divide into shingles",
	new String[] { "please", "please<SEP>divide",
	"divide", "divide<SEP>into",
	"into", "into<SEP>shingles",
	"shingles" },
	new int[] { 0, 0, 7, 7, 14, 14, 19 },
	new int[] { 6, 13, 13, 18, 18, 27, 27 },
	new int[] { 1, 0, 1, 0, 1, 0, 1 });
	analyzer.close();

	analyzer = new ShingleAnalyzerWrapper(
	new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
	ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
	ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
	"<SEP>", false, false,
	ShingleFilter.DEFAULT_FILLER_TOKEN);
	assertAnalyzesTo(analyzer, "please divide into shingles",
	new String[] { "please<SEP>divide",
	"divide<SEP>into",
	"into<SEP>shingles" },
	new int[] { 0, 7, 14 },
	new int[] { 13, 18, 27 },
	new int[] { 1, 1, 1 });
	analyzer.close();
	}

	public void testAltFillerToken() throws Exception {
	Analyzer delegate = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	CharArraySet stopSet = StopFilter.makeStopSet("into");
	Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	TokenFilter filter = new StopFilter(tokenizer, stopSet);
	return new TokenStreamComponents(tokenizer, filter);
	}
	};

	ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
	delegate,
	ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
	ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
	ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
	true, false, "--");
	assertAnalyzesTo(analyzer, "please divide into shingles",
	new String[] { "please", "please divide",
	"divide", "divide --",
	"-- shingles", "shingles" },
	new int[] { 0, 0, 7, 7, 19, 19 },
	new int[] { 6, 13, 13, 19, 27, 27 },
	new int[] { 1, 0, 1, 0, 1, 1 });
	analyzer.close();

	delegate = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	CharArraySet stopSet = StopFilter.makeStopSet("into");
	Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	TokenFilter filter = new StopFilter(tokenizer, stopSet);
	return new TokenStreamComponents(tokenizer, filter);
	}
	};
	analyzer = new ShingleAnalyzerWrapper(
	delegate,
	ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
	ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
	ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
	false, false, null);
	assertAnalyzesTo(analyzer, "please divide into shingles",
	new String[] { "please divide", "divide ", " shingles" },
	new int[] { 0, 7, 19 },
	new int[] { 13, 19, 27 },
	new int[] { 1, 1, 1 });
	analyzer.close();

	delegate = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	CharArraySet stopSet = StopFilter.makeStopSet("into");
	Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	TokenFilter filter = new StopFilter(tokenizer, stopSet);
	return new TokenStreamComponents(tokenizer, filter);
	}
	};
	analyzer = new ShingleAnalyzerWrapper(
	delegate,
	ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
	ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
	ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
	false, false, "");
	assertAnalyzesTo(analyzer, "please divide into shingles",
	new String[] { "please divide", "divide ", " shingles" },
	new int[] { 0, 7, 19 },
	new int[] { 13, 19, 27 },
	new int[] { 1, 1, 1 });
	analyzer.close();
	}

	public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
	ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
	new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
	ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
	ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
	"", false, true,
	ShingleFilter.DEFAULT_FILLER_TOKEN);
	assertAnalyzesTo(analyzer, "please",
	new String[] { "please" },
	new int[] { 0 },
	new int[] { 6 },
	new int[] { 1 });
	analyzer.close();
	}
	}