lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.commongrams;

 import java.io.StringReader;
 import java.util.Arrays;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

 /**
  * Tests CommonGrams(Query)Filter
  */
 public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
   private static final CharArraySet commonWords = new CharArraySet(Arrays.asList(
       "s", "a", "b", "c", "d", "the", "of"
   ), false);

   public void testReset() throws Exception {
     final String input = "How the s a brown s cow d like A B thing?";
     WhitespaceTokenizer wt = new WhitespaceTokenizer();
     wt.setReader(new StringReader(input));
     CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);

     CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class);
     cgf.reset();
     assertTrue(cgf.incrementToken());
     assertEquals("How", term.toString());
     assertTrue(cgf.incrementToken());
     assertEquals("How_the", term.toString());
     assertTrue(cgf.incrementToken());
     assertEquals("the", term.toString());
     assertTrue(cgf.incrementToken());
     assertEquals("the_s", term.toString());
     cgf.close();

     wt.setReader(new StringReader(input));
     cgf.reset();
     assertTrue(cgf.incrementToken());
     assertEquals("How", term.toString());
   }

   public void testQueryReset() throws Exception {
     final String input = "How the s a brown s cow d like A B thing?";
     WhitespaceTokenizer wt = new WhitespaceTokenizer();
     wt.setReader(new StringReader(input));
     CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
     CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);

     CharTermAttribute term = wt.addAttribute(CharTermAttribute.class);
     nsf.reset();
     assertTrue(nsf.incrementToken());
     assertEquals("How_the", term.toString());
     assertTrue(nsf.incrementToken());
     assertEquals("the_s", term.toString());
     nsf.close();

     wt.setReader(new StringReader(input));
     nsf.reset();
     assertTrue(nsf.incrementToken());
     assertEquals("How_the", term.toString());
   }

   /**
    * This is for testing CommonGramsQueryFilter which outputs a set of tokens
    * optimized for querying with only one token at each position, either a
    * unigram or a bigram It also will not return a token for the final position
    * if the final word is already in the preceding bigram Example:(three
    * tokens/positions in)
    * "foo bar the"=&gt;"foo:1|bar:2,bar-the:2|the:3=&gt; "foo" "bar-the" (2 tokens
    * out)
    *
    */
   public void testCommonGramsQueryFilter() throws Exception {
     Analyzer a = new Analyzer() {
       @Override
       public TokenStreamComponents createComponents(String field) {
         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
         return new TokenStreamComponents(tokenizer, new CommonGramsQueryFilter(new CommonGramsFilter(
             tokenizer, commonWords)));
       }
     };

     // Stop words used below are "of" "the" and "s"

     // two word queries
     assertAnalyzesTo(a, "brown fox",
         new String[] { "brown", "fox" });
     assertAnalyzesTo(a, "the fox",
         new String[] { "the_fox" });
     assertAnalyzesTo(a, "fox of",
         new String[] { "fox_of" });
     assertAnalyzesTo(a, "of the",
         new String[] { "of_the" });

     // one word queries
     assertAnalyzesTo(a, "the",
         new String[] { "the" });
     assertAnalyzesTo(a, "foo",
         new String[] { "foo" });

     // 3 word combinations s=stopword/common word n=not a stop word
     assertAnalyzesTo(a, "n n n",
         new String[] { "n", "n", "n" });
     assertAnalyzesTo(a, "quick brown fox",
         new String[] { "quick", "brown", "fox" });

     assertAnalyzesTo(a, "n n s",
         new String[] { "n", "n_s" });
     assertAnalyzesTo(a, "quick brown the",
         new String[] { "quick", "brown_the" });

     assertAnalyzesTo(a, "n s n",
         new String[] { "n_s", "s_n" });
     assertAnalyzesTo(a, "quick the brown",
         new String[] { "quick_the", "the_brown" });

     assertAnalyzesTo(a, "n s s",
         new String[] { "n_s", "s_s" });
     assertAnalyzesTo(a, "fox of the",
         new String[] { "fox_of", "of_the" });

     assertAnalyzesTo(a, "s n n",
         new String[] { "s_n", "n", "n" });
     assertAnalyzesTo(a, "the quick brown",
         new String[] { "the_quick", "quick", "brown" });

     assertAnalyzesTo(a, "s n s",
         new String[] { "s_n", "n_s" });
     assertAnalyzesTo(a, "the fox of",
         new String[] { "the_fox", "fox_of" });

     assertAnalyzesTo(a, "s s n",
         new String[] { "s_s", "s_n" });
     assertAnalyzesTo(a, "of the fox",
         new String[] { "of_the", "the_fox" });

     assertAnalyzesTo(a, "s s s",
         new String[] { "s_s", "s_s" });
     assertAnalyzesTo(a, "of the of",
         new String[] { "of_the", "the_of" });
     a.close();
   }

   public void testCommonGramsFilter() throws Exception {
     Analyzer a = new Analyzer() {
       @Override
       public TokenStreamComponents createComponents(String field) {
         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
         return new TokenStreamComponents(tokenizer, new CommonGramsFilter(tokenizer, commonWords));
       }
     };

     // Stop words used below are "of" "the" and "s"
     // one word queries
     assertAnalyzesTo(a, "the", new String[] { "the" });
     assertAnalyzesTo(a, "foo", new String[] { "foo" });

     // two word queries
     assertAnalyzesTo(a, "brown fox",
         new String[] { "brown", "fox" },
         new int[] { 1, 1 });
     assertAnalyzesTo(a, "the fox",
         new String[] { "the", "the_fox", "fox" },
         new int[] { 1, 0, 1 });
     assertAnalyzesTo(a, "fox of",
         new String[] { "fox", "fox_of", "of" },
         new int[] { 1, 0, 1 });
     assertAnalyzesTo(a, "of the",
         new String[] { "of", "of_the", "the" },
         new int[] { 1, 0, 1 });

     // 3 word combinations s=stopword/common word n=not a stop word
     assertAnalyzesTo(a, "n n n",
         new String[] { "n", "n", "n" },
         new int[] { 1, 1, 1 });
     assertAnalyzesTo(a, "quick brown fox",
         new String[] { "quick", "brown", "fox" },
         new int[] { 1, 1, 1 });

     assertAnalyzesTo(a, "n n s",
         new String[] { "n", "n", "n_s", "s" },
         new int[] { 1, 1, 0, 1 });
     assertAnalyzesTo(a, "quick brown the",
         new String[] { "quick", "brown", "brown_the", "the" },
         new int[] { 1, 1, 0, 1 });

     assertAnalyzesTo(a, "n s n",
         new String[] { "n", "n_s", "s", "s_n", "n" },
         new int[] { 1, 0, 1, 0, 1 });
     assertAnalyzesTo(a, "quick the fox",
         new String[] { "quick", "quick_the", "the", "the_fox", "fox" },
         new int[] { 1, 0, 1, 0, 1 });

     assertAnalyzesTo(a, "n s s",
         new String[] { "n", "n_s", "s", "s_s", "s" },
         new int[] { 1, 0, 1, 0, 1 });
     assertAnalyzesTo(a, "fox of the",
         new String[] { "fox", "fox_of", "of", "of_the", "the" },
         new int[] { 1, 0, 1, 0, 1 });

     assertAnalyzesTo(a, "s n n",
         new String[] { "s", "s_n", "n", "n" },
         new int[] { 1, 0, 1, 1 });
     assertAnalyzesTo(a, "the quick brown",
         new String[] { "the", "the_quick", "quick", "brown" },
         new int[] { 1, 0, 1, 1 });

     assertAnalyzesTo(a, "s n s",
         new String[] { "s", "s_n", "n", "n_s", "s" },
         new int[] { 1, 0, 1, 0, 1 });
     assertAnalyzesTo(a, "the fox of",
         new String[] { "the", "the_fox", "fox", "fox_of", "of" },
         new int[] { 1, 0, 1, 0, 1 });

     assertAnalyzesTo(a, "s s n",
         new String[] { "s", "s_s", "s", "s_n", "n" },
         new int[] { 1, 0, 1, 0, 1 });
     assertAnalyzesTo(a, "of the fox",
         new String[] { "of", "of_the", "the", "the_fox", "fox" },
         new int[] { 1, 0, 1, 0, 1 });

     assertAnalyzesTo(a, "s s s",
         new String[] { "s", "s_s", "s", "s_s", "s" },
         new int[] { 1, 0, 1, 0, 1 });
     assertAnalyzesTo(a, "of the of",
         new String[] { "of", "of_the", "the", "the_of", "of" },
         new int[] { 1, 0, 1, 0, 1 });
     a.close();
   }

   /**
    * Test that CommonGramsFilter works correctly in case-insensitive mode
    */
   public void testCaseSensitive() throws Exception {
     final String input = "How The s a brown s cow d like A B thing?";
     MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
     wt.setReader(new StringReader(input));
     TokenFilter cgf = new CommonGramsFilter(wt, commonWords);
     assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
         "s_a", "a", "a_brown", "brown", "brown_s", "s", "s_cow", "cow",
         "cow_d", "d", "d_like", "like", "A", "B", "thing?"});
   }

   /**
    * Test CommonGramsQueryFilter in the case that the last word is a stopword
    */
   public void testLastWordisStopWord() throws Exception {
     final String input = "dog the";
     MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
     wt.setReader(new StringReader(input));
     CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
     TokenFilter nsf = new CommonGramsQueryFilter(cgf);
     assertTokenStreamContents(nsf, new String[] { "dog_the" });
   }

   /**
    * Test CommonGramsQueryFilter in the case that the first word is a stopword
    */
   public void testFirstWordisStopWord() throws Exception {
     final String input = "the dog";
     MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
     wt.setReader(new StringReader(input));
     CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
     TokenFilter nsf = new CommonGramsQueryFilter(cgf);
     assertTokenStreamContents(nsf, new String[] { "the_dog" });
   }

   /**
    * Test CommonGramsQueryFilter in the case of a single (stop)word query
    */
   public void testOneWordQueryStopWord() throws Exception {
     final String input = "the";
     MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
     wt.setReader(new StringReader(input));
     CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
     TokenFilter nsf = new CommonGramsQueryFilter(cgf);
     assertTokenStreamContents(nsf, new String[] { "the" });
   }

   /**
    * Test CommonGramsQueryFilter in the case of a single word query
    */
   public void testOneWordQuery() throws Exception {
     final String input = "monster";
     MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
     wt.setReader(new StringReader(input));
     CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
     TokenFilter nsf = new CommonGramsQueryFilter(cgf);
     assertTokenStreamContents(nsf, new String[] { "monster" });
   }

   /**
    * Test CommonGramsQueryFilter when first and last words are stopwords.
    */
   public void TestFirstAndLastStopWord() throws Exception {
     final String input = "the of";
     MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
     wt.setReader(new StringReader(input));
     CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
     TokenFilter nsf = new CommonGramsQueryFilter(cgf);
     assertTokenStreamContents(nsf, new String[] { "the_of" });
   }

   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     Analyzer a = new Analyzer() {

       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
         CommonGramsFilter cgf = new CommonGramsFilter(t, commonWords);
         return new TokenStreamComponents(t, cgf);
       }
     };

     checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER);
     a.close();

     Analyzer b = new Analyzer() {

       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
         CommonGramsFilter cgf = new CommonGramsFilter(t, commonWords);
         return new TokenStreamComponents(t, new CommonGramsQueryFilter(cgf));
       }
     };

     checkRandomData(random(), b, 200 * RANDOM_MULTIPLIER);
     b.close();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.commongrams;

	import java.io.StringReader;
	import java.util.Arrays;

	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.BaseTokenStreamTestCase;
	import org.apache.lucene.analysis.CharArraySet;
	import org.apache.lucene.analysis.MockTokenizer;
	import org.apache.lucene.analysis.TokenFilter;
	import org.apache.lucene.analysis.Tokenizer;
	import org.apache.lucene.analysis.core.WhitespaceTokenizer;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

	/**
	* Tests CommonGrams(Query)Filter
	*/
	public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
	private static final CharArraySet commonWords = new CharArraySet(Arrays.asList(
	"s", "a", "b", "c", "d", "the", "of"
	), false);

	public void testReset() throws Exception {
	final String input = "How the s a brown s cow d like A B thing?";
	WhitespaceTokenizer wt = new WhitespaceTokenizer();
	wt.setReader(new StringReader(input));
	CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);

	CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class);
	cgf.reset();
	assertTrue(cgf.incrementToken());
	assertEquals("How", term.toString());
	assertTrue(cgf.incrementToken());
	assertEquals("How_the", term.toString());
	assertTrue(cgf.incrementToken());
	assertEquals("the", term.toString());
	assertTrue(cgf.incrementToken());
	assertEquals("the_s", term.toString());
	cgf.close();

	wt.setReader(new StringReader(input));
	cgf.reset();
	assertTrue(cgf.incrementToken());
	assertEquals("How", term.toString());
	}

	public void testQueryReset() throws Exception {
	final String input = "How the s a brown s cow d like A B thing?";
	WhitespaceTokenizer wt = new WhitespaceTokenizer();
	wt.setReader(new StringReader(input));
	CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
	CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);

	CharTermAttribute term = wt.addAttribute(CharTermAttribute.class);
	nsf.reset();
	assertTrue(nsf.incrementToken());
	assertEquals("How_the", term.toString());
	assertTrue(nsf.incrementToken());
	assertEquals("the_s", term.toString());
	nsf.close();

	wt.setReader(new StringReader(input));
	nsf.reset();
	assertTrue(nsf.incrementToken());
	assertEquals("How_the", term.toString());
	}

	/**
	* This is for testing CommonGramsQueryFilter which outputs a set of tokens
	* optimized for querying with only one token at each position, either a
	* unigram or a bigram It also will not return a token for the final position
	* if the final word is already in the preceding bigram Example:(three
	* tokens/positions in)
	* "foo bar the"=>"foo:1\|bar:2,bar-the:2\|the:3=> "foo" "bar-the" (2 tokens
	* out)
	*
	*/
	public void testCommonGramsQueryFilter() throws Exception {
	Analyzer a = new Analyzer() {
	@Override
	public TokenStreamComponents createComponents(String field) {
	Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	return new TokenStreamComponents(tokenizer, new CommonGramsQueryFilter(new CommonGramsFilter(
	tokenizer, commonWords)));
	}
	};

	// Stop words used below are "of" "the" and "s"

	// two word queries
	assertAnalyzesTo(a, "brown fox",
	new String[] { "brown", "fox" });
	assertAnalyzesTo(a, "the fox",
	new String[] { "the_fox" });
	assertAnalyzesTo(a, "fox of",
	new String[] { "fox_of" });
	assertAnalyzesTo(a, "of the",
	new String[] { "of_the" });

	// one word queries
	assertAnalyzesTo(a, "the",
	new String[] { "the" });
	assertAnalyzesTo(a, "foo",
	new String[] { "foo" });

	// 3 word combinations s=stopword/common word n=not a stop word
	assertAnalyzesTo(a, "n n n",
	new String[] { "n", "n", "n" });
	assertAnalyzesTo(a, "quick brown fox",
	new String[] { "quick", "brown", "fox" });

	assertAnalyzesTo(a, "n n s",
	new String[] { "n", "n_s" });
	assertAnalyzesTo(a, "quick brown the",
	new String[] { "quick", "brown_the" });

	assertAnalyzesTo(a, "n s n",
	new String[] { "n_s", "s_n" });
	assertAnalyzesTo(a, "quick the brown",
	new String[] { "quick_the", "the_brown" });

	assertAnalyzesTo(a, "n s s",
	new String[] { "n_s", "s_s" });
	assertAnalyzesTo(a, "fox of the",
	new String[] { "fox_of", "of_the" });

	assertAnalyzesTo(a, "s n n",
	new String[] { "s_n", "n", "n" });
	assertAnalyzesTo(a, "the quick brown",
	new String[] { "the_quick", "quick", "brown" });

	assertAnalyzesTo(a, "s n s",
	new String[] { "s_n", "n_s" });
	assertAnalyzesTo(a, "the fox of",
	new String[] { "the_fox", "fox_of" });

	assertAnalyzesTo(a, "s s n",
	new String[] { "s_s", "s_n" });
	assertAnalyzesTo(a, "of the fox",
	new String[] { "of_the", "the_fox" });

	assertAnalyzesTo(a, "s s s",
	new String[] { "s_s", "s_s" });
	assertAnalyzesTo(a, "of the of",
	new String[] { "of_the", "the_of" });
	a.close();
	}

	public void testCommonGramsFilter() throws Exception {
	Analyzer a = new Analyzer() {
	@Override
	public TokenStreamComponents createComponents(String field) {
	Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	return new TokenStreamComponents(tokenizer, new CommonGramsFilter(tokenizer, commonWords));
	}
	};

	// Stop words used below are "of" "the" and "s"
	// one word queries
	assertAnalyzesTo(a, "the", new String[] { "the" });
	assertAnalyzesTo(a, "foo", new String[] { "foo" });

	// two word queries
	assertAnalyzesTo(a, "brown fox",
	new String[] { "brown", "fox" },
	new int[] { 1, 1 });
	assertAnalyzesTo(a, "the fox",
	new String[] { "the", "the_fox", "fox" },
	new int[] { 1, 0, 1 });
	assertAnalyzesTo(a, "fox of",
	new String[] { "fox", "fox_of", "of" },
	new int[] { 1, 0, 1 });
	assertAnalyzesTo(a, "of the",
	new String[] { "of", "of_the", "the" },
	new int[] { 1, 0, 1 });

	// 3 word combinations s=stopword/common word n=not a stop word
	assertAnalyzesTo(a, "n n n",
	new String[] { "n", "n", "n" },
	new int[] { 1, 1, 1 });
	assertAnalyzesTo(a, "quick brown fox",
	new String[] { "quick", "brown", "fox" },
	new int[] { 1, 1, 1 });

	assertAnalyzesTo(a, "n n s",
	new String[] { "n", "n", "n_s", "s" },
	new int[] { 1, 1, 0, 1 });
	assertAnalyzesTo(a, "quick brown the",
	new String[] { "quick", "brown", "brown_the", "the" },
	new int[] { 1, 1, 0, 1 });

	assertAnalyzesTo(a, "n s n",
	new String[] { "n", "n_s", "s", "s_n", "n" },
	new int[] { 1, 0, 1, 0, 1 });
	assertAnalyzesTo(a, "quick the fox",
	new String[] { "quick", "quick_the", "the", "the_fox", "fox" },
	new int[] { 1, 0, 1, 0, 1 });

	assertAnalyzesTo(a, "n s s",
	new String[] { "n", "n_s", "s", "s_s", "s" },
	new int[] { 1, 0, 1, 0, 1 });
	assertAnalyzesTo(a, "fox of the",
	new String[] { "fox", "fox_of", "of", "of_the", "the" },
	new int[] { 1, 0, 1, 0, 1 });

	assertAnalyzesTo(a, "s n n",
	new String[] { "s", "s_n", "n", "n" },
	new int[] { 1, 0, 1, 1 });
	assertAnalyzesTo(a, "the quick brown",
	new String[] { "the", "the_quick", "quick", "brown" },
	new int[] { 1, 0, 1, 1 });

	assertAnalyzesTo(a, "s n s",
	new String[] { "s", "s_n", "n", "n_s", "s" },
	new int[] { 1, 0, 1, 0, 1 });
	assertAnalyzesTo(a, "the fox of",
	new String[] { "the", "the_fox", "fox", "fox_of", "of" },
	new int[] { 1, 0, 1, 0, 1 });

	assertAnalyzesTo(a, "s s n",
	new String[] { "s", "s_s", "s", "s_n", "n" },
	new int[] { 1, 0, 1, 0, 1 });
	assertAnalyzesTo(a, "of the fox",
	new String[] { "of", "of_the", "the", "the_fox", "fox" },
	new int[] { 1, 0, 1, 0, 1 });

	assertAnalyzesTo(a, "s s s",
	new String[] { "s", "s_s", "s", "s_s", "s" },
	new int[] { 1, 0, 1, 0, 1 });
	assertAnalyzesTo(a, "of the of",
	new String[] { "of", "of_the", "the", "the_of", "of" },
	new int[] { 1, 0, 1, 0, 1 });
	a.close();
	}

	/**
	* Test that CommonGramsFilter works correctly in case-insensitive mode
	*/
	public void testCaseSensitive() throws Exception {
	final String input = "How The s a brown s cow d like A B thing?";
	MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	wt.setReader(new StringReader(input));
	TokenFilter cgf = new CommonGramsFilter(wt, commonWords);
	assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
	"s_a", "a", "a_brown", "brown", "brown_s", "s", "s_cow", "cow",
	"cow_d", "d", "d_like", "like", "A", "B", "thing?"});
	}

	/**
	* Test CommonGramsQueryFilter in the case that the last word is a stopword
	*/
	public void testLastWordisStopWord() throws Exception {
	final String input = "dog the";
	MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	wt.setReader(new StringReader(input));
	CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
	TokenFilter nsf = new CommonGramsQueryFilter(cgf);
	assertTokenStreamContents(nsf, new String[] { "dog_the" });
	}

	/**
	* Test CommonGramsQueryFilter in the case that the first word is a stopword
	*/
	public void testFirstWordisStopWord() throws Exception {
	final String input = "the dog";
	MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	wt.setReader(new StringReader(input));
	CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
	TokenFilter nsf = new CommonGramsQueryFilter(cgf);
	assertTokenStreamContents(nsf, new String[] { "the_dog" });
	}

	/**
	* Test CommonGramsQueryFilter in the case of a single (stop)word query
	*/
	public void testOneWordQueryStopWord() throws Exception {
	final String input = "the";
	MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	wt.setReader(new StringReader(input));
	CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
	TokenFilter nsf = new CommonGramsQueryFilter(cgf);
	assertTokenStreamContents(nsf, new String[] { "the" });
	}

	/**
	* Test CommonGramsQueryFilter in the case of a single word query
	*/
	public void testOneWordQuery() throws Exception {
	final String input = "monster";
	MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	wt.setReader(new StringReader(input));
	CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
	TokenFilter nsf = new CommonGramsQueryFilter(cgf);
	assertTokenStreamContents(nsf, new String[] { "monster" });
	}

	/**
	* Test CommonGramsQueryFilter when first and last words are stopwords.
	*/
	public void TestFirstAndLastStopWord() throws Exception {
	final String input = "the of";
	MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	wt.setReader(new StringReader(input));
	CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
	TokenFilter nsf = new CommonGramsQueryFilter(cgf);
	assertTokenStreamContents(nsf, new String[] { "the_of" });
	}

	/** blast some random strings through the analyzer */
	public void testRandomStrings() throws Exception {
	Analyzer a = new Analyzer() {

	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	CommonGramsFilter cgf = new CommonGramsFilter(t, commonWords);
	return new TokenStreamComponents(t, cgf);
	}
	};

	checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER);
	a.close();

	Analyzer b = new Analyzer() {

	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	CommonGramsFilter cgf = new CommonGramsFilter(t, commonWords);
	return new TokenStreamComponents(t, new CommonGramsQueryFilter(cgf));
	}
	};

	checkRandomData(random(), b, 200 * RANDOM_MULTIPLIER);
	b.close();
	}
	}