modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java - lucene-solr - Git at Google

 package org.apache.lucene.analysis.sinks;

 /**
  * Copyright 2004 The Apache Software Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.IOException;
 import java.io.StringReader;

 import org.apache.lucene.analysis.*;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.DocsAndPositionsEnum;
 import org.apache.lucene.index.DocsEnum;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.English;


 /**
  * tests for the TestTeeSinkTokenFilter
  */
 public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
   protected StringBuilder buffer1;
   protected StringBuilder buffer2;
   protected String[] tokens1;
   protected String[] tokens2;

   @Override
   public void setUp() throws Exception {
     super.setUp();
     tokens1 = new String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"};
     tokens2 = new String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"};
     buffer1 = new StringBuilder();

     for (int i = 0; i < tokens1.length; i++) {
       buffer1.append(tokens1[i]).append(' ');
     }
     buffer2 = new StringBuilder();
     for (int i = 0; i < tokens2.length; i++) {
       buffer2.append(tokens2[i]).append(' ');
     }
   }

   static final TeeSinkTokenFilter.SinkFilter theFilter = new TeeSinkTokenFilter.SinkFilter() {
     @Override
     public boolean accept(AttributeSource a) {
       CharTermAttribute termAtt = a.getAttribute(CharTermAttribute.class);
       return termAtt.toString().equalsIgnoreCase("The");
     }
   };

   static final TeeSinkTokenFilter.SinkFilter dogFilter = new TeeSinkTokenFilter.SinkFilter() {
     @Override
     public boolean accept(AttributeSource a) {
       CharTermAttribute termAtt = a.getAttribute(CharTermAttribute.class);
       return termAtt.toString().equalsIgnoreCase("Dogs");
     }
   };

   // LUCENE-1448
   // TODO: instead of testing it this way, we can test
   // with BaseTokenStreamTestCase now...
   public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
     Directory dir = newDirectory();
     Analyzer analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false);
     IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
     Document doc = new Document();
     TokenStream tokenStream = analyzer.tokenStream("field", new StringReader("abcd   "));
     TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream);
     TokenStream sink = tee.newSinkTokenStream();
     FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
     ft.setStoreTermVectors(true);
     ft.setStoreTermVectorOffsets(true);
     ft.setStoreTermVectorPositions(true);
     Field f1 = new Field("field", tee, ft);
     Field f2 = new Field("field", sink, ft);
     doc.add(f1);
     doc.add(f2);
     w.addDocument(doc);
     w.close();

     IndexReader r = IndexReader.open(dir);
     Terms vector = r.getTermVectors(0).terms("field");
     assertEquals(1, vector.getUniqueTermCount());
     TermsEnum termsEnum = vector.iterator(null);
     termsEnum.next();
     assertEquals(2, termsEnum.totalTermFreq());
     DocsAndPositionsEnum positions = termsEnum.docsAndPositions(null, null, true);
     assertTrue(positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
     assertEquals(2, positions.freq());
     positions.nextPosition();
     assertEquals(0, positions.startOffset());
     assertEquals(4, positions.endOffset());
     positions.nextPosition();
     assertEquals(8, positions.startOffset());
     assertEquals(12, positions.endOffset());
     assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.nextDoc());
     r.close();
     dir.close();
   }

   public void testGeneral() throws IOException {
     final TeeSinkTokenFilter source = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
     final TokenStream sink1 = source.newSinkTokenStream();
     final TokenStream sink2 = source.newSinkTokenStream(theFilter);

     source.addAttribute(CheckClearAttributesAttribute.class);
     sink1.addAttribute(CheckClearAttributesAttribute.class);
     sink2.addAttribute(CheckClearAttributesAttribute.class);

     assertTokenStreamContents(source, tokens1);
     assertTokenStreamContents(sink1, tokens1);
     assertTokenStreamContents(sink2, new String[]{"The", "the"});
   }

   public void testMultipleSources() throws Exception {
     final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
     final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
     final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
     tee1.reset();
     final TokenStream source1 = new CachingTokenFilter(tee1);

     tee1.addAttribute(CheckClearAttributesAttribute.class);
     dogDetector.addAttribute(CheckClearAttributesAttribute.class);
     theDetector.addAttribute(CheckClearAttributesAttribute.class);

     final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.toString()), MockTokenizer.WHITESPACE, false));
     tee2.addSinkTokenStream(dogDetector);
     tee2.addSinkTokenStream(theDetector);
     final TokenStream source2 = tee2;

     assertTokenStreamContents(source1, tokens1);
     assertTokenStreamContents(source2, tokens2);

     assertTokenStreamContents(theDetector, new String[]{"The", "the", "The", "the"});
     assertTokenStreamContents(dogDetector, new String[]{"Dogs", "Dogs"});

     source1.reset();
     TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1);
     String[] lowerCaseTokens = new String[tokens1.length];
     for (int i = 0; i < tokens1.length; i++)
       lowerCaseTokens[i] = tokens1[i].toLowerCase();
     assertTokenStreamContents(lowerCasing, lowerCaseTokens);
   }

   /**
    * Not an explicit test, just useful to print out some info on performance
    *
    * @throws Exception
    */
   public void performance() throws Exception {
     int[] tokCount = {100, 500, 1000, 2000, 5000, 10000};
     int[] modCounts = {1, 2, 5, 10, 20, 50, 100, 200, 500};
     for (int k = 0; k < tokCount.length; k++) {
       StringBuilder buffer = new StringBuilder();
       System.out.println("-----Tokens: " + tokCount[k] + "-----");
       for (int i = 0; i < tokCount[k]; i++) {
         buffer.append(English.intToEnglish(i).toUpperCase()).append(' ');
       }
       //make sure we produce the same tokens
       TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))));
       TokenStream sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(100));
       teeStream.consumeAllTokens();
       TokenStream stream = new ModuloTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))), 100);
       CharTermAttribute tfTok = stream.addAttribute(CharTermAttribute.class);
       CharTermAttribute sinkTok = sink.addAttribute(CharTermAttribute.class);
       for (int i=0; stream.incrementToken(); i++) {
         assertTrue(sink.incrementToken());
         assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true);
       }

       //simulate two fields, each being analyzed once, for 20 documents
       for (int j = 0; j < modCounts.length; j++) {
         int tfPos = 0;
         long start = System.currentTimeMillis();
         for (int i = 0; i < 20; i++) {
           stream = new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString())));
           PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
           while (stream.incrementToken()) {
             tfPos += posIncrAtt.getPositionIncrement();
           }
           stream = new ModuloTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))), modCounts[j]);
           posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
           while (stream.incrementToken()) {
             tfPos += posIncrAtt.getPositionIncrement();
           }
         }
         long finish = System.currentTimeMillis();
         System.out.println("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
         int sinkPos = 0;
         //simulate one field with one sink
         start = System.currentTimeMillis();
         for (int i = 0; i < 20; i++) {
           teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))));
           sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(modCounts[j]));
           PositionIncrementAttribute posIncrAtt = teeStream.getAttribute(PositionIncrementAttribute.class);
           while (teeStream.incrementToken()) {
             sinkPos += posIncrAtt.getPositionIncrement();
           }
           //System.out.println("Modulo--------");
           posIncrAtt = sink.getAttribute(PositionIncrementAttribute.class);
           while (sink.incrementToken()) {
             sinkPos += posIncrAtt.getPositionIncrement();
           }
         }
         finish = System.currentTimeMillis();
         System.out.println("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
         assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos);

       }
       System.out.println("- End Tokens: " + tokCount[k] + "-----");
     }

   }


   class ModuloTokenFilter extends TokenFilter {

     int modCount;

     ModuloTokenFilter(TokenStream input, int mc) {
       super(input);
       modCount = mc;
     }

     int count = 0;

     //return every 100 tokens
     @Override
     public boolean incrementToken() throws IOException {
       boolean hasNext;
       for (hasNext = input.incrementToken();
            hasNext && count % modCount != 0;
            hasNext = input.incrementToken()) {
         count++;
       }
       count++;
       return hasNext;
     }
   }

   class ModuloSinkFilter extends TeeSinkTokenFilter.SinkFilter {
     int count = 0;
     int modCount;

     ModuloSinkFilter(int mc) {
       modCount = mc;
     }

     @Override
     public boolean accept(AttributeSource a) {
       boolean b = (a != null && count % modCount == 0);
       count++;
       return b;
     }

   }
 }
	package org.apache.lucene.analysis.sinks;

	/**
	* Copyright 2004 The Apache Software Foundation
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.io.IOException;
	import java.io.StringReader;

	import org.apache.lucene.analysis.*;
	import org.apache.lucene.analysis.core.LowerCaseFilter;
	import org.apache.lucene.analysis.standard.StandardFilter;
	import org.apache.lucene.analysis.standard.StandardTokenizer;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field;
	import org.apache.lucene.document.FieldType;
	import org.apache.lucene.document.TextField;
	import org.apache.lucene.index.DocsAndPositionsEnum;
	import org.apache.lucene.index.DocsEnum;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.IndexWriter;
	import org.apache.lucene.index.Terms;
	import org.apache.lucene.index.TermsEnum;
	import org.apache.lucene.search.DocIdSetIterator;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.util.AttributeSource;
	import org.apache.lucene.util.English;


	/**
	* tests for the TestTeeSinkTokenFilter
	*/
	public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
	protected StringBuilder buffer1;
	protected StringBuilder buffer2;
	protected String[] tokens1;
	protected String[] tokens2;

	@Override
	public void setUp() throws Exception {
	super.setUp();
	tokens1 = new String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"};
	tokens2 = new String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"};
	buffer1 = new StringBuilder();

	for (int i = 0; i < tokens1.length; i++) {
	buffer1.append(tokens1[i]).append(' ');
	}
	buffer2 = new StringBuilder();
	for (int i = 0; i < tokens2.length; i++) {
	buffer2.append(tokens2[i]).append(' ');
	}
	}

	static final TeeSinkTokenFilter.SinkFilter theFilter = new TeeSinkTokenFilter.SinkFilter() {
	@Override
	public boolean accept(AttributeSource a) {
	CharTermAttribute termAtt = a.getAttribute(CharTermAttribute.class);
	return termAtt.toString().equalsIgnoreCase("The");
	}
	};

	static final TeeSinkTokenFilter.SinkFilter dogFilter = new TeeSinkTokenFilter.SinkFilter() {
	@Override
	public boolean accept(AttributeSource a) {
	CharTermAttribute termAtt = a.getAttribute(CharTermAttribute.class);
	return termAtt.toString().equalsIgnoreCase("Dogs");
	}
	};

	// LUCENE-1448
	// TODO: instead of testing it this way, we can test
	// with BaseTokenStreamTestCase now...
	public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
	Directory dir = newDirectory();
	Analyzer analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false);
	IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
	Document doc = new Document();
	TokenStream tokenStream = analyzer.tokenStream("field", new StringReader("abcd "));
	TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream);
	TokenStream sink = tee.newSinkTokenStream();
	FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
	ft.setStoreTermVectors(true);
	ft.setStoreTermVectorOffsets(true);
	ft.setStoreTermVectorPositions(true);
	Field f1 = new Field("field", tee, ft);
	Field f2 = new Field("field", sink, ft);
	doc.add(f1);
	doc.add(f2);
	w.addDocument(doc);
	w.close();

	IndexReader r = IndexReader.open(dir);
	Terms vector = r.getTermVectors(0).terms("field");
	assertEquals(1, vector.getUniqueTermCount());
	TermsEnum termsEnum = vector.iterator(null);
	termsEnum.next();
	assertEquals(2, termsEnum.totalTermFreq());
	DocsAndPositionsEnum positions = termsEnum.docsAndPositions(null, null, true);
	assertTrue(positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
	assertEquals(2, positions.freq());
	positions.nextPosition();
	assertEquals(0, positions.startOffset());
	assertEquals(4, positions.endOffset());
	positions.nextPosition();
	assertEquals(8, positions.startOffset());
	assertEquals(12, positions.endOffset());
	assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.nextDoc());
	r.close();
	dir.close();
	}

	public void testGeneral() throws IOException {
	final TeeSinkTokenFilter source = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
	final TokenStream sink1 = source.newSinkTokenStream();
	final TokenStream sink2 = source.newSinkTokenStream(theFilter);

	source.addAttribute(CheckClearAttributesAttribute.class);
	sink1.addAttribute(CheckClearAttributesAttribute.class);
	sink2.addAttribute(CheckClearAttributesAttribute.class);

	assertTokenStreamContents(source, tokens1);
	assertTokenStreamContents(sink1, tokens1);
	assertTokenStreamContents(sink2, new String[]{"The", "the"});
	}

	public void testMultipleSources() throws Exception {
	final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
	final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
	final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
	tee1.reset();
	final TokenStream source1 = new CachingTokenFilter(tee1);

	tee1.addAttribute(CheckClearAttributesAttribute.class);
	dogDetector.addAttribute(CheckClearAttributesAttribute.class);
	theDetector.addAttribute(CheckClearAttributesAttribute.class);

	final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.toString()), MockTokenizer.WHITESPACE, false));
	tee2.addSinkTokenStream(dogDetector);
	tee2.addSinkTokenStream(theDetector);
	final TokenStream source2 = tee2;

	assertTokenStreamContents(source1, tokens1);
	assertTokenStreamContents(source2, tokens2);

	assertTokenStreamContents(theDetector, new String[]{"The", "the", "The", "the"});
	assertTokenStreamContents(dogDetector, new String[]{"Dogs", "Dogs"});

	source1.reset();
	TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1);
	String[] lowerCaseTokens = new String[tokens1.length];
	for (int i = 0; i < tokens1.length; i++)
	lowerCaseTokens[i] = tokens1[i].toLowerCase();
	assertTokenStreamContents(lowerCasing, lowerCaseTokens);
	}

	/**
	* Not an explicit test, just useful to print out some info on performance
	*
	* @throws Exception
	*/
	public void performance() throws Exception {
	int[] tokCount = {100, 500, 1000, 2000, 5000, 10000};
	int[] modCounts = {1, 2, 5, 10, 20, 50, 100, 200, 500};
	for (int k = 0; k < tokCount.length; k++) {
	StringBuilder buffer = new StringBuilder();
	System.out.println("-----Tokens: " + tokCount[k] + "-----");
	for (int i = 0; i < tokCount[k]; i++) {
	buffer.append(English.intToEnglish(i).toUpperCase()).append(' ');
	}
	//make sure we produce the same tokens
	TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))));
	TokenStream sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(100));
	teeStream.consumeAllTokens();
	TokenStream stream = new ModuloTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))), 100);
	CharTermAttribute tfTok = stream.addAttribute(CharTermAttribute.class);
	CharTermAttribute sinkTok = sink.addAttribute(CharTermAttribute.class);
	for (int i=0; stream.incrementToken(); i++) {
	assertTrue(sink.incrementToken());
	assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true);
	}

	//simulate two fields, each being analyzed once, for 20 documents
	for (int j = 0; j < modCounts.length; j++) {
	int tfPos = 0;
	long start = System.currentTimeMillis();
	for (int i = 0; i < 20; i++) {
	stream = new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString())));
	PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
	while (stream.incrementToken()) {
	tfPos += posIncrAtt.getPositionIncrement();
	}
	stream = new ModuloTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))), modCounts[j]);
	posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
	while (stream.incrementToken()) {
	tfPos += posIncrAtt.getPositionIncrement();
	}
	}
	long finish = System.currentTimeMillis();
	System.out.println("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
	int sinkPos = 0;
	//simulate one field with one sink
	start = System.currentTimeMillis();
	for (int i = 0; i < 20; i++) {
	teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))));
	sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(modCounts[j]));
	PositionIncrementAttribute posIncrAtt = teeStream.getAttribute(PositionIncrementAttribute.class);
	while (teeStream.incrementToken()) {
	sinkPos += posIncrAtt.getPositionIncrement();
	}
	//System.out.println("Modulo--------");
	posIncrAtt = sink.getAttribute(PositionIncrementAttribute.class);
	while (sink.incrementToken()) {
	sinkPos += posIncrAtt.getPositionIncrement();
	}
	}
	finish = System.currentTimeMillis();
	System.out.println("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
	assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos);

	}
	System.out.println("- End Tokens: " + tokCount[k] + "-----");
	}

	}


	class ModuloTokenFilter extends TokenFilter {

	int modCount;

	ModuloTokenFilter(TokenStream input, int mc) {
	super(input);
	modCount = mc;
	}

	int count = 0;

	//return every 100 tokens
	@Override
	public boolean incrementToken() throws IOException {
	boolean hasNext;
	for (hasNext = input.incrementToken();
	hasNext && count % modCount != 0;
	hasNext = input.incrementToken()) {
	count++;
	}
	count++;
	return hasNext;
	}
	}

	class ModuloSinkFilter extends TeeSinkTokenFilter.SinkFilter {
	int count = 0;
	int modCount;

	ModuloSinkFilter(int mc) {
	modCount = mc;
	}

	@Override
	public boolean accept(AttributeSource a) {
	boolean b = (a != null && count % modCount == 0);
	count++;
	return b;
	}

	}
	}