lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.lucene.analysis.miscellaneous;

 import java.io.IOException;
 import java.io.StringReader;
 import java.util.Collections;
 import java.util.Random;
 import java.util.function.Function;
 import java.util.regex.Pattern;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CannedTokenStream;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.CharacterUtils;
 import org.apache.lucene.analysis.FilteringTokenFilter;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ValidatingTokenFilter;
 import org.apache.lucene.analysis.core.TypeTokenFilter;
 import org.apache.lucene.analysis.de.GermanStemFilter;
 import org.apache.lucene.analysis.in.IndicNormalizationFilter;
 import org.apache.lucene.analysis.ngram.NGramTokenizer;
 import org.apache.lucene.analysis.shingle.FixedShingleFilter;
 import org.apache.lucene.analysis.shingle.ShingleFilter;
 import org.apache.lucene.analysis.standard.ClassicTokenizer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.synonym.SolrSynonymParser;
 import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
 import org.apache.lucene.analysis.synonym.SynonymMap;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

 public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {

   boolean closed = false;
   boolean ended = false;
   boolean reset = false;

   private final class AssertingLowerCaseFilter extends TokenFilter {

     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

     public AssertingLowerCaseFilter(TokenStream in) {
       super(in);
     }

     @Override
     public final boolean incrementToken() throws IOException {
       if (input.incrementToken()) {
         CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
         return true;
       } else
         return false;
     }

     @Override
     public void end() throws IOException {
       super.end();
       ended = true;
     }

     @Override
     public void close() throws IOException {
       super.close();
       closed = true;
     }

     @Override
     public void reset() throws IOException {
       super.reset();
       reset = true;
     }
   }

   private class SkipMatchingFilter extends ConditionalTokenFilter {
     private final Pattern pattern;
     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
     SkipMatchingFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory, String termRegex) {
       super(input, inputFactory);
       pattern = Pattern.compile(termRegex);
     }

     @Override
     protected boolean shouldFilter() throws IOException {
       return pattern.matcher(termAtt.toString()).matches() == false;
     }
   }

   public void testSimple() throws IOException {
     TokenStream stream = whitespaceMockTokenizer("Alice Bob Clara David");
     TokenStream t = new SkipMatchingFilter(stream, AssertingLowerCaseFilter::new, ".*o.*");
     assertTokenStreamContents(t, new String[]{ "alice", "Bob", "clara", "david" });
     assertTrue(closed);
     assertTrue(reset);
     assertTrue(ended);
   }

   private final class TokenSplitter extends TokenFilter {

     final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
     State state = null;
     String half;

     protected TokenSplitter(TokenStream input) {
       super(input);
     }

     @Override
     public boolean incrementToken() throws IOException {
       if (half == null) {
         state = captureState();
         if (input.incrementToken() == false) {
           return false;
         }
         half = termAtt.toString().substring(4);
         termAtt.setLength(4);
         return true;
       }
       restoreState(state);
       termAtt.setEmpty().append(half);
       half = null;
       return true;
     }
   }

   public void testMultitokenWrapping() throws IOException {
     TokenStream stream = whitespaceMockTokenizer("tokenpos1 tokenpos2 tokenpos3 tokenpos4");
     TokenStream ts = new SkipMatchingFilter(stream, TokenSplitter::new, ".*2.*");
     assertTokenStreamContents(ts, new String[]{
         "toke", "npos1", "tokenpos2", "toke", "npos3", "toke", "npos4"
     });
   }

   private final class EndTrimmingFilter extends FilteringTokenFilter {

     final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

     public EndTrimmingFilter(TokenStream in) {
       super(in);
     }

     @Override
     protected boolean accept() throws IOException {
       return true;
     }

     @Override
     public void end() throws IOException {
       super.end();
       offsetAtt.setOffset(0, offsetAtt.endOffset() - 2);
     }
   }

   public void testEndPropagation() throws IOException {

     CannedTokenStream cts2 = new CannedTokenStream(0, 20,
         new Token("alice", 0, 5), new Token("bob", 6, 8)
     );
     TokenStream ts2 = new ConditionalTokenFilter(cts2, EndTrimmingFilter::new) {
       @Override
       protected boolean shouldFilter() throws IOException {
         return true;
       }
     };
     assertTokenStreamContents(ts2, new String[]{ "alice", "bob" },
         null, null, null, null, null, 18);

     CannedTokenStream cts1 = new CannedTokenStream(0, 20,
         new Token("alice", 0, 5), new Token("bob", 6, 8)
     );
     TokenStream ts1 = new ConditionalTokenFilter(cts1, EndTrimmingFilter::new) {
       @Override
       protected boolean shouldFilter() throws IOException {
         return false;
       }
     };
     assertTokenStreamContents(ts1, new String[]{ "alice", "bob" },
         null, null, null, null, null, 20);

   }

   public void testWrapGraphs() throws Exception {

     TokenStream stream = whitespaceMockTokenizer("a b c d e");

     SynonymMap sm;
     try (Analyzer analyzer = new MockAnalyzer(random())) {
       SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
       parser.parse(new StringReader("a b, f\nc d, g"));
       sm = parser.build();
     }

     TokenStream ts = new SkipMatchingFilter(stream, in -> new SynonymGraphFilter(in, sm, true), "c");

     assertTokenStreamContents(ts, new String[]{
         "f", "a", "b", "c", "d", "e"
         },
         null, null, null,
         new int[]{
         1, 0, 1, 1, 1, 1
         },
         new int[]{
         2, 1, 1, 1, 1, 1
         });

   }

   public void testReadaheadWithNoFiltering() throws IOException {
     Analyzer analyzer = new Analyzer() {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         Tokenizer source = new ClassicTokenizer();
         TokenStream sink = new ConditionalTokenFilter(source, in -> new ShingleFilter(in, 2)) {
           @Override
           protected boolean shouldFilter() throws IOException {
             return true;
           }
         };
         return new TokenStreamComponents(source, sink);
       }
     };

     String input = "one two three four";

     try (TokenStream ts = analyzer.tokenStream("", input)) {
       assertTokenStreamContents(ts, new String[]{
           "one", "one two",
           "two", "two three",
           "three", "three four",
           "four"
       });
     }
   }

   public void testReadaheadWithFiltering() throws IOException {

     CharArraySet protectedTerms = new CharArraySet(2, true);
     protectedTerms.add("three");

     Analyzer analyzer = new Analyzer() {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         Tokenizer source = new ClassicTokenizer();
         TokenStream sink = new ProtectedTermFilter(protectedTerms, source, in -> new ShingleFilter(in, 2));
         sink = new ValidatingTokenFilter(sink, "1");
         return new TokenStreamComponents(source, sink);
       }
     };

     String input = "one two three four";

     try (TokenStream ts = analyzer.tokenStream("", input)) {
       assertTokenStreamContents(ts, new String[]{
           "one", "one two", "two", "three", "four"
       }, new int[]{
            0,     0,         4,     8,       14
       }, new int[]{
            3,     7,         7,     13,      18
       }, new int[]{
            1,     0,         1,     1,       1
       }, new int[]{
            1,     2,         1,     1,       1
       }, 18);
     }
   }

   public void testFilteringWithReadahead() throws IOException {

     CharArraySet protectedTerms = new CharArraySet(2, true);
     protectedTerms.add("two");
     protectedTerms.add("two three");

     Analyzer analyzer = new Analyzer() {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         Tokenizer source = new StandardTokenizer();
         TokenStream sink = new ShingleFilter(source, 3);
         sink = new ProtectedTermFilter(protectedTerms, sink, in -> new TypeTokenFilter(in, Collections.singleton("ALL"), true));
         return new TokenStreamComponents(source, sink);
       }
     };

     String input = "one two three four";

     try (TokenStream ts = analyzer.tokenStream("", input)) {
       assertTokenStreamContents(ts, new String[]{
           "two", "two three"
       }, new int[]{
            4,     4
       }, new int[]{
            7,     13
       }, new int[]{
            2,     0
       }, new int[]{
            1,     2
       }, 18);
     }

   }

   public void testMultipleConditionalFilters() throws IOException {
     TokenStream stream = whitespaceMockTokenizer("Alice Bob Clara David");
     TokenStream t = new SkipMatchingFilter(stream, in -> {
       TruncateTokenFilter truncateFilter = new TruncateTokenFilter(in, 2);
       return new AssertingLowerCaseFilter(truncateFilter);
     }, ".*o.*");

     assertTokenStreamContents(t, new String[]{"al", "Bob", "cl", "da"});
     assertTrue(closed);
     assertTrue(reset);
     assertTrue(ended);
   }

   public void testFilteredTokenFilters() throws IOException {

     CharArraySet protectedTerms = new CharArraySet(2, true);
     protectedTerms.add("foobar");

     TokenStream ts = whitespaceMockTokenizer("wuthering foobar abc");
     ts = new ProtectedTermFilter(protectedTerms, ts, in -> new LengthFilter(in, 1, 4));
     assertTokenStreamContents(ts, new String[]{ "foobar", "abc" });

     ts = whitespaceMockTokenizer("foobar abc");
     ts = new ProtectedTermFilter(protectedTerms, ts, in -> new LengthFilter(in, 1, 4));
     assertTokenStreamContents(ts, new String[]{ "foobar", "abc" });

   }

   public void testConsistentOffsets() throws IOException {

     long seed = random().nextLong();
     Analyzer analyzer = new Analyzer() {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         Tokenizer source = new NGramTokenizer();
         TokenStream sink = new ValidatingTokenFilter(new KeywordRepeatFilter(source), "stage 0");
         sink = new ValidatingTokenFilter(sink, "stage 1");
         sink = new RandomSkippingFilter(sink, seed, in -> new TypeTokenFilter(in, Collections.singleton("word")));
         sink = new ValidatingTokenFilter(sink, "last stage");
         return new TokenStreamComponents(source, sink);
       }
     };

     checkRandomData(random(), analyzer, 1);

   }

   public void testEndWithShingles() throws IOException {
     TokenStream ts = whitespaceMockTokenizer("cyk jvboq \u092e\u0962\u093f");
     ts = new GermanStemFilter(ts);
     ts = new NonRandomSkippingFilter(ts, in -> new FixedShingleFilter(in, 2), true, false, true);
     ts = new NonRandomSkippingFilter(ts, IndicNormalizationFilter::new, true);

     assertTokenStreamContents(ts, new String[]{"jvboq"});
   }

   public void testInternalPositionAdjustment() throws IOException {
     // check that the partial TokenStream sent to the condition filter begins with a posInc of 1,
     // even if the input stream has a posInc of 0 at that position, and that the filtered stream
     // has the correct posInc afterwards
     TokenStream ts = whitespaceMockTokenizer("one two three");
     ts = new KeywordRepeatFilter(ts);
     ts = new NonRandomSkippingFilter(ts, PositionAssertingTokenFilter::new, false, true, true, true, true, false);

     assertTokenStreamContents(ts,
         new String[]{ "one", "one", "two", "two", "three", "three" },
         new int[]{    1,      0,    1,      0,    1,        0});
   }

   private static final class PositionAssertingTokenFilter extends TokenFilter {

     boolean reset = false;
     final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);

     protected PositionAssertingTokenFilter(TokenStream input) {
       super(input);
     }

     @Override
     public void reset() throws IOException {
       super.reset();
       this.reset = true;
     }

     @Override
     public boolean incrementToken() throws IOException {
       boolean more = input.incrementToken();
       if (more && reset) {
         assertEquals(1, posIncAtt.getPositionIncrement());
       }
       reset = false;
       return more;
     }
   }

   private static class RandomSkippingFilter extends ConditionalTokenFilter {

     Random random;
     final long seed;

     protected RandomSkippingFilter(TokenStream input, long seed, Function<TokenStream, TokenStream> inputFactory) {
       super(input, inputFactory);
       this.seed = seed;
       this.random = new Random(seed);
     }

     @Override
     protected boolean shouldFilter() throws IOException {
       return random.nextBoolean();
     }

     @Override
     public void reset() throws IOException {
       super.reset();
       random = new Random(seed);
     }
   }

   private static class NonRandomSkippingFilter extends ConditionalTokenFilter {

     final boolean[] shouldFilters;
     int pos;

     /**
      * Create a new BypassingTokenFilter
      *
      * @param input        the input TokenStream
      * @param inputFactory a factory function to create a new instance of the TokenFilter to wrap
      */
     protected NonRandomSkippingFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory, boolean... shouldFilters) {
       super(input, inputFactory);
       this.shouldFilters = shouldFilters;
     }

     @Override
     protected boolean shouldFilter() throws IOException {
       return shouldFilters[pos++ % shouldFilters.length];
     }

     @Override
     public void reset() throws IOException {
       super.reset();
       pos = 0;
     }
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.lucene.analysis.miscellaneous;

	import java.io.IOException;
	import java.io.StringReader;
	import java.util.Collections;
	import java.util.Random;
	import java.util.function.Function;
	import java.util.regex.Pattern;

	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.BaseTokenStreamTestCase;
	import org.apache.lucene.analysis.CannedTokenStream;
	import org.apache.lucene.analysis.CharArraySet;
	import org.apache.lucene.analysis.CharacterUtils;
	import org.apache.lucene.analysis.FilteringTokenFilter;
	import org.apache.lucene.analysis.MockAnalyzer;
	import org.apache.lucene.analysis.Token;
	import org.apache.lucene.analysis.TokenFilter;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.Tokenizer;
	import org.apache.lucene.analysis.ValidatingTokenFilter;
	import org.apache.lucene.analysis.core.TypeTokenFilter;
	import org.apache.lucene.analysis.de.GermanStemFilter;
	import org.apache.lucene.analysis.in.IndicNormalizationFilter;
	import org.apache.lucene.analysis.ngram.NGramTokenizer;
	import org.apache.lucene.analysis.shingle.FixedShingleFilter;
	import org.apache.lucene.analysis.shingle.ShingleFilter;
	import org.apache.lucene.analysis.standard.ClassicTokenizer;
	import org.apache.lucene.analysis.standard.StandardTokenizer;
	import org.apache.lucene.analysis.synonym.SolrSynonymParser;
	import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
	import org.apache.lucene.analysis.synonym.SynonymMap;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

	public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {

	boolean closed = false;
	boolean ended = false;
	boolean reset = false;

	private final class AssertingLowerCaseFilter extends TokenFilter {

	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

	public AssertingLowerCaseFilter(TokenStream in) {
	super(in);
	}

	@Override
	public final boolean incrementToken() throws IOException {
	if (input.incrementToken()) {
	CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
	return true;
	} else
	return false;
	}

	@Override
	public void end() throws IOException {
	super.end();
	ended = true;
	}

	@Override
	public void close() throws IOException {
	super.close();
	closed = true;
	}

	@Override
	public void reset() throws IOException {
	super.reset();
	reset = true;
	}
	}

	private class SkipMatchingFilter extends ConditionalTokenFilter {
	private final Pattern pattern;
	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
	SkipMatchingFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory, String termRegex) {
	super(input, inputFactory);
	pattern = Pattern.compile(termRegex);
	}

	@Override
	protected boolean shouldFilter() throws IOException {
	return pattern.matcher(termAtt.toString()).matches() == false;
	}
	}

	public void testSimple() throws IOException {
	TokenStream stream = whitespaceMockTokenizer("Alice Bob Clara David");
	TokenStream t = new SkipMatchingFilter(stream, AssertingLowerCaseFilter::new, ".o.");
	assertTokenStreamContents(t, new String[]{ "alice", "Bob", "clara", "david" });
	assertTrue(closed);
	assertTrue(reset);
	assertTrue(ended);
	}

	private final class TokenSplitter extends TokenFilter {

	final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
	State state = null;
	String half;

	protected TokenSplitter(TokenStream input) {
	super(input);
	}

	@Override
	public boolean incrementToken() throws IOException {
	if (half == null) {
	state = captureState();
	if (input.incrementToken() == false) {
	return false;
	}
	half = termAtt.toString().substring(4);
	termAtt.setLength(4);
	return true;
	}
	restoreState(state);
	termAtt.setEmpty().append(half);
	half = null;
	return true;
	}
	}

	public void testMultitokenWrapping() throws IOException {
	TokenStream stream = whitespaceMockTokenizer("tokenpos1 tokenpos2 tokenpos3 tokenpos4");
	TokenStream ts = new SkipMatchingFilter(stream, TokenSplitter::new, ".2.");
	assertTokenStreamContents(ts, new String[]{
	"toke", "npos1", "tokenpos2", "toke", "npos3", "toke", "npos4"
	});
	}

	private final class EndTrimmingFilter extends FilteringTokenFilter {

	final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

	public EndTrimmingFilter(TokenStream in) {
	super(in);
	}

	@Override
	protected boolean accept() throws IOException {
	return true;
	}

	@Override
	public void end() throws IOException {
	super.end();
	offsetAtt.setOffset(0, offsetAtt.endOffset() - 2);
	}
	}

	public void testEndPropagation() throws IOException {

	CannedTokenStream cts2 = new CannedTokenStream(0, 20,
	new Token("alice", 0, 5), new Token("bob", 6, 8)
	);
	TokenStream ts2 = new ConditionalTokenFilter(cts2, EndTrimmingFilter::new) {
	@Override
	protected boolean shouldFilter() throws IOException {
	return true;
	}
	};
	assertTokenStreamContents(ts2, new String[]{ "alice", "bob" },
	null, null, null, null, null, 18);

	CannedTokenStream cts1 = new CannedTokenStream(0, 20,
	new Token("alice", 0, 5), new Token("bob", 6, 8)
	);
	TokenStream ts1 = new ConditionalTokenFilter(cts1, EndTrimmingFilter::new) {
	@Override
	protected boolean shouldFilter() throws IOException {
	return false;
	}
	};
	assertTokenStreamContents(ts1, new String[]{ "alice", "bob" },
	null, null, null, null, null, 20);

	}

	public void testWrapGraphs() throws Exception {

	TokenStream stream = whitespaceMockTokenizer("a b c d e");

	SynonymMap sm;
	try (Analyzer analyzer = new MockAnalyzer(random())) {
	SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
	parser.parse(new StringReader("a b, f\nc d, g"));
	sm = parser.build();
	}

	TokenStream ts = new SkipMatchingFilter(stream, in -> new SynonymGraphFilter(in, sm, true), "c");

	assertTokenStreamContents(ts, new String[]{
	"f", "a", "b", "c", "d", "e"
	},
	null, null, null,
	new int[]{
	1, 0, 1, 1, 1, 1
	},
	new int[]{
	2, 1, 1, 1, 1, 1
	});

	}

	public void testReadaheadWithNoFiltering() throws IOException {
	Analyzer analyzer = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer source = new ClassicTokenizer();
	TokenStream sink = new ConditionalTokenFilter(source, in -> new ShingleFilter(in, 2)) {
	@Override
	protected boolean shouldFilter() throws IOException {
	return true;
	}
	};
	return new TokenStreamComponents(source, sink);
	}
	};

	String input = "one two three four";

	try (TokenStream ts = analyzer.tokenStream("", input)) {
	assertTokenStreamContents(ts, new String[]{
	"one", "one two",
	"two", "two three",
	"three", "three four",
	"four"
	});
	}
	}

	public void testReadaheadWithFiltering() throws IOException {

	CharArraySet protectedTerms = new CharArraySet(2, true);
	protectedTerms.add("three");

	Analyzer analyzer = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer source = new ClassicTokenizer();
	TokenStream sink = new ProtectedTermFilter(protectedTerms, source, in -> new ShingleFilter(in, 2));
	sink = new ValidatingTokenFilter(sink, "1");
	return new TokenStreamComponents(source, sink);
	}
	};

	String input = "one two three four";

	try (TokenStream ts = analyzer.tokenStream("", input)) {
	assertTokenStreamContents(ts, new String[]{
	"one", "one two", "two", "three", "four"
	}, new int[]{
	0, 0, 4, 8, 14
	}, new int[]{
	3, 7, 7, 13, 18
	}, new int[]{
	1, 0, 1, 1, 1
	}, new int[]{
	1, 2, 1, 1, 1
	}, 18);
	}
	}

	public void testFilteringWithReadahead() throws IOException {

	CharArraySet protectedTerms = new CharArraySet(2, true);
	protectedTerms.add("two");
	protectedTerms.add("two three");

	Analyzer analyzer = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer source = new StandardTokenizer();
	TokenStream sink = new ShingleFilter(source, 3);
	sink = new ProtectedTermFilter(protectedTerms, sink, in -> new TypeTokenFilter(in, Collections.singleton("ALL"), true));
	return new TokenStreamComponents(source, sink);
	}
	};

	String input = "one two three four";

	try (TokenStream ts = analyzer.tokenStream("", input)) {
	assertTokenStreamContents(ts, new String[]{
	"two", "two three"
	}, new int[]{
	4, 4
	}, new int[]{
	7, 13
	}, new int[]{
	2, 0
	}, new int[]{
	1, 2
	}, 18);
	}

	}

	public void testMultipleConditionalFilters() throws IOException {
	TokenStream stream = whitespaceMockTokenizer("Alice Bob Clara David");
	TokenStream t = new SkipMatchingFilter(stream, in -> {
	TruncateTokenFilter truncateFilter = new TruncateTokenFilter(in, 2);
	return new AssertingLowerCaseFilter(truncateFilter);
	}, ".o.");

	assertTokenStreamContents(t, new String[]{"al", "Bob", "cl", "da"});
	assertTrue(closed);
	assertTrue(reset);
	assertTrue(ended);
	}

	public void testFilteredTokenFilters() throws IOException {

	CharArraySet protectedTerms = new CharArraySet(2, true);
	protectedTerms.add("foobar");

	TokenStream ts = whitespaceMockTokenizer("wuthering foobar abc");
	ts = new ProtectedTermFilter(protectedTerms, ts, in -> new LengthFilter(in, 1, 4));
	assertTokenStreamContents(ts, new String[]{ "foobar", "abc" });

	ts = whitespaceMockTokenizer("foobar abc");
	ts = new ProtectedTermFilter(protectedTerms, ts, in -> new LengthFilter(in, 1, 4));
	assertTokenStreamContents(ts, new String[]{ "foobar", "abc" });

	}

	public void testConsistentOffsets() throws IOException {

	long seed = random().nextLong();
	Analyzer analyzer = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer source = new NGramTokenizer();
	TokenStream sink = new ValidatingTokenFilter(new KeywordRepeatFilter(source), "stage 0");
	sink = new ValidatingTokenFilter(sink, "stage 1");
	sink = new RandomSkippingFilter(sink, seed, in -> new TypeTokenFilter(in, Collections.singleton("word")));
	sink = new ValidatingTokenFilter(sink, "last stage");
	return new TokenStreamComponents(source, sink);
	}
	};

	checkRandomData(random(), analyzer, 1);

	}

	public void testEndWithShingles() throws IOException {
	TokenStream ts = whitespaceMockTokenizer("cyk jvboq \u092e\u0962\u093f");
	ts = new GermanStemFilter(ts);
	ts = new NonRandomSkippingFilter(ts, in -> new FixedShingleFilter(in, 2), true, false, true);
	ts = new NonRandomSkippingFilter(ts, IndicNormalizationFilter::new, true);

	assertTokenStreamContents(ts, new String[]{"jvboq"});
	}

	public void testInternalPositionAdjustment() throws IOException {
	// check that the partial TokenStream sent to the condition filter begins with a posInc of 1,
	// even if the input stream has a posInc of 0 at that position, and that the filtered stream
	// has the correct posInc afterwards
	TokenStream ts = whitespaceMockTokenizer("one two three");
	ts = new KeywordRepeatFilter(ts);
	ts = new NonRandomSkippingFilter(ts, PositionAssertingTokenFilter::new, false, true, true, true, true, false);

	assertTokenStreamContents(ts,
	new String[]{ "one", "one", "two", "two", "three", "three" },
	new int[]{ 1, 0, 1, 0, 1, 0});
	}

	private static final class PositionAssertingTokenFilter extends TokenFilter {

	boolean reset = false;
	final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);

	protected PositionAssertingTokenFilter(TokenStream input) {
	super(input);
	}

	@Override
	public void reset() throws IOException {
	super.reset();
	this.reset = true;
	}

	@Override
	public boolean incrementToken() throws IOException {
	boolean more = input.incrementToken();
	if (more && reset) {
	assertEquals(1, posIncAtt.getPositionIncrement());
	}
	reset = false;
	return more;
	}
	}

	private static class RandomSkippingFilter extends ConditionalTokenFilter {

	Random random;
	final long seed;

	protected RandomSkippingFilter(TokenStream input, long seed, Function<TokenStream, TokenStream> inputFactory) {
	super(input, inputFactory);
	this.seed = seed;
	this.random = new Random(seed);
	}

	@Override
	protected boolean shouldFilter() throws IOException {
	return random.nextBoolean();
	}

	@Override
	public void reset() throws IOException {
	super.reset();
	random = new Random(seed);
	}
	}

	private static class NonRandomSkippingFilter extends ConditionalTokenFilter {

	final boolean[] shouldFilters;
	int pos;

	/**
	* Create a new BypassingTokenFilter
	*
	* @param input the input TokenStream
	* @param inputFactory a factory function to create a new instance of the TokenFilter to wrap
	*/
	protected NonRandomSkippingFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory, boolean... shouldFilters) {
	super(input, inputFactory);
	this.shouldFilters = shouldFilters;
	}

	@Override
	protected boolean shouldFilter() throws IOException {
	return shouldFilters[pos++ % shouldFilters.length];
	}

	@Override
	public void reset() throws IOException {
	super.reset();
	pos = 0;
	}
	}

	}