blob: 8058c116f1b02822721736cac6cc9d3536f1af95 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import java.io.StringReader;
import java.util.Collections;
import java.util.Random;
import java.util.function.Function;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.CharacterUtils;
import org.apache.lucene.analysis.FilteringTokenFilter;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ValidatingTokenFilter;
import org.apache.lucene.analysis.classic.ClassicTokenizer;
import org.apache.lucene.analysis.core.TypeTokenFilter;
import org.apache.lucene.analysis.de.GermanStemFilter;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.shingle.FixedShingleFilter;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {
boolean closed = false;
boolean ended = false;
boolean reset = false;
private final class AssertingLowerCaseFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
public AssertingLowerCaseFilter(TokenStream in) {
super(in);
}
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
return true;
} else return false;
}
@Override
public void end() throws IOException {
super.end();
ended = true;
}
@Override
public void close() throws IOException {
super.close();
closed = true;
}
@Override
public void reset() throws IOException {
super.reset();
reset = true;
}
}
private class SkipMatchingFilter extends ConditionalTokenFilter {
private final Pattern pattern;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
SkipMatchingFilter(
TokenStream input, Function<TokenStream, TokenStream> inputFactory, String termRegex) {
super(input, inputFactory);
pattern = Pattern.compile(termRegex);
}
@Override
protected boolean shouldFilter() throws IOException {
return pattern.matcher(termAtt.toString()).matches() == false;
}
}
public void testSimple() throws IOException {
TokenStream stream = whitespaceMockTokenizer("Alice Bob Clara David");
TokenStream t = new SkipMatchingFilter(stream, AssertingLowerCaseFilter::new, ".*o.*");
assertTokenStreamContents(t, new String[] {"alice", "Bob", "clara", "david"});
assertTrue(closed);
assertTrue(reset);
assertTrue(ended);
}
private final class TokenSplitter extends TokenFilter {
final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
State state = null;
String half;
protected TokenSplitter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (half == null) {
state = captureState();
if (input.incrementToken() == false) {
return false;
}
half = termAtt.toString().substring(4);
termAtt.setLength(4);
return true;
}
restoreState(state);
termAtt.setEmpty().append(half);
half = null;
return true;
}
}
public void testMultitokenWrapping() throws IOException {
TokenStream stream = whitespaceMockTokenizer("tokenpos1 tokenpos2 tokenpos3 tokenpos4");
TokenStream ts = new SkipMatchingFilter(stream, TokenSplitter::new, ".*2.*");
assertTokenStreamContents(
ts, new String[] {"toke", "npos1", "tokenpos2", "toke", "npos3", "toke", "npos4"});
}
private final class EndTrimmingFilter extends FilteringTokenFilter {
final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
public EndTrimmingFilter(TokenStream in) {
super(in);
}
@Override
protected boolean accept() throws IOException {
return true;
}
@Override
public void end() throws IOException {
super.end();
offsetAtt.setOffset(0, offsetAtt.endOffset() - 2);
}
}
public void testEndPropagation() throws IOException {
CannedTokenStream cts2 =
new CannedTokenStream(0, 20, new Token("alice", 0, 5), new Token("bob", 6, 8));
TokenStream ts2 =
new ConditionalTokenFilter(cts2, EndTrimmingFilter::new) {
@Override
protected boolean shouldFilter() throws IOException {
return true;
}
};
assertTokenStreamContents(ts2, new String[] {"alice", "bob"}, null, null, null, null, null, 18);
CannedTokenStream cts1 =
new CannedTokenStream(0, 20, new Token("alice", 0, 5), new Token("bob", 6, 8));
TokenStream ts1 =
new ConditionalTokenFilter(cts1, EndTrimmingFilter::new) {
@Override
protected boolean shouldFilter() throws IOException {
return false;
}
};
assertTokenStreamContents(ts1, new String[] {"alice", "bob"}, null, null, null, null, null, 20);
}
public void testWrapGraphs() throws Exception {
TokenStream stream = whitespaceMockTokenizer("a b c d e");
SynonymMap sm;
try (Analyzer analyzer = new MockAnalyzer(random())) {
SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
parser.parse(new StringReader("a b, f\nc d, g"));
sm = parser.build();
}
TokenStream ts =
new SkipMatchingFilter(stream, in -> new SynonymGraphFilter(in, sm, true), "c");
assertTokenStreamContents(
ts,
new String[] {"f", "a", "b", "c", "d", "e"},
null,
null,
null,
new int[] {1, 0, 1, 1, 1, 1},
new int[] {2, 1, 1, 1, 1, 1});
}
public void testReadaheadWithNoFiltering() throws IOException {
Analyzer analyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new ClassicTokenizer();
TokenStream sink =
new ConditionalTokenFilter(source, in -> new ShingleFilter(in, 2)) {
@Override
protected boolean shouldFilter() throws IOException {
return true;
}
};
return new TokenStreamComponents(source, sink);
}
};
String input = "one two three four";
try (TokenStream ts = analyzer.tokenStream("", input)) {
assertTokenStreamContents(
ts,
new String[] {
"one", "one two",
"two", "two three",
"three", "three four",
"four"
});
}
}
public void testReadaheadWithFiltering() throws IOException {
CharArraySet protectedTerms = new CharArraySet(2, true);
protectedTerms.add("three");
Analyzer analyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new ClassicTokenizer();
TokenStream sink =
new ProtectedTermFilter(protectedTerms, source, in -> new ShingleFilter(in, 2));
sink = new ValidatingTokenFilter(sink, "1");
return new TokenStreamComponents(source, sink);
}
};
String input = "one two three four";
try (TokenStream ts = analyzer.tokenStream("", input)) {
assertTokenStreamContents(
ts,
new String[] {"one", "one two", "two", "three", "four"},
new int[] {0, 0, 4, 8, 14},
new int[] {3, 7, 7, 13, 18},
new int[] {1, 0, 1, 1, 1},
new int[] {1, 2, 1, 1, 1},
18);
}
}
public void testFilteringWithReadahead() throws IOException {
CharArraySet protectedTerms = new CharArraySet(2, true);
protectedTerms.add("two");
protectedTerms.add("two three");
Analyzer analyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new StandardTokenizer();
TokenStream sink = new ShingleFilter(source, 3);
sink =
new ProtectedTermFilter(
protectedTerms,
sink,
in -> new TypeTokenFilter(in, Collections.singleton("ALL"), true));
return new TokenStreamComponents(source, sink);
}
};
String input = "one two three four";
try (TokenStream ts = analyzer.tokenStream("", input)) {
assertTokenStreamContents(
ts,
new String[] {"two", "two three"},
new int[] {4, 4},
new int[] {7, 13},
new int[] {2, 0},
new int[] {1, 2},
18);
}
}
public void testMultipleConditionalFilters() throws IOException {
TokenStream stream = whitespaceMockTokenizer("Alice Bob Clara David");
TokenStream t =
new SkipMatchingFilter(
stream,
in -> {
TruncateTokenFilter truncateFilter = new TruncateTokenFilter(in, 2);
return new AssertingLowerCaseFilter(truncateFilter);
},
".*o.*");
assertTokenStreamContents(t, new String[] {"al", "Bob", "cl", "da"});
assertTrue(closed);
assertTrue(reset);
assertTrue(ended);
}
public void testFilteredTokenFilters() throws IOException {
CharArraySet protectedTerms = new CharArraySet(2, true);
protectedTerms.add("foobar");
TokenStream ts = whitespaceMockTokenizer("wuthering foobar abc");
ts = new ProtectedTermFilter(protectedTerms, ts, in -> new LengthFilter(in, 1, 4));
assertTokenStreamContents(ts, new String[] {"foobar", "abc"});
ts = whitespaceMockTokenizer("foobar abc");
ts = new ProtectedTermFilter(protectedTerms, ts, in -> new LengthFilter(in, 1, 4));
assertTokenStreamContents(ts, new String[] {"foobar", "abc"});
}
public void testConsistentOffsets() throws IOException {
long seed = random().nextLong();
Analyzer analyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new NGramTokenizer();
TokenStream sink =
new ValidatingTokenFilter(new KeywordRepeatFilter(source), "stage 0");
sink = new ValidatingTokenFilter(sink, "stage 1");
sink =
new RandomSkippingFilter(
sink, seed, in -> new TypeTokenFilter(in, Collections.singleton("word")));
sink = new ValidatingTokenFilter(sink, "last stage");
return new TokenStreamComponents(source, sink);
}
};
checkRandomData(random(), analyzer, 1);
}
public void testEndWithShingles() throws IOException {
TokenStream ts = whitespaceMockTokenizer("cyk jvboq \u092e\u0962\u093f");
ts = new GermanStemFilter(ts);
ts = new NonRandomSkippingFilter(ts, in -> new FixedShingleFilter(in, 2), true, false, true);
ts = new NonRandomSkippingFilter(ts, IndicNormalizationFilter::new, true);
assertTokenStreamContents(ts, new String[] {"jvboq"});
}
public void testInternalPositionAdjustment() throws IOException {
// check that the partial TokenStream sent to the condition filter begins with a posInc of 1,
// even if the input stream has a posInc of 0 at that position, and that the filtered stream
// has the correct posInc afterwards
TokenStream ts = whitespaceMockTokenizer("one two three");
ts = new KeywordRepeatFilter(ts);
ts =
new NonRandomSkippingFilter(
ts, PositionAssertingTokenFilter::new, false, true, true, true, true, false);
assertTokenStreamContents(
ts,
new String[] {"one", "one", "two", "two", "three", "three"},
new int[] {1, 0, 1, 0, 1, 0});
}
private static final class PositionAssertingTokenFilter extends TokenFilter {
boolean reset = false;
final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
protected PositionAssertingTokenFilter(TokenStream input) {
super(input);
}
@Override
public void reset() throws IOException {
super.reset();
this.reset = true;
}
@Override
public boolean incrementToken() throws IOException {
boolean more = input.incrementToken();
if (more && reset) {
assertEquals(1, posIncAtt.getPositionIncrement());
}
reset = false;
return more;
}
}
private static class RandomSkippingFilter extends ConditionalTokenFilter {
Random random;
final long seed;
protected RandomSkippingFilter(
TokenStream input, long seed, Function<TokenStream, TokenStream> inputFactory) {
super(input, inputFactory);
this.seed = seed;
this.random = new Random(seed);
}
@Override
protected boolean shouldFilter() throws IOException {
return random.nextBoolean();
}
@Override
public void reset() throws IOException {
super.reset();
random = new Random(seed);
}
}
private static class NonRandomSkippingFilter extends ConditionalTokenFilter {
final boolean[] shouldFilters;
int pos;
/**
* Create a new BypassingTokenFilter
*
* @param input the input TokenStream
* @param inputFactory a factory function to create a new instance of the TokenFilter to wrap
*/
protected NonRandomSkippingFilter(
TokenStream input,
Function<TokenStream, TokenStream> inputFactory,
boolean... shouldFilters) {
super(input, inputFactory);
this.shouldFilters = shouldFilters;
}
@Override
protected boolean shouldFilter() throws IOException {
return shouldFilters[pos++ % shouldFilters.length];
}
@Override
public void reset() throws IOException {
super.reset();
pos = 0;
}
}
}