blob: 7b07bf9cb6f551f7dc7797e27e63e58c986bf1b8 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import java.io.StringReader;
import java.util.Collections;
import java.util.Random;
import java.util.function.Function;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.CharacterUtils;
import org.apache.lucene.analysis.FilteringTokenFilter;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ValidatingTokenFilter;
import org.apache.lucene.analysis.core.TypeTokenFilter;
import org.apache.lucene.analysis.de.GermanStemFilter;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.shingle.FixedShingleFilter;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.standard.ClassicTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {
boolean closed = false;
boolean ended = false;
boolean reset = false;
private final class AssertingLowerCaseFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
public AssertingLowerCaseFilter(TokenStream in) {
super(in);
}
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
return true;
} else
return false;
}
@Override
public void end() throws IOException {
super.end();
ended = true;
}
@Override
public void close() throws IOException {
super.close();
closed = true;
}
@Override
public void reset() throws IOException {
super.reset();
reset = true;
}
}
private class SkipMatchingFilter extends ConditionalTokenFilter {
private final Pattern pattern;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
SkipMatchingFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory, String termRegex) {
super(input, inputFactory);
pattern = Pattern.compile(termRegex);
}
@Override
protected boolean shouldFilter() throws IOException {
return pattern.matcher(termAtt.toString()).matches() == false;
}
}
public void testSimple() throws IOException {
TokenStream stream = whitespaceMockTokenizer("Alice Bob Clara David");
TokenStream t = new SkipMatchingFilter(stream, AssertingLowerCaseFilter::new, ".*o.*");
assertTokenStreamContents(t, new String[]{ "alice", "Bob", "clara", "david" });
assertTrue(closed);
assertTrue(reset);
assertTrue(ended);
}
private final class TokenSplitter extends TokenFilter {
final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
State state = null;
String half;
protected TokenSplitter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (half == null) {
state = captureState();
if (input.incrementToken() == false) {
return false;
}
half = termAtt.toString().substring(4);
termAtt.setLength(4);
return true;
}
restoreState(state);
termAtt.setEmpty().append(half);
half = null;
return true;
}
}
public void testMultitokenWrapping() throws IOException {
TokenStream stream = whitespaceMockTokenizer("tokenpos1 tokenpos2 tokenpos3 tokenpos4");
TokenStream ts = new SkipMatchingFilter(stream, TokenSplitter::new, ".*2.*");
assertTokenStreamContents(ts, new String[]{
"toke", "npos1", "tokenpos2", "toke", "npos3", "toke", "npos4"
});
}
private final class EndTrimmingFilter extends FilteringTokenFilter {
final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
public EndTrimmingFilter(TokenStream in) {
super(in);
}
@Override
protected boolean accept() throws IOException {
return true;
}
@Override
public void end() throws IOException {
super.end();
offsetAtt.setOffset(0, offsetAtt.endOffset() - 2);
}
}
public void testEndPropagation() throws IOException {
CannedTokenStream cts2 = new CannedTokenStream(0, 20,
new Token("alice", 0, 5), new Token("bob", 6, 8)
);
TokenStream ts2 = new ConditionalTokenFilter(cts2, EndTrimmingFilter::new) {
@Override
protected boolean shouldFilter() throws IOException {
return true;
}
};
assertTokenStreamContents(ts2, new String[]{ "alice", "bob" },
null, null, null, null, null, 18);
CannedTokenStream cts1 = new CannedTokenStream(0, 20,
new Token("alice", 0, 5), new Token("bob", 6, 8)
);
TokenStream ts1 = new ConditionalTokenFilter(cts1, EndTrimmingFilter::new) {
@Override
protected boolean shouldFilter() throws IOException {
return false;
}
};
assertTokenStreamContents(ts1, new String[]{ "alice", "bob" },
null, null, null, null, null, 20);
}
public void testWrapGraphs() throws Exception {
TokenStream stream = whitespaceMockTokenizer("a b c d e");
SynonymMap sm;
try (Analyzer analyzer = new MockAnalyzer(random())) {
SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
parser.parse(new StringReader("a b, f\nc d, g"));
sm = parser.build();
}
TokenStream ts = new SkipMatchingFilter(stream, in -> new SynonymGraphFilter(in, sm, true), "c");
assertTokenStreamContents(ts, new String[]{
"f", "a", "b", "c", "d", "e"
},
null, null, null,
new int[]{
1, 0, 1, 1, 1, 1
},
new int[]{
2, 1, 1, 1, 1, 1
});
}
public void testReadaheadWithNoFiltering() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new ClassicTokenizer();
TokenStream sink = new ConditionalTokenFilter(source, in -> new ShingleFilter(in, 2)) {
@Override
protected boolean shouldFilter() throws IOException {
return true;
}
};
return new TokenStreamComponents(source, sink);
}
};
String input = "one two three four";
try (TokenStream ts = analyzer.tokenStream("", input)) {
assertTokenStreamContents(ts, new String[]{
"one", "one two",
"two", "two three",
"three", "three four",
"four"
});
}
}
public void testReadaheadWithFiltering() throws IOException {
CharArraySet protectedTerms = new CharArraySet(2, true);
protectedTerms.add("three");
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new ClassicTokenizer();
TokenStream sink = new ProtectedTermFilter(protectedTerms, source, in -> new ShingleFilter(in, 2));
sink = new ValidatingTokenFilter(sink, "1");
return new TokenStreamComponents(source, sink);
}
};
String input = "one two three four";
try (TokenStream ts = analyzer.tokenStream("", input)) {
assertTokenStreamContents(ts, new String[]{
"one", "one two", "two", "three", "four"
}, new int[]{
0, 0, 4, 8, 14
}, new int[]{
3, 7, 7, 13, 18
}, new int[]{
1, 0, 1, 1, 1
}, new int[]{
1, 2, 1, 1, 1
}, 18);
}
}
public void testFilteringWithReadahead() throws IOException {
CharArraySet protectedTerms = new CharArraySet(2, true);
protectedTerms.add("two");
protectedTerms.add("two three");
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new StandardTokenizer();
TokenStream sink = new ShingleFilter(source, 3);
sink = new ProtectedTermFilter(protectedTerms, sink, in -> new TypeTokenFilter(in, Collections.singleton("ALL"), true));
return new TokenStreamComponents(source, sink);
}
};
String input = "one two three four";
try (TokenStream ts = analyzer.tokenStream("", input)) {
assertTokenStreamContents(ts, new String[]{
"two", "two three"
}, new int[]{
4, 4
}, new int[]{
7, 13
}, new int[]{
2, 0
}, new int[]{
1, 2
}, 18);
}
}
public void testMultipleConditionalFilters() throws IOException {
TokenStream stream = whitespaceMockTokenizer("Alice Bob Clara David");
TokenStream t = new SkipMatchingFilter(stream, in -> {
TruncateTokenFilter truncateFilter = new TruncateTokenFilter(in, 2);
return new AssertingLowerCaseFilter(truncateFilter);
}, ".*o.*");
assertTokenStreamContents(t, new String[]{"al", "Bob", "cl", "da"});
assertTrue(closed);
assertTrue(reset);
assertTrue(ended);
}
public void testFilteredTokenFilters() throws IOException {
CharArraySet protectedTerms = new CharArraySet(2, true);
protectedTerms.add("foobar");
TokenStream ts = whitespaceMockTokenizer("wuthering foobar abc");
ts = new ProtectedTermFilter(protectedTerms, ts, in -> new LengthFilter(in, 1, 4));
assertTokenStreamContents(ts, new String[]{ "foobar", "abc" });
ts = whitespaceMockTokenizer("foobar abc");
ts = new ProtectedTermFilter(protectedTerms, ts, in -> new LengthFilter(in, 1, 4));
assertTokenStreamContents(ts, new String[]{ "foobar", "abc" });
}
public void testConsistentOffsets() throws IOException {
long seed = random().nextLong();
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new NGramTokenizer();
TokenStream sink = new ValidatingTokenFilter(new KeywordRepeatFilter(source), "stage 0");
sink = new ValidatingTokenFilter(sink, "stage 1");
sink = new RandomSkippingFilter(sink, seed, in -> new TypeTokenFilter(in, Collections.singleton("word")));
sink = new ValidatingTokenFilter(sink, "last stage");
return new TokenStreamComponents(source, sink);
}
};
checkRandomData(random(), analyzer, 1);
}
public void testEndWithShingles() throws IOException {
TokenStream ts = whitespaceMockTokenizer("cyk jvboq \u092e\u0962\u093f");
ts = new GermanStemFilter(ts);
ts = new NonRandomSkippingFilter(ts, in -> new FixedShingleFilter(in, 2), true, false, true);
ts = new NonRandomSkippingFilter(ts, IndicNormalizationFilter::new, true);
assertTokenStreamContents(ts, new String[]{"jvboq"});
}
public void testInternalPositionAdjustment() throws IOException {
// check that the partial TokenStream sent to the condition filter begins with a posInc of 1,
// even if the input stream has a posInc of 0 at that position, and that the filtered stream
// has the correct posInc afterwards
TokenStream ts = whitespaceMockTokenizer("one two three");
ts = new KeywordRepeatFilter(ts);
ts = new NonRandomSkippingFilter(ts, PositionAssertingTokenFilter::new, false, true, true, true, true, false);
assertTokenStreamContents(ts,
new String[]{ "one", "one", "two", "two", "three", "three" },
new int[]{ 1, 0, 1, 0, 1, 0});
}
private static final class PositionAssertingTokenFilter extends TokenFilter {
boolean reset = false;
final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
protected PositionAssertingTokenFilter(TokenStream input) {
super(input);
}
@Override
public void reset() throws IOException {
super.reset();
this.reset = true;
}
@Override
public boolean incrementToken() throws IOException {
boolean more = input.incrementToken();
if (more && reset) {
assertEquals(1, posIncAtt.getPositionIncrement());
}
reset = false;
return more;
}
}
private static class RandomSkippingFilter extends ConditionalTokenFilter {
Random random;
final long seed;
protected RandomSkippingFilter(TokenStream input, long seed, Function<TokenStream, TokenStream> inputFactory) {
super(input, inputFactory);
this.seed = seed;
this.random = new Random(seed);
}
@Override
protected boolean shouldFilter() throws IOException {
return random.nextBoolean();
}
@Override
public void reset() throws IOException {
super.reset();
random = new Random(seed);
}
}
private static class NonRandomSkippingFilter extends ConditionalTokenFilter {
final boolean[] shouldFilters;
int pos;
/**
* Create a new BypassingTokenFilter
*
* @param input the input TokenStream
* @param inputFactory a factory function to create a new instance of the TokenFilter to wrap
*/
protected NonRandomSkippingFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory, boolean... shouldFilters) {
super(input, inputFactory);
this.shouldFilters = shouldFilters;
}
@Override
protected boolean shouldFilter() throws IOException {
return shouldFilters[pos++ % shouldFilters.length];
}
@Override
public void reset() throws IOException {
super.reset();
pos = 0;
}
}
}