blob: a2963a245f0942bee7c3d203ac44c6148582fdff [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.TestUtil;
public class TestRemoveDuplicatesTokenFilter extends BaseTokenStreamTestCase {
public static Token tok(int pos, String t, int start, int end) {
Token tok = new Token(t, start, end);
tok.setPositionIncrement(pos);
return tok;
}
public static Token tok(int pos, String t) {
return tok(pos, t, 0, 0);
}
public void testDups(final String expected, final Token... tokens) throws Exception {
final Iterator<Token> toks = Arrays.asList(tokens).iterator();
final TokenStream ts =
new RemoveDuplicatesTokenFilter(
(new TokenStream() {
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
@Override
public boolean incrementToken() {
if (toks.hasNext()) {
clearAttributes();
Token tok = toks.next();
termAtt.setEmpty().append(tok);
offsetAtt.setOffset(tok.startOffset(), tok.endOffset());
posIncAtt.setPositionIncrement(tok.getPositionIncrement());
return true;
} else {
return false;
}
}
}));
assertTokenStreamContents(ts, expected.split("\\s"));
}
public void testNoDups() throws Exception {
testDups(
"A B B C D E",
tok(1, "A", 0, 4),
tok(1, "B", 5, 10),
tok(1, "B", 11, 15),
tok(1, "C", 16, 20),
tok(0, "D", 16, 20),
tok(1, "E", 21, 25));
}
public void testSimpleDups() throws Exception {
testDups(
"A B C D E",
tok(1, "A", 0, 4),
tok(1, "B", 5, 10),
tok(0, "B", 11, 15),
tok(1, "C", 16, 20),
tok(0, "D", 16, 20),
tok(1, "E", 21, 25));
}
public void testComplexDups() throws Exception {
testDups(
"A B C D E F G H I J K",
tok(1, "A"),
tok(1, "B"),
tok(0, "B"),
tok(1, "C"),
tok(1, "D"),
tok(0, "D"),
tok(0, "D"),
tok(1, "E"),
tok(1, "F"),
tok(0, "F"),
tok(1, "G"),
tok(0, "H"),
tok(0, "H"),
tok(1, "I"),
tok(1, "J"),
tok(0, "K"),
tok(0, "J"));
}
// some helper methods for the below test with synonyms
private String randomNonEmptyString() {
while (true) {
final String s = TestUtil.randomUnicodeString(random()).trim();
if (s.length() != 0 && s.indexOf('\u0000') == -1) {
return s;
}
}
}
private void add(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
b.add(
new CharsRef(input.replaceAll(" +", "\u0000")),
new CharsRef(output.replaceAll(" +", "\u0000")),
keepOrig);
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
final int numIters = atLeast(3);
for (int i = 0; i < numIters; i++) {
SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean());
final int numEntries = atLeast(10);
for (int j = 0; j < numEntries; j++) {
add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean());
}
final SynonymMap map = b.build();
final boolean ignoreCase = random().nextBoolean();
final Analyzer analyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
TokenStream stream = new SynonymGraphFilter(tokenizer, map, ignoreCase);
return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream));
}
};
checkRandomData(random(), analyzer, 200);
analyzer.close();
}
}
public void testEmptyTerm() throws IOException {
Analyzer a =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(tokenizer));
}
};
checkOneTerm(a, "", "");
a.close();
}
}