blob: 978ade5d5355923b9091ca6e6ed1717159f6a083 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.shingle;
import java.io.IOException;
import java.util.Iterator;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings;
public class FixedShingleFilterTest extends BaseTokenStreamTestCase {
public void testBiGramFilter() throws IOException {
TokenStream ts = new CannedTokenStream(
new Token("please", 0, 6),
new Token("divide", 7, 13),
new Token("this", 14, 18),
new Token("sentence", 19, 27),
new Token("into", 28, 32),
new Token("shingles", 33, 41)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 2),
new String[]{"please divide", "divide this", "this sentence", "sentence into", "into shingles"},
new int[]{0, 7, 14, 19, 28,},
new int[]{13, 18, 27, 32, 41,},
new String[]{"shingle", "shingle", "shingle", "shingle", "shingle",},
new int[]{1, 1, 1, 1, 1,},
new int[]{1, 1, 1, 1, 1});
}
public void testBiGramFilterWithAltSeparator() throws IOException {
TokenStream ts = new CannedTokenStream(
new Token("please", 0, 6),
new Token("divide", 7, 13),
new Token("this", 14, 18),
new Token("sentence", 19, 27),
new Token("into", 28, 32),
new Token("shingles", 33, 41)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 2, "<SEP>", "_"),
new String[]{"please<SEP>divide", "divide<SEP>this", "this<SEP>sentence", "sentence<SEP>into", "into<SEP>shingles"},
new int[]{0, 7, 14, 19, 28},
new int[]{13, 18, 27, 32, 41},
new String[]{"shingle", "shingle", "shingle", "shingle", "shingle"},
new int[]{1, 1, 1, 1, 1});
}
public void testTriGramFilter() throws IOException {
TokenStream ts = new CannedTokenStream(
new Token("please", 0, 6),
new Token("divide", 7, 13),
new Token("this", 14, 18),
new Token("sentence", 19, 27),
new Token("into", 28, 32),
new Token("shingles", 33, 41)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
new String[]{"please divide this", "divide this sentence", "this sentence into", "sentence into shingles"});
}
public void testShingleSizeGreaterThanTokenstreamLength() throws IOException {
TokenStream ts = new FixedShingleFilter(new CannedTokenStream(
new Token("please", 0, 6),
new Token("divide", 7, 13)
), 3);
ts.reset();
assertFalse(ts.incrementToken());
}
public void testWithStopwords() throws IOException {
TokenStream ts = new CannedTokenStream(
new Token("please", 0, 6),
new Token("divide", 7, 13),
new Token("sentence", 2, 19, 27),
new Token("shingles", 2, 33, 41)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
new String[]{"please divide _", "divide _ sentence", "sentence _ shingles"},
new int[]{0, 7, 19,},
new int[]{13, 27, 41,},
new String[]{"shingle", "shingle", "shingle",},
new int[]{1, 1, 2,});
}
public void testConsecutiveStopwords() throws IOException {
TokenStream ts = new CannedTokenStream(
new Token("b", 2, 2, 3),
new Token("c", 4, 5),
new Token("d", 6, 7),
new Token("b", 3, 12, 13),
new Token("c", 14, 15)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 4),
new String[]{"b c d _", "c d _ _", "d _ _ b"},
new int[]{2, 4, 6,},
new int[]{7, 7, 13,},
new int[]{2, 1, 1,});
}
public void testTrailingStopwords() throws IOException {
TokenStream ts = new CannedTokenStream(1, 7,
new Token("b", 0, 1),
new Token("c", 2, 3),
new Token("d", 4, 5)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
new String[] { "b c d", "c d _" },
new int[] { 0, 2, },
new int[] { 5, 5, },
new int[] { 1, 1, });
}
public void testMultipleTrailingStopwords() throws IOException {
TokenStream ts = new CannedTokenStream(2, 9,
new Token("b", 0, 1),
new Token("c", 2, 3),
new Token("d", 4, 5)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
new String[] { "b c d", "c d _", "d _ _" },
new int[] { 0, 2, 4 },
new int[] { 5, 5, 5 },
new int[] { 1, 1, 1 });
}
public void testIncomingGraphs() throws IOException {
// b/a c b/a d
TokenStream ts = new CannedTokenStream(
new Token("b", 0, 1),
new Token("a", 0, 0, 1),
new Token("c", 2, 3),
new Token("b", 4, 5),
new Token("a", 0, 4, 5),
new Token("d", 6, 7)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 2),
new String[] { "b c", "a c", "c b", "c a", "b d", "a d" },
new int[] { 0, 0, 2, 2, 4, 4 },
new int[] { 3, 3, 5, 5, 7, 7 },
new int[] { 1, 0, 1, 0, 1, 0 });
}
public void testShinglesSpanningGraphs() throws IOException {
TokenStream ts = new CannedTokenStream(
new Token("b", 0, 1),
new Token("a", 0, 0, 1),
new Token("c", 2, 3),
new Token("b", 4, 5),
new Token("a", 0, 4, 5),
new Token("d", 6, 7)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
new String[] { "b c b", "b c a", "a c b", "a c a", "c b d", "c a d" },
new int[] { 0, 0, 0, 0, 2, 2, },
new int[] { 5, 5, 5, 5, 7, 7, },
new int[] { 1, 0, 0, 0, 1, 0, });
}
public void testTrailingGraphsOfDifferingLengths() throws IOException {
// a b:3/c d e f
TokenStream ts = new CannedTokenStream(
new Token("a", 0, 1),
new Token("b", 1, 2, 3, 3),
new Token("c", 0, 2, 3),
new Token("d", 2, 3),
new Token("e", 2, 3),
new Token("f", 4, 5)
);
assertTokenStreamContents(new FixedShingleFilter(ts, 3),
new String[]{ "a b f", "a c d", "c d e", "d e f"});
}
public void testParameterLimits() {
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> {
new FixedShingleFilter(new CannedTokenStream(), 1);
});
assertEquals("Shingle size must be between 2 and 4, got 1", e.getMessage());
IllegalArgumentException e2 = expectThrows(IllegalArgumentException.class, () -> {
new FixedShingleFilter(new CannedTokenStream(), 5);
});
assertEquals("Shingle size must be between 2 and 4, got 5", e2.getMessage());
}
public void testWithGraphInput() throws IOException {
TokenStream ts = new CannedTokenStream(
new Token("fuz", 0, 3),
new Token("foo", 1, 4, 6, 2),
new Token("bar", 0, 4, 6),
new Token("baz", 1, 4, 6)
);
GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(ts);
Iterator<TokenStream> it = graph.getFiniteStrings();
assertTokenStreamContents(new FixedShingleFilter(it.next(), 2), new String[]{ "fuz foo"});
assertTokenStreamContents(new FixedShingleFilter(it.next(), 2), new String[]{ "fuz bar", "bar baz"});
}
}