blob: 47e5e5adeba92893a5286b64d80489ff7a7fc14c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.AutomatonTestUtil;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.fst.Util;
import static org.apache.lucene.util.automaton.Operations.DEFAULT_DETERMINIZE_WORK_LIMIT;
public class TestGraphTokenizers extends BaseTokenStreamTestCase {
// Makes a graph TokenStream from the string; separate
// positions with single space, multiple tokens at the same
// position with /, and add optional position length with
// :. EG "a b c" is a simple chain, "a/x b c" adds 'x'
// over 'a' at position 0 with posLen=1, "a/x:3 b c" adds
// 'x' over a with posLen=3. Tokens are in normal-form!
// So, offsets are computed based on the first token at a
// given position. NOTE: each token must be a single
// character! We assume this when computing offsets...
// NOTE: all input tokens must be length 1!!! This means
// you cannot turn on MockCharFilter when random
// testing...
public static final class GraphTokenizer extends Tokenizer {
private List<Token> tokens;
private int upto;
private int inputLength;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
@Override
public void reset() throws IOException {
super.reset();
tokens = null;
upto = 0;
}
@Override
public boolean incrementToken() throws IOException {
if (tokens == null) {
fillTokens();
}
//System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size());
if (upto == tokens.size()) {
//System.out.println(" END @ " + tokens.size());
return false;
}
final Token t = tokens.get(upto++);
//System.out.println(" return token=" + t);
clearAttributes();
termAtt.append(t.toString());
offsetAtt.setOffset(t.startOffset(), t.endOffset());
posIncrAtt.setPositionIncrement(t.getPositionIncrement());
posLengthAtt.setPositionLength(t.getPositionLength());
return true;
}
@Override
public void end() throws IOException {
super.end();
// NOTE: somewhat... hackish, but we need this to
// satisfy BTSTC:
final int lastOffset;
if (tokens != null && !tokens.isEmpty()) {
lastOffset = tokens.get(tokens.size()-1).endOffset();
} else {
lastOffset = 0;
}
offsetAtt.setOffset(correctOffset(lastOffset),
correctOffset(inputLength));
}
private void fillTokens() throws IOException {
final StringBuilder sb = new StringBuilder();
final char[] buffer = new char[256];
while (true) {
final int count = input.read(buffer);
if (count == -1) {
break;
}
sb.append(buffer, 0, count);
//System.out.println("got count=" + count);
}
//System.out.println("fillTokens: " + sb);
inputLength = sb.length();
final String[] parts = sb.toString().split(" ");
tokens = new ArrayList<>();
int pos = 0;
int maxPos = -1;
int offset = 0;
//System.out.println("again");
for(String part : parts) {
final String[] overlapped = part.split("/");
boolean firstAtPos = true;
int minPosLength = Integer.MAX_VALUE;
for(String part2 : overlapped) {
final int colonIndex = part2.indexOf(':');
final String token;
final int posLength;
if (colonIndex != -1) {
token = part2.substring(0, colonIndex);
posLength = Integer.parseInt(part2.substring(1+colonIndex));
} else {
token = part2;
posLength = 1;
}
maxPos = Math.max(maxPos, pos + posLength);
minPosLength = Math.min(minPosLength, posLength);
final Token t = new Token(token, offset, offset + 2*posLength - 1);
t.setPositionLength(posLength);
t.setPositionIncrement(firstAtPos ? 1:0);
firstAtPos = false;
//System.out.println(" add token=" + t + " startOff=" + t.startOffset() + " endOff=" + t.endOffset());
tokens.add(t);
}
pos += minPosLength;
offset = 2 * pos;
}
assert maxPos <= pos: "input string mal-formed: posLength>1 tokens hang over the end";
}
}
public void testMockGraphTokenFilterBasic() throws Exception {
for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
}
// Make new analyzer each time, because MGTF has fixed
// seed:
final Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
final TokenStream t2 = new MockGraphTokenFilter(random(), t);
return new TokenStreamComponents(t, t2);
}
};
checkAnalysisConsistency(random(), a, false, "a b c d e f g h i j k");
}
}
public void testMockGraphTokenFilterOnGraphInput() throws Exception {
for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
}
// Make new analyzer each time, because MGTF has fixed
// seed:
final Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer t = new GraphTokenizer();
final TokenStream t2 = new MockGraphTokenFilter(random(), t);
return new TokenStreamComponents(t, t2);
}
};
checkAnalysisConsistency(random(), a, false, "a/x:3 c/y:2 d e f/z:4 g h i j k");
}
}
// Just deletes (leaving hole) token 'a':
private final static class RemoveATokens extends TokenFilter {
private int pendingPosInc;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
public RemoveATokens(TokenStream in) {
super(in);
}
@Override
public void reset() throws IOException {
super.reset();
pendingPosInc = 0;
}
@Override
public void end() throws IOException {
super.end();
posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
}
@Override
public boolean incrementToken() throws IOException {
while (true) {
final boolean gotOne = input.incrementToken();
if (!gotOne) {
return false;
} else if (termAtt.toString().equals("a")) {
pendingPosInc += posIncAtt.getPositionIncrement();
} else {
posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
pendingPosInc = 0;
return true;
}
}
}
}
public void testMockGraphTokenFilterBeforeHoles() throws Exception {
for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
}
// Make new analyzer each time, because MGTF has fixed
// seed:
final Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
final TokenStream t2 = new MockGraphTokenFilter(random(), t);
final TokenStream t3 = new RemoveATokens(t2);
return new TokenStreamComponents(t, t3);
}
};
Random random = random();
checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k");
checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k");
checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a");
checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y");
}
}
public void testMockGraphTokenFilterAfterHoles() throws Exception {
for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
}
// Make new analyzer each time, because MGTF has fixed
// seed:
final Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
final TokenStream t2 = new RemoveATokens(t);
final TokenStream t3 = new MockGraphTokenFilter(random(), t2);
return new TokenStreamComponents(t, t3);
}
};
Random random = random();
checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k");
checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k");
checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a");
checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y");
}
}
public void testMockGraphTokenFilterRandom() throws Exception {
for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
}
// Make new analyzer each time, because MGTF has fixed
// seed:
final Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
final TokenStream t2 = new MockGraphTokenFilter(random(), t);
return new TokenStreamComponents(t, t2);
}
};
Random random = random();
checkRandomData(random, a, 5, atLeast(100));
}
}
// Two MockGraphTokenFilters
public void testDoubleMockGraphTokenFilterRandom() throws Exception {
for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
}
// Make new analyzer each time, because MGTF has fixed
// seed:
final Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
final TokenStream t1 = new MockGraphTokenFilter(random(), t);
final TokenStream t2 = new MockGraphTokenFilter(random(), t1);
return new TokenStreamComponents(t, t2);
}
};
Random random = random();
checkRandomData(random, a, 5, atLeast(100));
}
}
public void testMockGraphTokenFilterBeforeHolesRandom() throws Exception {
for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
}
// Make new analyzer each time, because MGTF has fixed
// seed:
final Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
final TokenStream t1 = new MockGraphTokenFilter(random(), t);
final TokenStream t2 = new MockHoleInjectingTokenFilter(random(), t1);
return new TokenStreamComponents(t, t2);
}
};
Random random = random();
checkRandomData(random, a, 5, atLeast(100));
}
}
public void testMockGraphTokenFilterAfterHolesRandom() throws Exception {
for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
}
// Make new analyzer each time, because MGTF has fixed
// seed:
final Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
final TokenStream t1 = new MockHoleInjectingTokenFilter(random(), t);
final TokenStream t2 = new MockGraphTokenFilter(random(), t1);
return new TokenStreamComponents(t, t2);
}
};
Random random = random();
checkRandomData(random, a, 5, atLeast(100));
}
}
private static Token token(String term, int posInc, int posLength) {
final Token t = new Token(term, 0, 0);
t.setPositionIncrement(posInc);
t.setPositionLength(posLength);
return t;
}
private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
final Token t = new Token(term, startOffset, endOffset);
t.setPositionIncrement(posInc);
t.setPositionLength(posLength);
return t;
}
public void testSingleToken() throws Exception {
final TokenStream ts = new CannedTokenStream(
new Token[] {
token("abc", 1, 1),
});
assertSameLanguage(s2a("abc"), ts);
}
public void testMultipleHoles() throws Exception {
final TokenStream ts = new CannedTokenStream(
new Token[] {
token("a", 1, 1),
token("b", 3, 1),
});
assertSameLanguage(join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")), ts);
}
public void testSynOverMultipleHoles() throws Exception {
final TokenStream ts = new CannedTokenStream(
new Token[] {
token("a", 1, 1),
token("x", 0, 3),
token("b", 3, 1),
});
final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
final Automaton a2 = join(s2a("x"), SEP_A, s2a("b"));
assertSameLanguage(Operations.union(a1, a2), ts);
}
// for debugging!
/*
private static void toDot(Automaton a) throws IOException {
final String s = a.toDot();
Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
w.write(s);
w.close();
System.out.println("TEST: saved to /x/tmp/out.dot");
}
*/
private static final Automaton SEP_A = Automata.makeChar(TokenStreamToAutomaton.POS_SEP);
private static final Automaton HOLE_A = Automata.makeChar(TokenStreamToAutomaton.HOLE);
private Automaton join(String ... strings) {
List<Automaton> as = new ArrayList<>();
for(String s : strings) {
as.add(s2a(s));
as.add(SEP_A);
}
as.remove(as.size()-1);
return Operations.concatenate(as);
}
private Automaton join(Automaton ... as) {
return Operations.concatenate(Arrays.asList(as));
}
private Automaton s2a(String s) {
return Automata.makeString(s);
}
public void testTwoTokens() throws Exception {
final TokenStream ts = new CannedTokenStream(
new Token[] {
token("abc", 1, 1),
token("def", 1, 1),
});
assertSameLanguage(join("abc", "def"), ts);
}
public void testHole() throws Exception {
final TokenStream ts = new CannedTokenStream(
new Token[] {
token("abc", 1, 1),
token("def", 2, 1),
});
assertSameLanguage(join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def")), ts);
}
public void testOverlappedTokensSausage() throws Exception {
// Two tokens on top of each other (sausage):
final TokenStream ts = new CannedTokenStream(
new Token[] {
token("abc", 1, 1),
token("xyz", 0, 1)
});
final Automaton a1 = s2a("abc");
final Automaton a2 = s2a("xyz");
assertSameLanguage(Operations.union(a1, a2), ts);
}
public void testOverlappedTokensLattice() throws Exception {
final TokenStream ts = new CannedTokenStream(
new Token[] {
token("abc", 1, 1),
token("xyz", 0, 2),
token("def", 1, 1),
});
final Automaton a1 = s2a("xyz");
final Automaton a2 = join("abc", "def");
assertSameLanguage(Operations.union(a1, a2), ts);
}
public void testSynOverHole() throws Exception {
final TokenStream ts = new CannedTokenStream(
new Token[] {
token("a", 1, 1),
token("X", 0, 2),
token("b", 2, 1),
});
final Automaton a1 = Operations.union(join(s2a("a"), SEP_A, HOLE_A), s2a("X"));
final Automaton expected = Operations.concatenate(a1, join(SEP_A, s2a("b")));
assertSameLanguage(expected, ts);
}
public void testSynOverHole2() throws Exception {
final TokenStream ts = new CannedTokenStream(
new Token[] {
token("xyz", 1, 1),
token("abc", 0, 3),
token("def", 2, 1),
});
final Automaton expected = Operations.union(
join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), s2a("abc"));
assertSameLanguage(expected, ts);
}
public void testOverlappedTokensLattice2() throws Exception {
final TokenStream ts = new CannedTokenStream(
new Token[] {
token("abc", 1, 1),
token("xyz", 0, 3),
token("def", 1, 1),
token("ghi", 1, 1),
});
final Automaton a1 = s2a("xyz");
final Automaton a2 = join("abc", "def", "ghi");
assertSameLanguage(Operations.union(a1, a2), ts);
}
public void testToDot() throws Exception {
final TokenStream ts = new CannedTokenStream(new Token[] {token("abc", 1, 1, 0, 4)});
StringWriter w = new StringWriter();
new TokenStreamToDot("abcd", ts, new PrintWriter(w)).toDot();
assertTrue(w.toString().indexOf("abc / abcd") != -1);
}
public void testStartsWithHole() throws Exception {
final TokenStream ts = new CannedTokenStream(
new Token[] {
token("abc", 2, 1),
});
assertSameLanguage(join(HOLE_A, SEP_A, s2a("abc")), ts);
}
public void testEndsWithHole() throws Exception {
final TokenStream ts = new CannedTokenStream(1, 0,
new Token[] {
token("abc", 2, 1),
});
assertSameLanguage(join(HOLE_A, SEP_A, s2a("abc"), SEP_A, HOLE_A), ts);
}
public void testSynHangingOverEnd() throws Exception {
final TokenStream ts = new CannedTokenStream(
new Token[] {
token("a", 1, 1),
token("X", 0, 10),
});
assertSameLanguage(Operations.union(s2a("a"), s2a("X")), ts);
}
/** Returns all paths */
private Set<String> toPathStrings(Automaton a) {
BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
Set<String> paths = new HashSet<>();
for (IntsRef ir: AutomatonTestUtil.getFiniteStringsRecursive(a, -1)) {
paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
}
return paths;
}
private void assertSameLanguage(Automaton expected, TokenStream ts) throws IOException {
assertSameLanguage(expected, new TokenStreamToAutomaton().toAutomaton(ts));
}
private void assertSameLanguage(Automaton expected, Automaton actual) {
Automaton expectedDet = Operations.determinize(Operations.removeDeadStates(expected), DEFAULT_DETERMINIZE_WORK_LIMIT);
Automaton actualDet = Operations.determinize(Operations.removeDeadStates(actual), DEFAULT_DETERMINIZE_WORK_LIMIT);
if (Operations.sameLanguage(expectedDet, actualDet) == false) {
Set<String> expectedPaths = toPathStrings(expectedDet);
Set<String> actualPaths = toPathStrings(actualDet);
StringBuilder b = new StringBuilder();
b.append("expected:\n");
for(String path : expectedPaths) {
b.append(" ");
b.append(path);
if (actualPaths.contains(path) == false) {
b.append(" [missing!]");
}
b.append('\n');
}
b.append("actual:\n");
for(String path : actualPaths) {
b.append(" ");
b.append(path);
if (expectedPaths.contains(path) == false) {
b.append(" [unexpected!]");
}
b.append('\n');
}
fail("accepted language is different:\n\n" + b.toString());
}
}
public void testTokenStreamGraphWithHoles() throws Exception {
final TokenStream ts = new CannedTokenStream(
new Token[] {
token("abc", 1, 1),
token("xyz", 1, 8),
token("def", 1, 1),
token("ghi", 1, 1),
});
assertSameLanguage(Operations.union(join(s2a("abc"), SEP_A, s2a("xyz")),
join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"), SEP_A, s2a("ghi"))), ts);
}
}