blob: a08379c244d7f967ee6c252a727de7039df72dfa [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.automaton;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.fst.Util;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import static org.apache.lucene.util.automaton.Operations.DEFAULT_DETERMINIZE_WORK_LIMIT;
/**
* Test for {@link FiniteStringsIterator}.
*/
public class FiniteStringsIteratorTest extends LuceneTestCase {
public void testRandomFiniteStrings1() {
int numStrings = atLeast(100);
if (VERBOSE) {
System.out.println("TEST: numStrings=" + numStrings);
}
Set<IntsRef> strings = new HashSet<>();
List<Automaton> automata = new ArrayList<>();
IntsRefBuilder scratch = new IntsRefBuilder();
for(int i=0;i<numStrings;i++) {
String s = TestUtil.randomSimpleString(random(), 1, 200);
Util.toUTF32(s.toCharArray(), 0, s.length(), scratch);
if (strings.add(scratch.toIntsRef())) {
automata.add(Automata.makeString(s));
if (VERBOSE) {
System.out.println(" add string=" + s);
}
}
}
// TODO: we could sometimes use
// DaciukMihovAutomatonBuilder here
// TODO: what other random things can we do here...
Automaton a = Operations.union(automata);
if (random().nextBoolean()) {
a = MinimizationOperations.minimize(a, 1000000);
if (VERBOSE) {
System.out.println("TEST: a.minimize numStates=" + a.getNumStates());
}
} else if (random().nextBoolean()) {
if (VERBOSE) {
System.out.println("TEST: a.determinize");
}
a = Operations.determinize(a, 1000000);
} else if (random().nextBoolean()) {
if (VERBOSE) {
System.out.println("TEST: a.removeDeadStates");
}
a = Operations.removeDeadStates(a);
}
FiniteStringsIterator iterator = new FiniteStringsIterator(a);
List<IntsRef> actual = getFiniteStrings(iterator);
assertFiniteStringsRecursive(a, actual);
if (!strings.equals(new HashSet<>(actual))) {
System.out.println("strings.size()=" + strings.size() + " actual.size=" + actual.size());
List<IntsRef> x = new ArrayList<>(strings);
Collections.sort(x);
List<IntsRef> y = new ArrayList<>(actual);
Collections.sort(y);
int end = Math.min(x.size(), y.size());
for(int i=0;i<end;i++) {
System.out.println(" i=" + i + " string=" + toString(x.get(i)) + " actual=" + toString(y.get(i)));
}
fail("wrong strings found");
}
}
/**
* Basic test for getFiniteStrings
*/
public void testFiniteStringsBasic() {
Automaton a = Operations.union(Automata.makeString("dog"), Automata.makeString("duck"));
a = MinimizationOperations.minimize(a, DEFAULT_DETERMINIZE_WORK_LIMIT);
FiniteStringsIterator iterator = new FiniteStringsIterator(a);
List<IntsRef> actual = getFiniteStrings(iterator);
assertFiniteStringsRecursive(a, actual);
assertEquals(2, actual.size());
IntsRefBuilder dog = new IntsRefBuilder();
Util.toIntsRef(new BytesRef("dog"), dog);
assertTrue(actual.contains(dog.get()));
IntsRefBuilder duck = new IntsRefBuilder();
Util.toIntsRef(new BytesRef("duck"), duck);
assertTrue(actual.contains(duck.get()));
}
public void testFiniteStringsEatsStack() {
char[] chars = new char[50000];
TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
String bigString1 = new String(chars);
TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
String bigString2 = new String(chars);
Automaton a = Operations.union(Automata.makeString(bigString1), Automata.makeString(bigString2));
FiniteStringsIterator iterator = new FiniteStringsIterator(a);
List<IntsRef> actual = getFiniteStrings(iterator);
assertEquals(2, actual.size());
IntsRefBuilder scratch = new IntsRefBuilder();
Util.toUTF32(bigString1.toCharArray(), 0, bigString1.length(), scratch);
assertTrue(actual.contains(scratch.get()));
Util.toUTF32(bigString2.toCharArray(), 0, bigString2.length(), scratch);
assertTrue(actual.contains(scratch.get()));
}
public void testWithCycle() throws Exception {
expectThrows(IllegalArgumentException.class, () -> {
Automaton a = new RegExp("abc.*", RegExp.NONE).toAutomaton();
FiniteStringsIterator iterator = new FiniteStringsIterator(a);
getFiniteStrings(iterator);
});
}
public void testSingletonNoLimit() {
Automaton a = Automata.makeString("foobar");
FiniteStringsIterator iterator = new FiniteStringsIterator(a);
List<IntsRef> actual = getFiniteStrings(iterator);
assertEquals(1, actual.size());
IntsRefBuilder scratch = new IntsRefBuilder();
Util.toUTF32("foobar".toCharArray(), 0, 6, scratch);
assertTrue(actual.contains(scratch.get()));
}
public void testShortAccept() {
Automaton a = Operations.union(Automata.makeString("x"), Automata.makeString("xy"));
a = MinimizationOperations.minimize(a, DEFAULT_DETERMINIZE_WORK_LIMIT);
FiniteStringsIterator iterator = new FiniteStringsIterator(a);
List<IntsRef> actual = getFiniteStrings(iterator);
assertEquals(2, actual.size());
IntsRefBuilder x = new IntsRefBuilder();
Util.toIntsRef(new BytesRef("x"), x);
assertTrue(actual.contains(x.get()));
IntsRefBuilder xy = new IntsRefBuilder();
Util.toIntsRef(new BytesRef("xy"), xy);
assertTrue(actual.contains(xy.get()));
}
public void testSingleString() {
Automaton a = new Automaton();
int start = a.createState();
int end = a.createState();
a.setAccept(end, true);
a.addTransition(start, end, 'a', 'a');
a.finishState();
Set<IntsRef> accepted = TestOperations.getFiniteStrings(a);
assertEquals(1, accepted.size());
IntsRefBuilder intsRef = new IntsRefBuilder();
intsRef.append('a');
assertTrue(accepted.contains(intsRef.toIntsRef()));
}
/**
* All strings generated by the iterator.
*/
static List<IntsRef> getFiniteStrings(FiniteStringsIterator iterator) {
List<IntsRef> result = new ArrayList<>();
for (IntsRef finiteString; (finiteString = iterator.next()) != null;) {
result.add(IntsRef.deepCopyOf(finiteString));
}
return result;
}
/**
* Check that strings the automaton returns are as expected.
*
* @param automaton Automaton.
* @param actual Strings generated by automaton.
*/
private void assertFiniteStringsRecursive(Automaton automaton, List<IntsRef> actual) {
Set<IntsRef> expected = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);
// Check that no string is emitted twice.
assertEquals(expected.size(), actual.size());
assertEquals(expected, new HashSet<>(actual));
}
// ascii only!
private static String toString(IntsRef ints) {
BytesRef br = new BytesRef(ints.length);
for(int i=0;i<ints.length;i++) {
br.bytes[i] = (byte) ints.ints[i];
}
br.length = ints.length;
return br.utf8ToString();
}
}