| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.util.automaton; |
| |
| |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.IntsRefBuilder; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.TestUtil; |
| import org.apache.lucene.util.fst.Util; |
| |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Set; |
| |
| import static org.apache.lucene.util.automaton.Operations.DEFAULT_DETERMINIZE_WORK_LIMIT; |
| |
| /** |
| * Test for {@link FiniteStringsIterator}. |
| */ |
| public class FiniteStringsIteratorTest extends LuceneTestCase { |
| public void testRandomFiniteStrings1() { |
| int numStrings = atLeast(100); |
| if (VERBOSE) { |
| System.out.println("TEST: numStrings=" + numStrings); |
| } |
| |
| Set<IntsRef> strings = new HashSet<>(); |
| List<Automaton> automata = new ArrayList<>(); |
| IntsRefBuilder scratch = new IntsRefBuilder(); |
| for(int i=0;i<numStrings;i++) { |
| String s = TestUtil.randomSimpleString(random(), 1, 200); |
| Util.toUTF32(s.toCharArray(), 0, s.length(), scratch); |
| if (strings.add(scratch.toIntsRef())) { |
| automata.add(Automata.makeString(s)); |
| if (VERBOSE) { |
| System.out.println(" add string=" + s); |
| } |
| } |
| } |
| |
| // TODO: we could sometimes use |
| // DaciukMihovAutomatonBuilder here |
| |
| // TODO: what other random things can we do here... |
| Automaton a = Operations.union(automata); |
| if (random().nextBoolean()) { |
| a = MinimizationOperations.minimize(a, 1000000); |
| if (VERBOSE) { |
| System.out.println("TEST: a.minimize numStates=" + a.getNumStates()); |
| } |
| } else if (random().nextBoolean()) { |
| if (VERBOSE) { |
| System.out.println("TEST: a.determinize"); |
| } |
| a = Operations.determinize(a, 1000000); |
| } else if (random().nextBoolean()) { |
| if (VERBOSE) { |
| System.out.println("TEST: a.removeDeadStates"); |
| } |
| a = Operations.removeDeadStates(a); |
| } |
| |
| FiniteStringsIterator iterator = new FiniteStringsIterator(a); |
| List<IntsRef> actual = getFiniteStrings(iterator); |
| assertFiniteStringsRecursive(a, actual); |
| |
| if (!strings.equals(new HashSet<>(actual))) { |
| System.out.println("strings.size()=" + strings.size() + " actual.size=" + actual.size()); |
| List<IntsRef> x = new ArrayList<>(strings); |
| Collections.sort(x); |
| List<IntsRef> y = new ArrayList<>(actual); |
| Collections.sort(y); |
| int end = Math.min(x.size(), y.size()); |
| for(int i=0;i<end;i++) { |
| System.out.println(" i=" + i + " string=" + toString(x.get(i)) + " actual=" + toString(y.get(i))); |
| } |
| fail("wrong strings found"); |
| } |
| } |
| |
| /** |
| * Basic test for getFiniteStrings |
| */ |
| public void testFiniteStringsBasic() { |
| Automaton a = Operations.union(Automata.makeString("dog"), Automata.makeString("duck")); |
| a = MinimizationOperations.minimize(a, DEFAULT_DETERMINIZE_WORK_LIMIT); |
| FiniteStringsIterator iterator = new FiniteStringsIterator(a); |
| List<IntsRef> actual = getFiniteStrings(iterator); |
| assertFiniteStringsRecursive(a, actual); |
| assertEquals(2, actual.size()); |
| IntsRefBuilder dog = new IntsRefBuilder(); |
| Util.toIntsRef(new BytesRef("dog"), dog); |
| assertTrue(actual.contains(dog.get())); |
| IntsRefBuilder duck = new IntsRefBuilder(); |
| Util.toIntsRef(new BytesRef("duck"), duck); |
| assertTrue(actual.contains(duck.get())); |
| } |
| |
| public void testFiniteStringsEatsStack() { |
| char[] chars = new char[50000]; |
| TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length); |
| String bigString1 = new String(chars); |
| TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length); |
| String bigString2 = new String(chars); |
| Automaton a = Operations.union(Automata.makeString(bigString1), Automata.makeString(bigString2)); |
| FiniteStringsIterator iterator = new FiniteStringsIterator(a); |
| List<IntsRef> actual = getFiniteStrings(iterator); |
| assertEquals(2, actual.size()); |
| IntsRefBuilder scratch = new IntsRefBuilder(); |
| Util.toUTF32(bigString1.toCharArray(), 0, bigString1.length(), scratch); |
| assertTrue(actual.contains(scratch.get())); |
| Util.toUTF32(bigString2.toCharArray(), 0, bigString2.length(), scratch); |
| assertTrue(actual.contains(scratch.get())); |
| } |
| |
| |
| public void testWithCycle() throws Exception { |
| expectThrows(IllegalArgumentException.class, () -> { |
| Automaton a = new RegExp("abc.*", RegExp.NONE).toAutomaton(); |
| FiniteStringsIterator iterator = new FiniteStringsIterator(a); |
| getFiniteStrings(iterator); |
| }); |
| } |
| |
| public void testSingletonNoLimit() { |
| Automaton a = Automata.makeString("foobar"); |
| FiniteStringsIterator iterator = new FiniteStringsIterator(a); |
| List<IntsRef> actual = getFiniteStrings(iterator); |
| assertEquals(1, actual.size()); |
| IntsRefBuilder scratch = new IntsRefBuilder(); |
| Util.toUTF32("foobar".toCharArray(), 0, 6, scratch); |
| assertTrue(actual.contains(scratch.get())); |
| } |
| |
| public void testShortAccept() { |
| Automaton a = Operations.union(Automata.makeString("x"), Automata.makeString("xy")); |
| a = MinimizationOperations.minimize(a, DEFAULT_DETERMINIZE_WORK_LIMIT); |
| FiniteStringsIterator iterator = new FiniteStringsIterator(a); |
| List<IntsRef> actual = getFiniteStrings(iterator); |
| assertEquals(2, actual.size()); |
| IntsRefBuilder x = new IntsRefBuilder(); |
| Util.toIntsRef(new BytesRef("x"), x); |
| assertTrue(actual.contains(x.get())); |
| IntsRefBuilder xy = new IntsRefBuilder(); |
| Util.toIntsRef(new BytesRef("xy"), xy); |
| assertTrue(actual.contains(xy.get())); |
| } |
| |
| public void testSingleString() { |
| Automaton a = new Automaton(); |
| int start = a.createState(); |
| int end = a.createState(); |
| a.setAccept(end, true); |
| a.addTransition(start, end, 'a', 'a'); |
| a.finishState(); |
| Set<IntsRef> accepted = TestOperations.getFiniteStrings(a); |
| assertEquals(1, accepted.size()); |
| IntsRefBuilder intsRef = new IntsRefBuilder(); |
| intsRef.append('a'); |
| assertTrue(accepted.contains(intsRef.toIntsRef())); |
| } |
| |
| /** |
| * All strings generated by the iterator. |
| */ |
| static List<IntsRef> getFiniteStrings(FiniteStringsIterator iterator) { |
| List<IntsRef> result = new ArrayList<>(); |
| for (IntsRef finiteString; (finiteString = iterator.next()) != null;) { |
| result.add(IntsRef.deepCopyOf(finiteString)); |
| } |
| |
| return result; |
| } |
| |
| /** |
| * Check that strings the automaton returns are as expected. |
| * |
| * @param automaton Automaton. |
| * @param actual Strings generated by automaton. |
| */ |
| private void assertFiniteStringsRecursive(Automaton automaton, List<IntsRef> actual) { |
| Set<IntsRef> expected = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1); |
| // Check that no string is emitted twice. |
| assertEquals(expected.size(), actual.size()); |
| assertEquals(expected, new HashSet<>(actual)); |
| } |
| |
| // ascii only! |
| private static String toString(IntsRef ints) { |
| BytesRef br = new BytesRef(ints.length); |
| for(int i=0;i<ints.length;i++) { |
| br.bytes[i] = (byte) ints.ints[i]; |
| } |
| br.length = ints.length; |
| return br.utf8ToString(); |
| } |
| } |