| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.util.automaton; |
| |
| |
| import java.util.*; |
| |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.CharsRef; |
| import org.apache.lucene.util.UnicodeUtil; |
| |
| /** |
| * Builds a minimal, deterministic {@link Automaton} that accepts a set of |
| * strings. The algorithm requires sorted input data, but is very fast |
| * (nearly linear with the input size). |
| * |
| * @see #build(Collection) |
| * @see Automata#makeStringUnion(Collection) |
| */ |
| public final class DaciukMihovAutomatonBuilder { |
| |
| /** |
| * This builder rejects terms that are more than 1k chars long since it then |
| * uses recursion based on the length of the string, which might cause stack |
| * overflows. |
| */ |
| static final int MAX_TERM_LENGTH = 1_000; |
| |
| /** |
| * The default constructor is private. Use static methods directly. |
| */ |
| private DaciukMihovAutomatonBuilder() { |
| super(); |
| } |
| |
| /** |
| * DFSA state with <code>char</code> labels on transitions. |
| */ |
| private final static class State { |
| |
| /** An empty set of labels. */ |
| private final static int[] NO_LABELS = new int[0]; |
| |
| /** An empty set of states. */ |
| private final static State[] NO_STATES = new State[0]; |
| |
| /** |
| * Labels of outgoing transitions. Indexed identically to {@link #states}. |
| * Labels must be sorted lexicographically. |
| */ |
| int[] labels = NO_LABELS; |
| |
| /** |
| * States reachable from outgoing transitions. Indexed identically to |
| * {@link #labels}. |
| */ |
| State[] states = NO_STATES; |
| |
| /** |
| * <code>true</code> if this state corresponds to the end of at least one |
| * input sequence. |
| */ |
| boolean is_final; |
| |
| /** |
| * Returns the target state of a transition leaving this state and labeled |
| * with <code>label</code>. If no such transition exists, returns |
| * <code>null</code>. |
| */ |
| State getState(int label) { |
| final int index = Arrays.binarySearch(labels, label); |
| return index >= 0 ? states[index] : null; |
| } |
| |
| /** |
| * Two states are equal if: |
| * <ul> |
| * <li>they have an identical number of outgoing transitions, labeled with |
| * the same labels</li> |
| * <li>corresponding outgoing transitions lead to the same states (to states |
| * with an identical right-language). |
| * </ul> |
| */ |
| @Override |
| public boolean equals(Object obj) { |
| final State other = (State) obj; |
| return is_final == other.is_final |
| && Arrays.equals(this.labels, other.labels) |
| && referenceEquals(this.states, other.states); |
| } |
| |
| /** |
| * Compute the hash code of the <i>current</i> status of this state. |
| */ |
| @Override |
| public int hashCode() { |
| int hash = is_final ? 1 : 0; |
| |
| hash ^= hash * 31 + this.labels.length; |
| for (int c : this.labels) |
| hash ^= hash * 31 + c; |
| |
| /* |
| * Compare the right-language of this state using reference-identity of |
| * outgoing states. This is possible because states are interned (stored |
| * in registry) and traversed in post-order, so any outgoing transitions |
| * are already interned. |
| */ |
| for (State s : this.states) { |
| hash ^= System.identityHashCode(s); |
| } |
| |
| return hash; |
| } |
| |
| /** |
| * Return <code>true</code> if this state has any children (outgoing |
| * transitions). |
| */ |
| boolean hasChildren() { |
| return labels.length > 0; |
| } |
| |
| /** |
| * Create a new outgoing transition labeled <code>label</code> and return |
| * the newly created target state for this transition. |
| */ |
| State newState(int label) { |
| assert Arrays.binarySearch(labels, label) < 0 : "State already has transition labeled: " |
| + label; |
| |
| labels = ArrayUtil.growExact(labels, labels.length + 1); |
| states = ArrayUtil.growExact(states, states.length + 1); |
| |
| labels[labels.length - 1] = label; |
| return states[states.length - 1] = new State(); |
| } |
| |
| /** |
| * Return the most recent transitions's target state. |
| */ |
| State lastChild() { |
| assert hasChildren() : "No outgoing transitions."; |
| return states[states.length - 1]; |
| } |
| |
| /** |
| * Return the associated state if the most recent transition is labeled with |
| * <code>label</code>. |
| */ |
| State lastChild(int label) { |
| final int index = labels.length - 1; |
| State s = null; |
| if (index >= 0 && labels[index] == label) { |
| s = states[index]; |
| } |
| assert s == getState(label); |
| return s; |
| } |
| |
| /** |
| * Replace the last added outgoing transition's target state with the given |
| * state. |
| */ |
| void replaceLastChild(State state) { |
| assert hasChildren() : "No outgoing transitions."; |
| states[states.length - 1] = state; |
| } |
| |
| /** |
| * Compare two lists of objects for reference-equality. |
| */ |
| private static boolean referenceEquals(Object[] a1, Object[] a2) { |
| if (a1.length != a2.length) { |
| return false; |
| } |
| |
| for (int i = 0; i < a1.length; i++) { |
| if (a1[i] != a2[i]) { |
| return false; |
| } |
| } |
| |
| return true; |
| } |
| } |
| |
| /** |
| * A "registry" for state interning. |
| */ |
| private HashMap<State,State> stateRegistry = new HashMap<>(); |
| |
| /** |
| * Root automaton state. |
| */ |
| private State root = new State(); |
| |
| /** |
| * Previous sequence added to the automaton in {@link #add(CharsRef)}. |
| */ |
| private CharsRef previous; |
| |
| /** |
| * A comparator used for enforcing sorted UTF8 order, used in assertions only. |
| */ |
| @SuppressWarnings("deprecation") |
| private static final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator(); |
| |
| /** |
| * Add another character sequence to this automaton. The sequence must be |
| * lexicographically larger or equal compared to any previous sequences added |
| * to this automaton (the input must be sorted). |
| */ |
| public void add(CharsRef current) { |
| if (current.length > MAX_TERM_LENGTH) { |
| throw new IllegalArgumentException("This builder doesn't allow terms that are larger than 1,000 characters, got " + current); |
| } |
| assert stateRegistry != null : "Automaton already built."; |
| assert previous == null |
| || comparator.compare(previous, current) <= 0 : "Input must be in sorted UTF-8 order: " |
| + previous + " >= " + current; |
| assert setPrevious(current); |
| |
| // Descend in the automaton (find matching prefix). |
| int pos = 0, max = current.length(); |
| State next, state = root; |
| while (pos < max && (next = state.lastChild(Character.codePointAt(current, pos))) != null) { |
| state = next; |
| // todo, optimize me |
| pos += Character.charCount(Character.codePointAt(current, pos)); |
| } |
| |
| if (state.hasChildren()) replaceOrRegister(state); |
| |
| addSuffix(state, current, pos); |
| } |
| |
| /** |
| * Finalize the automaton and return the root state. No more strings can be |
| * added to the builder after this call. |
| * |
| * @return Root automaton state. |
| */ |
| public State complete() { |
| if (this.stateRegistry == null) throw new IllegalStateException(); |
| |
| if (root.hasChildren()) replaceOrRegister(root); |
| |
| stateRegistry = null; |
| return root; |
| } |
| |
| /** |
| * Internal recursive traversal for conversion. |
| */ |
| private static int convert(Automaton.Builder a, State s, |
| IdentityHashMap<State,Integer> visited) { |
| |
| Integer converted = visited.get(s); |
| if (converted != null) { |
| return converted; |
| } |
| |
| converted = a.createState(); |
| a.setAccept(converted, s.is_final); |
| |
| visited.put(s, converted); |
| int i = 0; |
| int[] labels = s.labels; |
| for (DaciukMihovAutomatonBuilder.State target : s.states) { |
| a.addTransition(converted, convert(a, target, visited), labels[i++]); |
| } |
| |
| return converted; |
| } |
| |
| /** |
| * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing |
| * strings in UTF-8. These strings must be binary-sorted. |
| */ |
| public static Automaton build(Collection<BytesRef> input) { |
| final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder(); |
| |
| char[] chars = new char[0]; |
| CharsRef ref = new CharsRef(); |
| for (BytesRef b : input) { |
| chars = ArrayUtil.grow(chars, b.length); |
| final int len = UnicodeUtil.UTF8toUTF16(b, chars); |
| ref.chars = chars; |
| ref.length = len; |
| builder.add(ref); |
| } |
| |
| Automaton.Builder a = new Automaton.Builder(); |
| convert(a, |
| builder.complete(), |
| new IdentityHashMap<State,Integer>()); |
| |
| return a.finish(); |
| } |
| |
| /** |
| * Copy <code>current</code> into an internal buffer. |
| */ |
| private boolean setPrevious(CharsRef current) { |
| // don't need to copy, once we fix https://issues.apache.org/jira/browse/LUCENE-3277 |
| // still, called only from assert |
| previous = CharsRef.deepCopyOf(current); |
| return true; |
| } |
| |
| /** |
| * Replace last child of <code>state</code> with an already registered state |
| * or stateRegistry the last child state. |
| */ |
| private void replaceOrRegister(State state) { |
| final State child = state.lastChild(); |
| |
| if (child.hasChildren()) replaceOrRegister(child); |
| |
| final State registered = stateRegistry.get(child); |
| if (registered != null) { |
| state.replaceLastChild(registered); |
| } else { |
| stateRegistry.put(child, child); |
| } |
| } |
| |
| /** |
| * Add a suffix of <code>current</code> starting at <code>fromIndex</code> |
| * (inclusive) to state <code>state</code>. |
| */ |
| private void addSuffix(State state, CharSequence current, int fromIndex) { |
| final int len = current.length(); |
| while (fromIndex < len) { |
| int cp = Character.codePointAt(current, fromIndex); |
| state = state.newState(cp); |
| fromIndex += Character.charCount(cp); |
| } |
| state.is_final = true; |
| } |
| } |