| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.util.automaton; |
| |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.List; |
| |
| import org.apache.lucene.index.SingleTermsEnum; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.index.Terms; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.search.Query; |
| import org.apache.lucene.search.QueryVisitor; |
| import org.apache.lucene.util.Accountable; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefBuilder; |
| import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.RamUsageEstimator; |
| import org.apache.lucene.util.StringHelper; |
| import org.apache.lucene.util.UnicodeUtil; |
| |
| /** |
| * Immutable class holding compiled details for a given |
| * Automaton. The Automaton is deterministic, must not have |
| * dead states but is not necessarily minimal. |
| * |
| * @lucene.experimental |
| */ |
| public class CompiledAutomaton implements Accountable { |
| private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(CompiledAutomaton.class); |
| |
| /** |
| * Automata are compiled into different internal forms for the |
| * most efficient execution depending upon the language they accept. |
| */ |
| public enum AUTOMATON_TYPE { |
| /** Automaton that accepts no strings. */ |
| NONE, |
| /** Automaton that accepts all possible strings. */ |
| ALL, |
| /** Automaton that accepts only a single fixed string. */ |
| SINGLE, |
| /** Catch-all for any other automata. */ |
| NORMAL |
| }; |
| |
| /** If simplify is true this will be the "simplified" type; else, this is NORMAL */ |
| public final AUTOMATON_TYPE type; |
| |
| /** |
| * For {@link AUTOMATON_TYPE#SINGLE} this is the singleton term. |
| */ |
| public final BytesRef term; |
| |
| /** |
| * Matcher for quickly determining if a byte[] is accepted. |
| * only valid for {@link AUTOMATON_TYPE#NORMAL}. |
| */ |
| public final ByteRunAutomaton runAutomaton; |
| |
| /** |
| * Two dimensional array of transitions, indexed by state |
| * number for traversal. The state numbering is consistent with |
| * {@link #runAutomaton}. |
| * Only valid for {@link AUTOMATON_TYPE#NORMAL}. |
| */ |
| public final Automaton automaton; |
| |
| /** |
| * Shared common suffix accepted by the automaton. Only valid |
| * for {@link AUTOMATON_TYPE#NORMAL}, and only when the |
| * automaton accepts an infinite language. This will be null |
| * if the common prefix is length 0. |
| */ |
| public final BytesRef commonSuffixRef; |
| |
| /** |
| * Indicates if the automaton accepts a finite set of strings. |
| * Null if this was not computed. |
| * Only valid for {@link AUTOMATON_TYPE#NORMAL}. |
| */ |
| public final Boolean finite; |
| |
| /** Which state, if any, accepts all suffixes, else -1. */ |
| public final int sinkState; |
| |
| /** Create this, passing simplify=true and finite=null, so that we try |
| * to simplify the automaton and determine if it is finite. */ |
| public CompiledAutomaton(Automaton automaton) { |
| this(automaton, null, true); |
| } |
| |
| /** Returns sink state, if present, else -1. */ |
| private static int findSinkState(Automaton automaton) { |
| int numStates = automaton.getNumStates(); |
| Transition t = new Transition(); |
| int foundState = -1; |
| for (int s=0;s<numStates;s++) { |
| if (automaton.isAccept(s)) { |
| int count = automaton.initTransition(s, t); |
| boolean isSinkState = false; |
| for(int i=0;i<count;i++) { |
| automaton.getNextTransition(t); |
| if (t.dest == s && t.min == 0 && t.max == 0xff) { |
| isSinkState = true; |
| break; |
| } |
| } |
| if (isSinkState) { |
| foundState = s; |
| break; |
| } |
| } |
| } |
| |
| return foundState; |
| } |
| |
| /** Create this. If finite is null, we use {@link Operations#isFinite} |
| * to determine whether it is finite. If simplify is true, we run |
| * possibly expensive operations to determine if the automaton is one |
| * the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}. */ |
| public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify) { |
| this(automaton, finite, simplify, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT, false); |
| } |
| |
| |
| /** Create this. If finite is null, we use {@link Operations#isFinite} |
| * to determine whether it is finite. If simplify is true, we run |
| * possibly expensive operations to determine if the automaton is one |
| * the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}. If simplify |
| * requires determinizing the automaton then at most determinizeWorkLimit |
| * effort will be spent. Any more than that will cause a |
| * TooComplexToDeterminizeException. |
| */ |
| public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify, |
| int determinizeWorkLimit, boolean isBinary) { |
| if (automaton.getNumStates() == 0) { |
| automaton = new Automaton(); |
| automaton.createState(); |
| } |
| |
| if (simplify) { |
| |
| // Test whether the automaton is a "simple" form and |
| // if so, don't create a runAutomaton. Note that on a |
| // large automaton these tests could be costly: |
| |
| if (Operations.isEmpty(automaton)) { |
| // matches nothing |
| type = AUTOMATON_TYPE.NONE; |
| term = null; |
| commonSuffixRef = null; |
| runAutomaton = null; |
| this.automaton = null; |
| this.finite = null; |
| sinkState = -1; |
| return; |
| } |
| |
| boolean isTotal; |
| |
| // NOTE: only approximate, because automaton may not be minimal: |
| if (isBinary) { |
| isTotal = Operations.isTotal(automaton, 0, 0xff); |
| } else { |
| isTotal = Operations.isTotal(automaton); |
| } |
| |
| if (isTotal) { |
| // matches all possible strings |
| type = AUTOMATON_TYPE.ALL; |
| term = null; |
| commonSuffixRef = null; |
| runAutomaton = null; |
| this.automaton = null; |
| this.finite = null; |
| sinkState = -1; |
| return; |
| } |
| |
| automaton = Operations.determinize(automaton, determinizeWorkLimit); |
| |
| IntsRef singleton = Operations.getSingleton(automaton); |
| |
| if (singleton != null) { |
| // matches a fixed string |
| type = AUTOMATON_TYPE.SINGLE; |
| commonSuffixRef = null; |
| runAutomaton = null; |
| this.automaton = null; |
| this.finite = null; |
| |
| if (isBinary) { |
| term = StringHelper.intsRefToBytesRef(singleton); |
| } else { |
| term = new BytesRef(UnicodeUtil.newString(singleton.ints, singleton.offset, singleton.length)); |
| } |
| sinkState = -1; |
| return; |
| } |
| } |
| |
| type = AUTOMATON_TYPE.NORMAL; |
| term = null; |
| |
| if (finite == null) { |
| this.finite = Operations.isFinite(automaton); |
| } else { |
| this.finite = finite; |
| } |
| |
| Automaton binary; |
| if (isBinary) { |
| // Caller already built binary automaton themselves, e.g. PrefixQuery |
| // does this since it can be provided with a binary (not necessarily |
| // UTF8!) term: |
| binary = automaton; |
| } else { |
| // Incoming automaton is unicode, and we must convert to UTF8 to match what's in the index: |
| binary = new UTF32ToUTF8().convert(automaton); |
| } |
| |
| // compute a common suffix for infinite DFAs, this is an optimization for "leading wildcard" |
| // so don't burn cycles on it if the DFA is finite, or largeish |
| if (this.finite || automaton.getNumStates() + automaton.getNumTransitions() > 1000) { |
| commonSuffixRef = null; |
| } else { |
| BytesRef suffix = Operations.getCommonSuffixBytesRef(binary); |
| if (suffix.length == 0) { |
| commonSuffixRef = null; |
| } else { |
| commonSuffixRef = suffix; |
| } |
| } |
| |
| // This will determinize the binary automaton for us: |
| runAutomaton = new ByteRunAutomaton(binary, true, determinizeWorkLimit); |
| |
| this.automaton = runAutomaton.automaton; |
| |
| // TODO: this is a bit fragile because if the automaton is not minimized there could be more than 1 sink state but auto-prefix will fail |
| // to run for those: |
| sinkState = findSinkState(this.automaton); |
| } |
| |
| private Transition transition = new Transition(); |
| |
| //private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG; |
| |
| private BytesRef addTail(int state, BytesRefBuilder term, int idx, int leadLabel) { |
| //System.out.println("addTail state=" + state + " term=" + term.utf8ToString() + " idx=" + idx + " leadLabel=" + (char) leadLabel); |
| //System.out.println(automaton.toDot()); |
| // Find biggest transition that's < label |
| // TODO: use binary search here |
| int maxIndex = -1; |
| int numTransitions = automaton.initTransition(state, transition); |
| for(int i=0;i<numTransitions;i++) { |
| automaton.getNextTransition(transition); |
| if (transition.min < leadLabel) { |
| maxIndex = i; |
| } else { |
| // Transitions are always sorted |
| break; |
| } |
| } |
| |
| //System.out.println(" maxIndex=" + maxIndex); |
| |
| assert maxIndex != -1; |
| automaton.getTransition(state, maxIndex, transition); |
| |
| // Append floorLabel |
| final int floorLabel; |
| if (transition.max > leadLabel-1) { |
| floorLabel = leadLabel-1; |
| } else { |
| floorLabel = transition.max; |
| } |
| //System.out.println(" floorLabel=" + (char) floorLabel); |
| term.grow(1+idx); |
| //if (DEBUG) System.out.println(" add floorLabel=" + (char) floorLabel + " idx=" + idx); |
| term.setByteAt(idx, (byte) floorLabel); |
| |
| state = transition.dest; |
| //System.out.println(" dest: " + state); |
| idx++; |
| |
| // Push down to last accept state |
| while (true) { |
| numTransitions = automaton.getNumTransitions(state); |
| if (numTransitions == 0) { |
| //System.out.println("state=" + state + " 0 trans"); |
| assert runAutomaton.isAccept(state); |
| term.setLength(idx); |
| //if (DEBUG) System.out.println(" return " + term.utf8ToString()); |
| return term.get(); |
| } else { |
| // We are pushing "top" -- so get last label of |
| // last transition: |
| //System.out.println("get state=" + state + " numTrans=" + numTransitions); |
| automaton.getTransition(state, numTransitions-1, transition); |
| term.grow(1+idx); |
| //if (DEBUG) System.out.println(" push maxLabel=" + (char) lastTransition.max + " idx=" + idx); |
| //System.out.println(" add trans dest=" + scratch.dest + " label=" + (char) scratch.max); |
| term.setByteAt(idx, (byte) transition.max); |
| state = transition.dest; |
| idx++; |
| } |
| } |
| } |
| |
| // TODO: should this take startTerm too? This way |
| // Terms.intersect could forward to this method if type != |
| // NORMAL: |
| /** Return a {@link TermsEnum} intersecting the provided {@link Terms} |
| * with the terms accepted by this automaton. */ |
| public TermsEnum getTermsEnum(Terms terms) throws IOException { |
| switch(type) { |
| case NONE: |
| return TermsEnum.EMPTY; |
| case ALL: |
| return terms.iterator(); |
| case SINGLE: |
| return new SingleTermsEnum(terms.iterator(), term); |
| case NORMAL: |
| return terms.intersect(this, null); |
| default: |
| // unreachable |
| throw new RuntimeException("unhandled case"); |
| } |
| } |
| |
| /** |
| * Report back to a QueryVisitor how this automaton matches terms |
| */ |
| public void visit(QueryVisitor visitor, Query parent, String field) { |
| if (visitor.acceptField(field)) { |
| switch (type) { |
| case NORMAL: |
| visitor.consumeTermsMatching(parent, field, () -> runAutomaton); |
| break; |
| case NONE: |
| break; |
| case ALL: |
| visitor.consumeTermsMatching(parent, field, () -> new ByteRunAutomaton(Automata.makeAnyString())); |
| break; |
| case SINGLE: |
| visitor.consumeTerms(parent, new Term(field, term)); |
| break; |
| } |
| } |
| } |
| |
| /** Finds largest term accepted by this Automaton, that's |
| * <= the provided input term. The result is placed in |
| * output; it's fine for output and input to point to |
| * the same bytes. The returned result is either the |
| * provided output, or null if there is no floor term |
| * (ie, the provided input term is before the first term |
| * accepted by this Automaton). */ |
| public BytesRef floor(BytesRef input, BytesRefBuilder output) { |
| |
| //if (DEBUG) System.out.println("CA.floor input=" + input.utf8ToString()); |
| |
| int state = 0; |
| |
| // Special case empty string: |
| if (input.length == 0) { |
| if (runAutomaton.isAccept(state)) { |
| output.clear(); |
| return output.get(); |
| } else { |
| return null; |
| } |
| } |
| |
| final List<Integer> stack = new ArrayList<>(); |
| |
| int idx = 0; |
| while (true) { |
| int label = input.bytes[input.offset + idx] & 0xff; |
| int nextState = runAutomaton.step(state, label); |
| //if (DEBUG) System.out.println(" cycle label=" + (char) label + " nextState=" + nextState); |
| |
| if (idx == input.length-1) { |
| if (nextState != -1 && runAutomaton.isAccept(nextState)) { |
| // Input string is accepted |
| output.grow(1+idx); |
| output.setByteAt(idx, (byte) label); |
| output.setLength(input.length); |
| //if (DEBUG) System.out.println(" input is accepted; return term=" + output.utf8ToString()); |
| return output.get(); |
| } else { |
| nextState = -1; |
| } |
| } |
| |
| if (nextState == -1) { |
| |
| // Pop back to a state that has a transition |
| // <= our label: |
| while (true) { |
| int numTransitions = automaton.getNumTransitions(state); |
| if (numTransitions == 0) { |
| assert runAutomaton.isAccept(state); |
| output.setLength(idx); |
| //if (DEBUG) System.out.println(" return " + output.utf8ToString()); |
| return output.get(); |
| } else { |
| automaton.getTransition(state, 0, transition); |
| |
| if (label-1 < transition.min) { |
| |
| if (runAutomaton.isAccept(state)) { |
| output.setLength(idx); |
| //if (DEBUG) System.out.println(" return " + output.utf8ToString()); |
| return output.get(); |
| } |
| // pop |
| if (stack.size() == 0) { |
| //if (DEBUG) System.out.println(" pop ord=" + idx + " return null"); |
| return null; |
| } else { |
| state = stack.remove(stack.size()-1); |
| idx--; |
| //if (DEBUG) System.out.println(" pop ord=" + (idx+1) + " label=" + (char) label + " first trans.min=" + (char) transitions[0].min); |
| label = input.bytes[input.offset + idx] & 0xff; |
| } |
| } else { |
| //if (DEBUG) System.out.println(" stop pop ord=" + idx + " first trans.min=" + (char) transitions[0].min); |
| break; |
| } |
| } |
| } |
| |
| //if (DEBUG) System.out.println(" label=" + (char) label + " idx=" + idx); |
| |
| return addTail(state, output, idx, label); |
| |
| } else { |
| output.grow(1+idx); |
| output.setByteAt(idx, (byte) label); |
| stack.add(state); |
| state = nextState; |
| idx++; |
| } |
| } |
| } |
| |
| @Override |
| public int hashCode() { |
| final int prime = 31; |
| int result = 1; |
| result = prime * result + ((runAutomaton == null) ? 0 : runAutomaton.hashCode()); |
| result = prime * result + ((term == null) ? 0 : term.hashCode()); |
| result = prime * result + ((type == null) ? 0 : type.hashCode()); |
| return result; |
| } |
| |
| @Override |
| public boolean equals(Object obj) { |
| if (this == obj) return true; |
| if (obj == null) return false; |
| if (getClass() != obj.getClass()) return false; |
| CompiledAutomaton other = (CompiledAutomaton) obj; |
| if (type != other.type) return false; |
| if (type == AUTOMATON_TYPE.SINGLE) { |
| if (!term.equals(other.term)) return false; |
| } else if (type == AUTOMATON_TYPE.NORMAL) { |
| if (!runAutomaton.equals(other.runAutomaton)) return false; |
| } |
| |
| return true; |
| } |
| |
| @Override |
| public long ramBytesUsed() { |
| return BASE_RAM_BYTES + |
| RamUsageEstimator.sizeOfObject(automaton) + |
| RamUsageEstimator.sizeOfObject(commonSuffixRef) + |
| RamUsageEstimator.sizeOfObject(runAutomaton) + |
| RamUsageEstimator.sizeOfObject(term) + |
| RamUsageEstimator.sizeOfObject(transition); |
| } |
| |
| } |