| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.index; |
| |
| |
| import java.io.IOException; |
| import java.util.Arrays; |
| |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefBuilder; |
| import org.apache.lucene.util.IntsRefBuilder; |
| import org.apache.lucene.util.StringHelper; |
| import org.apache.lucene.util.automaton.Automaton; |
| import org.apache.lucene.util.automaton.ByteRunAutomaton; |
| import org.apache.lucene.util.automaton.CompiledAutomaton; |
| import org.apache.lucene.util.automaton.Transition; |
| |
| /** |
| * A FilteredTermsEnum that enumerates terms based upon what is accepted by a |
| * DFA. |
| * <p> |
| * The algorithm is such: |
| * <ol> |
| * <li>As long as matches are successful, keep reading sequentially. |
| * <li>When a match fails, skip to the next string in lexicographic order that |
| * does not enter a reject state. |
| * </ol> |
| * <p> |
| * The algorithm does not attempt to actually skip to the next string that is |
| * completely accepted. This is not possible when the language accepted by the |
| * FSM is not finite (i.e. * operator). |
| * </p> |
| * @lucene.internal |
| */ |
| public class AutomatonTermsEnum extends FilteredTermsEnum { |
| // a tableized array-based form of the DFA |
| private final ByteRunAutomaton runAutomaton; |
| // common suffix of the automaton |
| private final BytesRef commonSuffixRef; |
| // true if the automaton accepts a finite language |
| private final boolean finite; |
| // array of sorted transitions for each state, indexed by state number |
| private final Automaton automaton; |
| // Used for visited state tracking: each short records gen when we last |
| // visited the state; we use gens to avoid having to clear |
| private final short[] visited; |
| private short curGen; |
| // the reference used for seeking forwards through the term dictionary |
| private final BytesRefBuilder seekBytesRef = new BytesRefBuilder(); |
| // true if we are enumerating an infinite portion of the DFA. |
| // in this case it is faster to drive the query based on the terms dictionary. |
| // when this is true, linearUpperBound indicate the end of range |
| // of terms where we should simply do sequential reads instead. |
| private boolean linear; |
| private final BytesRef linearUpperBound = new BytesRef(); |
| private final Transition transition = new Transition(); |
| private final IntsRefBuilder savedStates = new IntsRefBuilder(); |
| |
| /** |
| * Construct an enumerator based upon an automaton, enumerating the specified |
| * field, working on a supplied TermsEnum |
| * |
| * @lucene.experimental |
| * @param compiled CompiledAutomaton |
| */ |
| public AutomatonTermsEnum(TermsEnum tenum, CompiledAutomaton compiled) { |
| super(tenum); |
| if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { |
| throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); |
| } |
| this.finite = compiled.finite; |
| this.runAutomaton = compiled.runAutomaton; |
| assert this.runAutomaton != null; |
| this.commonSuffixRef = compiled.commonSuffixRef; |
| this.automaton = compiled.automaton; |
| |
| // No need to track visited states for a finite language without loops. |
| visited = finite ? null : new short[runAutomaton.getSize()]; |
| } |
| |
| /** |
| * Records the given state has been visited. |
| */ |
| private void setVisited(int state) { |
| if (!finite) { |
| visited[state] = curGen; |
| } |
| } |
| |
| /** |
| * Indicates whether the given state has been visited. |
| */ |
| private boolean isVisited(int state) { |
| return !finite && visited[state] == curGen; |
| } |
| |
| /** |
| * Returns true if the term matches the automaton. Also stashes away the term |
| * to assist with smart enumeration. |
| */ |
| @Override |
| protected AcceptStatus accept(final BytesRef term) { |
| if (commonSuffixRef == null || StringHelper.endsWith(term, commonSuffixRef)) { |
| if (runAutomaton.run(term.bytes, term.offset, term.length)) |
| return linear ? AcceptStatus.YES : AcceptStatus.YES_AND_SEEK; |
| else |
| return (linear && term.compareTo(linearUpperBound) < 0) ? |
| AcceptStatus.NO : AcceptStatus.NO_AND_SEEK; |
| } else { |
| return (linear && term.compareTo(linearUpperBound) < 0) ? |
| AcceptStatus.NO : AcceptStatus.NO_AND_SEEK; |
| } |
| } |
| |
| @Override |
| protected BytesRef nextSeekTerm(final BytesRef term) throws IOException { |
| //System.out.println("ATE.nextSeekTerm term=" + term); |
| if (term == null) { |
| assert seekBytesRef.length() == 0; |
| // return the empty term, as it's valid |
| if (runAutomaton.isAccept(0)) { |
| return seekBytesRef.get(); |
| } |
| } else { |
| seekBytesRef.copyBytes(term); |
| } |
| |
| // seek to the next possible string; |
| if (nextString()) { |
| return seekBytesRef.get(); // reposition |
| } else { |
| return null; // no more possible strings can match |
| } |
| } |
| |
| /** |
| * Sets the enum to operate in linear fashion, as we have found |
| * a looping transition at position: we set an upper bound and |
| * act like a TermRangeQuery for this portion of the term space. |
| */ |
| private void setLinear(int position) { |
| assert linear == false; |
| |
| int state = 0; |
| int maxInterval = 0xff; |
| //System.out.println("setLinear pos=" + position + " seekbytesRef=" + seekBytesRef); |
| for (int i = 0; i < position; i++) { |
| state = runAutomaton.step(state, seekBytesRef.byteAt(i) & 0xff); |
| assert state >= 0: "state=" + state; |
| } |
| final int numTransitions = automaton.getNumTransitions(state); |
| automaton.initTransition(state, transition); |
| for (int i = 0; i < numTransitions; i++) { |
| automaton.getNextTransition(transition); |
| if (transition.min <= (seekBytesRef.byteAt(position) & 0xff) && |
| (seekBytesRef.byteAt(position) & 0xff) <= transition.max) { |
| maxInterval = transition.max; |
| break; |
| } |
| } |
| // 0xff terms don't get the optimization... not worth the trouble. |
| if (maxInterval != 0xff) |
| maxInterval++; |
| int length = position + 1; /* position + maxTransition */ |
| if (linearUpperBound.bytes.length < length) { |
| linearUpperBound.bytes = new byte[ArrayUtil.oversize(length, Byte.BYTES)]; |
| } |
| System.arraycopy(seekBytesRef.bytes(), 0, linearUpperBound.bytes, 0, position); |
| linearUpperBound.bytes[position] = (byte) maxInterval; |
| linearUpperBound.length = length; |
| |
| linear = true; |
| } |
| |
| /** |
| * Increments the byte buffer to the next String in binary order after s that will not put |
| * the machine into a reject state. If such a string does not exist, returns |
| * false. |
| * |
| * The correctness of this method depends upon the automaton being deterministic, |
| * and having no transitions to dead states. |
| * |
| * @return true if more possible solutions exist for the DFA |
| */ |
| private boolean nextString() { |
| int state; |
| int pos = 0; |
| savedStates.grow(seekBytesRef.length()+1); |
| savedStates.setIntAt(0, 0); |
| |
| while (true) { |
| if (!finite && ++curGen == 0) { |
| // Clear the visited states every time curGen wraps (so very infrequently to not impact average perf). |
| Arrays.fill(visited, (short) -1); |
| } |
| linear = false; |
| // walk the automaton until a character is rejected. |
| for (state = savedStates.intAt(pos); pos < seekBytesRef.length(); pos++) { |
| setVisited(state); |
| int nextState = runAutomaton.step(state, seekBytesRef.byteAt(pos) & 0xff); |
| if (nextState == -1) |
| break; |
| savedStates.setIntAt(pos+1, nextState); |
| // we found a loop, record it for faster enumeration |
| if (!linear && isVisited(nextState)) { |
| setLinear(pos); |
| } |
| state = nextState; |
| } |
| |
| // take the useful portion, and the last non-reject state, and attempt to |
| // append characters that will match. |
| if (nextString(state, pos)) { |
| return true; |
| } else { /* no more solutions exist from this useful portion, backtrack */ |
| if ((pos = backtrack(pos)) < 0) /* no more solutions at all */ |
| return false; |
| final int newState = runAutomaton.step(savedStates.intAt(pos), seekBytesRef.byteAt(pos) & 0xff); |
| if (newState >= 0 && runAutomaton.isAccept(newState)) |
| /* String is good to go as-is */ |
| return true; |
| /* else advance further */ |
| // TODO: paranoia? if we backtrack thru an infinite DFA, the loop detection is important! |
| // for now, restart from scratch for all infinite DFAs |
| if (!finite) pos = 0; |
| } |
| } |
| } |
| |
| /** |
| * Returns the next String in lexicographic order that will not put |
| * the machine into a reject state. |
| * |
| * This method traverses the DFA from the given position in the String, |
| * starting at the given state. |
| * |
| * If this cannot satisfy the machine, returns false. This method will |
| * walk the minimal path, in lexicographic order, as long as possible. |
| * |
| * If this method returns false, then there might still be more solutions, |
| * it is necessary to backtrack to find out. |
| * |
| * @param state current non-reject state |
| * @param position useful portion of the string |
| * @return true if more possible solutions exist for the DFA from this |
| * position |
| */ |
| private boolean nextString(int state, int position) { |
| /* |
| * the next lexicographic character must be greater than the existing |
| * character, if it exists. |
| */ |
| int c = 0; |
| if (position < seekBytesRef.length()) { |
| c = seekBytesRef.byteAt(position) & 0xff; |
| // if the next byte is 0xff and is not part of the useful portion, |
| // then by definition it puts us in a reject state, and therefore this |
| // path is dead. there cannot be any higher transitions. backtrack. |
| if (c++ == 0xff) |
| return false; |
| } |
| |
| seekBytesRef.setLength(position); |
| setVisited(state); |
| |
| final int numTransitions = automaton.getNumTransitions(state); |
| automaton.initTransition(state, transition); |
| // find the minimal path (lexicographic order) that is >= c |
| |
| for (int i = 0; i < numTransitions; i++) { |
| automaton.getNextTransition(transition); |
| if (transition.max >= c) { |
| int nextChar = Math.max(c, transition.min); |
| // append either the next sequential char, or the minimum transition |
| seekBytesRef.grow(seekBytesRef.length() + 1); |
| seekBytesRef.append((byte) nextChar); |
| state = transition.dest; |
| /* |
| * as long as is possible, continue down the minimal path in |
| * lexicographic order. if a loop or accept state is encountered, stop. |
| */ |
| while (!isVisited(state) && !runAutomaton.isAccept(state)) { |
| setVisited(state); |
| /* |
| * Note: we work with a DFA with no transitions to dead states. |
| * so the below is ok, if it is not an accept state, |
| * then there MUST be at least one transition. |
| */ |
| automaton.initTransition(state, transition); |
| automaton.getNextTransition(transition); |
| state = transition.dest; |
| |
| // append the minimum transition |
| seekBytesRef.grow(seekBytesRef.length() + 1); |
| seekBytesRef.append((byte) transition.min); |
| |
| // we found a loop, record it for faster enumeration |
| if (!linear && isVisited(state)) { |
| setLinear(seekBytesRef.length()-1); |
| } |
| } |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /** |
| * Attempts to backtrack thru the string after encountering a dead end |
| * at some given position. Returns false if no more possible strings |
| * can match. |
| * |
| * @param position current position in the input String |
| * @return {@code position >= 0} if more possible solutions exist for the DFA |
| */ |
| private int backtrack(int position) { |
| while (position-- > 0) { |
| int nextChar = seekBytesRef.byteAt(position) & 0xff; |
| // if a character is 0xff it's a dead-end too, |
| // because there is no higher character in binary sort order. |
| if (nextChar++ != 0xff) { |
| seekBytesRef.setByteAt(position, (byte) nextChar); |
| seekBytesRef.setLength(position+1); |
| return position; |
| } |
| } |
| return -1; /* all solutions exhausted */ |
| } |
| } |