| package org.apache.lucene.index; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.IOException; |
| |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.StringHelper; |
| import org.apache.lucene.util.automaton.ByteRunAutomaton; |
| import org.apache.lucene.util.automaton.CompiledAutomaton; |
| import org.apache.lucene.util.automaton.Transition; |
| |
| /** |
| * A FilteredTermsEnum that enumerates terms based upon what is accepted by a |
| * DFA. |
| * <p> |
| * The algorithm is such: |
| * <ol> |
| * <li>As long as matches are successful, keep reading sequentially. |
| * <li>When a match fails, skip to the next string in lexicographic order that |
| * does not enter a reject state. |
| * </ol> |
| * <p> |
| * The algorithm does not attempt to actually skip to the next string that is |
| * completely accepted. This is not possible when the language accepted by the |
| * FSM is not finite (i.e. * operator). |
| * </p> |
| * @lucene.experimental |
| */ |
| class AutomatonTermsEnum extends FilteredTermsEnum { |
| // a tableized array-based form of the DFA |
| private final ByteRunAutomaton runAutomaton; |
| // common suffix of the automaton |
| private final BytesRef commonSuffixRef; |
| // true if the automaton accepts a finite language |
| private final boolean finite; |
| // array of sorted transitions for each state, indexed by state number |
| private final Transition[][] allTransitions; |
| // for path tracking: each long records gen when we last |
| // visited the state; we use gens to avoid having to clear |
| private final long[] visited; |
| private long curGen; |
| // the reference used for seeking forwards through the term dictionary |
| private final BytesRef seekBytesRef = new BytesRef(10); |
| // true if we are enumerating an infinite portion of the DFA. |
| // in this case it is faster to drive the query based on the terms dictionary. |
| // when this is true, linearUpperBound indicate the end of range |
| // of terms where we should simply do sequential reads instead. |
| private boolean linear = false; |
| private final BytesRef linearUpperBound = new BytesRef(10); |
| |
| /** |
| * Construct an enumerator based upon an automaton, enumerating the specified |
| * field, working on a supplied TermsEnum |
| * <p> |
| * @lucene.experimental |
| * <p> |
| * @param compiled CompiledAutomaton |
| */ |
| public AutomatonTermsEnum(TermsEnum tenum, CompiledAutomaton compiled) { |
| super(tenum); |
| this.finite = compiled.finite; |
| this.runAutomaton = compiled.runAutomaton; |
| assert this.runAutomaton != null; |
| this.commonSuffixRef = compiled.commonSuffixRef; |
| this.allTransitions = compiled.sortedTransitions; |
| |
| // used for path tracking, where each bit is a numbered state. |
| visited = new long[runAutomaton.getSize()]; |
| } |
| |
| /** |
| * Returns true if the term matches the automaton. Also stashes away the term |
| * to assist with smart enumeration. |
| */ |
| @Override |
| protected AcceptStatus accept(final BytesRef term) { |
| if (commonSuffixRef == null || StringHelper.endsWith(term, commonSuffixRef)) { |
| if (runAutomaton.run(term.bytes, term.offset, term.length)) |
| return linear ? AcceptStatus.YES : AcceptStatus.YES_AND_SEEK; |
| else |
| return (linear && term.compareTo(linearUpperBound) < 0) ? |
| AcceptStatus.NO : AcceptStatus.NO_AND_SEEK; |
| } else { |
| return (linear && term.compareTo(linearUpperBound) < 0) ? |
| AcceptStatus.NO : AcceptStatus.NO_AND_SEEK; |
| } |
| } |
| |
| @Override |
| protected BytesRef nextSeekTerm(final BytesRef term) throws IOException { |
| //System.out.println("ATE.nextSeekTerm term=" + term); |
| if (term == null) { |
| assert seekBytesRef.length == 0; |
| // return the empty term, as its valid |
| if (runAutomaton.isAccept(runAutomaton.getInitialState())) { |
| return seekBytesRef; |
| } |
| } else { |
| seekBytesRef.copyBytes(term); |
| } |
| |
| // seek to the next possible string; |
| if (nextString()) { |
| return seekBytesRef; // reposition |
| } else { |
| return null; // no more possible strings can match |
| } |
| } |
| |
| /** |
| * Sets the enum to operate in linear fashion, as we have found |
| * a looping transition at position: we set an upper bound and |
| * act like a TermRangeQuery for this portion of the term space. |
| */ |
| private void setLinear(int position) { |
| assert linear == false; |
| |
| int state = runAutomaton.getInitialState(); |
| int maxInterval = 0xff; |
| for (int i = 0; i < position; i++) { |
| state = runAutomaton.step(state, seekBytesRef.bytes[i] & 0xff); |
| assert state >= 0: "state=" + state; |
| } |
| for (int i = 0; i < allTransitions[state].length; i++) { |
| Transition t = allTransitions[state][i]; |
| if (t.getMin() <= (seekBytesRef.bytes[position] & 0xff) && |
| (seekBytesRef.bytes[position] & 0xff) <= t.getMax()) { |
| maxInterval = t.getMax(); |
| break; |
| } |
| } |
| // 0xff terms don't get the optimization... not worth the trouble. |
| if (maxInterval != 0xff) |
| maxInterval++; |
| int length = position + 1; /* position + maxTransition */ |
| if (linearUpperBound.bytes.length < length) |
| linearUpperBound.bytes = new byte[length]; |
| System.arraycopy(seekBytesRef.bytes, 0, linearUpperBound.bytes, 0, position); |
| linearUpperBound.bytes[position] = (byte) maxInterval; |
| linearUpperBound.length = length; |
| |
| linear = true; |
| } |
| |
| private final IntsRef savedStates = new IntsRef(10); |
| |
| /** |
| * Increments the byte buffer to the next String in binary order after s that will not put |
| * the machine into a reject state. If such a string does not exist, returns |
| * false. |
| * |
| * The correctness of this method depends upon the automaton being deterministic, |
| * and having no transitions to dead states. |
| * |
| * @return true if more possible solutions exist for the DFA |
| */ |
| private boolean nextString() { |
| int state; |
| int pos = 0; |
| savedStates.grow(seekBytesRef.length+1); |
| final int[] states = savedStates.ints; |
| states[0] = runAutomaton.getInitialState(); |
| |
| while (true) { |
| curGen++; |
| linear = false; |
| // walk the automaton until a character is rejected. |
| for (state = states[pos]; pos < seekBytesRef.length; pos++) { |
| visited[state] = curGen; |
| int nextState = runAutomaton.step(state, seekBytesRef.bytes[pos] & 0xff); |
| if (nextState == -1) |
| break; |
| states[pos+1] = nextState; |
| // we found a loop, record it for faster enumeration |
| if (!finite && !linear && visited[nextState] == curGen) { |
| setLinear(pos); |
| } |
| state = nextState; |
| } |
| |
| // take the useful portion, and the last non-reject state, and attempt to |
| // append characters that will match. |
| if (nextString(state, pos)) { |
| return true; |
| } else { /* no more solutions exist from this useful portion, backtrack */ |
| if ((pos = backtrack(pos)) < 0) /* no more solutions at all */ |
| return false; |
| final int newState = runAutomaton.step(states[pos], seekBytesRef.bytes[pos] & 0xff); |
| if (newState >= 0 && runAutomaton.isAccept(newState)) |
| /* String is good to go as-is */ |
| return true; |
| /* else advance further */ |
| // TODO: paranoia? if we backtrack thru an infinite DFA, the loop detection is important! |
| // for now, restart from scratch for all infinite DFAs |
| if (!finite) pos = 0; |
| } |
| } |
| } |
| |
| /** |
| * Returns the next String in lexicographic order that will not put |
| * the machine into a reject state. |
| * |
| * This method traverses the DFA from the given position in the String, |
| * starting at the given state. |
| * |
| * If this cannot satisfy the machine, returns false. This method will |
| * walk the minimal path, in lexicographic order, as long as possible. |
| * |
| * If this method returns false, then there might still be more solutions, |
| * it is necessary to backtrack to find out. |
| * |
| * @param state current non-reject state |
| * @param position useful portion of the string |
| * @return true if more possible solutions exist for the DFA from this |
| * position |
| */ |
| private boolean nextString(int state, int position) { |
| /* |
| * the next lexicographic character must be greater than the existing |
| * character, if it exists. |
| */ |
| int c = 0; |
| if (position < seekBytesRef.length) { |
| c = seekBytesRef.bytes[position] & 0xff; |
| // if the next byte is 0xff and is not part of the useful portion, |
| // then by definition it puts us in a reject state, and therefore this |
| // path is dead. there cannot be any higher transitions. backtrack. |
| if (c++ == 0xff) |
| return false; |
| } |
| |
| seekBytesRef.length = position; |
| visited[state] = curGen; |
| |
| Transition transitions[] = allTransitions[state]; |
| |
| // find the minimal path (lexicographic order) that is >= c |
| |
| for (int i = 0; i < transitions.length; i++) { |
| Transition transition = transitions[i]; |
| if (transition.getMax() >= c) { |
| int nextChar = Math.max(c, transition.getMin()); |
| // append either the next sequential char, or the minimum transition |
| seekBytesRef.grow(seekBytesRef.length + 1); |
| seekBytesRef.length++; |
| seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) nextChar; |
| state = transition.getDest().getNumber(); |
| /* |
| * as long as is possible, continue down the minimal path in |
| * lexicographic order. if a loop or accept state is encountered, stop. |
| */ |
| while (visited[state] != curGen && !runAutomaton.isAccept(state)) { |
| visited[state] = curGen; |
| /* |
| * Note: we work with a DFA with no transitions to dead states. |
| * so the below is ok, if it is not an accept state, |
| * then there MUST be at least one transition. |
| */ |
| transition = allTransitions[state][0]; |
| state = transition.getDest().getNumber(); |
| |
| // append the minimum transition |
| seekBytesRef.grow(seekBytesRef.length + 1); |
| seekBytesRef.length++; |
| seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) transition.getMin(); |
| |
| // we found a loop, record it for faster enumeration |
| if (!finite && !linear && visited[state] == curGen) { |
| setLinear(seekBytesRef.length-1); |
| } |
| } |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /** |
| * Attempts to backtrack thru the string after encountering a dead end |
| * at some given position. Returns false if no more possible strings |
| * can match. |
| * |
| * @param position current position in the input String |
| * @return position >=0 if more possible solutions exist for the DFA |
| */ |
| private int backtrack(int position) { |
| while (position-- > 0) { |
| int nextChar = seekBytesRef.bytes[position] & 0xff; |
| // if a character is 0xff its a dead-end too, |
| // because there is no higher character in binary sort order. |
| if (nextChar++ != 0xff) { |
| seekBytesRef.bytes[position] = (byte) nextChar; |
| seekBytesRef.length = position+1; |
| return position; |
| } |
| } |
| return -1; /* all solutions exhausted */ |
| } |
| } |