| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.lucene.codecs.uniformsplit; |
| |
| import java.io.IOException; |
| import java.util.Arrays; |
| import org.apache.lucene.codecs.PostingsReaderBase; |
| import org.apache.lucene.index.TermState; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.store.IndexInput; |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefBuilder; |
| import org.apache.lucene.util.IntsRefBuilder; |
| import org.apache.lucene.util.automaton.Automaton; |
| import org.apache.lucene.util.automaton.ByteRunAutomaton; |
| import org.apache.lucene.util.automaton.CompiledAutomaton; |
| import org.apache.lucene.util.automaton.Transition; |
| |
| /** |
| * The "intersect" {@link TermsEnum} response to {@link |
| * UniformSplitTerms#intersect(CompiledAutomaton, BytesRef)}, intersecting the terms with an |
| * automaton. |
| * |
| * <p>By design of the UniformSplit block keys, it is less efficient than {@code |
| * org.apache.lucene.backward_codecs.lucene40.blocktree.IntersectTermsEnum} for {@link |
| * org.apache.lucene.search.FuzzyQuery} (-37%). It is slightly slower for {@link |
| * org.apache.lucene.search.WildcardQuery} (-5%) and slightly faster for {@link |
| * org.apache.lucene.search.PrefixQuery} (+5%). |
| * |
| * @lucene.experimental |
| */ |
| public class IntersectBlockReader extends BlockReader { |
| |
| /** |
| * Block iteration order. Whether to move next block, jump to a block away, or end the iteration. |
| */ |
| protected enum BlockIteration { |
| NEXT, |
| SEEK, |
| END |
| } |
| |
| /** |
| * Threshold that controls when to attempt to jump to a block away. |
| * |
| * <p>This counter is 0 when entering a block. It is incremented each time a term is rejected by |
| * the automaton. When the counter is greater than or equal to this threshold, then we compute the |
| * next term accepted by the automaton, with {@link AutomatonNextTermCalculator}, and we jump to a |
| * block away if the next term accepted is greater than the immediate next term in the block. |
| * |
| * <p>A low value, for example 1, improves the performance of automatons requiring many jumps, for |
| * example {@link org.apache.lucene.search.FuzzyQuery} and most {@link |
| * org.apache.lucene.search.WildcardQuery}. A higher value improves the performance of automatons |
| * with less or no jump, for example {@link org.apache.lucene.search.PrefixQuery}. A threshold of |
| * 4 seems to be a good balance. |
| */ |
| protected final int NUM_CONSECUTIVELY_REJECTED_TERMS_THRESHOLD = 4; |
| |
| protected final Automaton automaton; |
| protected final ByteRunAutomaton runAutomaton; |
| protected final boolean finite; |
| protected final BytesRef commonSuffix; // maybe null |
| protected final int minTermLength; |
| protected final AutomatonNextTermCalculator nextStringCalculator; |
| |
| /** Set this when our current mode is seeking to this term. Set to null after. */ |
| protected BytesRef seekTerm; |
| /** Number of bytes accepted by the automaton when validating the current term. */ |
| protected int numMatchedBytes; |
| /** |
| * Automaton states reached when validating the current term, from 0 to {@link #numMatchedBytes} - |
| * 1. |
| */ |
| protected int[] states; |
| /** Block iteration order determined when scanning the terms in the current block. */ |
| protected BlockIteration blockIteration; |
| /** |
| * Counter of the number of consecutively rejected terms. Depending on {@link |
| * #NUM_CONSECUTIVELY_REJECTED_TERMS_THRESHOLD}, this may trigger a jump to a block away. |
| */ |
| protected int numConsecutivelyRejectedTerms; |
| |
| protected IntersectBlockReader( |
| CompiledAutomaton compiled, |
| BytesRef startTerm, |
| IndexDictionary.BrowserSupplier dictionaryBrowserSupplier, |
| IndexInput blockInput, |
| PostingsReaderBase postingsReader, |
| FieldMetadata fieldMetadata, |
| BlockDecoder blockDecoder) |
| throws IOException { |
| super(dictionaryBrowserSupplier, blockInput, postingsReader, fieldMetadata, blockDecoder); |
| automaton = compiled.automaton; |
| runAutomaton = compiled.runAutomaton; |
| finite = compiled.finite; |
| commonSuffix = compiled.commonSuffixRef; |
| minTermLength = getMinTermLength(); |
| nextStringCalculator = new AutomatonNextTermCalculator(compiled); |
| seekTerm = startTerm; |
| } |
| |
| /** |
| * Computes the minimal length of the terms accepted by the automaton. This speeds up the term |
| * scanning for automatons accepting a finite language. |
| */ |
| protected int getMinTermLength() { |
| // Automatons accepting infinite language (e.g. PrefixQuery and WildcardQuery) do not benefit |
| // much from |
| // min term length while it takes time to compute it. More precisely, by skipping this |
| // computation PrefixQuery |
| // is significantly boosted while WildcardQuery might be slightly degraded on average. This min |
| // term length |
| // mainly boosts FuzzyQuery. |
| int commonSuffixLength = commonSuffix == null ? 0 : commonSuffix.length; |
| if (!finite) { |
| return commonSuffixLength; |
| } |
| // Since we are only dealing with finite language, there is no loop to detect. |
| int commonPrefixLength = 0; |
| int state = 0; |
| Transition t = null; |
| while (true) { |
| if (runAutomaton.isAccept(state)) { |
| // The common prefix reaches a final state. So common prefix and common suffix overlap. |
| // Min term length is the max between common prefix and common suffix lengths. |
| return Math.max(commonPrefixLength, commonSuffixLength); |
| } |
| if (automaton.getNumTransitions(state) == 1) { |
| if (t == null) { |
| t = new Transition(); |
| } |
| automaton.getTransition(state, 0, t); |
| if (t.min == t.max) { |
| state = t.dest; |
| commonPrefixLength++; |
| continue; |
| } |
| } |
| break; |
| } |
| // Min term length is the sum of common prefix and common suffix lengths. |
| return commonPrefixLength + commonSuffixLength; |
| } |
| |
| @Override |
| public BytesRef next() throws IOException { |
| if (blockHeader == null) { |
| if (!seekFirstBlock()) { |
| return null; |
| } |
| states = new int[32]; |
| blockIteration = BlockIteration.NEXT; |
| } |
| termState = null; |
| do { |
| BytesRef term = nextTermInBlockMatching(); |
| if (term != null) { |
| return term; |
| } |
| } while (nextBlock()); |
| return null; |
| } |
| |
| protected boolean seekFirstBlock() throws IOException { |
| seekTerm = nextStringCalculator.nextSeekTerm(seekTerm); |
| if (seekTerm == null) { |
| return false; |
| } |
| long blockStartFP = getOrCreateDictionaryBrowser().seekBlock(seekTerm); |
| if (blockStartFP == -1) { |
| blockStartFP = fieldMetadata.getFirstBlockStartFP(); |
| } else if (isBeyondLastTerm(seekTerm, blockStartFP)) { |
| return false; |
| } |
| initializeHeader(seekTerm, blockStartFP); |
| return blockHeader != null; |
| } |
| |
| /** |
| * Finds the next block line that matches (accepted by the automaton), or null when at end of |
| * block. |
| * |
| * @return The next term in the current block that is accepted by the automaton; or null if none. |
| */ |
| protected BytesRef nextTermInBlockMatching() throws IOException { |
| if (seekTerm == null) { |
| if (readLineInBlock() == null) { |
| return null; |
| } |
| } else { |
| SeekStatus seekStatus = seekInBlock(seekTerm); |
| seekTerm = null; |
| if (seekStatus == SeekStatus.END) { |
| return null; |
| } |
| assert numMatchedBytes == 0; |
| assert numConsecutivelyRejectedTerms == 0; |
| } |
| while (true) { |
| TermBytes lineTermBytes = blockLine.getTermBytes(); |
| BytesRef lineTerm = lineTermBytes.getTerm(); |
| assert lineTerm.offset == 0; |
| if (states.length <= lineTerm.length) { |
| states = ArrayUtil.growExact(states, ArrayUtil.oversize(lineTerm.length + 1, Byte.BYTES)); |
| } |
| // Since terms are delta encoded, we may start the automaton steps from the last state reached |
| // by the previous term. |
| int index = Math.min(lineTermBytes.getSuffixOffset(), numMatchedBytes); |
| // Skip this term early if it is shorter than the min term length, or if it does not end with |
| // the common suffix |
| // accepted by the automaton. |
| if (lineTerm.length >= minTermLength |
| && (commonSuffix == null || endsWithCommonSuffix(lineTerm.bytes, lineTerm.length))) { |
| int state = states[index]; |
| while (true) { |
| if (index == lineTerm.length) { |
| if (runAutomaton.isAccept(state)) { |
| // The automaton accepts the current term. Record the number of matched bytes and |
| // return the term. |
| assert runAutomaton.run(lineTerm.bytes, 0, lineTerm.length); |
| numMatchedBytes = index; |
| if (numConsecutivelyRejectedTerms > 0) { |
| numConsecutivelyRejectedTerms = 0; |
| } |
| assert blockIteration == BlockIteration.NEXT; |
| return lineTerm; |
| } |
| break; |
| } |
| state = runAutomaton.step(state, lineTerm.bytes[index] & 0xff); |
| if (state == -1) { |
| // The automaton rejects the current term. |
| break; |
| } |
| // Record the reached automaton state. |
| states[++index] = state; |
| } |
| } |
| // The current term is not accepted by the automaton. |
| // Still record the reached automaton state to start the next term steps from there. |
| assert !runAutomaton.run(lineTerm.bytes, 0, lineTerm.length); |
| numMatchedBytes = index; |
| // If the number of consecutively rejected terms reaches the threshold, |
| // then determine whether it is worthwhile to jump to a block away. |
| if (++numConsecutivelyRejectedTerms >= NUM_CONSECUTIVELY_REJECTED_TERMS_THRESHOLD |
| && lineIndexInBlock < blockHeader.getLinesCount() - 1 |
| && !nextStringCalculator.isLinearState(lineTerm)) { |
| // Compute the next term accepted by the automaton after the current term. |
| if ((seekTerm = nextStringCalculator.nextSeekTerm(lineTerm)) == null) { |
| blockIteration = BlockIteration.END; |
| return null; |
| } |
| // It is worthwhile to jump to a block away if the next term accepted is after the next term |
| // in the block. |
| // Actually the block away may be the current block, but this is a good heuristic. |
| readLineInBlock(); |
| if (seekTerm.compareTo(blockLine.getTermBytes().getTerm()) > 0) { |
| // Stop scanning this block terms and set the iteration order to jump to a block away by |
| // seeking seekTerm. |
| blockIteration = BlockIteration.SEEK; |
| return null; |
| } |
| seekTerm = null; |
| // If it is not worthwhile to jump to a block away, do not attempt anymore for the current |
| // block. |
| numConsecutivelyRejectedTerms = Integer.MIN_VALUE; |
| } else if (readLineInBlock() == null) { |
| // No more terms in the block. The iteration order is to open the very next block. |
| assert blockIteration == BlockIteration.NEXT; |
| return null; |
| } |
| } |
| } |
| |
| /** |
| * Indicates whether the given term ends with the automaton common suffix. This allows to quickly |
| * skip terms that the automaton would reject eventually. |
| */ |
| protected boolean endsWithCommonSuffix(byte[] termBytes, int termLength) { |
| byte[] suffixBytes = commonSuffix.bytes; |
| int suffixLength = commonSuffix.length; |
| int offset = termLength - suffixLength; |
| assert offset >= 0; // We already checked minTermLength. |
| for (int i = 0; i < suffixLength; i++) { |
| if (termBytes[offset + i] != suffixBytes[i]) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /** |
| * Opens the next block. Depending on the {@link #blockIteration} order, it may be the very next |
| * block, or a block away that may contain {@link #seekTerm}. |
| * |
| * @return true if the next block is opened; false if there is no blocks anymore and the iteration |
| * is over. |
| */ |
| protected boolean nextBlock() throws IOException { |
| long blockStartFP; |
| switch (blockIteration) { |
| case NEXT: |
| assert seekTerm == null; |
| blockStartFP = blockInput.getFilePointer(); |
| break; |
| case SEEK: |
| assert seekTerm != null; |
| blockStartFP = getOrCreateDictionaryBrowser().seekBlock(seekTerm); |
| if (isBeyondLastTerm(seekTerm, blockStartFP)) { |
| return false; |
| } |
| blockIteration = BlockIteration.NEXT; |
| break; |
| case END: |
| return false; |
| default: |
| throw new UnsupportedOperationException( |
| "Unsupported " + BlockIteration.class.getSimpleName()); |
| } |
| numMatchedBytes = 0; |
| numConsecutivelyRejectedTerms = 0; |
| initializeHeader(seekTerm, blockStartFP); |
| return blockHeader != null; |
| } |
| |
| @Override |
| public boolean seekExact(BytesRef text) { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public void seekExact(long ord) { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public void seekExact(BytesRef term, TermState state) { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public SeekStatus seekCeil(BytesRef text) { |
| throw new UnsupportedOperationException(); |
| } |
| |
| /** |
| * This is mostly a copy of AutomatonTermsEnum. Since it's an inner class, the outer class can |
| * call methods that ATE does not expose. It'd be nice if ATE's logic could be more extensible. |
| */ |
| protected class AutomatonNextTermCalculator { |
| // for path tracking: each short records gen when we last |
| // visited the state; we use gens to avoid having to clear |
| protected final short[] visited; |
| protected short curGen; |
| // the reference used for seeking forwards through the term dictionary |
| protected final BytesRefBuilder seekBytesRef = new BytesRefBuilder(); |
| // true if we are enumerating an infinite portion of the DFA. |
| // in this case it is faster to drive the query based on the terms dictionary. |
| // when this is true, linearUpperBound indicate the end of range |
| // of terms where we should simply do sequential reads instead. |
| protected boolean linear; |
| protected final BytesRef linearUpperBound = new BytesRef(); |
| protected final Transition transition = new Transition(); |
| protected final IntsRefBuilder savedStates = new IntsRefBuilder(); |
| |
| protected AutomatonNextTermCalculator(CompiledAutomaton compiled) { |
| visited = compiled.finite ? null : new short[runAutomaton.getSize()]; |
| } |
| |
| /** Records the given state has been visited. */ |
| protected void setVisited(int state) { |
| if (!finite) { |
| visited[state] = curGen; |
| } |
| } |
| |
| /** Indicates whether the given state has been visited. */ |
| protected boolean isVisited(int state) { |
| return !finite && visited[state] == curGen; |
| } |
| |
| /** True if the current state of the automata is best iterated linearly (without seeking). */ |
| protected boolean isLinearState(BytesRef term) { |
| return linear && term.compareTo(linearUpperBound) < 0; |
| } |
| |
| /** @see org.apache.lucene.index.FilteredTermsEnum#nextSeekTerm(BytesRef) */ |
| protected BytesRef nextSeekTerm(final BytesRef term) { |
| // System.out.println("ATE.nextSeekTerm term=" + term); |
| if (term == null) { |
| assert seekBytesRef.length() == 0; |
| // return the empty term, as it's valid |
| if (runAutomaton.isAccept(0)) { |
| return seekBytesRef.get(); |
| } |
| } else { |
| seekBytesRef.copyBytes(term); |
| } |
| |
| // seek to the next possible string; |
| if (nextString()) { |
| return seekBytesRef.get(); // reposition |
| } else { |
| return null; // no more possible strings can match |
| } |
| } |
| |
| /** |
| * Sets the enum to operate in linear fashion, as we have found a looping transition at |
| * position: we set an upper bound and act like a TermRangeQuery for this portion of the term |
| * space. |
| */ |
| protected void setLinear(int position) { |
| assert linear == false; |
| |
| int state = 0; |
| int maxInterval = 0xff; |
| // System.out.println("setLinear pos=" + position + " seekbytesRef=" + seekBytesRef); |
| for (int i = 0; i < position; i++) { |
| state = runAutomaton.step(state, seekBytesRef.byteAt(i) & 0xff); |
| assert state >= 0 : "state=" + state; |
| } |
| final int numTransitions = automaton.getNumTransitions(state); |
| automaton.initTransition(state, transition); |
| for (int i = 0; i < numTransitions; i++) { |
| automaton.getNextTransition(transition); |
| if (transition.min <= (seekBytesRef.byteAt(position) & 0xff) |
| && (seekBytesRef.byteAt(position) & 0xff) <= transition.max) { |
| maxInterval = transition.max; |
| break; |
| } |
| } |
| // 0xff terms don't get the optimization... not worth the trouble. |
| if (maxInterval != 0xff) maxInterval++; |
| int length = position + 1; /* position + maxTransition */ |
| if (linearUpperBound.bytes.length < length) { |
| linearUpperBound.bytes = new byte[ArrayUtil.oversize(length, Byte.BYTES)]; |
| } |
| System.arraycopy(seekBytesRef.bytes(), 0, linearUpperBound.bytes, 0, position); |
| linearUpperBound.bytes[position] = (byte) maxInterval; |
| linearUpperBound.length = length; |
| |
| linear = true; |
| } |
| |
| /** |
| * Increments the byte buffer to the next String in binary order after s that will not put the |
| * machine into a reject state. If such a string does not exist, returns false. |
| * |
| * <p>The correctness of this method depends upon the automaton being deterministic, and having |
| * no transitions to dead states. |
| * |
| * @return true if more possible solutions exist for the DFA |
| */ |
| protected boolean nextString() { |
| int state; |
| int pos = 0; |
| savedStates.grow(seekBytesRef.length() + 1); |
| savedStates.setIntAt(0, 0); |
| |
| while (true) { |
| if (!finite && ++curGen == 0) { |
| // Clear the visited states every time curGen wraps (so very infrequently to not impact |
| // average perf). |
| Arrays.fill(visited, (short) -1); |
| } |
| linear = false; |
| // walk the automaton until a character is rejected. |
| for (state = savedStates.intAt(pos); pos < seekBytesRef.length(); pos++) { |
| setVisited(state); |
| int nextState = runAutomaton.step(state, seekBytesRef.byteAt(pos) & 0xff); |
| if (nextState == -1) break; |
| savedStates.setIntAt(pos + 1, nextState); |
| // we found a loop, record it for faster enumeration |
| if (!linear && isVisited(nextState)) { |
| setLinear(pos); |
| } |
| state = nextState; |
| } |
| |
| // take the useful portion, and the last non-reject state, and attempt to |
| // append characters that will match. |
| if (nextString(state, pos)) { |
| return true; |
| } else { |
| /* no more solutions exist from this useful portion, backtrack */ |
| if ((pos = backtrack(pos)) < 0) /* no more solutions at all */ return false; |
| final int newState = |
| runAutomaton.step(savedStates.intAt(pos), seekBytesRef.byteAt(pos) & 0xff); |
| if (newState >= 0 && runAutomaton.isAccept(newState)) |
| /* String is good to go as-is */ |
| return true; |
| /* else advance further */ |
| // paranoia? if we backtrack thru an infinite DFA, the loop detection is important! |
| // for now, restart from scratch for all infinite DFAs |
| if (!finite) pos = 0; |
| } |
| } |
| } |
| |
| /** |
| * Returns the next String in lexicographic order that will not put the machine into a reject |
| * state. |
| * |
| * <p>This method traverses the DFA from the given position in the String, starting at the given |
| * state. |
| * |
| * <p>If this cannot satisfy the machine, returns false. This method will walk the minimal path, |
| * in lexicographic order, as long as possible. |
| * |
| * <p>If this method returns false, then there might still be more solutions, it is necessary to |
| * backtrack to find out. |
| * |
| * @param state current non-reject state |
| * @param position useful portion of the string |
| * @return true if more possible solutions exist for the DFA from this position |
| */ |
| protected boolean nextString(int state, int position) { |
| /* |
| * the next lexicographic character must be greater than the existing |
| * character, if it exists. |
| */ |
| int c = 0; |
| if (position < seekBytesRef.length()) { |
| c = seekBytesRef.byteAt(position) & 0xff; |
| // if the next byte is 0xff and is not part of the useful portion, |
| // then by definition it puts us in a reject state, and therefore this |
| // path is dead. there cannot be any higher transitions. backtrack. |
| if (c++ == 0xff) return false; |
| } |
| |
| seekBytesRef.setLength(position); |
| setVisited(state); |
| |
| final int numTransitions = automaton.getNumTransitions(state); |
| automaton.initTransition(state, transition); |
| // find the minimal path (lexicographic order) that is >= c |
| |
| for (int i = 0; i < numTransitions; i++) { |
| automaton.getNextTransition(transition); |
| if (transition.max >= c) { |
| int nextChar = Math.max(c, transition.min); |
| // append either the next sequential char, or the minimum transition |
| seekBytesRef.grow(seekBytesRef.length() + 1); |
| seekBytesRef.append((byte) nextChar); |
| state = transition.dest; |
| /* |
| * as long as is possible, continue down the minimal path in |
| * lexicographic order. if a loop or accept state is encountered, stop. |
| */ |
| while (!isVisited(state) && !runAutomaton.isAccept(state)) { |
| setVisited(state); |
| /* |
| * Note: we work with a DFA with no transitions to dead states. |
| * so the below is ok, if it is not an accept state, |
| * then there MUST be at least one transition. |
| */ |
| automaton.initTransition(state, transition); |
| automaton.getNextTransition(transition); |
| state = transition.dest; |
| |
| // append the minimum transition |
| seekBytesRef.grow(seekBytesRef.length() + 1); |
| seekBytesRef.append((byte) transition.min); |
| |
| // we found a loop, record it for faster enumeration |
| if (!linear && isVisited(state)) { |
| setLinear(seekBytesRef.length() - 1); |
| } |
| } |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /** |
| * Attempts to backtrack thru the string after encountering a dead end at some given position. |
| * Returns false if no more possible strings can match. |
| * |
| * @param position current position in the input String |
| * @return {@code position >= 0} if more possible solutions exist for the DFA |
| */ |
| protected int backtrack(int position) { |
| while (position-- > 0) { |
| int nextChar = seekBytesRef.byteAt(position) & 0xff; |
| // if a character is 0xff it's a dead-end too, |
| // because there is no higher character in binary sort order. |
| if (nextChar++ != 0xff) { |
| seekBytesRef.setByteAt(position, (byte) nextChar); |
| seekBytesRef.setLength(position + 1); |
| return position; |
| } |
| } |
| return -1; /* all solutions exhausted */ |
| } |
| } |
| } |