| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.codecs.blocktreeords; |
| |
| import java.io.IOException; |
| import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output; |
| import org.apache.lucene.index.BaseTermsEnum; |
| import org.apache.lucene.index.ImpactsEnum; |
| import org.apache.lucene.index.PostingsEnum; |
| import org.apache.lucene.index.TermState; |
| import org.apache.lucene.store.IndexInput; |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.RamUsageEstimator; |
| import org.apache.lucene.util.StringHelper; |
| import org.apache.lucene.util.automaton.CompiledAutomaton; |
| import org.apache.lucene.util.automaton.RunAutomaton; |
| import org.apache.lucene.util.fst.FST; |
| |
| // NOTE: cannot seek! |
| final class OrdsIntersectTermsEnum extends BaseTermsEnum { |
| final IndexInput in; |
| |
| private OrdsIntersectTermsEnumFrame[] stack; |
| |
| @SuppressWarnings({"rawtypes", "unchecked"}) |
| private FST.Arc<Output>[] arcs = new FST.Arc[5]; |
| |
| final RunAutomaton runAutomaton; |
| final CompiledAutomaton compiledAutomaton; |
| |
| private OrdsIntersectTermsEnumFrame currentFrame; |
| |
| private final BytesRef term = new BytesRef(); |
| |
| private final FST.BytesReader fstReader; |
| |
| final OrdsFieldReader fr; |
| |
| private BytesRef savedStartTerm; |
| |
| // TODO: in some cases we can filter by length? eg |
| // regexp foo*bar must be at least length 6 bytes |
| public OrdsIntersectTermsEnum(OrdsFieldReader fr, CompiledAutomaton compiled, BytesRef startTerm) |
| throws IOException { |
| // if (DEBUG) { |
| // System.out.println("\nintEnum.init seg=" + segment + " commonSuffix=" + |
| // brToString(compiled.commonSuffixRef)); |
| // } |
| this.fr = fr; |
| runAutomaton = compiled.runAutomaton; |
| compiledAutomaton = compiled; |
| in = fr.parent.in.clone(); |
| stack = new OrdsIntersectTermsEnumFrame[5]; |
| for (int idx = 0; idx < stack.length; idx++) { |
| stack[idx] = new OrdsIntersectTermsEnumFrame(this, idx); |
| } |
| for (int arcIdx = 0; arcIdx < arcs.length; arcIdx++) { |
| arcs[arcIdx] = new FST.Arc<>(); |
| } |
| |
| if (fr.index == null) { |
| fstReader = null; |
| } else { |
| fstReader = fr.index.getBytesReader(); |
| } |
| |
| // TODO: if the automaton is "smallish" we really |
| // should use the terms index to seek at least to |
| // the initial term and likely to subsequent terms |
| // (or, maybe just fallback to ATE for such cases). |
| // Else the seek cost of loading the frames will be |
| // too costly. |
| |
| final FST.Arc<Output> arc = fr.index.getFirstArc(arcs[0]); |
| // Empty string prefix must have an output in the index! |
| assert arc.isFinal(); |
| |
| // Special pushFrame since it's the first one: |
| final OrdsIntersectTermsEnumFrame f = stack[0]; |
| f.fp = f.fpOrig = fr.rootBlockFP; |
| f.prefix = 0; |
| f.setState(0); |
| f.arc = arc; |
| f.outputPrefix = arc.output(); |
| f.load(fr.rootCode); |
| |
| // for assert: |
| assert setSavedStartTerm(startTerm); |
| |
| currentFrame = f; |
| if (startTerm != null) { |
| seekToStartTerm(startTerm); |
| } |
| } |
| |
| // only for assert: |
| private boolean setSavedStartTerm(BytesRef startTerm) { |
| savedStartTerm = startTerm == null ? null : BytesRef.deepCopyOf(startTerm); |
| return true; |
| } |
| |
| @Override |
| public TermState termState() throws IOException { |
| currentFrame.decodeMetaData(); |
| return currentFrame.termState.clone(); |
| } |
| |
| private OrdsIntersectTermsEnumFrame getFrame(int ord) throws IOException { |
| if (ord >= stack.length) { |
| final OrdsIntersectTermsEnumFrame[] next = |
| new OrdsIntersectTermsEnumFrame |
| [ArrayUtil.oversize(1 + ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; |
| System.arraycopy(stack, 0, next, 0, stack.length); |
| for (int stackOrd = stack.length; stackOrd < next.length; stackOrd++) { |
| next[stackOrd] = new OrdsIntersectTermsEnumFrame(this, stackOrd); |
| } |
| stack = next; |
| } |
| assert stack[ord].ord == ord; |
| return stack[ord]; |
| } |
| |
| private FST.Arc<Output> getArc(int ord) { |
| if (ord >= arcs.length) { |
| @SuppressWarnings({"rawtypes", "unchecked"}) |
| final FST.Arc<Output>[] next = |
| new FST.Arc[ArrayUtil.oversize(1 + ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; |
| System.arraycopy(arcs, 0, next, 0, arcs.length); |
| for (int arcOrd = arcs.length; arcOrd < next.length; arcOrd++) { |
| next[arcOrd] = new FST.Arc<>(); |
| } |
| arcs = next; |
| } |
| return arcs[ord]; |
| } |
| |
| private OrdsIntersectTermsEnumFrame pushFrame(int state) throws IOException { |
| final OrdsIntersectTermsEnumFrame f = getFrame(currentFrame == null ? 0 : 1 + currentFrame.ord); |
| |
| f.fp = f.fpOrig = currentFrame.lastSubFP; |
| f.prefix = currentFrame.prefix + currentFrame.suffix; |
| // if (DEBUG) System.out.println(" pushFrame state=" + state + " prefix=" + f.prefix); |
| f.setState(state); |
| |
| // Walk the arc through the index -- we only |
| // "bother" with this so we can get the floor data |
| // from the index and skip floor blocks when |
| // possible: |
| FST.Arc<Output> arc = currentFrame.arc; |
| int idx = currentFrame.prefix; |
| assert currentFrame.suffix > 0; |
| Output output = currentFrame.outputPrefix; |
| while (idx < f.prefix) { |
| final int target = term.bytes[idx] & 0xff; |
| // TODO: we could be more efficient for the next() |
| // case by using current arc as starting point, |
| // passed to findTargetArc |
| arc = fr.index.findTargetArc(target, arc, getArc(1 + idx), fstReader); |
| assert arc != null; |
| output = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output()); |
| idx++; |
| } |
| |
| f.arc = arc; |
| f.outputPrefix = output; |
| assert arc.isFinal(); |
| f.load(OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput())); |
| return f; |
| } |
| |
| @Override |
| public BytesRef term() { |
| return term; |
| } |
| |
| // TODO: do we need ord() here? OrdsIntersectTermsEnumFrame tracks termOrd but it may be buggy! |
| |
| @Override |
| public int docFreq() throws IOException { |
| // if (DEBUG) System.out.println("BTIR.docFreq"); |
| currentFrame.decodeMetaData(); |
| // if (DEBUG) System.out.println(" return " + currentFrame.termState.docFreq); |
| return currentFrame.termState.docFreq; |
| } |
| |
| @Override |
| public long totalTermFreq() throws IOException { |
| currentFrame.decodeMetaData(); |
| return currentFrame.termState.totalTermFreq; |
| } |
| |
| @Override |
| public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { |
| currentFrame.decodeMetaData(); |
| return fr.parent.postingsReader.postings(fr.fieldInfo, currentFrame.termState, reuse, flags); |
| } |
| |
| @Override |
| public ImpactsEnum impacts(int flags) throws IOException { |
| currentFrame.decodeMetaData(); |
| return fr.parent.postingsReader.impacts(fr.fieldInfo, currentFrame.termState, flags); |
| } |
| |
| private int getState() { |
| int state = currentFrame.state; |
| for (int idx = 0; idx < currentFrame.suffix; idx++) { |
| state = |
| runAutomaton.step( |
| state, currentFrame.suffixBytes[currentFrame.startBytePos + idx] & 0xff); |
| assert state != -1; |
| } |
| return state; |
| } |
| |
| // NOTE: specialized to only doing the first-time |
| // seek, but we could generalize it to allow |
| // arbitrary seekExact/Ceil. Note that this is a |
| // seekFloor! |
| private void seekToStartTerm(BytesRef target) throws IOException { |
| // if (DEBUG) System.out.println("seek to startTerm=" + target.utf8ToString()); |
| assert currentFrame.ord == 0; |
| if (term.length < target.length) { |
| term.bytes = ArrayUtil.grow(term.bytes, target.length); |
| } |
| FST.Arc<Output> arc = arcs[0]; |
| assert arc == currentFrame.arc; |
| |
| for (int idx = 0; idx <= target.length; idx++) { |
| |
| while (true) { |
| final int savePos = currentFrame.suffixesReader.getPosition(); |
| final int saveStartBytePos = currentFrame.startBytePos; |
| final int saveSuffix = currentFrame.suffix; |
| final long saveLastSubFP = currentFrame.lastSubFP; |
| final int saveTermBlockOrd = currentFrame.termState.termBlockOrd; |
| |
| final boolean isSubBlock = currentFrame.next(); |
| |
| // if (DEBUG) System.out.println(" cycle ent=" + currentFrame.nextEnt + " (of " + |
| // currentFrame.entCount + ") prefix=" + currentFrame.prefix + " suffix=" + |
| // currentFrame.suffix + " isBlock=" + isSubBlock + " firstLabel=" + (currentFrame.suffix == |
| // 0 ? "" : (currentFrame.suffixBytes[currentFrame.startBytePos])&0xff)); |
| term.length = currentFrame.prefix + currentFrame.suffix; |
| if (term.bytes.length < term.length) { |
| term.bytes = ArrayUtil.grow(term.bytes, term.length); |
| } |
| System.arraycopy( |
| currentFrame.suffixBytes, |
| currentFrame.startBytePos, |
| term.bytes, |
| currentFrame.prefix, |
| currentFrame.suffix); |
| |
| if (isSubBlock && StringHelper.startsWith(target, term)) { |
| // Recurse |
| // if (DEBUG) System.out.println(" recurse!"); |
| currentFrame = pushFrame(getState()); |
| break; |
| } else { |
| final int cmp = term.compareTo(target); |
| if (cmp < 0) { |
| if (currentFrame.nextEnt == currentFrame.entCount) { |
| if (!currentFrame.isLastInFloor) { |
| // if (DEBUG) System.out.println(" load floorBlock"); |
| currentFrame.loadNextFloorBlock(); |
| continue; |
| } else { |
| // if (DEBUG) System.out.println(" return term=" + brToString(term)); |
| return; |
| } |
| } |
| continue; |
| } else if (cmp == 0) { |
| // if (DEBUG) System.out.println(" return term=" + brToString(term)); |
| return; |
| } else { |
| // Fallback to prior entry: the semantics of |
| // this method is that the first call to |
| // next() will return the term after the |
| // requested term |
| currentFrame.nextEnt--; |
| currentFrame.lastSubFP = saveLastSubFP; |
| currentFrame.startBytePos = saveStartBytePos; |
| currentFrame.suffix = saveSuffix; |
| currentFrame.suffixesReader.setPosition(savePos); |
| currentFrame.termState.termBlockOrd = saveTermBlockOrd; |
| System.arraycopy( |
| currentFrame.suffixBytes, |
| currentFrame.startBytePos, |
| term.bytes, |
| currentFrame.prefix, |
| currentFrame.suffix); |
| term.length = currentFrame.prefix + currentFrame.suffix; |
| // If the last entry was a block we don't |
| // need to bother recursing and pushing to |
| // the last term under it because the first |
| // next() will simply skip the frame anyway |
| return; |
| } |
| } |
| } |
| } |
| |
| assert false; |
| } |
| |
| @Override |
| public BytesRef next() throws IOException { |
| |
| // if (DEBUG) { |
| // System.out.println("\nintEnum.next seg=" + segment); |
| // System.out.println(" frame ord=" + currentFrame.ord + " prefix=" + brToString(new |
| // BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " |
| // lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + |
| // (currentFrame.transitions.length == 0 ? "n/a" : |
| // currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + |
| // currentFrame.outputPrefix); |
| // } |
| |
| nextTerm: |
| while (true) { |
| // Pop finished frames |
| while (currentFrame.nextEnt == currentFrame.entCount) { |
| if (!currentFrame.isLastInFloor) { |
| // if (DEBUG) System.out.println(" next-floor-block"); |
| currentFrame.loadNextFloorBlock(); |
| // if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + |
| // brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + |
| // currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + |
| // currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : |
| // currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + |
| // currentFrame.outputPrefix); |
| } else { |
| // if (DEBUG) System.out.println(" pop frame"); |
| if (currentFrame.ord == 0) { |
| return null; |
| } |
| final long lastFP = currentFrame.fpOrig; |
| currentFrame = stack[currentFrame.ord - 1]; |
| assert currentFrame.lastSubFP == lastFP; |
| // if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + |
| // brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + |
| // currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + |
| // currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : |
| // currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + |
| // currentFrame.outputPrefix); |
| } |
| } |
| |
| final boolean isSubBlock = currentFrame.next(); |
| // if (DEBUG) { |
| // final BytesRef suffixRef = new BytesRef(); |
| // suffixRef.bytes = currentFrame.suffixBytes; |
| // suffixRef.offset = currentFrame.startBytePos; |
| // suffixRef.length = currentFrame.suffix; |
| // System.out.println(" " + (isSubBlock ? "sub-block" : "term") + " " + |
| // currentFrame.nextEnt + " (of " + currentFrame.entCount + ") suffix=" + |
| // brToString(suffixRef)); |
| // } |
| |
| if (currentFrame.suffix != 0) { |
| final int label = currentFrame.suffixBytes[currentFrame.startBytePos] & 0xff; |
| while (label > currentFrame.curTransitionMax) { |
| if (currentFrame.transitionIndex >= currentFrame.transitionCount - 1) { |
| // Stop processing this frame -- no further |
| // matches are possible because we've moved |
| // beyond what the max transition will allow |
| // if (DEBUG) System.out.println(" break: trans=" + |
| // (currentFrame.transitions.length == 0 ? "n/a" : |
| // currentFrame.transitions[currentFrame.transitionIndex])); |
| |
| // sneaky! forces a pop above |
| currentFrame.isLastInFloor = true; |
| currentFrame.nextEnt = currentFrame.entCount; |
| continue nextTerm; |
| } |
| currentFrame.transitionIndex++; |
| compiledAutomaton.automaton.getNextTransition(currentFrame.transition); |
| currentFrame.curTransitionMax = currentFrame.transition.max; |
| // if (DEBUG) System.out.println(" next trans=" + |
| // currentFrame.transitions[currentFrame.transitionIndex]); |
| } |
| } |
| |
| // First test the common suffix, if set: |
| if (compiledAutomaton.commonSuffixRef != null && !isSubBlock) { |
| final int termLen = currentFrame.prefix + currentFrame.suffix; |
| if (termLen < compiledAutomaton.commonSuffixRef.length) { |
| // No match |
| // if (DEBUG) { |
| // System.out.println(" skip: common suffix length"); |
| // } |
| continue nextTerm; |
| } |
| |
| final byte[] suffixBytes = currentFrame.suffixBytes; |
| final byte[] commonSuffixBytes = compiledAutomaton.commonSuffixRef.bytes; |
| |
| final int lenInPrefix = compiledAutomaton.commonSuffixRef.length - currentFrame.suffix; |
| assert compiledAutomaton.commonSuffixRef.offset == 0; |
| int suffixBytesPos; |
| int commonSuffixBytesPos = 0; |
| |
| if (lenInPrefix > 0) { |
| // A prefix of the common suffix overlaps with |
| // the suffix of the block prefix so we first |
| // test whether the prefix part matches: |
| final byte[] termBytes = term.bytes; |
| int termBytesPos = currentFrame.prefix - lenInPrefix; |
| assert termBytesPos >= 0; |
| final int termBytesPosEnd = currentFrame.prefix; |
| while (termBytesPos < termBytesPosEnd) { |
| if (termBytes[termBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) { |
| // if (DEBUG) { |
| // System.out.println(" skip: common suffix mismatch (in prefix)"); |
| // } |
| continue nextTerm; |
| } |
| } |
| suffixBytesPos = currentFrame.startBytePos; |
| } else { |
| suffixBytesPos = |
| currentFrame.startBytePos |
| + currentFrame.suffix |
| - compiledAutomaton.commonSuffixRef.length; |
| } |
| |
| // Test overlapping suffix part: |
| final int commonSuffixBytesPosEnd = compiledAutomaton.commonSuffixRef.length; |
| while (commonSuffixBytesPos < commonSuffixBytesPosEnd) { |
| if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) { |
| // if (DEBUG) { |
| // System.out.println(" skip: common suffix mismatch"); |
| // } |
| continue nextTerm; |
| } |
| } |
| } |
| |
| // TODO: maybe we should do the same linear test |
| // that AutomatonTermsEnum does, so that if we |
| // reach a part of the automaton where .* is |
| // "temporarily" accepted, we just blindly .next() |
| // until the limit |
| |
| // See if the term prefix matches the automaton: |
| int state = currentFrame.state; |
| for (int idx = 0; idx < currentFrame.suffix; idx++) { |
| state = |
| runAutomaton.step( |
| state, currentFrame.suffixBytes[currentFrame.startBytePos + idx] & 0xff); |
| if (state == -1) { |
| // No match |
| // System.out.println(" no s=" + state); |
| continue nextTerm; |
| } else { |
| // System.out.println(" c s=" + state); |
| } |
| } |
| |
| if (isSubBlock) { |
| // Match! Recurse: |
| // if (DEBUG) System.out.println(" sub-block match to state=" + state + "; recurse fp=" |
| // + currentFrame.lastSubFP); |
| copyTerm(); |
| currentFrame = pushFrame(state); |
| // if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + |
| // brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + |
| // currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + |
| // currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : |
| // currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + |
| // currentFrame.outputPrefix); |
| } else if (runAutomaton.isAccept(state)) { |
| copyTerm(); |
| // if (DEBUG) System.out.println(" term match to state=" + state + "; return term=" + |
| // brToString(term)); |
| assert savedStartTerm == null || term.compareTo(savedStartTerm) > 0 |
| : "saveStartTerm=" + savedStartTerm.utf8ToString() + " term=" + term.utf8ToString(); |
| return term; |
| } else { |
| // System.out.println(" no s=" + state); |
| } |
| } |
| } |
| |
| private void copyTerm() { |
| // System.out.println(" copyTerm cur.prefix=" + currentFrame.prefix + " cur.suffix=" + |
| // currentFrame.suffix + " first=" + (char) |
| // currentFrame.suffixBytes[currentFrame.startBytePos]); |
| final int len = currentFrame.prefix + currentFrame.suffix; |
| if (term.bytes.length < len) { |
| term.bytes = ArrayUtil.grow(term.bytes, len); |
| } |
| System.arraycopy( |
| currentFrame.suffixBytes, |
| currentFrame.startBytePos, |
| term.bytes, |
| currentFrame.prefix, |
| currentFrame.suffix); |
| term.length = len; |
| } |
| |
| @Override |
| public boolean seekExact(BytesRef text) { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public void seekExact(long ord) { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public long ord() { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public SeekStatus seekCeil(BytesRef text) { |
| throw new UnsupportedOperationException(); |
| } |
| } |