| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.codecs.idversion; |
| |
| import java.io.IOException; |
| import java.io.PrintStream; |
| |
| import org.apache.lucene.codecs.BlockTermState; |
| import org.apache.lucene.index.BaseTermsEnum; |
| import org.apache.lucene.index.ImpactsEnum; |
| import org.apache.lucene.index.PostingsEnum; |
| import org.apache.lucene.index.SlowImpactsEnum; |
| import org.apache.lucene.index.TermState; |
| import org.apache.lucene.store.ByteArrayDataInput; |
| import org.apache.lucene.store.IndexInput; |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefBuilder; |
| import org.apache.lucene.util.RamUsageEstimator; |
| import org.apache.lucene.util.fst.FST; |
| import org.apache.lucene.util.fst.PairOutputs.Pair; |
| import org.apache.lucene.util.fst.Util; |
| |
| /** Iterates through terms in this field; this class is public so users |
| * can cast it to call {@link #seekExact(BytesRef, long)} for |
| * optimistic-concurrency, and also {@link #getVersion} to get the |
| * version of the currently seek'd term. */ |
| public final class IDVersionSegmentTermsEnum extends BaseTermsEnum { |
| |
| // Lazy init: |
| IndexInput in; |
| |
| // static boolean DEBUG = false; |
| |
| private IDVersionSegmentTermsEnumFrame[] stack; |
| private final IDVersionSegmentTermsEnumFrame staticFrame; |
| IDVersionSegmentTermsEnumFrame currentFrame; |
| boolean termExists; |
| final VersionFieldReader fr; |
| |
| private int targetBeforeCurrentLength; |
| |
| private final ByteArrayDataInput scratchReader = new ByteArrayDataInput(); |
| |
| // What prefix of the current term was present in the index: |
| private int validIndexPrefix; |
| |
| // assert only: |
| private boolean eof; |
| |
| final BytesRefBuilder term = new BytesRefBuilder(); |
| private final FST.BytesReader fstReader; |
| |
| @SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc<Pair<BytesRef,Long>>[] arcs = |
| new FST.Arc[1]; |
| |
| IDVersionSegmentTermsEnum(VersionFieldReader fr) throws IOException { |
| this.fr = fr; |
| |
| //if (DEBUG) System.out.println("BTTR.init seg=" + segment); |
| stack = new IDVersionSegmentTermsEnumFrame[0]; |
| |
| // Used to hold seek by TermState, or cached seek |
| staticFrame = new IDVersionSegmentTermsEnumFrame(this, -1); |
| |
| if (fr.index == null) { |
| fstReader = null; |
| } else { |
| fstReader = fr.index.getBytesReader(); |
| } |
| |
| // Init w/ root block; don't use index since it may |
| // not (and need not) have been loaded |
| for(int arcIdx=0;arcIdx<arcs.length;arcIdx++) { |
| arcs[arcIdx] = new FST.Arc<>(); |
| } |
| |
| currentFrame = staticFrame; |
| final FST.Arc<Pair<BytesRef,Long>> arc; |
| if (fr.index != null) { |
| arc = fr.index.getFirstArc(arcs[0]); |
| // Empty string prefix must have an output in the index! |
| assert arc.isFinal(); |
| } else { |
| arc = null; |
| } |
| currentFrame = staticFrame; |
| //currentFrame = pushFrame(arc, rootCode, 0); |
| //currentFrame.loadBlock(); |
| validIndexPrefix = 0; |
| // if (DEBUG) { |
| // System.out.println("init frame state " + currentFrame.ord); |
| // printSeekState(); |
| // } |
| |
| //System.out.println(); |
| // computeBlockStats().print(System.out); |
| } |
| |
| // Not private to avoid synthetic access$NNN methods |
| void initIndexInput() { |
| if (this.in == null) { |
| this.in = fr.parent.in.clone(); |
| } |
| } |
| |
| private IDVersionSegmentTermsEnumFrame getFrame(int ord) throws IOException { |
| if (ord >= stack.length) { |
| final IDVersionSegmentTermsEnumFrame[] next = new IDVersionSegmentTermsEnumFrame[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; |
| System.arraycopy(stack, 0, next, 0, stack.length); |
| for(int stackOrd=stack.length;stackOrd<next.length;stackOrd++) { |
| next[stackOrd] = new IDVersionSegmentTermsEnumFrame(this, stackOrd); |
| } |
| stack = next; |
| } |
| assert stack[ord].ord == ord; |
| return stack[ord]; |
| } |
| |
| private FST.Arc<Pair<BytesRef,Long>> getArc(int ord) { |
| if (ord >= arcs.length) { |
| @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc<Pair<BytesRef,Long>>[] next = |
| new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; |
| System.arraycopy(arcs, 0, next, 0, arcs.length); |
| for(int arcOrd=arcs.length;arcOrd<next.length;arcOrd++) { |
| next[arcOrd] = new FST.Arc<>(); |
| } |
| arcs = next; |
| } |
| return arcs[ord]; |
| } |
| |
| // Pushes a frame we seek'd to |
| IDVersionSegmentTermsEnumFrame pushFrame(FST.Arc<Pair<BytesRef,Long>> arc, Pair<BytesRef,Long> frameData, int length) throws IOException { |
| scratchReader.reset(frameData.output1.bytes, frameData.output1.offset, frameData.output1.length); |
| final long code = scratchReader.readVLong(); |
| final long fpSeek = code >>> VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS; |
| final IDVersionSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord); |
| f.maxIDVersion = Long.MAX_VALUE - frameData.output2; |
| f.hasTerms = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0; |
| f.hasTermsOrig = f.hasTerms; |
| f.isFloor = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0; |
| if (f.isFloor) { |
| f.setFloorData(scratchReader, frameData.output1); |
| } |
| pushFrame(arc, fpSeek, length); |
| |
| return f; |
| } |
| |
| // Pushes next'd frame or seek'd frame; we later |
| // lazy-load the frame only when needed |
| IDVersionSegmentTermsEnumFrame pushFrame(FST.Arc<Pair<BytesRef,Long>> arc, long fp, int length) throws IOException { |
| final IDVersionSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord); |
| f.arc = arc; |
| if (f.fpOrig == fp && f.nextEnt != -1) { |
| //if (DEBUG) System.out.println(" push reused frame ord=" + f.ord + " fp=" + f.fp + " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + term.length + " vs prefix=" + f.prefix); |
| if (f.prefix > targetBeforeCurrentLength) { |
| f.rewind(); |
| } else { |
| // if (DEBUG) { |
| // System.out.println(" skip rewind!"); |
| // } |
| } |
| assert length == f.prefix; |
| } else { |
| f.nextEnt = -1; |
| f.prefix = length; |
| f.state.termBlockOrd = 0; |
| f.fpOrig = f.fp = fp; |
| f.lastSubFP = -1; |
| // if (DEBUG) { |
| // final int sav = term.length; |
| // term.length = length; |
| // System.out.println(" push new frame ord=" + f.ord + " fp=" + f.fp + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " pref=" + brToString(term)); |
| // term.length = sav; |
| // } |
| } |
| |
| return f; |
| } |
| |
| // asserts only |
| private boolean clearEOF() { |
| eof = false; |
| return true; |
| } |
| |
| // asserts only |
| private boolean setEOF() { |
| eof = true; |
| return true; |
| } |
| |
| @Override |
| public boolean seekExact(final BytesRef target) throws IOException { |
| return seekExact(target, 0); |
| } |
| |
| // for debugging |
| @SuppressWarnings("unused") |
| static String brToString(BytesRef b) { |
| try { |
| return b.utf8ToString() + " " + b; |
| } catch (Throwable t) { |
| // If BytesRef isn't actually UTF8, or it's eg a |
| // prefix of UTF8 that ends mid-unicode-char, we |
| // fallback to hex: |
| return b.toString(); |
| } |
| } |
| |
| /** Get the version of the currently seek'd term; only valid if we are |
| * positioned. */ |
| public long getVersion() { |
| return ((IDVersionTermState) currentFrame.state).idVersion; |
| } |
| |
| /** Optimized version of {@link #seekExact(BytesRef)} that can |
| * sometimes fail-fast if the version indexed with the requested ID |
| * is less than the specified minIDVersion. Applications that index |
| * a monotonically increasing global version with each document can |
| * use this for fast optimistic concurrency. */ |
| public boolean seekExact(final BytesRef target, long minIDVersion) throws IOException { |
| |
| if (fr.index == null) { |
| throw new IllegalStateException("terms index was not loaded"); |
| } |
| |
| term.grow(1 + target.length); |
| |
| assert clearEOF(); |
| |
| // if (DEBUG) { |
| // System.out.println("\nBTTR.seekExact seg=" + fr.parent.segment + " target=" + fr.fieldInfo.name + ":" + brToString(target) + " minIDVersion=" + minIDVersion + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix=" + validIndexPrefix); |
| // printSeekState(System.out); |
| // } |
| |
| FST.Arc<Pair<BytesRef,Long>> arc; |
| int targetUpto; |
| Pair<BytesRef,Long> output; |
| |
| long startFrameFP = currentFrame.fp; |
| |
| targetBeforeCurrentLength = currentFrame.ord; |
| |
| boolean changed = false; |
| |
| // TODO: we could stop earlier w/ the version check, every time we traverse an index arc we can check? |
| |
| if (currentFrame != staticFrame) { |
| |
| // We are already seek'd; find the common |
| // prefix of new seek term vs current term and |
| // re-use the corresponding seek state. For |
| // example, if app first seeks to foobar, then |
| // seeks to foobaz, we can re-use the seek state |
| // for the first 5 bytes. |
| |
| // if (DEBUG) { |
| // System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix); |
| // } |
| |
| arc = arcs[0]; |
| assert arc.isFinal(); |
| output = arc.output(); |
| targetUpto = 0; |
| |
| IDVersionSegmentTermsEnumFrame lastFrame = stack[0]; |
| assert validIndexPrefix <= term.length(): "validIndexPrefix=" + validIndexPrefix + " term.length=" + term.length() + " seg=" + fr.parent; |
| |
| final int targetLimit = Math.min(target.length, validIndexPrefix); |
| |
| int cmp = 0; |
| |
| // TODO: reverse vLong byte order for better FST |
| // prefix output sharing |
| |
| // First compare up to valid seek frames: |
| while (targetUpto < targetLimit) { |
| cmp = (term.byteAt(targetUpto)&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); |
| // if (DEBUG) { |
| // System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output); |
| // } |
| if (cmp != 0) { |
| break; |
| } |
| arc = arcs[1+targetUpto]; |
| //if (arc.label != (target.bytes[target.offset + targetUpto] & 0xFF)) { |
| //System.out.println("FAIL: arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF)); |
| //} |
| assert arc.label() == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label() + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF); |
| if (arc.output() != VersionBlockTreeTermsWriter.NO_OUTPUT) { |
| output = VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output()); |
| } |
| if (arc.isFinal()) { |
| lastFrame = stack[1+lastFrame.ord]; |
| } |
| targetUpto++; |
| } |
| |
| if (cmp == 0) { |
| final int targetUptoMid = targetUpto; |
| |
| // Second compare the rest of the term, but |
| // don't save arc/output/frame; we only do this |
| // to find out if the target term is before, |
| // equal or after the current term |
| final int targetLimit2 = Math.min(target.length, term.length()); |
| while (targetUpto < targetLimit2) { |
| cmp = (term.byteAt(targetUpto)&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); |
| // if (DEBUG) { |
| // System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"); |
| // } |
| if (cmp != 0) { |
| break; |
| } |
| targetUpto++; |
| } |
| |
| if (cmp == 0) { |
| cmp = term.length() - target.length; |
| } |
| targetUpto = targetUptoMid; |
| } |
| |
| if (cmp < 0) { |
| // Common case: target term is after current |
| // term, ie, app is seeking multiple terms |
| // in sorted order |
| // if (DEBUG) { |
| // System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); frame.ord=" + lastFrame.ord + "; targetUpto=" + targetUpto); |
| // } |
| currentFrame = lastFrame; |
| |
| } else if (cmp > 0) { |
| // Uncommon case: target term |
| // is before current term; this means we can |
| // keep the currentFrame but we must rewind it |
| // (so we scan from the start) |
| targetBeforeCurrentLength = 0; |
| changed = true; |
| // if (DEBUG) { |
| // System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord); |
| // } |
| currentFrame = lastFrame; |
| currentFrame.rewind(); |
| } else { |
| // Target is exactly the same as current term |
| assert term.length() == target.length; |
| if (termExists) { |
| |
| if (currentFrame.maxIDVersion < minIDVersion) { |
| // The max version for all terms in this block is lower than the minVersion |
| // if (DEBUG) { |
| // System.out.println(" target is same as current maxIDVersion=" + currentFrame.maxIDVersion + " is < minIDVersion=" + minIDVersion + "; return false"); |
| // } |
| return false; |
| } |
| |
| currentFrame.decodeMetaData(); |
| if (((IDVersionTermState) currentFrame.state).idVersion < minIDVersion) { |
| // This term's version is lower than the minVersion |
| // if (DEBUG) { |
| // System.out.println(" target is same as current but version=" + ((IDVersionTermState) currentFrame.state).idVersion + " is < minIDVersion=" + minIDVersion + "; return false"); |
| // } |
| return false; |
| } |
| // System.out.println(" term version=" + ((IDVersionTermState) currentFrame.state).idVersion + " frame version=" + currentFrame.maxIDVersion + " frame ord=" + currentFrame.ord); |
| |
| // if (DEBUG) { |
| // System.out.println(" target is same as current; return true"); |
| // } |
| return true; |
| } else { |
| // if (DEBUG) { |
| // System.out.println(" target is same as current but term doesn't exist"); |
| // } |
| } |
| //validIndexPrefix = currentFrame.depth; |
| //term.length = target.length; |
| //return termExists; |
| } |
| |
| } else { |
| |
| targetBeforeCurrentLength = -1; |
| arc = fr.index.getFirstArc(arcs[0]); |
| //System.out.println("first arc=" + arc); |
| |
| // Empty string prefix must have an output (block) in the index! |
| assert arc.isFinal(); |
| assert arc.output() != null; |
| |
| // if (DEBUG) { |
| // System.out.println(" no seek state; push root frame"); |
| // } |
| |
| output = arc.output(); |
| |
| currentFrame = staticFrame; |
| |
| //term.length = 0; |
| targetUpto = 0; |
| currentFrame = pushFrame(arc, VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0); |
| } |
| |
| // if (DEBUG) { |
| // System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " termExists=" + termExists); |
| // } |
| |
| // We are done sharing the common prefix with the incoming target and where we are currently seek'd; now continue walking the index: |
| while (targetUpto < target.length) { |
| |
| final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF; |
| |
| final FST.Arc<Pair<BytesRef,Long>> nextArc = fr.index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader); |
| |
| if (nextArc == null) { |
| |
| // Index is exhausted |
| // if (DEBUG) { |
| // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + Integer.toHexString(targetLabel) + " termExists=" + termExists); |
| // } |
| |
| validIndexPrefix = currentFrame.prefix; |
| //validIndexPrefix = targetUpto; |
| |
| currentFrame.scanToFloorFrame(target); |
| |
| if (!currentFrame.hasTerms) { |
| termExists = false; |
| term.setByteAt(targetUpto, (byte) targetLabel); |
| term.setLength(1+targetUpto); |
| // if (DEBUG) { |
| // System.out.println(" FAST NOT_FOUND term=" + brToString(term)); |
| // } |
| return false; |
| } |
| //System.out.println(" check maxVersion=" + currentFrame.maxIDVersion + " vs " + minIDVersion); |
| |
| // if (DEBUG) { |
| // System.out.println(" frame.maxIDVersion=" + currentFrame.maxIDVersion + " vs minIDVersion=" + minIDVersion); |
| // } |
| |
| if (currentFrame.maxIDVersion < minIDVersion) { |
| // The max version for all terms in this block is lower than the minVersion |
| if (currentFrame.fp != startFrameFP || changed) { |
| //if (targetUpto+1 > term.length) { |
| termExists = false; |
| term.setByteAt(targetUpto, (byte) targetLabel); |
| term.setLength(1+targetUpto); |
| // if (DEBUG) { |
| // System.out.println(" reset current term"); |
| // } |
| validIndexPrefix = Math.min(validIndexPrefix, term.length()); |
| } |
| //if (currentFrame.ord != startFrameOrd) { |
| //termExists = false; |
| //} |
| // if (DEBUG) { |
| // System.out.println(" FAST version NOT_FOUND term=" + brToString(term) + " targetUpto=" + targetUpto + " currentFrame.maxIDVersion=" + currentFrame.maxIDVersion + " validIndexPrefix=" + validIndexPrefix + " startFrameFP=" + startFrameFP + " vs " + currentFrame.fp + " termExists=" + termExists); |
| // } |
| return false; |
| } |
| |
| currentFrame.loadBlock(); |
| |
| // if (DEBUG) { |
| // System.out.println(" scan currentFrame ord=" + currentFrame.ord); |
| // } |
| final SeekStatus result = currentFrame.scanToTerm(target, true); |
| if (result == SeekStatus.FOUND) { |
| currentFrame.decodeMetaData(); |
| if (((IDVersionTermState) currentFrame.state).idVersion < minIDVersion) { |
| // This term's version is lower than the minVersion |
| // if (DEBUG) { |
| // System.out.println(" return NOT_FOUND: idVersion=" + ((IDVersionTermState) currentFrame.state).idVersion + " vs minIDVersion=" + minIDVersion); |
| // } |
| return false; |
| } |
| |
| // if (DEBUG) { |
| // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); |
| // } |
| |
| return true; |
| } else { |
| // if (DEBUG) { |
| // System.out.println(" got " + result + "; return NOT_FOUND term=" + brToString(term)); |
| // } |
| return false; |
| } |
| } else { |
| // Follow this arc |
| arc = nextArc; |
| if (term.byteAt(targetUpto) != (byte) targetLabel) { |
| // if (DEBUG) { |
| // System.out.println(" now set termExists=false targetUpto=" + targetUpto + " term=" + term.bytes[targetUpto] + " targetLabel=" + targetLabel); |
| // } |
| changed = true; |
| term.setByteAt(targetUpto, (byte) targetLabel); |
| termExists = false; |
| } |
| // Aggregate output as we go: |
| assert arc.output() != null; |
| if (arc.output() != VersionBlockTreeTermsWriter.NO_OUTPUT) { |
| output = VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output()); |
| } |
| |
| // if (DEBUG) { |
| // System.out.println(" index: follow label=" + (char) ((target.bytes[target.offset + targetUpto]&0xff)) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput); |
| // } |
| targetUpto++; |
| |
| if (arc.isFinal()) { |
| // if (DEBUG) System.out.println(" arc is final!"); |
| currentFrame = pushFrame(arc, VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput()), targetUpto); |
| // if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms); |
| } |
| } |
| } |
| |
| //validIndexPrefix = targetUpto; |
| validIndexPrefix = currentFrame.prefix; |
| |
| currentFrame.scanToFloorFrame(target); |
| |
| // Target term is entirely contained in the index: |
| if (!currentFrame.hasTerms) { |
| termExists = false; |
| term.setLength(targetUpto); |
| // if (DEBUG) { |
| // System.out.println(" FAST NOT_FOUND term=" + brToString(term)); |
| // } |
| return false; |
| } |
| |
| // if (DEBUG) { |
| // System.out.println(" frame.maxIDVersion=" + currentFrame.maxIDVersion + " vs minIDVersion=" + minIDVersion); |
| // } |
| |
| if (currentFrame.maxIDVersion < minIDVersion) { |
| // The max version for all terms in this block is lower than the minVersion |
| termExists = false; |
| term.setLength(targetUpto); |
| return false; |
| } |
| |
| currentFrame.loadBlock(); |
| |
| final SeekStatus result = currentFrame.scanToTerm(target, true); |
| if (result == SeekStatus.FOUND) { |
| // if (DEBUG) { |
| // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); |
| // } |
| currentFrame.decodeMetaData(); |
| if (((IDVersionTermState) currentFrame.state).idVersion < minIDVersion) { |
| // This term's version is lower than the minVersion |
| return false; |
| } |
| return true; |
| } else { |
| // if (DEBUG) { |
| // System.out.println(" got result " + result + "; return NOT_FOUND term=" + term.utf8ToString()); |
| // } |
| |
| return false; |
| } |
| } |
| |
| @Override |
| public SeekStatus seekCeil(final BytesRef target) throws IOException { |
| if (fr.index == null) { |
| throw new IllegalStateException("terms index was not loaded"); |
| } |
| |
| term.grow(1 + target.length); |
| |
| assert clearEOF(); |
| |
| //if (DEBUG) { |
| //System.out.println("\nBTTR.seekCeil seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix= " + validIndexPrefix); |
| //printSeekState(); |
| //} |
| |
| FST.Arc<Pair<BytesRef,Long>> arc; |
| int targetUpto; |
| Pair<BytesRef,Long> output; |
| |
| targetBeforeCurrentLength = currentFrame.ord; |
| |
| if (currentFrame != staticFrame) { |
| |
| // We are already seek'd; find the common |
| // prefix of new seek term vs current term and |
| // re-use the corresponding seek state. For |
| // example, if app first seeks to foobar, then |
| // seeks to foobaz, we can re-use the seek state |
| // for the first 5 bytes. |
| |
| //if (DEBUG) { |
| //System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix); |
| //} |
| |
| arc = arcs[0]; |
| assert arc.isFinal(); |
| output = arc.output(); |
| targetUpto = 0; |
| |
| IDVersionSegmentTermsEnumFrame lastFrame = stack[0]; |
| assert validIndexPrefix <= term.length(); |
| |
| final int targetLimit = Math.min(target.length, validIndexPrefix); |
| |
| int cmp = 0; |
| |
| // TODO: we should write our vLong backwards (MSB |
| // first) to get better sharing from the FST |
| |
| // First compare up to valid seek frames: |
| while (targetUpto < targetLimit) { |
| cmp = (term.byteAt(targetUpto)&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); |
| //if (DEBUG) { |
| //System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output); |
| //} |
| if (cmp != 0) { |
| break; |
| } |
| arc = arcs[1+targetUpto]; |
| assert arc.label() == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label() + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF); |
| // TODO: we could save the outputs in local |
| // byte[][] instead of making new objs ever |
| // seek; but, often the FST doesn't have any |
| // shared bytes (but this could change if we |
| // reverse vLong byte order) |
| if (arc.output() != VersionBlockTreeTermsWriter.NO_OUTPUT) { |
| output = VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output()); |
| } |
| if (arc.isFinal()) { |
| lastFrame = stack[1+lastFrame.ord]; |
| } |
| targetUpto++; |
| } |
| |
| |
| if (cmp == 0) { |
| final int targetUptoMid = targetUpto; |
| // Second compare the rest of the term, but |
| // don't save arc/output/frame: |
| final int targetLimit2 = Math.min(target.length, term.length()); |
| while (targetUpto < targetLimit2) { |
| cmp = (term.byteAt(targetUpto)&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); |
| //if (DEBUG) { |
| //System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"); |
| //} |
| if (cmp != 0) { |
| break; |
| } |
| targetUpto++; |
| } |
| |
| if (cmp == 0) { |
| cmp = term.length() - target.length; |
| } |
| targetUpto = targetUptoMid; |
| } |
| |
| if (cmp < 0) { |
| // Common case: target term is after current |
| // term, ie, app is seeking multiple terms |
| // in sorted order |
| //if (DEBUG) { |
| //System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); clear frame.scanned ord=" + lastFrame.ord); |
| //} |
| currentFrame = lastFrame; |
| |
| } else if (cmp > 0) { |
| // Uncommon case: target term |
| // is before current term; this means we can |
| // keep the currentFrame but we must rewind it |
| // (so we scan from the start) |
| targetBeforeCurrentLength = 0; |
| //if (DEBUG) { |
| //System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord); |
| //} |
| currentFrame = lastFrame; |
| currentFrame.rewind(); |
| } else { |
| // Target is exactly the same as current term |
| assert term.length() == target.length; |
| if (termExists) { |
| //if (DEBUG) { |
| //System.out.println(" target is same as current; return FOUND"); |
| //} |
| return SeekStatus.FOUND; |
| } else { |
| //if (DEBUG) { |
| //System.out.println(" target is same as current but term doesn't exist"); |
| //} |
| } |
| } |
| |
| } else { |
| |
| targetBeforeCurrentLength = -1; |
| arc = fr.index.getFirstArc(arcs[0]); |
| |
| // Empty string prefix must have an output (block) in the index! |
| assert arc.isFinal(); |
| assert arc.output() != null; |
| |
| //if (DEBUG) { |
| //System.out.println(" no seek state; push root frame"); |
| //} |
| |
| output = arc.output(); |
| |
| currentFrame = staticFrame; |
| |
| //term.length = 0; |
| targetUpto = 0; |
| currentFrame = pushFrame(arc, VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0); |
| } |
| |
| //if (DEBUG) { |
| //System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord+1=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength); |
| //} |
| |
| // We are done sharing the common prefix with the incoming target and where we are currently seek'd; now continue walking the index: |
| while (targetUpto < target.length) { |
| |
| final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF; |
| |
| final FST.Arc<Pair<BytesRef,Long>> nextArc = fr.index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader); |
| |
| if (nextArc == null) { |
| |
| // Index is exhausted |
| // if (DEBUG) { |
| // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel)); |
| // } |
| |
| validIndexPrefix = currentFrame.prefix; |
| //validIndexPrefix = targetUpto; |
| |
| currentFrame.scanToFloorFrame(target); |
| |
| currentFrame.loadBlock(); |
| |
| final SeekStatus result = currentFrame.scanToTerm(target, false); |
| if (result == SeekStatus.END) { |
| term.copyBytes(target); |
| termExists = false; |
| |
| if (next() != null) { |
| //if (DEBUG) { |
| //System.out.println(" return NOT_FOUND term=" + brToString(term) + " " + term); |
| //} |
| return SeekStatus.NOT_FOUND; |
| } else { |
| //if (DEBUG) { |
| //System.out.println(" return END"); |
| //} |
| return SeekStatus.END; |
| } |
| } else { |
| //if (DEBUG) { |
| //System.out.println(" return " + result + " term=" + brToString(term) + " " + term); |
| //} |
| return result; |
| } |
| } else { |
| // Follow this arc |
| term.setByteAt(targetUpto, (byte) targetLabel); |
| arc = nextArc; |
| // Aggregate output as we go: |
| assert arc.output() != null; |
| if (arc.output() != VersionBlockTreeTermsWriter.NO_OUTPUT) { |
| output = VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output()); |
| } |
| |
| //if (DEBUG) { |
| //System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput); |
| //} |
| targetUpto++; |
| |
| if (arc.isFinal()) { |
| //if (DEBUG) System.out.println(" arc is final!"); |
| currentFrame = pushFrame(arc, VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput()), targetUpto); |
| //if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms); |
| } |
| } |
| } |
| |
| //validIndexPrefix = targetUpto; |
| validIndexPrefix = currentFrame.prefix; |
| |
| currentFrame.scanToFloorFrame(target); |
| |
| currentFrame.loadBlock(); |
| |
| final SeekStatus result = currentFrame.scanToTerm(target, false); |
| |
| if (result == SeekStatus.END) { |
| term.copyBytes(target); |
| termExists = false; |
| if (next() != null) { |
| //if (DEBUG) { |
| //System.out.println(" return NOT_FOUND term=" + term.utf8ToString() + " " + term); |
| //} |
| return SeekStatus.NOT_FOUND; |
| } else { |
| //if (DEBUG) { |
| //System.out.println(" return END"); |
| //} |
| return SeekStatus.END; |
| } |
| } else { |
| return result; |
| } |
| } |
| |
| @SuppressWarnings("unused") |
| private void printSeekState(PrintStream out) throws IOException { |
| if (currentFrame == staticFrame) { |
| out.println(" no prior seek"); |
| } else { |
| out.println(" prior seek state:"); |
| int ord = 0; |
| boolean isSeekFrame = true; |
| while(true) { |
| IDVersionSegmentTermsEnumFrame f = getFrame(ord); |
| assert f != null; |
| final BytesRef prefix = new BytesRef(term.bytes(), 0, f.prefix); |
| if (f.nextEnt == -1) { |
| out.println(" frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + brToString(prefix) + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<<VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR:0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd()); |
| } else { |
| out.println(" frame " + (isSeekFrame ? "(seek, loaded)" : "(next, loaded)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + brToString(prefix) + " nextEnt=" + f.nextEnt + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<<VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR:0)) + " lastSubFP=" + f.lastSubFP + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd()); |
| } |
| if (fr.index != null) { |
| assert !isSeekFrame || f.arc != null: "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc; |
| if (f.prefix > 0 && isSeekFrame && f.arc.label() != (term.byteAt(f.prefix-1)&0xFF)) { |
| out.println(" broken seek state: arc.label=" + (char) f.arc.label() + " vs term byte=" + (char) (term.byteAt(f.prefix-1)&0xFF)); |
| throw new RuntimeException("seek state is broken"); |
| } |
| Pair<BytesRef,Long> output = Util.get(fr.index, prefix); |
| if (output == null) { |
| out.println(" broken seek state: prefix is not final in index"); |
| throw new RuntimeException("seek state is broken"); |
| } else if (isSeekFrame && !f.isFloor) { |
| final ByteArrayDataInput reader = new ByteArrayDataInput(output.output1.bytes, output.output1.offset, output.output1.length); |
| final long codeOrig = reader.readVLong(); |
| final long code = (f.fp << VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS:0) | (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR:0); |
| if (codeOrig != code) { |
| out.println(" broken seek state: output code=" + codeOrig + " doesn't match frame code=" + code); |
| throw new RuntimeException("seek state is broken"); |
| } |
| } |
| } |
| if (f == currentFrame) { |
| break; |
| } |
| if (f.prefix == validIndexPrefix) { |
| isSeekFrame = false; |
| } |
| ord++; |
| } |
| } |
| } |
| |
| /* Decodes only the term bytes of the next term. If caller then asks for |
| metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily) |
| decode all metadata up to the current term. */ |
| @Override |
| public BytesRef next() throws IOException { |
| |
| if (in == null) { |
| // Fresh TermsEnum; seek to first term: |
| final FST.Arc<Pair<BytesRef,Long>> arc; |
| if (fr.index != null) { |
| arc = fr.index.getFirstArc(arcs[0]); |
| // Empty string prefix must have an output in the index! |
| assert arc.isFinal(); |
| } else { |
| arc = null; |
| } |
| currentFrame = pushFrame(arc, fr.rootCode, 0); |
| currentFrame.loadBlock(); |
| } |
| |
| targetBeforeCurrentLength = currentFrame.ord; |
| |
| assert !eof; |
| //if (DEBUG) { |
| //System.out.println("\nBTTR.next seg=" + segment + " term=" + brToString(term) + " termExists?=" + termExists + " field=" + fieldInfo.name + " termBlockOrd=" + currentFrame.state.termBlockOrd + " validIndexPrefix=" + validIndexPrefix); |
| //printSeekState(); |
| //} |
| |
| if (currentFrame == staticFrame) { |
| // If seek was previously called and the term was |
| // cached, or seek(TermState) was called, usually |
| // caller is just going to pull a D/&PEnum or get |
| // docFreq, etc. But, if they then call next(), |
| // this method catches up all internal state so next() |
| // works properly: |
| //if (DEBUG) System.out.println(" re-seek to pending term=" + term.utf8ToString() + " " + term); |
| final boolean result = seekExact(term.get()); |
| assert result; |
| } |
| |
| // Pop finished blocks |
| while (currentFrame.nextEnt == currentFrame.entCount) { |
| if (!currentFrame.isLastInFloor) { |
| currentFrame.loadNextFloorBlock(); |
| } else { |
| //if (DEBUG) System.out.println(" pop frame"); |
| if (currentFrame.ord == 0) { |
| //if (DEBUG) System.out.println(" return null"); |
| assert setEOF(); |
| term.clear(); |
| validIndexPrefix = 0; |
| currentFrame.rewind(); |
| termExists = false; |
| return null; |
| } |
| final long lastFP = currentFrame.fpOrig; |
| currentFrame = stack[currentFrame.ord-1]; |
| |
| if (currentFrame.nextEnt == -1 || currentFrame.lastSubFP != lastFP) { |
| // We popped into a frame that's not loaded |
| // yet or not scan'd to the right entry |
| currentFrame.scanToFloorFrame(term.get()); |
| currentFrame.loadBlock(); |
| currentFrame.scanToSubBlock(lastFP); |
| } |
| |
| // Note that the seek state (last seek) has been |
| // invalidated beyond this depth |
| validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefix); |
| //if (DEBUG) { |
| //System.out.println(" reset validIndexPrefix=" + validIndexPrefix); |
| //} |
| } |
| } |
| |
| while(true) { |
| if (currentFrame.next()) { |
| // Push to new block: |
| //if (DEBUG) System.out.println(" push frame"); |
| currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length()); |
| // This is a "next" frame -- even if it's |
| // floor'd we must pretend it isn't so we don't |
| // try to scan to the right floor frame: |
| currentFrame.isFloor = false; |
| //currentFrame.hasTerms = true; |
| currentFrame.loadBlock(); |
| } else { |
| //if (DEBUG) System.out.println(" return term=" + term.utf8ToString() + " " + term + " currentFrame.ord=" + currentFrame.ord); |
| return term.get(); |
| } |
| } |
| } |
| |
| @Override |
| public BytesRef term() { |
| assert !eof; |
| return term.get(); |
| } |
| |
| @Override |
| public int docFreq() throws IOException { |
| assert !eof; |
| return 1; |
| } |
| |
| @Override |
| public long totalTermFreq() throws IOException { |
| assert !eof; |
| return 1; |
| } |
| |
| @Override |
| public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { |
| assert !eof; |
| //if (DEBUG) { |
| //System.out.println("BTTR.docs seg=" + segment); |
| //} |
| currentFrame.decodeMetaData(); |
| //if (DEBUG) { |
| //System.out.println(" state=" + currentFrame.state); |
| //} |
| return fr.parent.postingsReader.postings(fr.fieldInfo, currentFrame.state, reuse, flags); |
| } |
| |
| @Override |
| public ImpactsEnum impacts(int flags) throws IOException { |
| // Only one posting, the slow impl is fine |
| // We could make this throw UOE but then CheckIndex is angry |
| return new SlowImpactsEnum(postings(null, flags)); |
| } |
| |
| @Override |
| public void seekExact(BytesRef target, TermState otherState) { |
| // if (DEBUG) { |
| // System.out.println("BTTR.seekExact termState seg=" + segment + " target=" + target.utf8ToString() + " " + target + " state=" + otherState); |
| // } |
| assert clearEOF(); |
| if (target.compareTo(term.get()) != 0 || !termExists) { |
| assert otherState != null && otherState instanceof BlockTermState; |
| currentFrame = staticFrame; |
| currentFrame.state.copyFrom(otherState); |
| term.copyBytes(target); |
| currentFrame.metaDataUpto = currentFrame.getTermBlockOrd(); |
| assert currentFrame.metaDataUpto > 0; |
| validIndexPrefix = 0; |
| } else { |
| // if (DEBUG) { |
| // System.out.println(" skip seek: already on target state=" + currentFrame.state); |
| // } |
| } |
| } |
| |
| @Override |
| public TermState termState() throws IOException { |
| assert !eof; |
| currentFrame.decodeMetaData(); |
| TermState ts = currentFrame.state.clone(); |
| //if (DEBUG) System.out.println("BTTR.termState seg=" + segment + " state=" + ts); |
| return ts; |
| } |
| |
| @Override |
| public void seekExact(long ord) { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public long ord() { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public String toString() { |
| return "IDVersionSegmentTermsEnum(seg=" + fr.parent + ")"; |
| } |
| } |