| package org.apache.lucene.codecs; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.ByteArrayOutputStream; |
| import java.io.IOException; |
| import java.io.PrintStream; |
| import java.io.UnsupportedEncodingException; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.Iterator; |
| import java.util.Locale; |
| import java.util.TreeMap; |
| |
| import org.apache.lucene.index.CorruptIndexException; |
| import org.apache.lucene.index.DocsAndPositionsEnum; |
| import org.apache.lucene.index.DocsEnum; |
| import org.apache.lucene.index.FieldInfo.IndexOptions; |
| import org.apache.lucene.index.FieldInfo; |
| import org.apache.lucene.index.FieldInfos; |
| import org.apache.lucene.index.IndexFileNames; |
| import org.apache.lucene.index.SegmentInfo; |
| import org.apache.lucene.index.TermState; |
| import org.apache.lucene.index.Terms; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.store.ByteArrayDataInput; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.store.IOContext; |
| import org.apache.lucene.store.IndexInput; |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.Bits; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.IOUtils; |
| import org.apache.lucene.util.RamUsageEstimator; |
| import org.apache.lucene.util.StringHelper; |
| import org.apache.lucene.util.automaton.CompiledAutomaton; |
| import org.apache.lucene.util.automaton.RunAutomaton; |
| import org.apache.lucene.util.automaton.Transition; |
| import org.apache.lucene.util.fst.ByteSequenceOutputs; |
| import org.apache.lucene.util.fst.FST; |
| import org.apache.lucene.util.fst.Outputs; |
| import org.apache.lucene.util.fst.Util; |
| |
| /** A block-based terms index and dictionary that assigns |
| * terms to variable length blocks according to how they |
| * share prefixes. The terms index is a prefix trie |
| * whose leaves are term blocks. The advantage of this |
| * approach is that seekExact is often able to |
| * determine a term cannot exist without doing any IO, and |
| * intersection with Automata is very fast. Note that this |
| * terms dictionary has it's own fixed terms index (ie, it |
| * does not support a pluggable terms index |
| * implementation). |
| * |
| * <p><b>NOTE</b>: this terms dictionary does not support |
| * index divisor when opening an IndexReader. Instead, you |
| * can change the min/maxItemsPerBlock during indexing.</p> |
| * |
| * <p>The data structure used by this implementation is very |
| * similar to a burst trie |
| * (http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.3499), |
| * but with added logic to break up too-large blocks of all |
| * terms sharing a given prefix into smaller ones.</p> |
| * |
| * <p>Use {@link org.apache.lucene.index.CheckIndex} with the <code>-verbose</code> |
| * option to see summary statistics on the blocks in the |
| * dictionary. |
| * |
| * See {@link BlockTreeTermsWriter}. |
| * |
| * @lucene.experimental |
| */ |
| |
| public class BlockTreeTermsReader extends FieldsProducer { |
| |
| // Open input to the main terms dict file (_X.tib) |
| private final IndexInput in; |
| |
| //private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG; |
| |
| // Reads the terms dict entries, to gather state to |
| // produce DocsEnum on demand |
| private final PostingsReaderBase postingsReader; |
| |
| private final TreeMap<String,FieldReader> fields = new TreeMap<String,FieldReader>(); |
| |
| /** File offset where the directory starts in the terms file. */ |
| private long dirOffset; |
| |
| /** File offset where the directory starts in the index file. */ |
| private long indexDirOffset; |
| |
| private String segment; |
| |
| private final int version; |
| |
| private final IndexInput indexIn; |
| |
| /** Sole constructor. */ |
| public BlockTreeTermsReader(Directory dir, FieldInfos fieldInfos, SegmentInfo info, |
| PostingsReaderBase postingsReader, IOContext ioContext, |
| String segmentSuffix, int indexDivisor) |
| throws IOException { |
| |
| this.postingsReader = postingsReader; |
| |
| this.segment = info.name; |
| in = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, BlockTreeTermsWriter.TERMS_EXTENSION), |
| ioContext); |
| |
| boolean success = false; |
| |
| try { |
| version = readHeader(in); |
| if (indexDivisor != -1) { |
| indexIn = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, BlockTreeTermsWriter.TERMS_INDEX_EXTENSION), |
| ioContext); |
| int indexVersion = readIndexHeader(indexIn); |
| if (indexVersion != version) { |
| throw new CorruptIndexException("mixmatched version files: " + in + "=" + version + "," + indexIn + "=" + indexVersion); |
| } |
| } else { |
| indexIn = null; |
| } |
| |
| // Have PostingsReader init itself |
| postingsReader.init(in); |
| |
| // Read per-field details |
| seekDir(in, dirOffset); |
| if (indexDivisor != -1) { |
| seekDir(indexIn, indexDirOffset); |
| } |
| |
| final int numFields = in.readVInt(); |
| if (numFields < 0) { |
| throw new CorruptIndexException("invalid numFields: " + numFields + " (resource=" + in + ")"); |
| } |
| |
| for(int i=0;i<numFields;i++) { |
| final int field = in.readVInt(); |
| final long numTerms = in.readVLong(); |
| assert numTerms >= 0; |
| final int numBytes = in.readVInt(); |
| final BytesRef rootCode = new BytesRef(new byte[numBytes]); |
| in.readBytes(rootCode.bytes, 0, numBytes); |
| rootCode.length = numBytes; |
| final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); |
| assert fieldInfo != null: "field=" + field; |
| final long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong(); |
| final long sumDocFreq = in.readVLong(); |
| final int docCount = in.readVInt(); |
| if (docCount < 0 || docCount > info.getDocCount()) { // #docs with field must be <= #docs |
| throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + info.getDocCount() + " (resource=" + in + ")"); |
| } |
| if (sumDocFreq < docCount) { // #postings must be >= #docs with field |
| throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount + " (resource=" + in + ")"); |
| } |
| if (sumTotalTermFreq != -1 && sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings |
| throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq + " (resource=" + in + ")"); |
| } |
| final long indexStartFP = indexDivisor != -1 ? indexIn.readVLong() : 0; |
| FieldReader previous = fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount, indexStartFP, indexIn)); |
| if (previous != null) { |
| throw new CorruptIndexException("duplicate field: " + fieldInfo.name + " (resource=" + in + ")"); |
| } |
| } |
| |
| success = true; |
| } finally { |
| if (!success) { |
| // this.close() will close in: |
| IOUtils.closeWhileHandlingException(this); |
| } |
| } |
| } |
| |
| /** Reads terms file header. */ |
| protected int readHeader(IndexInput input) throws IOException { |
| int version = CodecUtil.checkHeader(input, BlockTreeTermsWriter.TERMS_CODEC_NAME, |
| BlockTreeTermsWriter.TERMS_VERSION_START, |
| BlockTreeTermsWriter.TERMS_VERSION_CURRENT); |
| if (version < BlockTreeTermsWriter.TERMS_VERSION_APPEND_ONLY) { |
| dirOffset = input.readLong(); |
| } |
| return version; |
| } |
| |
| /** Reads index file header. */ |
| protected int readIndexHeader(IndexInput input) throws IOException { |
| int version = CodecUtil.checkHeader(input, BlockTreeTermsWriter.TERMS_INDEX_CODEC_NAME, |
| BlockTreeTermsWriter.TERMS_INDEX_VERSION_START, |
| BlockTreeTermsWriter.TERMS_INDEX_VERSION_CURRENT); |
| if (version < BlockTreeTermsWriter.TERMS_INDEX_VERSION_APPEND_ONLY) { |
| indexDirOffset = input.readLong(); |
| } |
| return version; |
| } |
| |
| /** Seek {@code input} to the directory offset. */ |
| protected void seekDir(IndexInput input, long dirOffset) |
| throws IOException { |
| if (version >= BlockTreeTermsWriter.TERMS_INDEX_VERSION_APPEND_ONLY) { |
| input.seek(input.length() - 8); |
| dirOffset = input.readLong(); |
| } |
| input.seek(dirOffset); |
| } |
| |
| // for debugging |
| // private static String toHex(int v) { |
| // return "0x" + Integer.toHexString(v); |
| // } |
| |
| @Override |
| public void close() throws IOException { |
| try { |
| IOUtils.close(indexIn, in, postingsReader); |
| } finally { |
| // Clear so refs to terms index is GCable even if |
| // app hangs onto us: |
| fields.clear(); |
| } |
| } |
| |
| @Override |
| public Iterator<String> iterator() { |
| return Collections.unmodifiableSet(fields.keySet()).iterator(); |
| } |
| |
| @Override |
| public Terms terms(String field) throws IOException { |
| assert field != null; |
| return fields.get(field); |
| } |
| |
| @Override |
| public int size() { |
| return fields.size(); |
| } |
| |
| // for debugging |
| String brToString(BytesRef b) { |
| if (b == null) { |
| return "null"; |
| } else { |
| try { |
| return b.utf8ToString() + " " + b; |
| } catch (Throwable t) { |
| // If BytesRef isn't actually UTF8, or it's eg a |
| // prefix of UTF8 that ends mid-unicode-char, we |
| // fallback to hex: |
| return b.toString(); |
| } |
| } |
| } |
| |
| /** |
| * BlockTree statistics for a single field |
| * returned by {@link FieldReader#computeStats()}. |
| */ |
| public static class Stats { |
| /** How many nodes in the index FST. */ |
| public long indexNodeCount; |
| |
| /** How many arcs in the index FST. */ |
| public long indexArcCount; |
| |
| /** Byte size of the index. */ |
| public long indexNumBytes; |
| |
| /** Total number of terms in the field. */ |
| public long totalTermCount; |
| |
| /** Total number of bytes (sum of term lengths) across all terms in the field. */ |
| public long totalTermBytes; |
| |
| /** The number of normal (non-floor) blocks in the terms file. */ |
| public int nonFloorBlockCount; |
| |
| /** The number of floor blocks (meta-blocks larger than the |
| * allowed {@code maxItemsPerBlock}) in the terms file. */ |
| public int floorBlockCount; |
| |
| /** The number of sub-blocks within the floor blocks. */ |
| public int floorSubBlockCount; |
| |
| /** The number of "internal" blocks (that have both |
| * terms and sub-blocks). */ |
| public int mixedBlockCount; |
| |
| /** The number of "leaf" blocks (blocks that have only |
| * terms). */ |
| public int termsOnlyBlockCount; |
| |
| /** The number of "internal" blocks that do not contain |
| * terms (have only sub-blocks). */ |
| public int subBlocksOnlyBlockCount; |
| |
| /** Total number of blocks. */ |
| public int totalBlockCount; |
| |
| /** Number of blocks at each prefix depth. */ |
| public int[] blockCountByPrefixLen = new int[10]; |
| private int startBlockCount; |
| private int endBlockCount; |
| |
| /** Total number of bytes used to store term suffixes. */ |
| public long totalBlockSuffixBytes; |
| |
| /** Total number of bytes used to store term stats (not |
| * including what the {@link PostingsBaseFormat} |
| * stores. */ |
| public long totalBlockStatsBytes; |
| |
| /** Total bytes stored by the {@link PostingsBaseFormat}, |
| * plus the other few vInts stored in the frame. */ |
| public long totalBlockOtherBytes; |
| |
| /** Segment name. */ |
| public final String segment; |
| |
| /** Field name. */ |
| public final String field; |
| |
| Stats(String segment, String field) { |
| this.segment = segment; |
| this.field = field; |
| } |
| |
| void startBlock(FieldReader.SegmentTermsEnum.Frame frame, boolean isFloor) { |
| totalBlockCount++; |
| if (isFloor) { |
| if (frame.fp == frame.fpOrig) { |
| floorBlockCount++; |
| } |
| floorSubBlockCount++; |
| } else { |
| nonFloorBlockCount++; |
| } |
| |
| if (blockCountByPrefixLen.length <= frame.prefix) { |
| blockCountByPrefixLen = ArrayUtil.grow(blockCountByPrefixLen, 1+frame.prefix); |
| } |
| blockCountByPrefixLen[frame.prefix]++; |
| startBlockCount++; |
| totalBlockSuffixBytes += frame.suffixesReader.length(); |
| totalBlockStatsBytes += frame.statsReader.length(); |
| } |
| |
| void endBlock(FieldReader.SegmentTermsEnum.Frame frame) { |
| final int termCount = frame.isLeafBlock ? frame.entCount : frame.state.termBlockOrd; |
| final int subBlockCount = frame.entCount - termCount; |
| totalTermCount += termCount; |
| if (termCount != 0 && subBlockCount != 0) { |
| mixedBlockCount++; |
| } else if (termCount != 0) { |
| termsOnlyBlockCount++; |
| } else if (subBlockCount != 0) { |
| subBlocksOnlyBlockCount++; |
| } else { |
| throw new IllegalStateException(); |
| } |
| endBlockCount++; |
| final long otherBytes = frame.fpEnd - frame.fp - frame.suffixesReader.length() - frame.statsReader.length(); |
| assert otherBytes > 0 : "otherBytes=" + otherBytes + " frame.fp=" + frame.fp + " frame.fpEnd=" + frame.fpEnd; |
| totalBlockOtherBytes += otherBytes; |
| } |
| |
| void term(BytesRef term) { |
| totalTermBytes += term.length; |
| } |
| |
| void finish() { |
| assert startBlockCount == endBlockCount: "startBlockCount=" + startBlockCount + " endBlockCount=" + endBlockCount; |
| assert totalBlockCount == floorSubBlockCount + nonFloorBlockCount: "floorSubBlockCount=" + floorSubBlockCount + " nonFloorBlockCount=" + nonFloorBlockCount + " totalBlockCount=" + totalBlockCount; |
| assert totalBlockCount == mixedBlockCount + termsOnlyBlockCount + subBlocksOnlyBlockCount: "totalBlockCount=" + totalBlockCount + " mixedBlockCount=" + mixedBlockCount + " subBlocksOnlyBlockCount=" + subBlocksOnlyBlockCount + " termsOnlyBlockCount=" + termsOnlyBlockCount; |
| } |
| |
| @Override |
| public String toString() { |
| final ByteArrayOutputStream bos = new ByteArrayOutputStream(1024); |
| PrintStream out; |
| try { |
| out = new PrintStream(bos, false, "UTF-8"); |
| } catch (UnsupportedEncodingException bogus) { |
| throw new RuntimeException(bogus); |
| } |
| |
| out.println(" index FST:"); |
| out.println(" " + indexNodeCount + " nodes"); |
| out.println(" " + indexArcCount + " arcs"); |
| out.println(" " + indexNumBytes + " bytes"); |
| out.println(" terms:"); |
| out.println(" " + totalTermCount + " terms"); |
| out.println(" " + totalTermBytes + " bytes" + (totalTermCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalTermBytes)/totalTermCount) + " bytes/term)" : "")); |
| out.println(" blocks:"); |
| out.println(" " + totalBlockCount + " blocks"); |
| out.println(" " + termsOnlyBlockCount + " terms-only blocks"); |
| out.println(" " + subBlocksOnlyBlockCount + " sub-block-only blocks"); |
| out.println(" " + mixedBlockCount + " mixed blocks"); |
| out.println(" " + floorBlockCount + " floor blocks"); |
| out.println(" " + (totalBlockCount-floorSubBlockCount) + " non-floor blocks"); |
| out.println(" " + floorSubBlockCount + " floor sub-blocks"); |
| out.println(" " + totalBlockSuffixBytes + " term suffix bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockSuffixBytes)/totalBlockCount) + " suffix-bytes/block)" : "")); |
| out.println(" " + totalBlockStatsBytes + " term stats bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockStatsBytes)/totalBlockCount) + " stats-bytes/block)" : "")); |
| out.println(" " + totalBlockOtherBytes + " other bytes" + (totalBlockCount != 0 ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockOtherBytes)/totalBlockCount) + " other-bytes/block)" : "")); |
| if (totalBlockCount != 0) { |
| out.println(" by prefix length:"); |
| int total = 0; |
| for(int prefix=0;prefix<blockCountByPrefixLen.length;prefix++) { |
| final int blockCount = blockCountByPrefixLen[prefix]; |
| total += blockCount; |
| if (blockCount != 0) { |
| out.println(" " + String.format(Locale.ROOT, "%2d", prefix) + ": " + blockCount); |
| } |
| } |
| assert totalBlockCount == total; |
| } |
| |
| try { |
| return bos.toString("UTF-8"); |
| } catch (UnsupportedEncodingException bogus) { |
| throw new RuntimeException(bogus); |
| } |
| } |
| } |
| |
| final Outputs<BytesRef> fstOutputs = ByteSequenceOutputs.getSingleton(); |
| final BytesRef NO_OUTPUT = fstOutputs.getNoOutput(); |
| |
| /** BlockTree's implementation of {@link Terms}. */ |
| public final class FieldReader extends Terms { |
| final long numTerms; |
| final FieldInfo fieldInfo; |
| final long sumTotalTermFreq; |
| final long sumDocFreq; |
| final int docCount; |
| final long indexStartFP; |
| final long rootBlockFP; |
| final BytesRef rootCode; |
| private final FST<BytesRef> index; |
| |
| //private boolean DEBUG; |
| |
| FieldReader(FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, long indexStartFP, IndexInput indexIn) throws IOException { |
| assert numTerms > 0; |
| this.fieldInfo = fieldInfo; |
| //DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id"); |
| this.numTerms = numTerms; |
| this.sumTotalTermFreq = sumTotalTermFreq; |
| this.sumDocFreq = sumDocFreq; |
| this.docCount = docCount; |
| this.indexStartFP = indexStartFP; |
| this.rootCode = rootCode; |
| // if (DEBUG) { |
| // System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode=" + rootCode + " divisor=" + indexDivisor); |
| // } |
| |
| rootBlockFP = (new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong() >>> BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS; |
| |
| if (indexIn != null) { |
| final IndexInput clone = indexIn.clone(); |
| //System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name); |
| clone.seek(indexStartFP); |
| index = new FST<BytesRef>(clone, ByteSequenceOutputs.getSingleton()); |
| |
| /* |
| if (false) { |
| final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; |
| Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); |
| Util.toDot(index, w, false, false); |
| System.out.println("FST INDEX: SAVED to " + dotFileName); |
| w.close(); |
| } |
| */ |
| } else { |
| index = null; |
| } |
| } |
| |
| /** For debugging -- used by CheckIndex too*/ |
| // TODO: maybe push this into Terms? |
| public Stats computeStats() throws IOException { |
| return new SegmentTermsEnum().computeBlockStats(); |
| } |
| |
| @Override |
| public Comparator<BytesRef> getComparator() { |
| return BytesRef.getUTF8SortedAsUnicodeComparator(); |
| } |
| |
| @Override |
| public boolean hasOffsets() { |
| return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; |
| } |
| |
| @Override |
| public boolean hasPositions() { |
| return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; |
| } |
| |
| @Override |
| public boolean hasPayloads() { |
| return fieldInfo.hasPayloads(); |
| } |
| |
| @Override |
| public TermsEnum iterator(TermsEnum reuse) throws IOException { |
| return new SegmentTermsEnum(); |
| } |
| |
| @Override |
| public long size() { |
| return numTerms; |
| } |
| |
| @Override |
| public long getSumTotalTermFreq() { |
| return sumTotalTermFreq; |
| } |
| |
| @Override |
| public long getSumDocFreq() { |
| return sumDocFreq; |
| } |
| |
| @Override |
| public int getDocCount() { |
| return docCount; |
| } |
| |
| @Override |
| public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { |
| if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { |
| throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); |
| } |
| return new IntersectEnum(compiled, startTerm); |
| } |
| |
| // NOTE: cannot seek! |
| private final class IntersectEnum extends TermsEnum { |
| private final IndexInput in; |
| |
| private Frame[] stack; |
| |
| @SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc<BytesRef>[] arcs = new FST.Arc[5]; |
| |
| private final RunAutomaton runAutomaton; |
| private final CompiledAutomaton compiledAutomaton; |
| |
| private Frame currentFrame; |
| |
| private final BytesRef term = new BytesRef(); |
| |
| private final FST.BytesReader fstReader; |
| |
| // TODO: can we share this with the frame in STE? |
| private final class Frame { |
| final int ord; |
| long fp; |
| long fpOrig; |
| long fpEnd; |
| long lastSubFP; |
| |
| // State in automaton |
| int state; |
| |
| int metaDataUpto; |
| |
| byte[] suffixBytes = new byte[128]; |
| final ByteArrayDataInput suffixesReader = new ByteArrayDataInput(); |
| |
| byte[] statBytes = new byte[64]; |
| final ByteArrayDataInput statsReader = new ByteArrayDataInput(); |
| |
| byte[] floorData = new byte[32]; |
| final ByteArrayDataInput floorDataReader = new ByteArrayDataInput(); |
| |
| // Length of prefix shared by all terms in this block |
| int prefix; |
| |
| // Number of entries (term or sub-block) in this block |
| int entCount; |
| |
| // Which term we will next read |
| int nextEnt; |
| |
| // True if this block is either not a floor block, |
| // or, it's the last sub-block of a floor block |
| boolean isLastInFloor; |
| |
| // True if all entries are terms |
| boolean isLeafBlock; |
| |
| int numFollowFloorBlocks; |
| int nextFloorLabel; |
| |
| Transition[] transitions; |
| int curTransitionMax; |
| int transitionIndex; |
| |
| FST.Arc<BytesRef> arc; |
| |
| final BlockTermState termState; |
| |
| // Cumulative output so far |
| BytesRef outputPrefix; |
| |
| private int startBytePos; |
| private int suffix; |
| |
| public Frame(int ord) throws IOException { |
| this.ord = ord; |
| termState = postingsReader.newTermState(); |
| termState.totalTermFreq = -1; |
| } |
| |
| void loadNextFloorBlock() throws IOException { |
| assert numFollowFloorBlocks > 0; |
| //if (DEBUG) System.out.println(" loadNextFoorBlock trans=" + transitions[transitionIndex]); |
| |
| do { |
| fp = fpOrig + (floorDataReader.readVLong() >>> 1); |
| numFollowFloorBlocks--; |
| // if (DEBUG) System.out.println(" skip floor block2! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[transitionIndex].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks); |
| if (numFollowFloorBlocks != 0) { |
| nextFloorLabel = floorDataReader.readByte() & 0xff; |
| } else { |
| nextFloorLabel = 256; |
| } |
| // if (DEBUG) System.out.println(" nextFloorLabel=" + (char) nextFloorLabel); |
| } while (numFollowFloorBlocks != 0 && nextFloorLabel <= transitions[transitionIndex].getMin()); |
| |
| load(null); |
| } |
| |
| public void setState(int state) { |
| this.state = state; |
| transitionIndex = 0; |
| transitions = compiledAutomaton.sortedTransitions[state]; |
| if (transitions.length != 0) { |
| curTransitionMax = transitions[0].getMax(); |
| } else { |
| curTransitionMax = -1; |
| } |
| } |
| |
| void load(BytesRef frameIndexData) throws IOException { |
| |
| // if (DEBUG) System.out.println(" load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" + state)); |
| |
| if (frameIndexData != null && transitions.length != 0) { |
| // Floor frame |
| if (floorData.length < frameIndexData.length) { |
| this.floorData = new byte[ArrayUtil.oversize(frameIndexData.length, 1)]; |
| } |
| System.arraycopy(frameIndexData.bytes, frameIndexData.offset, floorData, 0, frameIndexData.length); |
| floorDataReader.reset(floorData, 0, frameIndexData.length); |
| // Skip first long -- has redundant fp, hasTerms |
| // flag, isFloor flag |
| final long code = floorDataReader.readVLong(); |
| if ((code & BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0) { |
| numFollowFloorBlocks = floorDataReader.readVInt(); |
| nextFloorLabel = floorDataReader.readByte() & 0xff; |
| // if (DEBUG) System.out.println(" numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + nextFloorLabel); |
| |
| // If current state is accept, we must process |
| // first block in case it has empty suffix: |
| if (!runAutomaton.isAccept(state)) { |
| // Maybe skip floor blocks: |
| while (numFollowFloorBlocks != 0 && nextFloorLabel <= transitions[0].getMin()) { |
| fp = fpOrig + (floorDataReader.readVLong() >>> 1); |
| numFollowFloorBlocks--; |
| // if (DEBUG) System.out.println(" skip floor block! nextFloorLabel=" + (char) nextFloorLabel + " vs target=" + (char) transitions[0].getMin() + " newFP=" + fp + " numFollowFloorBlocks=" + numFollowFloorBlocks); |
| if (numFollowFloorBlocks != 0) { |
| nextFloorLabel = floorDataReader.readByte() & 0xff; |
| } else { |
| nextFloorLabel = 256; |
| } |
| } |
| } |
| } |
| } |
| |
| in.seek(fp); |
| int code = in.readVInt(); |
| entCount = code >>> 1; |
| assert entCount > 0; |
| isLastInFloor = (code & 1) != 0; |
| |
| // term suffixes: |
| code = in.readVInt(); |
| isLeafBlock = (code & 1) != 0; |
| int numBytes = code >>> 1; |
| // if (DEBUG) System.out.println(" entCount=" + entCount + " lastInFloor?=" + isLastInFloor + " leafBlock?=" + isLeafBlock + " numSuffixBytes=" + numBytes); |
| if (suffixBytes.length < numBytes) { |
| suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; |
| } |
| in.readBytes(suffixBytes, 0, numBytes); |
| suffixesReader.reset(suffixBytes, 0, numBytes); |
| |
| // stats |
| numBytes = in.readVInt(); |
| if (statBytes.length < numBytes) { |
| statBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; |
| } |
| in.readBytes(statBytes, 0, numBytes); |
| statsReader.reset(statBytes, 0, numBytes); |
| metaDataUpto = 0; |
| |
| termState.termBlockOrd = 0; |
| nextEnt = 0; |
| |
| postingsReader.readTermsBlock(in, fieldInfo, termState); |
| |
| if (!isLastInFloor) { |
| // Sub-blocks of a single floor block are always |
| // written one after another -- tail recurse: |
| fpEnd = in.getFilePointer(); |
| } |
| } |
| |
| // TODO: maybe add scanToLabel; should give perf boost |
| |
| public boolean next() { |
| return isLeafBlock ? nextLeaf() : nextNonLeaf(); |
| } |
| |
| // Decodes next entry; returns true if it's a sub-block |
| public boolean nextLeaf() { |
| //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount); |
| assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; |
| nextEnt++; |
| suffix = suffixesReader.readVInt(); |
| startBytePos = suffixesReader.getPosition(); |
| suffixesReader.skipBytes(suffix); |
| return false; |
| } |
| |
| public boolean nextNonLeaf() { |
| //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount); |
| assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; |
| nextEnt++; |
| final int code = suffixesReader.readVInt(); |
| suffix = code >>> 1; |
| startBytePos = suffixesReader.getPosition(); |
| suffixesReader.skipBytes(suffix); |
| if ((code & 1) == 0) { |
| // A normal term |
| termState.termBlockOrd++; |
| return false; |
| } else { |
| // A sub-block; make sub-FP absolute: |
| lastSubFP = fp - suffixesReader.readVLong(); |
| return true; |
| } |
| } |
| |
| public int getTermBlockOrd() { |
| return isLeafBlock ? nextEnt : termState.termBlockOrd; |
| } |
| |
| public void decodeMetaData() throws IOException { |
| |
| // lazily catch up on metadata decode: |
| final int limit = getTermBlockOrd(); |
| assert limit > 0; |
| |
| // We must set/incr state.termCount because |
| // postings impl can look at this |
| termState.termBlockOrd = metaDataUpto; |
| |
| // TODO: better API would be "jump straight to term=N"??? |
| while (metaDataUpto < limit) { |
| |
| // TODO: we could make "tiers" of metadata, ie, |
| // decode docFreq/totalTF but don't decode postings |
| // metadata; this way caller could get |
| // docFreq/totalTF w/o paying decode cost for |
| // postings |
| |
| // TODO: if docFreq were bulk decoded we could |
| // just skipN here: |
| termState.docFreq = statsReader.readVInt(); |
| //if (DEBUG) System.out.println(" dF=" + state.docFreq); |
| if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { |
| termState.totalTermFreq = termState.docFreq + statsReader.readVLong(); |
| //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); |
| } |
| |
| postingsReader.nextTerm(fieldInfo, termState); |
| metaDataUpto++; |
| termState.termBlockOrd++; |
| } |
| } |
| } |
| |
| private BytesRef savedStartTerm; |
| |
| // TODO: in some cases we can filter by length? eg |
| // regexp foo*bar must be at least length 6 bytes |
| public IntersectEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { |
| // if (DEBUG) { |
| // System.out.println("\nintEnum.init seg=" + segment + " commonSuffix=" + brToString(compiled.commonSuffixRef)); |
| // } |
| runAutomaton = compiled.runAutomaton; |
| compiledAutomaton = compiled; |
| in = BlockTreeTermsReader.this.in.clone(); |
| stack = new Frame[5]; |
| for(int idx=0;idx<stack.length;idx++) { |
| stack[idx] = new Frame(idx); |
| } |
| for(int arcIdx=0;arcIdx<arcs.length;arcIdx++) { |
| arcs[arcIdx] = new FST.Arc<BytesRef>(); |
| } |
| |
| if (index == null) { |
| fstReader = null; |
| } else { |
| fstReader = index.getBytesReader(); |
| } |
| |
| // TODO: if the automaton is "smallish" we really |
| // should use the terms index to seek at least to |
| // the initial term and likely to subsequent terms |
| // (or, maybe just fallback to ATE for such cases). |
| // Else the seek cost of loading the frames will be |
| // too costly. |
| |
| final FST.Arc<BytesRef> arc = index.getFirstArc(arcs[0]); |
| // Empty string prefix must have an output in the index! |
| assert arc.isFinal(); |
| |
| // Special pushFrame since it's the first one: |
| final Frame f = stack[0]; |
| f.fp = f.fpOrig = rootBlockFP; |
| f.prefix = 0; |
| f.setState(runAutomaton.getInitialState()); |
| f.arc = arc; |
| f.outputPrefix = arc.output; |
| f.load(rootCode); |
| |
| // for assert: |
| assert setSavedStartTerm(startTerm); |
| |
| currentFrame = f; |
| if (startTerm != null) { |
| seekToStartTerm(startTerm); |
| } |
| } |
| |
| // only for assert: |
| private boolean setSavedStartTerm(BytesRef startTerm) { |
| savedStartTerm = startTerm == null ? null : BytesRef.deepCopyOf(startTerm); |
| return true; |
| } |
| |
| @Override |
| public TermState termState() throws IOException { |
| currentFrame.decodeMetaData(); |
| return currentFrame.termState.clone(); |
| } |
| |
| private Frame getFrame(int ord) throws IOException { |
| if (ord >= stack.length) { |
| final Frame[] next = new Frame[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; |
| System.arraycopy(stack, 0, next, 0, stack.length); |
| for(int stackOrd=stack.length;stackOrd<next.length;stackOrd++) { |
| next[stackOrd] = new Frame(stackOrd); |
| } |
| stack = next; |
| } |
| assert stack[ord].ord == ord; |
| return stack[ord]; |
| } |
| |
| private FST.Arc<BytesRef> getArc(int ord) { |
| if (ord >= arcs.length) { |
| @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc<BytesRef>[] next = |
| new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; |
| System.arraycopy(arcs, 0, next, 0, arcs.length); |
| for(int arcOrd=arcs.length;arcOrd<next.length;arcOrd++) { |
| next[arcOrd] = new FST.Arc<BytesRef>(); |
| } |
| arcs = next; |
| } |
| return arcs[ord]; |
| } |
| |
| private Frame pushFrame(int state) throws IOException { |
| final Frame f = getFrame(currentFrame == null ? 0 : 1+currentFrame.ord); |
| |
| f.fp = f.fpOrig = currentFrame.lastSubFP; |
| f.prefix = currentFrame.prefix + currentFrame.suffix; |
| // if (DEBUG) System.out.println(" pushFrame state=" + state + " prefix=" + f.prefix); |
| f.setState(state); |
| |
| // Walk the arc through the index -- we only |
| // "bother" with this so we can get the floor data |
| // from the index and skip floor blocks when |
| // possible: |
| FST.Arc<BytesRef> arc = currentFrame.arc; |
| int idx = currentFrame.prefix; |
| assert currentFrame.suffix > 0; |
| BytesRef output = currentFrame.outputPrefix; |
| while (idx < f.prefix) { |
| final int target = term.bytes[idx] & 0xff; |
| // TODO: we could be more efficient for the next() |
| // case by using current arc as starting point, |
| // passed to findTargetArc |
| arc = index.findTargetArc(target, arc, getArc(1+idx), fstReader); |
| assert arc != null; |
| output = fstOutputs.add(output, arc.output); |
| idx++; |
| } |
| |
| f.arc = arc; |
| f.outputPrefix = output; |
| assert arc.isFinal(); |
| f.load(fstOutputs.add(output, arc.nextFinalOutput)); |
| return f; |
| } |
| |
| @Override |
| public BytesRef term() { |
| return term; |
| } |
| |
| @Override |
| public int docFreq() throws IOException { |
| //if (DEBUG) System.out.println("BTIR.docFreq"); |
| currentFrame.decodeMetaData(); |
| //if (DEBUG) System.out.println(" return " + currentFrame.termState.docFreq); |
| return currentFrame.termState.docFreq; |
| } |
| |
| @Override |
| public long totalTermFreq() throws IOException { |
| currentFrame.decodeMetaData(); |
| return currentFrame.termState.totalTermFreq; |
| } |
| |
| @Override |
| public DocsEnum docs(Bits skipDocs, DocsEnum reuse, int flags) throws IOException { |
| currentFrame.decodeMetaData(); |
| return postingsReader.docs(fieldInfo, currentFrame.termState, skipDocs, reuse, flags); |
| } |
| |
| @Override |
| public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { |
| if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { |
| // Positions were not indexed: |
| return null; |
| } |
| |
| currentFrame.decodeMetaData(); |
| return postingsReader.docsAndPositions(fieldInfo, currentFrame.termState, skipDocs, reuse, flags); |
| } |
| |
| private int getState() { |
| int state = currentFrame.state; |
| for(int idx=0;idx<currentFrame.suffix;idx++) { |
| state = runAutomaton.step(state, currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff); |
| assert state != -1; |
| } |
| return state; |
| } |
| |
| // NOTE: specialized to only doing the first-time |
| // seek, but we could generalize it to allow |
| // arbitrary seekExact/Ceil. Note that this is a |
| // seekFloor! |
| private void seekToStartTerm(BytesRef target) throws IOException { |
| //if (DEBUG) System.out.println("seek to startTerm=" + target.utf8ToString()); |
| assert currentFrame.ord == 0; |
| if (term.length < target.length) { |
| term.bytes = ArrayUtil.grow(term.bytes, target.length); |
| } |
| FST.Arc<BytesRef> arc = arcs[0]; |
| assert arc == currentFrame.arc; |
| |
| for(int idx=0;idx<=target.length;idx++) { |
| |
| while (true) { |
| final int savePos = currentFrame.suffixesReader.getPosition(); |
| final int saveStartBytePos = currentFrame.startBytePos; |
| final int saveSuffix = currentFrame.suffix; |
| final long saveLastSubFP = currentFrame.lastSubFP; |
| final int saveTermBlockOrd = currentFrame.termState.termBlockOrd; |
| |
| final boolean isSubBlock = currentFrame.next(); |
| |
| //if (DEBUG) System.out.println(" cycle ent=" + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") prefix=" + currentFrame.prefix + " suffix=" + currentFrame.suffix + " isBlock=" + isSubBlock + " firstLabel=" + (currentFrame.suffix == 0 ? "" : (currentFrame.suffixBytes[currentFrame.startBytePos])&0xff)); |
| term.length = currentFrame.prefix + currentFrame.suffix; |
| if (term.bytes.length < term.length) { |
| term.bytes = ArrayUtil.grow(term.bytes, term.length); |
| } |
| System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix); |
| |
| if (isSubBlock && StringHelper.startsWith(target, term)) { |
| // Recurse |
| //if (DEBUG) System.out.println(" recurse!"); |
| currentFrame = pushFrame(getState()); |
| break; |
| } else { |
| final int cmp = term.compareTo(target); |
| if (cmp < 0) { |
| if (currentFrame.nextEnt == currentFrame.entCount) { |
| if (!currentFrame.isLastInFloor) { |
| //if (DEBUG) System.out.println(" load floorBlock"); |
| currentFrame.loadNextFloorBlock(); |
| continue; |
| } else { |
| //if (DEBUG) System.out.println(" return term=" + brToString(term)); |
| return; |
| } |
| } |
| continue; |
| } else if (cmp == 0) { |
| //if (DEBUG) System.out.println(" return term=" + brToString(term)); |
| return; |
| } else { |
| // Fallback to prior entry: the semantics of |
| // this method is that the first call to |
| // next() will return the term after the |
| // requested term |
| currentFrame.nextEnt--; |
| currentFrame.lastSubFP = saveLastSubFP; |
| currentFrame.startBytePos = saveStartBytePos; |
| currentFrame.suffix = saveSuffix; |
| currentFrame.suffixesReader.setPosition(savePos); |
| currentFrame.termState.termBlockOrd = saveTermBlockOrd; |
| System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix); |
| term.length = currentFrame.prefix + currentFrame.suffix; |
| // If the last entry was a block we don't |
| // need to bother recursing and pushing to |
| // the last term under it because the first |
| // next() will simply skip the frame anyway |
| return; |
| } |
| } |
| } |
| } |
| |
| assert false; |
| } |
| |
| @Override |
| public BytesRef next() throws IOException { |
| |
| // if (DEBUG) { |
| // System.out.println("\nintEnum.next seg=" + segment); |
| // System.out.println(" frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix); |
| // } |
| |
| nextTerm: |
| while(true) { |
| // Pop finished frames |
| while (currentFrame.nextEnt == currentFrame.entCount) { |
| if (!currentFrame.isLastInFloor) { |
| //if (DEBUG) System.out.println(" next-floor-block"); |
| currentFrame.loadNextFloorBlock(); |
| //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix); |
| } else { |
| //if (DEBUG) System.out.println(" pop frame"); |
| if (currentFrame.ord == 0) { |
| return null; |
| } |
| final long lastFP = currentFrame.fpOrig; |
| currentFrame = stack[currentFrame.ord-1]; |
| assert currentFrame.lastSubFP == lastFP; |
| //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix); |
| } |
| } |
| |
| final boolean isSubBlock = currentFrame.next(); |
| // if (DEBUG) { |
| // final BytesRef suffixRef = new BytesRef(); |
| // suffixRef.bytes = currentFrame.suffixBytes; |
| // suffixRef.offset = currentFrame.startBytePos; |
| // suffixRef.length = currentFrame.suffix; |
| // System.out.println(" " + (isSubBlock ? "sub-block" : "term") + " " + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") suffix=" + brToString(suffixRef)); |
| // } |
| |
| if (currentFrame.suffix != 0) { |
| final int label = currentFrame.suffixBytes[currentFrame.startBytePos] & 0xff; |
| while (label > currentFrame.curTransitionMax) { |
| if (currentFrame.transitionIndex >= currentFrame.transitions.length-1) { |
| // Stop processing this frame -- no further |
| // matches are possible because we've moved |
| // beyond what the max transition will allow |
| //if (DEBUG) System.out.println(" break: trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex])); |
| |
| // sneaky! forces a pop above |
| currentFrame.isLastInFloor = true; |
| currentFrame.nextEnt = currentFrame.entCount; |
| continue nextTerm; |
| } |
| currentFrame.transitionIndex++; |
| currentFrame.curTransitionMax = currentFrame.transitions[currentFrame.transitionIndex].getMax(); |
| //if (DEBUG) System.out.println(" next trans=" + currentFrame.transitions[currentFrame.transitionIndex]); |
| } |
| } |
| |
| // First test the common suffix, if set: |
| if (compiledAutomaton.commonSuffixRef != null && !isSubBlock) { |
| final int termLen = currentFrame.prefix + currentFrame.suffix; |
| if (termLen < compiledAutomaton.commonSuffixRef.length) { |
| // No match |
| // if (DEBUG) { |
| // System.out.println(" skip: common suffix length"); |
| // } |
| continue nextTerm; |
| } |
| |
| final byte[] suffixBytes = currentFrame.suffixBytes; |
| final byte[] commonSuffixBytes = compiledAutomaton.commonSuffixRef.bytes; |
| |
| final int lenInPrefix = compiledAutomaton.commonSuffixRef.length - currentFrame.suffix; |
| assert compiledAutomaton.commonSuffixRef.offset == 0; |
| int suffixBytesPos; |
| int commonSuffixBytesPos = 0; |
| |
| if (lenInPrefix > 0) { |
| // A prefix of the common suffix overlaps with |
| // the suffix of the block prefix so we first |
| // test whether the prefix part matches: |
| final byte[] termBytes = term.bytes; |
| int termBytesPos = currentFrame.prefix - lenInPrefix; |
| assert termBytesPos >= 0; |
| final int termBytesPosEnd = currentFrame.prefix; |
| while (termBytesPos < termBytesPosEnd) { |
| if (termBytes[termBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) { |
| // if (DEBUG) { |
| // System.out.println(" skip: common suffix mismatch (in prefix)"); |
| // } |
| continue nextTerm; |
| } |
| } |
| suffixBytesPos = currentFrame.startBytePos; |
| } else { |
| suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - compiledAutomaton.commonSuffixRef.length; |
| } |
| |
| // Test overlapping suffix part: |
| final int commonSuffixBytesPosEnd = compiledAutomaton.commonSuffixRef.length; |
| while (commonSuffixBytesPos < commonSuffixBytesPosEnd) { |
| if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) { |
| // if (DEBUG) { |
| // System.out.println(" skip: common suffix mismatch"); |
| // } |
| continue nextTerm; |
| } |
| } |
| } |
| |
| // TODO: maybe we should do the same linear test |
| // that AutomatonTermsEnum does, so that if we |
| // reach a part of the automaton where .* is |
| // "temporarily" accepted, we just blindly .next() |
| // until the limit |
| |
| // See if the term prefix matches the automaton: |
| int state = currentFrame.state; |
| for (int idx=0;idx<currentFrame.suffix;idx++) { |
| state = runAutomaton.step(state, currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff); |
| if (state == -1) { |
| // No match |
| //System.out.println(" no s=" + state); |
| continue nextTerm; |
| } else { |
| //System.out.println(" c s=" + state); |
| } |
| } |
| |
| if (isSubBlock) { |
| // Match! Recurse: |
| //if (DEBUG) System.out.println(" sub-block match to state=" + state + "; recurse fp=" + currentFrame.lastSubFP); |
| copyTerm(); |
| currentFrame = pushFrame(state); |
| //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix); |
| } else if (runAutomaton.isAccept(state)) { |
| copyTerm(); |
| //if (DEBUG) System.out.println(" term match to state=" + state + "; return term=" + brToString(term)); |
| assert savedStartTerm == null || term.compareTo(savedStartTerm) > 0: "saveStartTerm=" + savedStartTerm.utf8ToString() + " term=" + term.utf8ToString(); |
| return term; |
| } else { |
| //System.out.println(" no s=" + state); |
| } |
| } |
| } |
| |
| private void copyTerm() { |
| //System.out.println(" copyTerm cur.prefix=" + currentFrame.prefix + " cur.suffix=" + currentFrame.suffix + " first=" + (char) currentFrame.suffixBytes[currentFrame.startBytePos]); |
| final int len = currentFrame.prefix + currentFrame.suffix; |
| if (term.bytes.length < len) { |
| term.bytes = ArrayUtil.grow(term.bytes, len); |
| } |
| System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix); |
| term.length = len; |
| } |
| |
| @Override |
| public Comparator<BytesRef> getComparator() { |
| return BytesRef.getUTF8SortedAsUnicodeComparator(); |
| } |
| |
| @Override |
| public boolean seekExact(BytesRef text, boolean useCache) { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public void seekExact(long ord) { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public long ord() { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public SeekStatus seekCeil(BytesRef text, boolean useCache) { |
| throw new UnsupportedOperationException(); |
| } |
| } |
| |
| // Iterates through terms in this field |
| private final class SegmentTermsEnum extends TermsEnum { |
| private IndexInput in; |
| |
| private Frame[] stack; |
| private final Frame staticFrame; |
| private Frame currentFrame; |
| private boolean termExists; |
| |
| private int targetBeforeCurrentLength; |
| |
| private final ByteArrayDataInput scratchReader = new ByteArrayDataInput(); |
| |
| // What prefix of the current term was present in the index: |
| private int validIndexPrefix; |
| |
| // assert only: |
| private boolean eof; |
| |
| final BytesRef term = new BytesRef(); |
| private final FST.BytesReader fstReader; |
| |
| @SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc<BytesRef>[] arcs = |
| new FST.Arc[1]; |
| |
| public SegmentTermsEnum() throws IOException { |
| //if (DEBUG) System.out.println("BTTR.init seg=" + segment); |
| stack = new Frame[0]; |
| |
| // Used to hold seek by TermState, or cached seek |
| staticFrame = new Frame(-1); |
| |
| if (index == null) { |
| fstReader = null; |
| } else { |
| fstReader = index.getBytesReader(); |
| } |
| |
| // Init w/ root block; don't use index since it may |
| // not (and need not) have been loaded |
| for(int arcIdx=0;arcIdx<arcs.length;arcIdx++) { |
| arcs[arcIdx] = new FST.Arc<BytesRef>(); |
| } |
| |
| currentFrame = staticFrame; |
| final FST.Arc<BytesRef> arc; |
| if (index != null) { |
| arc = index.getFirstArc(arcs[0]); |
| // Empty string prefix must have an output in the index! |
| assert arc.isFinal(); |
| } else { |
| arc = null; |
| } |
| currentFrame = staticFrame; |
| //currentFrame = pushFrame(arc, rootCode, 0); |
| //currentFrame.loadBlock(); |
| validIndexPrefix = 0; |
| // if (DEBUG) { |
| // System.out.println("init frame state " + currentFrame.ord); |
| // printSeekState(); |
| // } |
| |
| //System.out.println(); |
| // computeBlockStats().print(System.out); |
| } |
| |
| // Not private to avoid synthetic access$NNN methods |
| void initIndexInput() { |
| if (this.in == null) { |
| this.in = BlockTreeTermsReader.this.in.clone(); |
| } |
| } |
| |
| /** Runs next() through the entire terms dict, |
| * computing aggregate statistics. */ |
| public Stats computeBlockStats() throws IOException { |
| |
| Stats stats = new Stats(segment, fieldInfo.name); |
| if (index != null) { |
| stats.indexNodeCount = index.getNodeCount(); |
| stats.indexArcCount = index.getArcCount(); |
| stats.indexNumBytes = index.sizeInBytes(); |
| } |
| |
| currentFrame = staticFrame; |
| FST.Arc<BytesRef> arc; |
| if (index != null) { |
| arc = index.getFirstArc(arcs[0]); |
| // Empty string prefix must have an output in the index! |
| assert arc.isFinal(); |
| } else { |
| arc = null; |
| } |
| |
| // Empty string prefix must have an output in the |
| // index! |
| currentFrame = pushFrame(arc, rootCode, 0); |
| currentFrame.fpOrig = currentFrame.fp; |
| currentFrame.loadBlock(); |
| validIndexPrefix = 0; |
| |
| stats.startBlock(currentFrame, !currentFrame.isLastInFloor); |
| |
| allTerms: |
| while (true) { |
| |
| // Pop finished blocks |
| while (currentFrame.nextEnt == currentFrame.entCount) { |
| stats.endBlock(currentFrame); |
| if (!currentFrame.isLastInFloor) { |
| currentFrame.loadNextFloorBlock(); |
| stats.startBlock(currentFrame, true); |
| } else { |
| if (currentFrame.ord == 0) { |
| break allTerms; |
| } |
| final long lastFP = currentFrame.fpOrig; |
| currentFrame = stack[currentFrame.ord-1]; |
| assert lastFP == currentFrame.lastSubFP; |
| // if (DEBUG) { |
| // System.out.println(" reset validIndexPrefix=" + validIndexPrefix); |
| // } |
| } |
| } |
| |
| while(true) { |
| if (currentFrame.next()) { |
| // Push to new block: |
| currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length); |
| currentFrame.fpOrig = currentFrame.fp; |
| // This is a "next" frame -- even if it's |
| // floor'd we must pretend it isn't so we don't |
| // try to scan to the right floor frame: |
| currentFrame.isFloor = false; |
| //currentFrame.hasTerms = true; |
| currentFrame.loadBlock(); |
| stats.startBlock(currentFrame, !currentFrame.isLastInFloor); |
| } else { |
| stats.term(term); |
| break; |
| } |
| } |
| } |
| |
| stats.finish(); |
| |
| // Put root frame back: |
| currentFrame = staticFrame; |
| if (index != null) { |
| arc = index.getFirstArc(arcs[0]); |
| // Empty string prefix must have an output in the index! |
| assert arc.isFinal(); |
| } else { |
| arc = null; |
| } |
| currentFrame = pushFrame(arc, rootCode, 0); |
| currentFrame.rewind(); |
| currentFrame.loadBlock(); |
| validIndexPrefix = 0; |
| term.length = 0; |
| |
| return stats; |
| } |
| |
| private Frame getFrame(int ord) throws IOException { |
| if (ord >= stack.length) { |
| final Frame[] next = new Frame[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; |
| System.arraycopy(stack, 0, next, 0, stack.length); |
| for(int stackOrd=stack.length;stackOrd<next.length;stackOrd++) { |
| next[stackOrd] = new Frame(stackOrd); |
| } |
| stack = next; |
| } |
| assert stack[ord].ord == ord; |
| return stack[ord]; |
| } |
| |
| private FST.Arc<BytesRef> getArc(int ord) { |
| if (ord >= arcs.length) { |
| @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc<BytesRef>[] next = |
| new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; |
| System.arraycopy(arcs, 0, next, 0, arcs.length); |
| for(int arcOrd=arcs.length;arcOrd<next.length;arcOrd++) { |
| next[arcOrd] = new FST.Arc<BytesRef>(); |
| } |
| arcs = next; |
| } |
| return arcs[ord]; |
| } |
| |
| @Override |
| public Comparator<BytesRef> getComparator() { |
| return BytesRef.getUTF8SortedAsUnicodeComparator(); |
| } |
| |
| // Pushes a frame we seek'd to |
| Frame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length) throws IOException { |
| scratchReader.reset(frameData.bytes, frameData.offset, frameData.length); |
| final long code = scratchReader.readVLong(); |
| final long fpSeek = code >>> BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS; |
| final Frame f = getFrame(1+currentFrame.ord); |
| f.hasTerms = (code & BlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0; |
| f.hasTermsOrig = f.hasTerms; |
| f.isFloor = (code & BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0; |
| if (f.isFloor) { |
| f.setFloorData(scratchReader, frameData); |
| } |
| pushFrame(arc, fpSeek, length); |
| |
| return f; |
| } |
| |
| // Pushes next'd frame or seek'd frame; we later |
| // lazy-load the frame only when needed |
| Frame pushFrame(FST.Arc<BytesRef> arc, long fp, int length) throws IOException { |
| final Frame f = getFrame(1+currentFrame.ord); |
| f.arc = arc; |
| if (f.fpOrig == fp && f.nextEnt != -1) { |
| //if (DEBUG) System.out.println(" push reused frame ord=" + f.ord + " fp=" + f.fp + " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + term.length + " vs prefix=" + f.prefix); |
| if (f.prefix > targetBeforeCurrentLength) { |
| f.rewind(); |
| } else { |
| // if (DEBUG) { |
| // System.out.println(" skip rewind!"); |
| // } |
| } |
| assert length == f.prefix; |
| } else { |
| f.nextEnt = -1; |
| f.prefix = length; |
| f.state.termBlockOrd = 0; |
| f.fpOrig = f.fp = fp; |
| f.lastSubFP = -1; |
| // if (DEBUG) { |
| // final int sav = term.length; |
| // term.length = length; |
| // System.out.println(" push new frame ord=" + f.ord + " fp=" + f.fp + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " pref=" + brToString(term)); |
| // term.length = sav; |
| // } |
| } |
| |
| return f; |
| } |
| |
| // asserts only |
| private boolean clearEOF() { |
| eof = false; |
| return true; |
| } |
| |
| // asserts only |
| private boolean setEOF() { |
| eof = true; |
| return true; |
| } |
| |
| @Override |
| public boolean seekExact(final BytesRef target, final boolean useCache) throws IOException { |
| |
| if (index == null) { |
| throw new IllegalStateException("terms index was not loaded"); |
| } |
| |
| if (term.bytes.length <= target.length) { |
| term.bytes = ArrayUtil.grow(term.bytes, 1+target.length); |
| } |
| |
| assert clearEOF(); |
| |
| // if (DEBUG) { |
| // System.out.println("\nBTTR.seekExact seg=" + segment + " target=" + fieldInfo.name + ":" + brToString(target) + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix=" + validIndexPrefix); |
| // printSeekState(); |
| // } |
| |
| FST.Arc<BytesRef> arc; |
| int targetUpto; |
| BytesRef output; |
| |
| targetBeforeCurrentLength = currentFrame.ord; |
| |
| if (currentFrame != staticFrame) { |
| |
| // We are already seek'd; find the common |
| // prefix of new seek term vs current term and |
| // re-use the corresponding seek state. For |
| // example, if app first seeks to foobar, then |
| // seeks to foobaz, we can re-use the seek state |
| // for the first 5 bytes. |
| |
| // if (DEBUG) { |
| // System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix); |
| // } |
| |
| arc = arcs[0]; |
| assert arc.isFinal(); |
| output = arc.output; |
| targetUpto = 0; |
| |
| Frame lastFrame = stack[0]; |
| assert validIndexPrefix <= term.length; |
| |
| final int targetLimit = Math.min(target.length, validIndexPrefix); |
| |
| int cmp = 0; |
| |
| // TODO: reverse vLong byte order for better FST |
| // prefix output sharing |
| |
| // First compare up to valid seek frames: |
| while (targetUpto < targetLimit) { |
| cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); |
| // if (DEBUG) { |
| // System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output); |
| // } |
| if (cmp != 0) { |
| break; |
| } |
| arc = arcs[1+targetUpto]; |
| //if (arc.label != (target.bytes[target.offset + targetUpto] & 0xFF)) { |
| //System.out.println("FAIL: arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF)); |
| //} |
| assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF); |
| if (arc.output != NO_OUTPUT) { |
| output = fstOutputs.add(output, arc.output); |
| } |
| if (arc.isFinal()) { |
| lastFrame = stack[1+lastFrame.ord]; |
| } |
| targetUpto++; |
| } |
| |
| if (cmp == 0) { |
| final int targetUptoMid = targetUpto; |
| |
| // Second compare the rest of the term, but |
| // don't save arc/output/frame; we only do this |
| // to find out if the target term is before, |
| // equal or after the current term |
| final int targetLimit2 = Math.min(target.length, term.length); |
| while (targetUpto < targetLimit2) { |
| cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); |
| // if (DEBUG) { |
| // System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"); |
| // } |
| if (cmp != 0) { |
| break; |
| } |
| targetUpto++; |
| } |
| |
| if (cmp == 0) { |
| cmp = term.length - target.length; |
| } |
| targetUpto = targetUptoMid; |
| } |
| |
| if (cmp < 0) { |
| // Common case: target term is after current |
| // term, ie, app is seeking multiple terms |
| // in sorted order |
| // if (DEBUG) { |
| // System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); frame.ord=" + lastFrame.ord); |
| // } |
| currentFrame = lastFrame; |
| |
| } else if (cmp > 0) { |
| // Uncommon case: target term |
| // is before current term; this means we can |
| // keep the currentFrame but we must rewind it |
| // (so we scan from the start) |
| targetBeforeCurrentLength = 0; |
| // if (DEBUG) { |
| // System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord); |
| // } |
| currentFrame = lastFrame; |
| currentFrame.rewind(); |
| } else { |
| // Target is exactly the same as current term |
| assert term.length == target.length; |
| if (termExists) { |
| // if (DEBUG) { |
| // System.out.println(" target is same as current; return true"); |
| // } |
| return true; |
| } else { |
| // if (DEBUG) { |
| // System.out.println(" target is same as current but term doesn't exist"); |
| // } |
| } |
| //validIndexPrefix = currentFrame.depth; |
| //term.length = target.length; |
| //return termExists; |
| } |
| |
| } else { |
| |
| targetBeforeCurrentLength = -1; |
| arc = index.getFirstArc(arcs[0]); |
| |
| // Empty string prefix must have an output (block) in the index! |
| assert arc.isFinal(); |
| assert arc.output != null; |
| |
| // if (DEBUG) { |
| // System.out.println(" no seek state; push root frame"); |
| // } |
| |
| output = arc.output; |
| |
| currentFrame = staticFrame; |
| |
| //term.length = 0; |
| targetUpto = 0; |
| currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), 0); |
| } |
| |
| // if (DEBUG) { |
| // System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength); |
| // } |
| |
| while (targetUpto < target.length) { |
| |
| final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF; |
| |
| final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader); |
| |
| if (nextArc == null) { |
| |
| // Index is exhausted |
| // if (DEBUG) { |
| // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel)); |
| // } |
| |
| validIndexPrefix = currentFrame.prefix; |
| //validIndexPrefix = targetUpto; |
| |
| currentFrame.scanToFloorFrame(target); |
| |
| if (!currentFrame.hasTerms) { |
| termExists = false; |
| term.bytes[targetUpto] = (byte) targetLabel; |
| term.length = 1+targetUpto; |
| // if (DEBUG) { |
| // System.out.println(" FAST NOT_FOUND term=" + brToString(term)); |
| // } |
| return false; |
| } |
| |
| currentFrame.loadBlock(); |
| |
| final SeekStatus result = currentFrame.scanToTerm(target, true); |
| if (result == SeekStatus.FOUND) { |
| // if (DEBUG) { |
| // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); |
| // } |
| return true; |
| } else { |
| // if (DEBUG) { |
| // System.out.println(" got " + result + "; return NOT_FOUND term=" + brToString(term)); |
| // } |
| return false; |
| } |
| } else { |
| // Follow this arc |
| arc = nextArc; |
| term.bytes[targetUpto] = (byte) targetLabel; |
| // Aggregate output as we go: |
| assert arc.output != null; |
| if (arc.output != NO_OUTPUT) { |
| output = fstOutputs.add(output, arc.output); |
| } |
| |
| // if (DEBUG) { |
| // System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput); |
| // } |
| targetUpto++; |
| |
| if (arc.isFinal()) { |
| //if (DEBUG) System.out.println(" arc is final!"); |
| currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), targetUpto); |
| //if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms); |
| } |
| } |
| } |
| |
| //validIndexPrefix = targetUpto; |
| validIndexPrefix = currentFrame.prefix; |
| |
| currentFrame.scanToFloorFrame(target); |
| |
| // Target term is entirely contained in the index: |
| if (!currentFrame.hasTerms) { |
| termExists = false; |
| term.length = targetUpto; |
| // if (DEBUG) { |
| // System.out.println(" FAST NOT_FOUND term=" + brToString(term)); |
| // } |
| return false; |
| } |
| |
| currentFrame.loadBlock(); |
| |
| final SeekStatus result = currentFrame.scanToTerm(target, true); |
| if (result == SeekStatus.FOUND) { |
| // if (DEBUG) { |
| // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); |
| // } |
| return true; |
| } else { |
| // if (DEBUG) { |
| // System.out.println(" got result " + result + "; return NOT_FOUND term=" + term.utf8ToString()); |
| // } |
| |
| return false; |
| } |
| } |
| |
| @Override |
| public SeekStatus seekCeil(final BytesRef target, final boolean useCache) throws IOException { |
| if (index == null) { |
| throw new IllegalStateException("terms index was not loaded"); |
| } |
| |
| if (term.bytes.length <= target.length) { |
| term.bytes = ArrayUtil.grow(term.bytes, 1+target.length); |
| } |
| |
| assert clearEOF(); |
| |
| //if (DEBUG) { |
| //System.out.println("\nBTTR.seekCeil seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix= " + validIndexPrefix); |
| //printSeekState(); |
| //} |
| |
| FST.Arc<BytesRef> arc; |
| int targetUpto; |
| BytesRef output; |
| |
| targetBeforeCurrentLength = currentFrame.ord; |
| |
| if (currentFrame != staticFrame) { |
| |
| // We are already seek'd; find the common |
| // prefix of new seek term vs current term and |
| // re-use the corresponding seek state. For |
| // example, if app first seeks to foobar, then |
| // seeks to foobaz, we can re-use the seek state |
| // for the first 5 bytes. |
| |
| //if (DEBUG) { |
| //System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix); |
| //} |
| |
| arc = arcs[0]; |
| assert arc.isFinal(); |
| output = arc.output; |
| targetUpto = 0; |
| |
| Frame lastFrame = stack[0]; |
| assert validIndexPrefix <= term.length; |
| |
| final int targetLimit = Math.min(target.length, validIndexPrefix); |
| |
| int cmp = 0; |
| |
| // TOOD: we should write our vLong backwards (MSB |
| // first) to get better sharing from the FST |
| |
| // First compare up to valid seek frames: |
| while (targetUpto < targetLimit) { |
| cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); |
| //if (DEBUG) { |
| //System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output); |
| //} |
| if (cmp != 0) { |
| break; |
| } |
| arc = arcs[1+targetUpto]; |
| assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF); |
| // TOOD: we could save the outputs in local |
| // byte[][] instead of making new objs ever |
| // seek; but, often the FST doesn't have any |
| // shared bytes (but this could change if we |
| // reverse vLong byte order) |
| if (arc.output != NO_OUTPUT) { |
| output = fstOutputs.add(output, arc.output); |
| } |
| if (arc.isFinal()) { |
| lastFrame = stack[1+lastFrame.ord]; |
| } |
| targetUpto++; |
| } |
| |
| |
| if (cmp == 0) { |
| final int targetUptoMid = targetUpto; |
| // Second compare the rest of the term, but |
| // don't save arc/output/frame: |
| final int targetLimit2 = Math.min(target.length, term.length); |
| while (targetUpto < targetLimit2) { |
| cmp = (term.bytes[targetUpto]&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF); |
| //if (DEBUG) { |
| //System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"); |
| //} |
| if (cmp != 0) { |
| break; |
| } |
| targetUpto++; |
| } |
| |
| if (cmp == 0) { |
| cmp = term.length - target.length; |
| } |
| targetUpto = targetUptoMid; |
| } |
| |
| if (cmp < 0) { |
| // Common case: target term is after current |
| // term, ie, app is seeking multiple terms |
| // in sorted order |
| //if (DEBUG) { |
| //System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); clear frame.scanned ord=" + lastFrame.ord); |
| //} |
| currentFrame = lastFrame; |
| |
| } else if (cmp > 0) { |
| // Uncommon case: target term |
| // is before current term; this means we can |
| // keep the currentFrame but we must rewind it |
| // (so we scan from the start) |
| targetBeforeCurrentLength = 0; |
| //if (DEBUG) { |
| //System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord); |
| //} |
| currentFrame = lastFrame; |
| currentFrame.rewind(); |
| } else { |
| // Target is exactly the same as current term |
| assert term.length == target.length; |
| if (termExists) { |
| //if (DEBUG) { |
| //System.out.println(" target is same as current; return FOUND"); |
| //} |
| return SeekStatus.FOUND; |
| } else { |
| //if (DEBUG) { |
| //System.out.println(" target is same as current but term doesn't exist"); |
| //} |
| } |
| } |
| |
| } else { |
| |
| targetBeforeCurrentLength = -1; |
| arc = index.getFirstArc(arcs[0]); |
| |
| // Empty string prefix must have an output (block) in the index! |
| assert arc.isFinal(); |
| assert arc.output != null; |
| |
| //if (DEBUG) { |
| //System.out.println(" no seek state; push root frame"); |
| //} |
| |
| output = arc.output; |
| |
| currentFrame = staticFrame; |
| |
| //term.length = 0; |
| targetUpto = 0; |
| currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), 0); |
| } |
| |
| //if (DEBUG) { |
| //System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord+1=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength); |
| //} |
| |
| while (targetUpto < target.length) { |
| |
| final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF; |
| |
| final FST.Arc<BytesRef> nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader); |
| |
| if (nextArc == null) { |
| |
| // Index is exhausted |
| // if (DEBUG) { |
| // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel)); |
| // } |
| |
| validIndexPrefix = currentFrame.prefix; |
| //validIndexPrefix = targetUpto; |
| |
| currentFrame.scanToFloorFrame(target); |
| |
| currentFrame.loadBlock(); |
| |
| final SeekStatus result = currentFrame.scanToTerm(target, false); |
| if (result == SeekStatus.END) { |
| term.copyBytes(target); |
| termExists = false; |
| |
| if (next() != null) { |
| //if (DEBUG) { |
| //System.out.println(" return NOT_FOUND term=" + brToString(term) + " " + term); |
| //} |
| return SeekStatus.NOT_FOUND; |
| } else { |
| //if (DEBUG) { |
| //System.out.println(" return END"); |
| //} |
| return SeekStatus.END; |
| } |
| } else { |
| //if (DEBUG) { |
| //System.out.println(" return " + result + " term=" + brToString(term) + " " + term); |
| //} |
| return result; |
| } |
| } else { |
| // Follow this arc |
| term.bytes[targetUpto] = (byte) targetLabel; |
| arc = nextArc; |
| // Aggregate output as we go: |
| assert arc.output != null; |
| if (arc.output != NO_OUTPUT) { |
| output = fstOutputs.add(output, arc.output); |
| } |
| |
| //if (DEBUG) { |
| //System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput); |
| //} |
| targetUpto++; |
| |
| if (arc.isFinal()) { |
| //if (DEBUG) System.out.println(" arc is final!"); |
| currentFrame = pushFrame(arc, fstOutputs.add(output, arc.nextFinalOutput), targetUpto); |
| //if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms); |
| } |
| } |
| } |
| |
| //validIndexPrefix = targetUpto; |
| validIndexPrefix = currentFrame.prefix; |
| |
| currentFrame.scanToFloorFrame(target); |
| |
| currentFrame.loadBlock(); |
| |
| final SeekStatus result = currentFrame.scanToTerm(target, false); |
| |
| if (result == SeekStatus.END) { |
| term.copyBytes(target); |
| termExists = false; |
| if (next() != null) { |
| //if (DEBUG) { |
| //System.out.println(" return NOT_FOUND term=" + term.utf8ToString() + " " + term); |
| //} |
| return SeekStatus.NOT_FOUND; |
| } else { |
| //if (DEBUG) { |
| //System.out.println(" return END"); |
| //} |
| return SeekStatus.END; |
| } |
| } else { |
| return result; |
| } |
| } |
| |
| @SuppressWarnings("unused") |
| private void printSeekState(PrintStream out) throws IOException { |
| if (currentFrame == staticFrame) { |
| out.println(" no prior seek"); |
| } else { |
| out.println(" prior seek state:"); |
| int ord = 0; |
| boolean isSeekFrame = true; |
| while(true) { |
| Frame f = getFrame(ord); |
| assert f != null; |
| final BytesRef prefix = new BytesRef(term.bytes, 0, f.prefix); |
| if (f.nextEnt == -1) { |
| out.println(" frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<<BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? BlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR:0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd()); |
| } else { |
| out.println(" frame " + (isSeekFrame ? "(seek, loaded)" : "(next, loaded)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + " nextEnt=" + f.nextEnt + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<<BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? BlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR:0)) + " lastSubFP=" + f.lastSubFP + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd()); |
| } |
| if (index != null) { |
| assert !isSeekFrame || f.arc != null: "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc; |
| if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.bytes[f.prefix-1]&0xFF)) { |
| out.println(" broken seek state: arc.label=" + (char) f.arc.label + " vs term byte=" + (char) (term.bytes[f.prefix-1]&0xFF)); |
| throw new RuntimeException("seek state is broken"); |
| } |
| BytesRef output = Util.get(index, prefix); |
| if (output == null) { |
| out.println(" broken seek state: prefix is not final in index"); |
| throw new RuntimeException("seek state is broken"); |
| } else if (isSeekFrame && !f.isFloor) { |
| final ByteArrayDataInput reader = new ByteArrayDataInput(output.bytes, output.offset, output.length); |
| final long codeOrig = reader.readVLong(); |
| final long code = (f.fp << BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? BlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS:0) | (f.isFloor ? BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR:0); |
| if (codeOrig != code) { |
| out.println(" broken seek state: output code=" + codeOrig + " doesn't match frame code=" + code); |
| throw new RuntimeException("seek state is broken"); |
| } |
| } |
| } |
| if (f == currentFrame) { |
| break; |
| } |
| if (f.prefix == validIndexPrefix) { |
| isSeekFrame = false; |
| } |
| ord++; |
| } |
| } |
| } |
| |
| /* Decodes only the term bytes of the next term. If caller then asks for |
| metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily) |
| decode all metadata up to the current term. */ |
| @Override |
| public BytesRef next() throws IOException { |
| |
| if (in == null) { |
| // Fresh TermsEnum; seek to first term: |
| final FST.Arc<BytesRef> arc; |
| if (index != null) { |
| arc = index.getFirstArc(arcs[0]); |
| // Empty string prefix must have an output in the index! |
| assert arc.isFinal(); |
| } else { |
| arc = null; |
| } |
| currentFrame = pushFrame(arc, rootCode, 0); |
| currentFrame.loadBlock(); |
| } |
| |
| targetBeforeCurrentLength = currentFrame.ord; |
| |
| assert !eof; |
| //if (DEBUG) { |
| //System.out.println("\nBTTR.next seg=" + segment + " term=" + brToString(term) + " termExists?=" + termExists + " field=" + fieldInfo.name + " termBlockOrd=" + currentFrame.state.termBlockOrd + " validIndexPrefix=" + validIndexPrefix); |
| //printSeekState(); |
| //} |
| |
| if (currentFrame == staticFrame) { |
| // If seek was previously called and the term was |
| // cached, or seek(TermState) was called, usually |
| // caller is just going to pull a D/&PEnum or get |
| // docFreq, etc. But, if they then call next(), |
| // this method catches up all internal state so next() |
| // works properly: |
| //if (DEBUG) System.out.println(" re-seek to pending term=" + term.utf8ToString() + " " + term); |
| final boolean result = seekExact(term, false); |
| assert result; |
| } |
| |
| // Pop finished blocks |
| while (currentFrame.nextEnt == currentFrame.entCount) { |
| if (!currentFrame.isLastInFloor) { |
| currentFrame.loadNextFloorBlock(); |
| } else { |
| //if (DEBUG) System.out.println(" pop frame"); |
| if (currentFrame.ord == 0) { |
| //if (DEBUG) System.out.println(" return null"); |
| assert setEOF(); |
| term.length = 0; |
| validIndexPrefix = 0; |
| currentFrame.rewind(); |
| termExists = false; |
| return null; |
| } |
| final long lastFP = currentFrame.fpOrig; |
| currentFrame = stack[currentFrame.ord-1]; |
| |
| if (currentFrame.nextEnt == -1 || currentFrame.lastSubFP != lastFP) { |
| // We popped into a frame that's not loaded |
| // yet or not scan'd to the right entry |
| currentFrame.scanToFloorFrame(term); |
| currentFrame.loadBlock(); |
| currentFrame.scanToSubBlock(lastFP); |
| } |
| |
| // Note that the seek state (last seek) has been |
| // invalidated beyond this depth |
| validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefix); |
| //if (DEBUG) { |
| //System.out.println(" reset validIndexPrefix=" + validIndexPrefix); |
| //} |
| } |
| } |
| |
| while(true) { |
| if (currentFrame.next()) { |
| // Push to new block: |
| //if (DEBUG) System.out.println(" push frame"); |
| currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length); |
| // This is a "next" frame -- even if it's |
| // floor'd we must pretend it isn't so we don't |
| // try to scan to the right floor frame: |
| currentFrame.isFloor = false; |
| //currentFrame.hasTerms = true; |
| currentFrame.loadBlock(); |
| } else { |
| //if (DEBUG) System.out.println(" return term=" + term.utf8ToString() + " " + term + " currentFrame.ord=" + currentFrame.ord); |
| return term; |
| } |
| } |
| } |
| |
| @Override |
| public BytesRef term() { |
| assert !eof; |
| return term; |
| } |
| |
| @Override |
| public int docFreq() throws IOException { |
| assert !eof; |
| //if (DEBUG) System.out.println("BTR.docFreq"); |
| currentFrame.decodeMetaData(); |
| //if (DEBUG) System.out.println(" return " + currentFrame.state.docFreq); |
| return currentFrame.state.docFreq; |
| } |
| |
| @Override |
| public long totalTermFreq() throws IOException { |
| assert !eof; |
| currentFrame.decodeMetaData(); |
| return currentFrame.state.totalTermFreq; |
| } |
| |
| @Override |
| public DocsEnum docs(Bits skipDocs, DocsEnum reuse, int flags) throws IOException { |
| assert !eof; |
| //if (DEBUG) { |
| //System.out.println("BTTR.docs seg=" + segment); |
| //} |
| currentFrame.decodeMetaData(); |
| //if (DEBUG) { |
| //System.out.println(" state=" + currentFrame.state); |
| //} |
| return postingsReader.docs(fieldInfo, currentFrame.state, skipDocs, reuse, flags); |
| } |
| |
| @Override |
| public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { |
| if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { |
| // Positions were not indexed: |
| return null; |
| } |
| |
| assert !eof; |
| currentFrame.decodeMetaData(); |
| return postingsReader.docsAndPositions(fieldInfo, currentFrame.state, skipDocs, reuse, flags); |
| } |
| |
| @Override |
| public void seekExact(BytesRef target, TermState otherState) { |
| // if (DEBUG) { |
| // System.out.println("BTTR.seekExact termState seg=" + segment + " target=" + target.utf8ToString() + " " + target + " state=" + otherState); |
| // } |
| assert clearEOF(); |
| if (target.compareTo(term) != 0 || !termExists) { |
| assert otherState != null && otherState instanceof BlockTermState; |
| currentFrame = staticFrame; |
| currentFrame.state.copyFrom(otherState); |
| term.copyBytes(target); |
| currentFrame.metaDataUpto = currentFrame.getTermBlockOrd(); |
| assert currentFrame.metaDataUpto > 0; |
| validIndexPrefix = 0; |
| } else { |
| // if (DEBUG) { |
| // System.out.println(" skip seek: already on target state=" + currentFrame.state); |
| // } |
| } |
| } |
| |
| @Override |
| public TermState termState() throws IOException { |
| assert !eof; |
| currentFrame.decodeMetaData(); |
| TermState ts = currentFrame.state.clone(); |
| //if (DEBUG) System.out.println("BTTR.termState seg=" + segment + " state=" + ts); |
| return ts; |
| } |
| |
| @Override |
| public void seekExact(long ord) { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public long ord() { |
| throw new UnsupportedOperationException(); |
| } |
| |
| // Not static -- references term, postingsReader, |
| // fieldInfo, in |
| private final class Frame { |
| // Our index in stack[]: |
| final int ord; |
| |
| boolean hasTerms; |
| boolean hasTermsOrig; |
| boolean isFloor; |
| |
| FST.Arc<BytesRef> arc; |
| |
| // File pointer where this block was loaded from |
| long fp; |
| long fpOrig; |
| long fpEnd; |
| |
| byte[] suffixBytes = new byte[128]; |
| final ByteArrayDataInput suffixesReader = new ByteArrayDataInput(); |
| |
| byte[] statBytes = new byte[64]; |
| final ByteArrayDataInput statsReader = new ByteArrayDataInput(); |
| |
| byte[] floorData = new byte[32]; |
| final ByteArrayDataInput floorDataReader = new ByteArrayDataInput(); |
| |
| // Length of prefix shared by all terms in this block |
| int prefix; |
| |
| // Number of entries (term or sub-block) in this block |
| int entCount; |
| |
| // Which term we will next read, or -1 if the block |
| // isn't loaded yet |
| int nextEnt; |
| |
| // True if this block is either not a floor block, |
| // or, it's the last sub-block of a floor block |
| boolean isLastInFloor; |
| |
| // True if all entries are terms |
| boolean isLeafBlock; |
| |
| long lastSubFP; |
| |
| int nextFloorLabel; |
| int numFollowFloorBlocks; |
| |
| // Next term to decode metaData; we decode metaData |
| // lazily so that scanning to find the matching term is |
| // fast and only if you find a match and app wants the |
| // stats or docs/positions enums, will we decode the |
| // metaData |
| int metaDataUpto; |
| |
| final BlockTermState state; |
| |
| public Frame(int ord) throws IOException { |
| this.ord = ord; |
| state = postingsReader.newTermState(); |
| state.totalTermFreq = -1; |
| } |
| |
| public void setFloorData(ByteArrayDataInput in, BytesRef source) { |
| final int numBytes = source.length - (in.getPosition() - source.offset); |
| if (numBytes > floorData.length) { |
| floorData = new byte[ArrayUtil.oversize(numBytes, 1)]; |
| } |
| System.arraycopy(source.bytes, source.offset+in.getPosition(), floorData, 0, numBytes); |
| floorDataReader.reset(floorData, 0, numBytes); |
| numFollowFloorBlocks = floorDataReader.readVInt(); |
| nextFloorLabel = floorDataReader.readByte() & 0xff; |
| //if (DEBUG) { |
| //System.out.println(" setFloorData fpOrig=" + fpOrig + " bytes=" + new BytesRef(source.bytes, source.offset + in.getPosition(), numBytes) + " numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + toHex(nextFloorLabel)); |
| //} |
| } |
| |
| public int getTermBlockOrd() { |
| return isLeafBlock ? nextEnt : state.termBlockOrd; |
| } |
| |
| void loadNextFloorBlock() throws IOException { |
| //if (DEBUG) { |
| //System.out.println(" loadNextFloorBlock fp=" + fp + " fpEnd=" + fpEnd); |
| //} |
| assert arc == null || isFloor: "arc=" + arc + " isFloor=" + isFloor; |
| fp = fpEnd; |
| nextEnt = -1; |
| loadBlock(); |
| } |
| |
| /* Does initial decode of next block of terms; this |
| doesn't actually decode the docFreq, totalTermFreq, |
| postings details (frq/prx offset, etc.) metadata; |
| it just loads them as byte[] blobs which are then |
| decoded on-demand if the metadata is ever requested |
| for any term in this block. This enables terms-only |
| intensive consumes (eg certain MTQs, respelling) to |
| not pay the price of decoding metadata they won't |
| use. */ |
| void loadBlock() throws IOException { |
| |
| // Clone the IndexInput lazily, so that consumers |
| // that just pull a TermsEnum to |
| // seekExact(TermState) don't pay this cost: |
| initIndexInput(); |
| |
| if (nextEnt != -1) { |
| // Already loaded |
| return; |
| } |
| //System.out.println("blc=" + blockLoadCount); |
| |
| in.seek(fp); |
| int code = in.readVInt(); |
| entCount = code >>> 1; |
| assert entCount > 0; |
| isLastInFloor = (code & 1) != 0; |
| assert arc == null || (isLastInFloor || isFloor); |
| |
| // TODO: if suffixes were stored in random-access |
| // array structure, then we could do binary search |
| // instead of linear scan to find target term; eg |
| // we could have simple array of offsets |
| |
| // term suffixes: |
| code = in.readVInt(); |
| isLeafBlock = (code & 1) != 0; |
| int numBytes = code >>> 1; |
| if (suffixBytes.length < numBytes) { |
| suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; |
| } |
| in.readBytes(suffixBytes, 0, numBytes); |
| suffixesReader.reset(suffixBytes, 0, numBytes); |
| |
| /*if (DEBUG) { |
| if (arc == null) { |
| System.out.println(" loadBlock (next) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock); |
| } else { |
| System.out.println(" loadBlock (seek) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " hasTerms?=" + hasTerms + " isFloor?=" + isFloor + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock); |
| } |
| }*/ |
| |
| // stats |
| numBytes = in.readVInt(); |
| if (statBytes.length < numBytes) { |
| statBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; |
| } |
| in.readBytes(statBytes, 0, numBytes); |
| statsReader.reset(statBytes, 0, numBytes); |
| metaDataUpto = 0; |
| |
| state.termBlockOrd = 0; |
| nextEnt = 0; |
| lastSubFP = -1; |
| |
| // TODO: we could skip this if !hasTerms; but |
| // that's rare so won't help much |
| postingsReader.readTermsBlock(in, fieldInfo, state); |
| |
| // Sub-blocks of a single floor block are always |
| // written one after another -- tail recurse: |
| fpEnd = in.getFilePointer(); |
| // if (DEBUG) { |
| // System.out.println(" fpEnd=" + fpEnd); |
| // } |
| } |
| |
| void rewind() { |
| |
| // Force reload: |
| fp = fpOrig; |
| nextEnt = -1; |
| hasTerms = hasTermsOrig; |
| if (isFloor) { |
| floorDataReader.rewind(); |
| numFollowFloorBlocks = floorDataReader.readVInt(); |
| nextFloorLabel = floorDataReader.readByte() & 0xff; |
| } |
| |
| /* |
| //System.out.println("rewind"); |
| // Keeps the block loaded, but rewinds its state: |
| if (nextEnt > 0 || fp != fpOrig) { |
| if (DEBUG) { |
| System.out.println(" rewind frame ord=" + ord + " fpOrig=" + fpOrig + " fp=" + fp + " hasTerms?=" + hasTerms + " isFloor?=" + isFloor + " nextEnt=" + nextEnt + " prefixLen=" + prefix); |
| } |
| if (fp != fpOrig) { |
| fp = fpOrig; |
| nextEnt = -1; |
| } else { |
| nextEnt = 0; |
| } |
| hasTerms = hasTermsOrig; |
| if (isFloor) { |
| floorDataReader.rewind(); |
| numFollowFloorBlocks = floorDataReader.readVInt(); |
| nextFloorLabel = floorDataReader.readByte() & 0xff; |
| } |
| assert suffixBytes != null; |
| suffixesReader.rewind(); |
| assert statBytes != null; |
| statsReader.rewind(); |
| metaDataUpto = 0; |
| state.termBlockOrd = 0; |
| // TODO: skip this if !hasTerms? Then postings |
| // impl wouldn't have to write useless 0 byte |
| postingsReader.resetTermsBlock(fieldInfo, state); |
| lastSubFP = -1; |
| } else if (DEBUG) { |
| System.out.println(" skip rewind fp=" + fp + " fpOrig=" + fpOrig + " nextEnt=" + nextEnt + " ord=" + ord); |
| } |
| */ |
| } |
| |
| public boolean next() { |
| return isLeafBlock ? nextLeaf() : nextNonLeaf(); |
| } |
| |
| // Decodes next entry; returns true if it's a sub-block |
| public boolean nextLeaf() { |
| //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount); |
| assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; |
| nextEnt++; |
| suffix = suffixesReader.readVInt(); |
| startBytePos = suffixesReader.getPosition(); |
| term.length = prefix + suffix; |
| if (term.bytes.length < term.length) { |
| term.grow(term.length); |
| } |
| suffixesReader.readBytes(term.bytes, prefix, suffix); |
| // A normal term |
| termExists = true; |
| return false; |
| } |
| |
| public boolean nextNonLeaf() { |
| //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount); |
| assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; |
| nextEnt++; |
| final int code = suffixesReader.readVInt(); |
| suffix = code >>> 1; |
| startBytePos = suffixesReader.getPosition(); |
| term.length = prefix + suffix; |
| if (term.bytes.length < term.length) { |
| term.grow(term.length); |
| } |
| suffixesReader.readBytes(term.bytes, prefix, suffix); |
| if ((code & 1) == 0) { |
| // A normal term |
| termExists = true; |
| subCode = 0; |
| state.termBlockOrd++; |
| return false; |
| } else { |
| // A sub-block; make sub-FP absolute: |
| termExists = false; |
| subCode = suffixesReader.readVLong(); |
| lastSubFP = fp - subCode; |
| //if (DEBUG) { |
| //System.out.println(" lastSubFP=" + lastSubFP); |
| //} |
| return true; |
| } |
| } |
| |
| // TODO: make this array'd so we can do bin search? |
| // likely not worth it? need to measure how many |
| // floor blocks we "typically" get |
| public void scanToFloorFrame(BytesRef target) { |
| |
| if (!isFloor || target.length <= prefix) { |
| // if (DEBUG) { |
| // System.out.println(" scanToFloorFrame skip: isFloor=" + isFloor + " target.length=" + target.length + " vs prefix=" + prefix); |
| // } |
| return; |
| } |
| |
| final int targetLabel = target.bytes[target.offset + prefix] & 0xFF; |
| |
| // if (DEBUG) { |
| // System.out.println(" scanToFloorFrame fpOrig=" + fpOrig + " targetLabel=" + toHex(targetLabel) + " vs nextFloorLabel=" + toHex(nextFloorLabel) + " numFollowFloorBlocks=" + numFollowFloorBlocks); |
| // } |
| |
| if (targetLabel < nextFloorLabel) { |
| // if (DEBUG) { |
| // System.out.println(" already on correct block"); |
| // } |
| return; |
| } |
| |
| assert numFollowFloorBlocks != 0; |
| |
| long newFP = fpOrig; |
| while (true) { |
| final long code = floorDataReader.readVLong(); |
| newFP = fpOrig + (code >>> 1); |
| hasTerms = (code & 1) != 0; |
| // if (DEBUG) { |
| // System.out.println(" label=" + toHex(nextFloorLabel) + " fp=" + newFP + " hasTerms?=" + hasTerms + " numFollowFloor=" + numFollowFloorBlocks); |
| // } |
| |
| isLastInFloor = numFollowFloorBlocks == 1; |
| numFollowFloorBlocks--; |
| |
| if (isLastInFloor) { |
| nextFloorLabel = 256; |
| // if (DEBUG) { |
| // System.out.println(" stop! last block nextFloorLabel=" + toHex(nextFloorLabel)); |
| // } |
| break; |
| } else { |
| nextFloorLabel = floorDataReader.readByte() & 0xff; |
| if (targetLabel < nextFloorLabel) { |
| // if (DEBUG) { |
| // System.out.println(" stop! nextFloorLabel=" + toHex(nextFloorLabel)); |
| // } |
| break; |
| } |
| } |
| } |
| |
| if (newFP != fp) { |
| // Force re-load of the block: |
| // if (DEBUG) { |
| // System.out.println(" force switch to fp=" + newFP + " oldFP=" + fp); |
| // } |
| nextEnt = -1; |
| fp = newFP; |
| } else { |
| // if (DEBUG) { |
| // System.out.println(" stay on same fp=" + newFP); |
| // } |
| } |
| } |
| |
| public void decodeMetaData() throws IOException { |
| |
| //if (DEBUG) System.out.println("\nBTTR.decodeMetadata seg=" + segment + " mdUpto=" + metaDataUpto + " vs termBlockOrd=" + state.termBlockOrd); |
| |
| // lazily catch up on metadata decode: |
| final int limit = getTermBlockOrd(); |
| assert limit > 0; |
| |
| // We must set/incr state.termCount because |
| // postings impl can look at this |
| state.termBlockOrd = metaDataUpto; |
| |
| // TODO: better API would be "jump straight to term=N"??? |
| while (metaDataUpto < limit) { |
| |
| // TODO: we could make "tiers" of metadata, ie, |
| // decode docFreq/totalTF but don't decode postings |
| // metadata; this way caller could get |
| // docFreq/totalTF w/o paying decode cost for |
| // postings |
| |
| // TODO: if docFreq were bulk decoded we could |
| // just skipN here: |
| state.docFreq = statsReader.readVInt(); |
| //if (DEBUG) System.out.println(" dF=" + state.docFreq); |
| if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { |
| state.totalTermFreq = state.docFreq + statsReader.readVLong(); |
| //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); |
| } |
| |
| postingsReader.nextTerm(fieldInfo, state); |
| metaDataUpto++; |
| state.termBlockOrd++; |
| } |
| } |
| |
| // Used only by assert |
| private boolean prefixMatches(BytesRef target) { |
| for(int bytePos=0;bytePos<prefix;bytePos++) { |
| if (target.bytes[target.offset + bytePos] != term.bytes[bytePos]) { |
| return false; |
| } |
| } |
| |
| return true; |
| } |
| |
| // Scans to sub-block that has this target fp; only |
| // called by next(); NOTE: does not set |
| // startBytePos/suffix as a side effect |
| public void scanToSubBlock(long subFP) { |
| assert !isLeafBlock; |
| //if (DEBUG) System.out.println(" scanToSubBlock fp=" + fp + " subFP=" + subFP + " entCount=" + entCount + " lastSubFP=" + lastSubFP); |
| //assert nextEnt == 0; |
| if (lastSubFP == subFP) { |
| //if (DEBUG) System.out.println(" already positioned"); |
| return; |
| } |
| assert subFP < fp : "fp=" + fp + " subFP=" + subFP; |
| final long targetSubCode = fp - subFP; |
| //if (DEBUG) System.out.println(" targetSubCode=" + targetSubCode); |
| while(true) { |
| assert nextEnt < entCount; |
| nextEnt++; |
| final int code = suffixesReader.readVInt(); |
| suffixesReader.skipBytes(isLeafBlock ? code : code >>> 1); |
| //if (DEBUG) System.out.println(" " + nextEnt + " (of " + entCount + ") ent isSubBlock=" + ((code&1)==1)); |
| if ((code & 1) != 0) { |
| final long subCode = suffixesReader.readVLong(); |
| //if (DEBUG) System.out.println(" subCode=" + subCode); |
| if (targetSubCode == subCode) { |
| //if (DEBUG) System.out.println(" match!"); |
| lastSubFP = subFP; |
| return; |
| } |
| } else { |
| state.termBlockOrd++; |
| } |
| } |
| } |
| |
| // NOTE: sets startBytePos/suffix as a side effect |
| public SeekStatus scanToTerm(BytesRef target, boolean exactOnly) throws IOException { |
| return isLeafBlock ? scanToTermLeaf(target, exactOnly) : scanToTermNonLeaf(target, exactOnly); |
| } |
| |
| private int startBytePos; |
| private int suffix; |
| private long subCode; |
| |
| // Target's prefix matches this block's prefix; we |
| // scan the entries check if the suffix matches. |
| public SeekStatus scanToTermLeaf(BytesRef target, boolean exactOnly) throws IOException { |
| |
| // if (DEBUG) System.out.println(" scanToTermLeaf: block fp=" + fp + " prefix=" + prefix + " nextEnt=" + nextEnt + " (of " + entCount + ") target=" + brToString(target) + " term=" + brToString(term)); |
| |
| assert nextEnt != -1; |
| |
| termExists = true; |
| subCode = 0; |
| |
| if (nextEnt == entCount) { |
| if (exactOnly) { |
| fillTerm(); |
| } |
| return SeekStatus.END; |
| } |
| |
| assert prefixMatches(target); |
| |
| // Loop over each entry (term or sub-block) in this block: |
| //nextTerm: while(nextEnt < entCount) { |
| nextTerm: while (true) { |
| nextEnt++; |
| |
| suffix = suffixesReader.readVInt(); |
| |
| // if (DEBUG) { |
| // BytesRef suffixBytesRef = new BytesRef(); |
| // suffixBytesRef.bytes = suffixBytes; |
| // suffixBytesRef.offset = suffixesReader.getPosition(); |
| // suffixBytesRef.length = suffix; |
| // System.out.println(" cycle: term " + (nextEnt-1) + " (of " + entCount + ") suffix=" + brToString(suffixBytesRef)); |
| // } |
| |
| final int termLen = prefix + suffix; |
| startBytePos = suffixesReader.getPosition(); |
| suffixesReader.skipBytes(suffix); |
| |
| final int targetLimit = target.offset + (target.length < termLen ? target.length : termLen); |
| int targetPos = target.offset + prefix; |
| |
| // Loop over bytes in the suffix, comparing to |
| // the target |
| int bytePos = startBytePos; |
| while(true) { |
| final int cmp; |
| final boolean stop; |
| if (targetPos < targetLimit) { |
| cmp = (suffixBytes[bytePos++]&0xFF) - (target.bytes[targetPos++]&0xFF); |
| stop = false; |
| } else { |
| assert targetPos == targetLimit; |
| cmp = termLen - target.length; |
| stop = true; |
| } |
| |
| if (cmp < 0) { |
| // Current entry is still before the target; |
| // keep scanning |
| |
| if (nextEnt == entCount) { |
| if (exactOnly) { |
| fillTerm(); |
| } |
| // We are done scanning this block |
| break nextTerm; |
| } else { |
| continue nextTerm; |
| } |
| } else if (cmp > 0) { |
| |
| // Done! Current entry is after target -- |
| // return NOT_FOUND: |
| fillTerm(); |
| |
| if (!exactOnly && !termExists) { |
| // We are on a sub-block, and caller wants |
| // us to position to the next term after |
| // the target, so we must recurse into the |
| // sub-frame(s): |
| currentFrame = pushFrame(null, currentFrame.lastSubFP, termLen); |
| currentFrame.loadBlock(); |
| while (currentFrame.next()) { |
| currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length); |
| currentFrame.loadBlock(); |
| } |
| } |
| |
| //if (DEBUG) System.out.println(" not found"); |
| return SeekStatus.NOT_FOUND; |
| } else if (stop) { |
| // Exact match! |
| |
| // This cannot be a sub-block because we |
| // would have followed the index to this |
| // sub-block from the start: |
| |
| assert termExists; |
| fillTerm(); |
| //if (DEBUG) System.out.println(" found!"); |
| return SeekStatus.FOUND; |
| } |
| } |
| } |
| |
| // It is possible (and OK) that terms index pointed us |
| // at this block, but, we scanned the entire block and |
| // did not find the term to position to. This happens |
| // when the target is after the last term in the block |
| // (but, before the next term in the index). EG |
| // target could be foozzz, and terms index pointed us |
| // to the foo* block, but the last term in this block |
| // was fooz (and, eg, first term in the next block will |
| // bee fop). |
| //if (DEBUG) System.out.println(" block end"); |
| if (exactOnly) { |
| fillTerm(); |
| } |
| |
| // TODO: not consistent that in the |
| // not-exact case we don't next() into the next |
| // frame here |
| return SeekStatus.END; |
| } |
| |
| // Target's prefix matches this block's prefix; we |
| // scan the entries check if the suffix matches. |
| public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws IOException { |
| |
| //if (DEBUG) System.out.println(" scanToTermNonLeaf: block fp=" + fp + " prefix=" + prefix + " nextEnt=" + nextEnt + " (of " + entCount + ") target=" + brToString(target) + " term=" + brToString(term)); |
| |
| assert nextEnt != -1; |
| |
| if (nextEnt == entCount) { |
| if (exactOnly) { |
| fillTerm(); |
| termExists = subCode == 0; |
| } |
| return SeekStatus.END; |
| } |
| |
| assert prefixMatches(target); |
| |
| // Loop over each entry (term or sub-block) in this block: |
| //nextTerm: while(nextEnt < entCount) { |
| nextTerm: while (true) { |
| nextEnt++; |
| |
| final int code = suffixesReader.readVInt(); |
| suffix = code >>> 1; |
| // if (DEBUG) { |
| // BytesRef suffixBytesRef = new BytesRef(); |
| // suffixBytesRef.bytes = suffixBytes; |
| // suffixBytesRef.offset = suffixesReader.getPosition(); |
| // suffixBytesRef.length = suffix; |
| // System.out.println(" cycle: " + ((code&1)==1 ? "sub-block" : "term") + " " + (nextEnt-1) + " (of " + entCount + ") suffix=" + brToString(suffixBytesRef)); |
| // } |
| |
| termExists = (code & 1) == 0; |
| final int termLen = prefix + suffix; |
| startBytePos = suffixesReader.getPosition(); |
| suffixesReader.skipBytes(suffix); |
| if (termExists) { |
| state.termBlockOrd++; |
| subCode = 0; |
| } else { |
| subCode = suffixesReader.readVLong(); |
| lastSubFP = fp - subCode; |
| } |
| |
| final int targetLimit = target.offset + (target.length < termLen ? target.length : termLen); |
| int targetPos = target.offset + prefix; |
| |
| // Loop over bytes in the suffix, comparing to |
| // the target |
| int bytePos = startBytePos; |
| while(true) { |
| final int cmp; |
| final boolean stop; |
| if (targetPos < targetLimit) { |
| cmp = (suffixBytes[bytePos++]&0xFF) - (target.bytes[targetPos++]&0xFF); |
| stop = false; |
| } else { |
| assert targetPos == targetLimit; |
| cmp = termLen - target.length; |
| stop = true; |
| } |
| |
| if (cmp < 0) { |
| // Current entry is still before the target; |
| // keep scanning |
| |
| if (nextEnt == entCount) { |
| if (exactOnly) { |
| fillTerm(); |
| //termExists = true; |
| } |
| // We are done scanning this block |
| break nextTerm; |
| } else { |
| continue nextTerm; |
| } |
| } else if (cmp > 0) { |
| |
| // Done! Current entry is after target -- |
| // return NOT_FOUND: |
| fillTerm(); |
| |
| if (!exactOnly && !termExists) { |
| // We are on a sub-block, and caller wants |
| // us to position to the next term after |
| // the target, so we must recurse into the |
| // sub-frame(s): |
| currentFrame = pushFrame(null, currentFrame.lastSubFP, termLen); |
| currentFrame.loadBlock(); |
| while (currentFrame.next()) { |
| currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length); |
| currentFrame.loadBlock(); |
| } |
| } |
| |
| //if (DEBUG) System.out.println(" not found"); |
| return SeekStatus.NOT_FOUND; |
| } else if (stop) { |
| // Exact match! |
| |
| // This cannot be a sub-block because we |
| // would have followed the index to this |
| // sub-block from the start: |
| |
| assert termExists; |
| fillTerm(); |
| //if (DEBUG) System.out.println(" found!"); |
| return SeekStatus.FOUND; |
| } |
| } |
| } |
| |
| // It is possible (and OK) that terms index pointed us |
| // at this block, but, we scanned the entire block and |
| // did not find the term to position to. This happens |
| // when the target is after the last term in the block |
| // (but, before the next term in the index). EG |
| // target could be foozzz, and terms index pointed us |
| // to the foo* block, but the last term in this block |
| // was fooz (and, eg, first term in the next block will |
| // bee fop). |
| //if (DEBUG) System.out.println(" block end"); |
| if (exactOnly) { |
| fillTerm(); |
| } |
| |
| // TODO: not consistent that in the |
| // not-exact case we don't next() into the next |
| // frame here |
| return SeekStatus.END; |
| } |
| |
| private void fillTerm() { |
| final int termLength = prefix + suffix; |
| term.length = prefix + suffix; |
| if (term.bytes.length < termLength) { |
| term.grow(termLength); |
| } |
| System.arraycopy(suffixBytes, startBytePos, term.bytes, prefix, suffix); |
| } |
| } |
| } |
| } |
| } |