| package org.apache.lucene.codecs.block; |
| |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.IOException; |
| |
| import org.apache.lucene.codecs.BlockTreeTermsReader; |
| import org.apache.lucene.codecs.BlockTreeTermsWriter; |
| import org.apache.lucene.codecs.CodecUtil; |
| import org.apache.lucene.codecs.FieldsConsumer; |
| import org.apache.lucene.codecs.FieldsProducer; |
| import org.apache.lucene.codecs.MultiLevelSkipListWriter; |
| import org.apache.lucene.codecs.PostingsFormat; |
| import org.apache.lucene.codecs.PostingsReaderBase; |
| import org.apache.lucene.codecs.PostingsWriterBase; |
| import org.apache.lucene.index.DocsEnum; |
| import org.apache.lucene.index.FieldInfo.IndexOptions; |
| import org.apache.lucene.index.SegmentReadState; |
| import org.apache.lucene.index.SegmentWriteState; |
| import org.apache.lucene.store.DataOutput; |
| import org.apache.lucene.util.IOUtils; |
| import org.apache.lucene.util.packed.PackedInts; |
| |
| /** |
| * Block postings format, which encodes postings in packed int blocks |
| * for faster decode. |
| * |
| * <p><b>NOTE</b>: this format is still experimental and |
| * subject to change without backwards compatibility. |
| * |
| * <p> |
| * Basic idea: |
| * <ul> |
| * <li> |
| * <b>Packed Block and VInt Block</b>: |
| * <p>In packed block, integers are encoded with the same bit width ({@link PackedInts packed format}), |
| * the block size (i.e. number of integers inside block) is fixed. </p> |
| * <p>In VInt block, integers are encoded as {@link DataOutput#writeVInt VInt}, |
| * the block size is variable.</p> |
| * </li> |
| * |
| * <li> |
| * <b>Block structure</b>: |
| * <p>When the postings is long enough, BlockPostingsFormat will try to encode most integer data |
| * as packed block.</p> |
| * <p>Take a term with 259 documents as example, the first 256 document ids are encoded as two packed |
| * blocks, while the remaining 3 as one VInt block. </p> |
| * <p>Different kinds of data are always encoded separately into different packed blocks, but may |
| * possible be encoded into a same VInt block. </p> |
| * <p>This strategy is applied to pairs: |
| * <document number, frequency>, |
| * <position, payload length>, |
| * <position, offset start, offset length>, and |
| * <position, payload length, offsetstart, offset length>.</p> |
| * </li> |
| * |
| * <li> |
| * <b>Skipper setting</b>: |
| * <p>The structure of skip table is quite similar to Lucene40PostingsFormat. Skip interval is the |
| * same as block size, and each skip entry points to the beginning of each block. However, for |
| * the first block, skip data is omitted.</p> |
| * </li> |
| * |
| * <li> |
| * <b>Positions, Payloads, and Offsets</b>: |
| * <p>A position is an integer indicating where the term occurs at within one document. |
| * A payload is a blob of metadata associated with current position. |
| * An offset is a pair of integers indicating the tokenized start/end offsets for given term |
| * in current position. </p> |
| * <p>When payloads and offsets are not omitted, numPositions==numPayloads==numOffsets (assuming a |
| * null payload contributes one count). As mentioned in block structure, it is possible to encode |
| * these three either combined or separately. |
| * <p>For all the cases, payloads and offsets are stored together. When encoded as packed block, |
| * position data is separated out as .pos, while payloads and offsets are encoded in .pay (payload |
| * metadata will also be stored directly in .pay). When encoded as VInt block, all these three are |
| * stored in .pos (so as payload metadata).</p> |
| * <p>With this strategy, the majority of payload and offset data will be outside .pos file. |
| * So for queries that require only position data, running on a full index with payloads and offsets, |
| * this reduces disk pre-fetches.</p> |
| * </li> |
| * </ul> |
| * </p> |
| * |
| * <p> |
| * Files and detailed format: |
| * <ul> |
| * <li><tt>.tim</tt>: <a href="#Termdictionary">Term Dictionary</a></li> |
| * <li><tt>.tip</tt>: <a href="#Termindex">Term Index</a></li> |
| * <li><tt>.doc</tt>: <a href="#Frequencies">Frequencies and Skip Data</a></li> |
| * <li><tt>.pos</tt>: <a href="#Positions">Positions</a></li> |
| * <li><tt>.pay</tt>: <a href="#Payloads">Payloads and Offsets</a></li> |
| * </ul> |
| * </p> |
| * |
| * <a name="Termdictionary" id="Termdictionary"></a> |
| * <dl> |
| * <dd> |
| * <b>Term Dictionary</b> |
| * |
| * <p>The .tim file format is quite similar to Lucene40PostingsFormat, |
| * with minor difference in MetadataBlock</p> |
| * |
| * <ul> |
| * <!-- TODO: expand on this, its not really correct and doesnt explain sub-blocks etc --> |
| * <li>TermDictionary(.tim) --> Header, DirOffset, PostingsHeader, PackedBlockSize, |
| * <Block><sup>NumBlocks</sup>, FieldSummary</li> |
| * <li>Block --> SuffixBlock, StatsBlock, MetadataBlock</li> |
| * <li>SuffixBlock --> EntryCount, SuffixLength, {@link DataOutput#writeByte byte}<sup>SuffixLength</sup></li> |
| * <li>StatsBlock --> StatsLength, <DocFreq, TotalTermFreq><sup>EntryCount</sup></li> |
| * <li>MetadataBlock --> MetaLength, <DocFPDelta, |
| * <PosFPDelta, PosVIntBlockFPDelta?, PayFPDelta?>?, |
| * SkipFPDelta?><sup>EntryCount</sup></li> |
| * <li>FieldSummary --> NumFields, <FieldNumber, NumTerms, RootCodeLength, |
| * {@link DataOutput#writeByte byte}<sup>RootCodeLength</sup>, SumDocFreq, DocCount> |
| * <sup>NumFields</sup></li> |
| * <li>Header, PostingsHeader --> {@link CodecUtil#writeHeader CodecHeader}</li> |
| * <li>DirOffset --> {@link DataOutput#writeLong Uint64}</li> |
| * <li>PackedBlockSize, EntryCount, SuffixLength, StatsLength, DocFreq, MetaLength, |
| * PosVIntBlockFPDelta, SkipFPDelta, NumFields, FieldNumber, RootCodeLength, DocCount --> |
| * {@link DataOutput#writeVInt VInt}</li> |
| * <li>TotalTermFreq, DocFPDelta, PosFPDelta, PayFPDelta, NumTerms, SumTotalTermFreq, SumDocFreq --> |
| * {@link DataOutput#writeVLong VLong}</li> |
| * </ul> |
| * <p>Notes:</p> |
| * <ul> |
| * <li>Here explains MetadataBlock only, other fields are mentioned in |
| * <a href="../lucene40/Lucene40PostingsFormat.html#Termdictionary">Lucene40PostingsFormat:TermDictionary</a> |
| * </li> |
| * <li>PackedBlockSize is the fixed block size for packed blocks. In packed block, bit width is |
| * determined by the largest integer. Smaller block size result in smaller variance among width |
| * of integers hence smaller indexes. Larger block size result in more efficient bulk i/o hence |
| * better acceleration. This value should always be a multiple of 64, currently fixed as 128 as |
| * a tradeoff. It is also the skip interval used to accelerate {@link DocsEnum#advance(int)}. |
| * <li>DocFPDelta determines the position of this term's TermFreqs within the .doc file. |
| * In particular, it is the difference of file offset between this term's |
| * data and previous term's data (or zero, for the first term in the block).On disk it is |
| * stored as the difference from previous value in sequence. </li> |
| * <li>PosFPDelta determines the position of this term's TermPositions within the .pos file. |
| * While PayFPDelta determines the position of this term's <TermPayloads, TermOffsets?> within |
| * the .pay file. Similar to DocFPDelta, it is the difference between two file positions (or |
| * neglected, for fields that omit payloads and offsets).</li> |
| * <li>PosVIntBlockFPDelta determines the position of this term's last TermPosition in last pos packed |
| * block within the .pos file. It is synonym for PayVIntBlockFPDelta or OffsetVIntBlockFPDelta. |
| * This is actually used to indicate whether it is necessary to load following |
| * payloads and offsets from .pos instead of .pay. Every time a new block of positions are to be |
| * loaded, the PostingsReader will use this value to check whether current block is packed format |
| * or VInt. When packed format, payloads and offsets are fetched from .pay, otherwise from .pos. |
| * (this value is neglected when total number of positions i.e. totalTermFreq is less or equal |
| * to PackedBlockSize). |
| * <li>SkipFPDelta determines the position of this term's SkipData within the .doc |
| * file. In particular, it is the length of the TermFreq data. |
| * SkipDelta is only stored if DocFreq is not smaller than SkipMinimum |
| * (i.e. 8 in BlockPostingsFormat).</li> |
| * </ul> |
| * </dd> |
| * </dl> |
| * |
| * <a name="Termindex" id="Termindex"></a> |
| * <dl> |
| * <dd> |
| * <b>Term Index</b> |
| * <p>The .tim file format is mentioned in |
| * <a href="../lucene40/Lucene40PostingsFormat.html#Termindex">Lucene40PostingsFormat:TermIndex</a> |
| * </dd> |
| * </dl> |
| * |
| * |
| * <a name="Frequencies" id="Frequencies"></a> |
| * <dl> |
| * <dd> |
| * <b>Frequencies and Skip Data</b> |
| * |
| * <p>The .doc file contains the lists of documents which contain each term, along |
| * with the frequency of the term in that document (except when frequencies are |
| * omitted: {@link IndexOptions#DOCS_ONLY}). It also saves skip data to the beginning of |
| * each packed or VInt block, when the length of document list is larger than packed block size.</p> |
| * |
| * <ul> |
| * <li>docFile(.doc) --> Header, <TermFreqs, SkipData?><sup>TermCount</sup></li> |
| * <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li> |
| * <li>TermFreqs --> <PackedBlock> <sup>PackedDocBlockNum</sup>, |
| * VIntBlock? </li> |
| * <li>PackedBlock --> PackedDocDeltaBlock, PackedFreqBlock? |
| * <li>VIntBlock --> <DocDelta[, Freq?]><sup>DocFreq-PackedBlockSize*PackedDocBlockNum</sup> |
| * <li>SkipData --> <<SkipLevelLength, SkipLevel> |
| * <sup>NumSkipLevels-1</sup>, SkipLevel>, SkipDatum?</li> |
| * <li>SkipLevel --> <SkipDatum> <sup>TrimmedDocFreq/(PackedBlockSize^(Level + 1))</sup></li> |
| * <li>SkipDatum --> DocSkip, DocFPSkip, <PosFPSkip, PosBlockOffset, PayLength?, |
| * OffsetStart?, PayFPSkip?>?, SkipChildLevelPointer?</li> |
| * <li>PackedDocDeltaBlock, PackedFreqBlock --> {@link PackedInts PackedInts}</li> |
| * <li>DocDelta, Freq, DocSkip, DocFPSkip, PosFPSkip, PosBlockOffset, PayLength, OffsetStart, PayFPSkip |
| * --> |
| * {@link DataOutput#writeVInt VInt}</li> |
| * <li>SkipChildLevelPointer --> {@link DataOutput#writeVLong VLong}</li> |
| * </ul> |
| * <p>Notes:</p> |
| * <ul> |
| * <li>PackedDocDeltaBlock is theoretically generated from two steps: |
| * <ol> |
| * <li>Calculate the difference between each document number and previous one, |
| * and get a d-gaps list (for the first document, use absolute value); </li> |
| * <li>For those d-gaps from first one to PackedDocBlockNum*PackedBlockSize<sup>th</sup>, |
| * separately encode as packed blocks.</li> |
| * </ol> |
| * If frequencies are not omitted, PackedFreqBlock will be generated without d-gap step. |
| * </li> |
| * <li>VIntBlock stores remaining d-gaps (along with frequencies when possible) with a format |
| * mentioned in |
| * <a href="../lucene40/Lucene40PostingsFormat.html#Frequencies">Lucene40PostingsFormat:Frequencies</a> |
| * </li> |
| * <li>PackedDocBlockNum is the number of packed blocks for current term's docids or frequencies. |
| * In particular, PackedDocBlockNum = floor(DocFreq/PackedBlockSize) </li> |
| * <li>TrimmedDocFreq = DocFreq % PackedBlockSize == 0 ? DocFreq - 1 : DocFreq. |
| * We use this trick since the definition of skip entry is a little different from base interface. |
| * In {@link MultiLevelSkipListWriter}, skip data is assumed to be saved for |
| * skipInterval<sup>th</sup>, 2*skipInterval<sup>th</sup> ... posting in the list. However, |
| * in BlockPostingsFormat, the skip data is saved for skipInterval+1<sup>th</sup>, |
| * 2*skipInterval+1<sup>th</sup> ... posting (skipInterval==PackedBlockSize in this case). |
| * When DocFreq is multiple of PackedBlockSize, MultiLevelSkipListWriter will expect one |
| * more skip data than BlockSkipWriter. </li> |
| * <li>SkipDatum is the metadata of one skip entry. |
| * For the first block (no matter packed or VInt), it is omitted.</li> |
| * <li>DocSkip records the document number of every PackedBlockSize<sup>th</sup> document number in |
| * the postings (i.e. last document number in each packed block). On disk it is stored as the |
| * difference from previous value in the sequence. </li> |
| * <li>DocFPSkip records the file offsets of each block (excluding )posting at |
| * PackedBlockSize+1<sup>th</sup>, 2*PackedBlockSize+1<sup>th</sup> ... , in DocFile. |
| * The file offsets are relative to the start of current term's TermFreqs. |
| * On disk it is also stored as the difference from previous SkipDatum in the sequence.</li> |
| * <li>Since positions and payloads are also block encoded, the skip should skip to related block first, |
| * then fetch the values according to in-block offset. PosFPSkip and PayFPSkip record the file |
| * offsets of related block in .pos and .pay, respectively. While PosBlockOffset indicates |
| * which value to fetch inside the related block (PayBlockOffset is unnecessary since it is always |
| * equal to PosBlockOffset). Same as DocFPSkip, the file offsets are relative to the start of |
| * current term's TermFreqs, and stored as a difference sequence.</li> |
| * <li>PayLength indicates the length of last payload.</li> |
| * <li>OffsetStart indicates the first value of last offset pair.</li> |
| * </ul> |
| * </dd> |
| * </dl> |
| * |
| * <a name="Positions" id="Positions"></a> |
| * <dl> |
| * <dd> |
| * <b>Positions</b> |
| * <p>The .pos file contains the lists of positions that each term occurs at within documents. It also |
| * sometimes stores part of payloads and offsets for speedup.</p> |
| * <ul> |
| * <li>PosFile(.pos) --> Header, <TermPositions> <sup>TermCount</sup></li> |
| * <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li> |
| * <li>TermPositions --> <PackedPosDeltaBlock> <sup>PackedPosBlockNum</sup>, |
| * VIntBlock? </li> |
| * <li>VIntBlock --> PosVIntCount, <PosDelta[, PayLength?], PayData?, |
| * OffsetStartDelta?, OffsetLength?><sup>PosVIntCount</sup> |
| * <li>PackedPosDeltaBlock --> {@link PackedInts PackedInts}</li> |
| * <li>PosVIntCount, PosDelta, OffsetStartDelta, OffsetLength --> |
| * {@link DataOutput#writeVInt VInt}</li> |
| * <li>PayData --> {@link DataOutput#writeByte byte}<sup>PayLength</sup></li> |
| * </ul> |
| * <p>Notes:</p> |
| * <ul> |
| * <li>TermPositions are order by term (terms are implicit, from the term dictionary), and position |
| * values for each term document pair are incremental, and ordered by document number.</li> |
| * <li>PackedPosBlockNum is the number of packed blocks for current term's positions, payloads or offsets. |
| * In particular, PackedPosBlockNum = floor(totalTermFreq/PackedBlockSize) </li> |
| * <li>PosVIntCount is the number of positions encoded as VInt format. In particular, |
| * PosVIntCount = totalTermFreq - PackedPosBlockNum*PackedBlockSize</li> |
| * <li>The procedure how PackedPosDeltaBlock is generated is the same as PackedDocDeltaBlock |
| * in chapter <a href="#Frequencies">Frequencies and Skip Data</a>.</li> |
| * <li>PosDelta is the same as the format mentioned in |
| * <a href="../lucene40/Lucene40PostingsFormat.html#Positions">Lucene40PostingsFormat:Positions</a> |
| * </li> |
| * <li>OffsetStartDelta is the difference between this position's startOffset from the previous |
| * occurrence (or zero, if this is the first occurrence in this document).</li> |
| * <li>OffsetLength indicates the length of the current offset (endOffset-startOffset).</li> |
| * <li>PayloadData is the blob of metadata associated with current position.</li> |
| * </ul> |
| * </dd> |
| * </dl> |
| * |
| * <a name="Payloads" id="Payloads"></a> |
| * <dl> |
| * <dd> |
| * <b>Payloads and Offsets</b> |
| * <p>The .pay file will store payloads and offsets associated with certain term-document positions. |
| * Some payloads and offsets will be separated out into .pos file, for speedup reason.</p> |
| * <ul> |
| * <li>PayFile(.pay): --> Header, <TermPayloads, TermOffsets?> <sup>TermCount</sup></li> |
| * <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li> |
| * <li>TermPayloads --> <PackedPayLengthBlock, SumPayLength, PayData> <sup>PackedPayBlockNum</sup> |
| * <li>TermOffsets --> <PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock> <sup>PackedPayBlockNum</sup> |
| * <li>PackedPayLengthBlock, PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock --> {@link PackedInts PackedInts}</li> |
| * <li>SumPayLength --> {@link DataOutput#writeVInt VInt}</li> |
| * <li>PayData --> {@link DataOutput#writeByte byte}<sup>SumPayLength</sup></li> |
| * </ul> |
| * <p>Notes:</p> |
| * <ul> |
| * <li>The order of TermPayloads/TermOffsets will be the same as TermPositions, note that part of |
| * payload/offsets are stored in .pos.</li> |
| * <li>The procedure how PackedPayLengthBlock and PackedOffsetLengthBlock are generated is the |
| * same as PackedFreqBlock in chapter <a href="#Frequencies">Frequencies and Skip Data</a>. |
| * While PackedStartDeltaBlock follows a same procedure as PackedDocDeltaBlock.</li> |
| * <li>PackedPayBlockNum is always equal to PackedPosBlockNum, for the same term. It is also synonym |
| * for PackedOffsetBlockNum.</li> |
| * <li>SumPayLength is the total length of payloads written within one block, should be the sum |
| * of PayLengths in one packed block.</li> |
| * <li>PayLength in PackedPayLengthBlock is the length of each payload, associated with current |
| * position.</li> |
| * </ul> |
| * </dd> |
| * </dl> |
| * </p> |
| * |
| * @lucene.experimental |
| */ |
| |
| public final class BlockPostingsFormat extends PostingsFormat { |
| /** |
| * Filename extension for document number, frequencies, and skip data. |
| * See chapter: <a href="#Frequencies">Frequencies and Skip Data</a> |
| */ |
| public static final String DOC_EXTENSION = "doc"; |
| |
| /** |
| * Filename extension for positions. |
| * See chapter: <a href="#Positions">Positions</a> |
| */ |
| public static final String POS_EXTENSION = "pos"; |
| |
| /** |
| * Filename extension for payloads and offsets. |
| * See chapter: <a href="#Payloads">Payloads and Offsets</a> |
| */ |
| public static final String PAY_EXTENSION = "pay"; |
| |
| private final int minTermBlockSize; |
| private final int maxTermBlockSize; |
| |
| /** |
| * Fixed packed block size, number of integers encoded in |
| * a single packed block. |
| */ |
| // NOTE: must be multiple of 64 because of PackedInts long-aligned encoding/decoding |
| public final static int BLOCK_SIZE = 128; |
| |
| public BlockPostingsFormat() { |
| this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE); |
| } |
| |
| public BlockPostingsFormat(int minTermBlockSize, int maxTermBlockSize) { |
| super("Block"); |
| this.minTermBlockSize = minTermBlockSize; |
| assert minTermBlockSize > 1; |
| this.maxTermBlockSize = maxTermBlockSize; |
| assert minTermBlockSize <= maxTermBlockSize; |
| } |
| |
| @Override |
| public String toString() { |
| return getName() + "(blocksize=" + BLOCK_SIZE + ")"; |
| } |
| |
| @Override |
| public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { |
| PostingsWriterBase postingsWriter = new BlockPostingsWriter(state); |
| |
| boolean success = false; |
| try { |
| FieldsConsumer ret = new BlockTreeTermsWriter(state, |
| postingsWriter, |
| minTermBlockSize, |
| maxTermBlockSize); |
| success = true; |
| return ret; |
| } finally { |
| if (!success) { |
| IOUtils.closeWhileHandlingException(postingsWriter); |
| } |
| } |
| } |
| |
| @Override |
| public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { |
| PostingsReaderBase postingsReader = new BlockPostingsReader(state.dir, |
| state.fieldInfos, |
| state.segmentInfo, |
| state.context, |
| state.segmentSuffix); |
| boolean success = false; |
| try { |
| FieldsProducer ret = new BlockTreeTermsReader(state.dir, |
| state.fieldInfos, |
| state.segmentInfo, |
| postingsReader, |
| state.context, |
| state.segmentSuffix, |
| state.termsIndexDivisor); |
| success = true; |
| return ret; |
| } finally { |
| if (!success) { |
| IOUtils.closeWhileHandlingException(postingsReader); |
| } |
| } |
| } |
| } |