| using Lucene.Net.Diagnostics; |
| using System; |
| |
| namespace Lucene.Net.Codecs.Lucene40 |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| using SegmentReadState = Lucene.Net.Index.SegmentReadState; |
| using SegmentWriteState = Lucene.Net.Index.SegmentWriteState; |
| |
| /// <summary> |
| /// Lucene 4.0 Postings format. |
| /// <para> |
| /// Files: |
| /// <list type="bullet"> |
| /// <item><description><tt>.tim</tt>: <a href="#Termdictionary">Term Dictionary</a></description></item> |
| /// <item><description><tt>.tip</tt>: <a href="#Termindex">Term Index</a></description></item> |
| /// <item><description><tt>.frq</tt>: <a href="#Frequencies">Frequencies</a></description></item> |
| /// <item><description><tt>.prx</tt>: <a href="#Positions">Positions</a></description></item> |
| /// </list> |
| /// </para> |
| /// <para/> |
| /// <a name="Termdictionary" id="Termdictionary"></a> |
| /// <h3>Term Dictionary</h3> |
| /// |
| /// <para>The .tim file contains the list of terms in each |
| /// field along with per-term statistics (such as docfreq) |
| /// and pointers to the frequencies, positions and |
| /// skip data in the .frq and .prx files. |
| /// See <see cref="BlockTreeTermsWriter"/> for more details on the format. |
| /// </para> |
| /// |
| /// <para>NOTE: The term dictionary can plug into different postings implementations: |
| /// the postings writer/reader are actually responsible for encoding |
| /// and decoding the Postings Metadata and Term Metadata sections described here:</para> |
| /// <list type="bullet"> |
| /// <item><description>Postings Metadata --> Header, SkipInterval, MaxSkipLevels, SkipMinimum</description></item> |
| /// <item><description>Term Metadata --> FreqDelta, SkipDelta?, ProxDelta?</description></item> |
| /// <item><description>Header --> CodecHeader (<see cref="CodecUtil.WriteHeader(Store.DataOutput, string, int)"/>) </description></item> |
| /// <item><description>SkipInterval,MaxSkipLevels,SkipMinimum --> Uint32 (<see cref="Store.DataOutput.WriteInt32(int)"/>) </description></item> |
| /// <item><description>SkipDelta,FreqDelta,ProxDelta --> VLong (<see cref="Store.DataOutput.WriteVInt64(long)"/>) </description></item> |
| /// </list> |
| /// <para>Notes:</para> |
| /// <list type="bullet"> |
| /// <item><description>Header is a CodecHeader (<see cref="CodecUtil.WriteHeader(Store.DataOutput, string, int)"/>) storing the version information |
| /// for the postings.</description></item> |
| /// <item><description>SkipInterval is the fraction of TermDocs stored in skip tables. It is used to accelerate |
| /// <see cref="Search.DocIdSetIterator.Advance(int)"/>. Larger values result in smaller indexes, greater |
| /// acceleration, but fewer accelerable cases, while smaller values result in bigger indexes, |
| /// less acceleration (in case of a small value for MaxSkipLevels) and more accelerable cases. |
| /// </description></item> |
| /// <item><description>MaxSkipLevels is the max. number of skip levels stored for each term in the .frq file. A |
| /// low value results in smaller indexes but less acceleration, a larger value results in |
| /// slightly larger indexes but greater acceleration. See format of .frq file for more |
| /// information about skip levels.</description></item> |
| /// <item><description>SkipMinimum is the minimum document frequency a term must have in order to write any |
| /// skip data at all.</description></item> |
| /// <item><description>FreqDelta determines the position of this term's TermFreqs within the .frq |
| /// file. In particular, it is the difference between the position of this term's |
| /// data in that file and the position of the previous term's data (or zero, for |
| /// the first term in the block).</description></item> |
| /// <item><description>ProxDelta determines the position of this term's TermPositions within the |
| /// .prx file. In particular, it is the difference between the position of this |
| /// term's data in that file and the position of the previous term's data (or zero, |
| /// for the first term in the block. For fields that omit position data, this will |
| /// be 0 since prox information is not stored.</description></item> |
| /// <item><description>SkipDelta determines the position of this term's SkipData within the .frq |
| /// file. In particular, it is the number of bytes after TermFreqs that the |
| /// SkipData starts. In other words, it is the length of the TermFreq data. |
| /// SkipDelta is only stored if DocFreq is not smaller than SkipMinimum.</description></item> |
| /// </list> |
| /// <a name="Termindex" id="Termindex"></a> |
| /// <h3>Term Index</h3> |
| /// <para>The .tip file contains an index into the term dictionary, so that it can be |
| /// accessed randomly. See <see cref="BlockTreeTermsWriter"/> for more details on the format.</para> |
| /// <a name="Frequencies" id="Frequencies"></a> |
| /// <h3>Frequencies</h3> |
| /// <para>The .frq file contains the lists of documents which contain each term, along |
| /// with the frequency of the term in that document (except when frequencies are |
| /// omitted: <see cref="Index.IndexOptions.DOCS_ONLY"/>).</para> |
| /// <list type="bullet"> |
| /// <item><description>FreqFile (.frq) --> Header, <TermFreqs, SkipData?> <sup>TermCount</sup></description></item> |
| /// <item><description>Header --> CodecHeader (<see cref="CodecUtil.WriteHeader(Store.DataOutput, string, int)"/>) </description></item> |
| /// <item><description>TermFreqs --> <TermFreq> <sup>DocFreq</sup></description></item> |
| /// <item><description>TermFreq --> DocDelta[, Freq?]</description></item> |
| /// <item><description>SkipData --> <<SkipLevelLength, SkipLevel> |
| /// <sup>NumSkipLevels-1</sup>, SkipLevel> <SkipDatum></description></item> |
| /// <item><description>SkipLevel --> <SkipDatum> <sup>DocFreq/(SkipInterval^(Level + |
| /// 1))</sup></description></item> |
| /// <item><description>SkipDatum --> |
| /// DocSkip,PayloadLength?,OffsetLength?,FreqSkip,ProxSkip,SkipChildLevelPointer?</description></item> |
| /// <item><description>DocDelta,Freq,DocSkip,PayloadLength,OffsetLength,FreqSkip,ProxSkip --> VInt (<see cref="Store.DataOutput.WriteVInt32(int)"/>) </description></item> |
| /// <item><description>SkipChildLevelPointer --> VLong (<see cref="Store.DataOutput.WriteVInt64(long)"/>) </description></item> |
| /// </list> |
| /// <para>TermFreqs are ordered by term (the term is implicit, from the term dictionary).</para> |
| /// <para>TermFreq entries are ordered by increasing document number.</para> |
| /// <para>DocDelta: if frequencies are indexed, this determines both the document |
| /// number and the frequency. In particular, DocDelta/2 is the difference between |
| /// this document number and the previous document number (or zero when this is the |
| /// first document in a TermFreqs). When DocDelta is odd, the frequency is one. |
| /// When DocDelta is even, the frequency is read as another VInt. If frequencies |
| /// are omitted, DocDelta contains the gap (not multiplied by 2) between document |
| /// numbers and no frequency information is stored.</para> |
| /// <para>For example, the TermFreqs for a term which occurs once in document seven |
| /// and three times in document eleven, with frequencies indexed, would be the |
| /// following sequence of VInts:</para> |
| /// <para>15, 8, 3</para> |
| /// <para>If frequencies were omitted (<see cref="Index.IndexOptions.DOCS_ONLY"/>) it would be this |
| /// sequence of VInts instead:</para> |
| /// <para>7,4</para> |
| /// <para>DocSkip records the document number before every SkipInterval <sup>th</sup> |
| /// document in TermFreqs. If payloads and offsets are disabled for the term's field, then |
| /// DocSkip represents the difference from the previous value in the sequence. If |
| /// payloads and/or offsets are enabled for the term's field, then DocSkip/2 represents the |
| /// difference from the previous value in the sequence. In this case when |
| /// DocSkip is odd, then PayloadLength and/or OffsetLength are stored indicating the length of |
| /// the last payload/offset before the SkipInterval<sup>th</sup> document in TermPositions.</para> |
| /// <para>PayloadLength indicates the length of the last payload.</para> |
| /// <para>OffsetLength indicates the length of the last offset (endOffset-startOffset).</para> |
| /// <para> |
| /// FreqSkip and ProxSkip record the position of every SkipInterval <sup>th</sup> |
| /// entry in FreqFile and ProxFile, respectively. File positions are relative to |
| /// the start of TermFreqs and Positions, to the previous SkipDatum in the |
| /// sequence.</para> |
| /// <para>For example, if DocFreq=35 and SkipInterval=16, then there are two SkipData |
| /// entries, containing the 15 <sup>th</sup> and 31 <sup>st</sup> document numbers |
| /// in TermFreqs. The first FreqSkip names the number of bytes after the beginning |
| /// of TermFreqs that the 16 <sup>th</sup> SkipDatum starts, and the second the |
| /// number of bytes after that that the 32 <sup>nd</sup> starts. The first ProxSkip |
| /// names the number of bytes after the beginning of Positions that the 16 |
| /// <sup>th</sup> SkipDatum starts, and the second the number of bytes after that |
| /// that the 32 <sup>nd</sup> starts.</para> |
| /// <para>Each term can have multiple skip levels. The amount of skip levels for a |
| /// term is NumSkipLevels = Min(MaxSkipLevels, |
| /// floor(log(DocFreq/log(SkipInterval)))). The number of SkipData entries for a |
| /// skip level is DocFreq/(SkipInterval^(Level + 1)), whereas the lowest skip level |
| /// is Level=0. |
| /// <para/> |
| /// Example: SkipInterval = 4, MaxSkipLevels = 2, DocFreq = 35. Then skip level 0 |
| /// has 8 SkipData entries, containing the 3<sup>rd</sup>, 7<sup>th</sup>, |
| /// 11<sup>th</sup>, 15<sup>th</sup>, 19<sup>th</sup>, 23<sup>rd</sup>, |
| /// 27<sup>th</sup>, and 31<sup>st</sup> document numbers in TermFreqs. Skip level |
| /// 1 has 2 SkipData entries, containing the 15<sup>th</sup> and 31<sup>st</sup> |
| /// document numbers in TermFreqs. |
| /// <para/> |
| /// The SkipData entries on all upper levels > 0 contain a SkipChildLevelPointer |
| /// referencing the corresponding SkipData entry in level-1. In the example has |
| /// entry 15 on level 1 a pointer to entry 15 on level 0 and entry 31 on level 1 a |
| /// pointer to entry 31 on level 0. |
| /// </para> |
| /// <a name="Positions" id="Positions"></a> |
| /// <h3>Positions</h3> |
| /// <para>The .prx file contains the lists of positions that each term occurs at |
| /// within documents. Note that fields omitting positional data do not store |
| /// anything into this file, and if all fields in the index omit positional data |
| /// then the .prx file will not exist.</para> |
| /// <list type="bullet"> |
| /// <item><description>ProxFile (.prx) --> Header, <TermPositions> <sup>TermCount</sup></description></item> |
| /// <item><description>Header --> CodecHeader (<see cref="CodecUtil.WriteHeader(Store.DataOutput, string, int)"/>) </description></item> |
| /// <item><description>TermPositions --> <Positions> <sup>DocFreq</sup></description></item> |
| /// <item><description>Positions --> <PositionDelta,PayloadLength?,OffsetDelta?,OffsetLength?,PayloadData?> <sup>Freq</sup></description></item> |
| /// <item><description>PositionDelta,OffsetDelta,OffsetLength,PayloadLength --> VInt (<see cref="Store.DataOutput.WriteVInt32(int)"/>) </description></item> |
| /// <item><description>PayloadData --> byte (<see cref="Store.DataOutput.WriteByte(byte)"/>) <sup>PayloadLength</sup></description></item> |
| /// </list> |
| /// <para>TermPositions are ordered by term (the term is implicit, from the term dictionary).</para> |
| /// <para>Positions entries are ordered by increasing document number (the document |
| /// number is implicit from the .frq file).</para> |
| /// <para>PositionDelta is, if payloads are disabled for the term's field, the |
| /// difference between the position of the current occurrence in the document and |
| /// the previous occurrence (or zero, if this is the first occurrence in this |
| /// document). If payloads are enabled for the term's field, then PositionDelta/2 |
| /// is the difference between the current and the previous position. If payloads |
| /// are enabled and PositionDelta is odd, then PayloadLength is stored, indicating |
| /// the length of the payload at the current term position.</para> |
| /// <para>For example, the TermPositions for a term which occurs as the fourth term in |
| /// one document, and as the fifth and ninth term in a subsequent document, would |
| /// be the following sequence of VInts (payloads disabled):</para> |
| /// <para>4, 5, 4</para> |
| /// <para>PayloadData is metadata associated with the current term position. If |
| /// PayloadLength is stored at the current position, then it indicates the length |
| /// of this payload. If PayloadLength is not stored, then this payload has the same |
| /// length as the payload at the previous position.</para> |
| /// <para>OffsetDelta/2 is the difference between this position's startOffset from the |
| /// previous occurrence (or zero, if this is the first occurrence in this document). |
| /// If OffsetDelta is odd, then the length (endOffset-startOffset) differs from the |
| /// previous occurrence and an OffsetLength follows. Offset data is only written for |
| /// <see cref="Index.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS"/>.</para> |
| /// </summary> |
| // TODO: this class could be created by wrapping |
| // BlockTreeTermsDict around Lucene40PostingsBaseFormat; ie |
| // we should not duplicate the code from that class here: |
| [Obsolete("Only for reading old 4.0 segments")] |
| [PostingsFormatName("Lucene40")] // LUCENENET specific - using PostingsFormatName attribute to ensure the default name passed from subclasses is the same as this class name |
| public class Lucene40PostingsFormat : PostingsFormat |
| { |
| /// <summary> |
| /// Minimum items (terms or sub-blocks) per block for BlockTree. </summary> |
| protected readonly int m_minBlockSize; |
| |
| /// <summary> |
| /// Maximum items (terms or sub-blocks) per block for BlockTree. </summary> |
| protected readonly int m_maxBlockSize; |
| |
| /// <summary> |
| /// Creates <see cref="Lucene40PostingsFormat"/> with default |
| /// settings. |
| /// </summary> |
| public Lucene40PostingsFormat() |
| : this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE) |
| { |
| } |
| |
| /// <summary> |
| /// Creates <see cref="Lucene40PostingsFormat"/> with custom |
| /// values for <paramref name="minBlockSize"/> and |
| /// <paramref name="maxBlockSize"/> passed to block terms dictionary. </summary> |
| /// <seealso cref="BlockTreeTermsWriter.BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)"/> |
| private Lucene40PostingsFormat(int minBlockSize, int maxBlockSize) |
| : base() |
| { |
| this.m_minBlockSize = minBlockSize; |
| if (Debugging.AssertsEnabled) Debugging.Assert(minBlockSize > 1); |
| this.m_maxBlockSize = maxBlockSize; |
| } |
| |
| public override FieldsConsumer FieldsConsumer(SegmentWriteState state) |
| { |
| throw new NotSupportedException("this codec can only be used for reading"); |
| } |
| |
| public override FieldsProducer FieldsProducer(SegmentReadState state) |
| { |
| PostingsReaderBase postings = new Lucene40PostingsReader(state.Directory, state.FieldInfos, state.SegmentInfo, state.Context, state.SegmentSuffix); |
| |
| bool success = false; |
| try |
| { |
| FieldsProducer ret = new BlockTreeTermsReader(state.Directory, state.FieldInfos, state.SegmentInfo, postings, state.Context, state.SegmentSuffix, state.TermsIndexDivisor); |
| success = true; |
| return ret; |
| } |
| finally |
| { |
| if (!success) |
| { |
| postings.Dispose(); |
| } |
| } |
| } |
| |
| /// <summary> |
| /// Extension of freq postings file. </summary> |
| internal const string FREQ_EXTENSION = "frq"; |
| |
| /// <summary> |
| /// Extension of prox postings file. </summary> |
| internal const string PROX_EXTENSION = "prx"; |
| |
| public override string ToString() |
| { |
| return Name + "(minBlockSize=" + m_minBlockSize + " maxBlockSize=" + m_maxBlockSize + ")"; |
| } |
| } |
| } |