| using System; |
| using System.Runtime.CompilerServices; |
| |
| namespace Lucene.Net.Codecs.Lucene42 |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| using PackedInt32s = Lucene.Net.Util.Packed.PackedInt32s; |
| using SegmentReadState = Lucene.Net.Index.SegmentReadState; |
| using SegmentWriteState = Lucene.Net.Index.SegmentWriteState; |
| |
| /// <summary> |
| /// Lucene 4.2 DocValues format. |
| /// <para/> |
| /// Encodes the four per-document value types (Numeric,Binary,Sorted,SortedSet) with seven basic strategies. |
| /// <para/> |
| /// <list type="bullet"> |
| /// <item><description>Delta-compressed Numerics: per-document integers written in blocks of 4096. For each block |
| /// the minimum value is encoded, and each entry is a delta from that minimum value.</description></item> |
| /// <item><description>Table-compressed Numerics: when the number of unique values is very small, a lookup table |
| /// is written instead. Each per-document entry is instead the ordinal to this table.</description></item> |
| /// <item><description>Uncompressed Numerics: when all values would fit into a single byte, and the |
| /// <c>acceptableOverheadRatio</c> would pack values into 8 bits per value anyway, they |
| /// are written as absolute values (with no indirection or packing) for performance.</description></item> |
| /// <item><description>GCD-compressed Numerics: when all numbers share a common divisor, such as dates, the greatest |
| /// common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics.</description></item> |
| /// <item><description>Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length. |
| /// Each document's value can be addressed by <c>maxDoc*length</c>.</description></item> |
| /// <item><description>Variable-width Binary: one large concatenated byte[] is written, along with end addresses |
| /// for each document. The addresses are written in blocks of 4096, with the current absolute |
| /// start for the block, and the average (expected) delta per entry. For each document the |
| /// deviation from the delta (actual - expected) is written.</description></item> |
| /// <item><description>Sorted: an FST mapping deduplicated terms to ordinals is written, along with the per-document |
| /// ordinals written using one of the numeric strategies above.</description></item> |
| /// <item><description>SortedSet: an FST mapping deduplicated terms to ordinals is written, along with the per-document |
| /// ordinal list written using one of the binary strategies above.</description></item> |
| /// </list> |
| /// <para/> |
| /// Files: |
| /// <list type="number"> |
| /// <item><description><c>.dvd</c>: DocValues data</description></item> |
| /// <item><description><c>.dvm</c>: DocValues metadata</description></item> |
| /// </list> |
| /// <list type="number"> |
| /// <item><description><a name="dvm" id="dvm"></a> |
| /// <para>The DocValues metadata or .dvm file.</para> |
| /// <para>For DocValues field, this stores metadata, such as the offset into the |
| /// DocValues data (.dvd)</para> |
| /// <para>DocValues metadata (.dvm) --> Header,<FieldNumber,EntryType,Entry><sup>NumFields</sup>,Footer</para> |
| /// <list type="bullet"> |
| /// <item><description>Entry --> NumericEntry | BinaryEntry | SortedEntry</description></item> |
| /// <item><description>NumericEntry --> DataOffset,CompressionType,PackedVersion</description></item> |
| /// <item><description>BinaryEntry --> DataOffset,DataLength,MinLength,MaxLength,PackedVersion?,BlockSize?</description></item> |
| /// <item><description>SortedEntry --> DataOffset,ValueCount</description></item> |
| /// <item><description>FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --> VInt (<see cref="Store.DataOutput.WriteVInt32(int)"/>) </description></item> |
| /// <item><description>DataOffset,DataLength --> Int64 (<see cref="Store.DataOutput.WriteInt64(long)"/>) </description></item> |
| /// <item><description>EntryType,CompressionType --> Byte (<see cref="Store.DataOutput.WriteByte(byte)"/>) </description></item> |
| /// <item><description>Header --> CodecHeader (<see cref="CodecUtil.WriteHeader(Store.DataOutput, string, int)"/>) </description></item> |
| /// <item><description>Footer --> CodecFooter (<see cref="CodecUtil.WriteFooter(Store.IndexOutput)"/>) </description></item> |
| /// </list> |
| /// <para>Sorted fields have two entries: a SortedEntry with the FST metadata, |
| /// and an ordinary NumericEntry for the document-to-ord metadata.</para> |
| /// <para>SortedSet fields have two entries: a SortedEntry with the FST metadata, |
| /// and an ordinary BinaryEntry for the document-to-ord-list metadata.</para> |
| /// <para>FieldNumber of -1 indicates the end of metadata.</para> |
| /// <para>EntryType is a 0 (NumericEntry), 1 (BinaryEntry, or 2 (SortedEntry)</para> |
| /// <para>DataOffset is the pointer to the start of the data in the DocValues data (.dvd)</para> |
| /// <para/>CompressionType indicates how Numeric values will be compressed: |
| /// <list type="bullet"> |
| /// <item><description>0 --> delta-compressed. For each block of 4096 integers, every integer is delta-encoded |
| /// from the minimum value within the block.</description></item> |
| /// <item><description>1 --> table-compressed. When the number of unique numeric values is small and it would save space, |
| /// a lookup table of unique values is written, followed by the ordinal for each document.</description></item> |
| /// <item><description>2 --> uncompressed. When the <c>acceptableOverheadRatio</c> parameter would upgrade the number |
| /// of bits required to 8, and all values fit in a byte, these are written as absolute binary values |
| /// for performance.</description></item> |
| /// <item><description>3 --> gcd-compressed. When all integers share a common divisor, only quotients are stored |
| /// using blocks of delta-encoded ints.</description></item> |
| /// </list> |
| /// <para/>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values. |
| /// If they are equal, then all values are of a fixed size, and can be addressed as <c>DataOffset + (docID * length)</c>. |
| /// Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize) |
| /// is written for the addresses.</description></item> |
| /// <item><description><a name="dvd" id="dvd"></a> |
| /// <para>The DocValues data or .dvd file.</para> |
| /// <para>For DocValues field, this stores the actual per-document data (the heavy-lifting)</para> |
| /// <para>DocValues data (.dvd) --> Header,<NumericData | BinaryData | SortedData><sup>NumFields</sup>,Footer</para> |
| /// <list type="bullet"> |
| /// <item><description>NumericData --> DeltaCompressedNumerics | TableCompressedNumerics | UncompressedNumerics | GCDCompressedNumerics</description></item> |
| /// <item><description>BinaryData --> Byte (<see cref="Store.DataOutput.WriteByte(byte)"/>) <sup>DataLength</sup>,Addresses</description></item> |
| /// <item><description>SortedData --> FST<Int64> (<see cref="Util.Fst.FST{T}"/>) </description></item> |
| /// <item><description>DeltaCompressedNumerics --> BlockPackedInts(blockSize=4096) (<see cref="Util.Packed.BlockPackedWriter"/>) </description></item> |
| /// <item><description>TableCompressedNumerics --> TableSize, Int64 (<see cref="Store.DataOutput.WriteInt64(long)"/>) <sup>TableSize</sup>, PackedInts (<see cref="PackedInt32s"/>) </description></item> |
| /// <item><description>UncompressedNumerics --> Byte (<see cref="Store.DataOutput.WriteByte(byte)"/>) <sup>maxdoc</sup></description></item> |
| /// <item><description>Addresses --> MonotonicBlockPackedInts(blockSize=4096) (<see cref="Util.Packed.MonotonicBlockPackedWriter"/>) </description></item> |
| /// <item><description>Footer --> CodecFooter (<see cref="CodecUtil.WriteFooter(Store.IndexOutput)"/></description></item> |
| /// </list> |
| /// <para>SortedSet entries store the list of ordinals in their BinaryData as a |
| /// sequences of increasing vLongs (<see cref="Store.DataOutput.WriteVInt64(long)"/>), delta-encoded.</para></description></item> |
| /// </list> |
| /// <para/> |
| /// Limitations: |
| /// <list type="bullet"> |
| /// <item><description> Binary doc values can be at most <see cref="MAX_BINARY_FIELD_LENGTH"/> in length.</description></item> |
| /// </list> |
| /// </summary> |
| [Obsolete("Only for reading old 4.2 segments")] |
| [DocValuesFormatName("Lucene42")] // LUCENENET specific - using DocValuesFormatName attribute to ensure the default name passed from subclasses is the same as this class name |
| public class Lucene42DocValuesFormat : DocValuesFormat |
| { |
| /// <summary> |
| /// Maximum length for each binary doc values field. </summary> |
| public static readonly int MAX_BINARY_FIELD_LENGTH = (1 << 15) - 2; |
| |
| protected readonly float m_acceptableOverheadRatio; |
| |
| /// <summary> |
| /// Calls <c>Lucene42DocValuesFormat(PackedInts.DEFAULT)</c> (<see cref="Lucene42DocValuesFormat(float)"/>. |
| /// </summary> |
| public Lucene42DocValuesFormat() |
| : this(PackedInt32s.DEFAULT) |
| { |
| } |
| |
| /// <summary> |
| /// Creates a new <see cref="Lucene42DocValuesFormat"/> with the specified |
| /// <paramref name="acceptableOverheadRatio"/> for <see cref="Index.NumericDocValues"/>. |
| /// <para/> |
| /// @lucene.experimental |
| /// </summary> |
| /// <param name="acceptableOverheadRatio"> Compression parameter for numerics. |
| /// Currently this is only used when the number of unique values is small.</param> |
| public Lucene42DocValuesFormat(float acceptableOverheadRatio) |
| : base() |
| { |
| this.m_acceptableOverheadRatio = acceptableOverheadRatio; |
| } |
| |
| public override DocValuesConsumer FieldsConsumer(SegmentWriteState state) |
| { |
| throw new NotSupportedException("this codec can only be used for reading"); |
| } |
| |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| public override DocValuesProducer FieldsProducer(SegmentReadState state) |
| { |
| return new Lucene42DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION); |
| } |
| |
| internal const string DATA_CODEC = "Lucene42DocValuesData"; |
| internal const string DATA_EXTENSION = "dvd"; |
| internal const string METADATA_CODEC = "Lucene42DocValuesMetadata"; |
| internal const string METADATA_EXTENSION = "dvm"; |
| } |
| } |