| using System; |
| using System.Runtime.CompilerServices; |
| |
| namespace Lucene.Net.Codecs.Lucene40 |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| using IndexFileNames = Lucene.Net.Index.IndexFileNames; |
| using SegmentReadState = Lucene.Net.Index.SegmentReadState; |
| using SegmentWriteState = Lucene.Net.Index.SegmentWriteState; |
| |
| /// <summary> |
| /// Lucene 4.0 DocValues format. |
| /// <para/> |
| /// Files: |
| /// <list type="bullet"> |
| /// <item><description><c>.dv.cfs</c>: compound container (<see cref="Store.CompoundFileDirectory"/>)</description></item> |
| /// <item><description><c>.dv.cfe</c>: compound entries (<see cref="Store.CompoundFileDirectory"/>)</description></item> |
| /// </list> |
| /// Entries within the compound file: |
| /// <list type="bullet"> |
| /// <item><description><c><segment>_<fieldNumber>.dat</c>: data values</description></item> |
| /// <item><description><c><segment>_<fieldNumber>.idx</c>: index into the .dat for DEREF types</description></item> |
| /// </list> |
| /// <para> |
| /// There are several many types of <see cref="Index.DocValues"/> with different encodings. |
| /// From the perspective of filenames, all types store their values in <c>.dat</c> |
| /// entries within the compound file. In the case of dereferenced/sorted types, the <c>.dat</c> |
| /// actually contains only the unique values, and an additional <c>.idx</c> file contains |
| /// pointers to these unique values. |
| /// </para> |
| /// Formats: |
| /// <list type="bullet"> |
| /// <item><description><see cref="LegacyDocValuesType.VAR_INTS"/> .dat --> Header, PackedType, MinValue, |
| /// DefaultValue, PackedStream</description></item> |
| /// <item><description><see cref="LegacyDocValuesType.FIXED_INTS_8"/> .dat --> Header, ValueSize, |
| /// Byte (<see cref="Store.DataOutput.WriteByte(byte)"/>) <sup>maxdoc</sup></description></item> |
| /// <item><description><see cref="LegacyDocValuesType.FIXED_INTS_16"/> .dat --> Header, ValueSize, |
| /// Short (<see cref="Store.DataOutput.WriteInt16(short)"/>) <sup>maxdoc</sup></description></item> |
| /// <item><description><see cref="LegacyDocValuesType.FIXED_INTS_32"/> .dat --> Header, ValueSize, |
| /// Int32 (<see cref="Store.DataOutput.WriteInt32(int)"/>) <sup>maxdoc</sup></description></item> |
| /// <item><description><see cref="LegacyDocValuesType.FIXED_INTS_64"/> .dat --> Header, ValueSize, |
| /// Int64 (<see cref="Store.DataOutput.WriteInt64(long)"/>) <sup>maxdoc</sup></description></item> |
| /// <item><description><see cref="LegacyDocValuesType.FLOAT_32"/> .dat --> Header, ValueSize, Float32<sup>maxdoc</sup></description></item> |
| /// <item><description><see cref="LegacyDocValuesType.FLOAT_64"/> .dat --> Header, ValueSize, Float64<sup>maxdoc</sup></description></item> |
| /// <item><description><see cref="LegacyDocValuesType.BYTES_FIXED_STRAIGHT"/> .dat --> Header, ValueSize, |
| /// (Byte (<see cref="Store.DataOutput.WriteByte(byte)"/>) * ValueSize)<sup>maxdoc</sup></description></item> |
| /// <item><description><see cref="LegacyDocValuesType.BYTES_VAR_STRAIGHT"/> .idx --> Header, TotalBytes, Addresses</description></item> |
| /// <item><description><see cref="LegacyDocValuesType.BYTES_VAR_STRAIGHT"/> .dat --> Header, |
| /// (Byte (<see cref="Store.DataOutput.WriteByte(byte)"/>) * <i>variable ValueSize</i>)<sup>maxdoc</sup></description></item> |
| /// <item><description><see cref="LegacyDocValuesType.BYTES_FIXED_DEREF"/> .idx --> Header, NumValues, Addresses</description></item> |
| /// <item><description><see cref="LegacyDocValuesType.BYTES_FIXED_DEREF"/> .dat --> Header, ValueSize, |
| /// (Byte (<see cref="Store.DataOutput.WriteByte(byte)"/>) * ValueSize)<sup>NumValues</sup></description></item> |
| /// <item><description><see cref="LegacyDocValuesType.BYTES_VAR_DEREF"/> .idx --> Header, TotalVarBytes, Addresses</description></item> |
| /// <item><description><see cref="LegacyDocValuesType.BYTES_VAR_DEREF"/> .dat --> Header, |
| /// (LengthPrefix + Byte (<see cref="Store.DataOutput.WriteByte(byte)"/>) * <i>variable ValueSize</i>)<sup>NumValues</sup></description></item> |
| /// <item><description><see cref="LegacyDocValuesType.BYTES_FIXED_SORTED"/> .idx --> Header, NumValues, Ordinals</description></item> |
| /// <item><description><see cref="LegacyDocValuesType.BYTES_FIXED_SORTED"/> .dat --> Header, ValueSize, |
| /// (Byte (<see cref="Store.DataOutput.WriteByte(byte)"/>) * ValueSize)<sup>NumValues</sup></description></item> |
| /// <item><description><see cref="LegacyDocValuesType.BYTES_VAR_SORTED"/> .idx --> Header, TotalVarBytes, Addresses, Ordinals</description></item> |
| /// <item><description><see cref="LegacyDocValuesType.BYTES_VAR_SORTED"/> .dat --> Header, |
| /// (Byte (<see cref="Store.DataOutput.WriteByte(byte)"/>) * <i>variable ValueSize</i>)<sup>NumValues</sup></description></item> |
| /// </list> |
| /// Data Types: |
| /// <list type="bullet"> |
| /// <item><description>Header --> CodecHeader (<see cref="CodecUtil.WriteHeader(Store.DataOutput, string, int)"/>) </description></item> |
| /// <item><description>PackedType --> Byte (<see cref="Store.DataOutput.WriteByte(byte)"/>)</description></item> |
| /// <item><description>MaxAddress, MinValue, DefaultValue --> Int64 (<see cref="Store.DataOutput.WriteInt64(long)"/>) </description></item> |
| /// <item><description>PackedStream, Addresses, Ordinals --> <see cref="Util.Packed.PackedInt32s"/></description></item> |
| /// <item><description>ValueSize, NumValues --> Int32 (<see cref="Store.DataOutput.WriteInt32(int)"/>) </description></item> |
| /// <item><description>Float32 --> 32-bit float encoded with <see cref="J2N.BitConversion.SingleToRawInt32Bits(float)"/> |
| /// then written as Int32 (<see cref="Store.DataOutput.WriteInt32(int)"/>) </description></item> |
| /// <item><description>Float64 --> 64-bit float encoded with <see cref="J2N.BitConversion.DoubleToRawInt64Bits(double)"/> |
| /// then written as Int64 (<see cref="Store.DataOutput.WriteInt64(long)"/>) </description></item> |
| /// <item><description>TotalBytes --> VLong (<see cref="Store.DataOutput.WriteVInt64(long)"/>) </description></item> |
| /// <item><description>TotalVarBytes --> Int64 (<see cref="Store.DataOutput.WriteInt64(long)"/>) </description></item> |
| /// <item><description>LengthPrefix --> Length of the data value as VInt (<see cref="Store.DataOutput.WriteVInt32(int)"/>) (maximum |
| /// of 2 bytes)</description></item> |
| /// </list> |
| /// Notes: |
| /// <list type="bullet"> |
| /// <item><description>PackedType is a 0 when compressed, 1 when the stream is written as 64-bit integers.</description></item> |
| /// <item><description>Addresses stores pointers to the actual byte location (indexed by docid). In the VAR_STRAIGHT |
| /// case, each entry can have a different length, so to determine the length, docid+1 is |
| /// retrieved. A sentinel address is written at the end for the VAR_STRAIGHT case, so the Addresses |
| /// stream contains maxdoc+1 indices. For the deduplicated VAR_DEREF case, each length |
| /// is encoded as a prefix to the data itself as a VInt (<see cref="Store.DataOutput.WriteVInt32(int)"/>) |
| /// (maximum of 2 bytes).</description></item> |
| /// <item><description>Ordinals stores the term ID in sorted order (indexed by docid). In the FIXED_SORTED case, |
| /// the address into the .dat can be computed from the ordinal as |
| /// <c>Header+ValueSize+(ordinal*ValueSize)</c> because the byte length is fixed. |
| /// In the VAR_SORTED case, there is double indirection (docid -> ordinal -> address), but |
| /// an additional sentinel ordinal+address is always written (so there are NumValues+1 ordinals). To |
| /// determine the length, ord+1's address is looked up as well.</description></item> |
| /// <item><description><see cref="LegacyDocValuesType.BYTES_VAR_STRAIGHT"/> in contrast to other straight |
| /// variants uses a <c>.idx</c> file to improve lookup perfromance. In contrast to |
| /// <see cref="LegacyDocValuesType.BYTES_VAR_DEREF"/> it doesn't apply deduplication of the document values. |
| /// </description></item> |
| /// </list> |
| /// <para/> |
| /// Limitations: |
| /// <list type="bullet"> |
| /// <item><description> Binary doc values can be at most <see cref="MAX_BINARY_FIELD_LENGTH"/> in length.</description></item> |
| /// </list> |
| /// </summary> |
| [Obsolete("Only for reading old 4.0 and 4.1 segments")] |
| [DocValuesFormatName("Lucene40")] // LUCENENET specific - using DocValuesFormatName attribute to ensure the default name passed from subclasses is the same as this class name |
| public class Lucene40DocValuesFormat : DocValuesFormat |
| // NOTE: not registered in SPI, doesnt respect segment suffix, etc |
| // for back compat only! |
| { |
| /// <summary> |
| /// Maximum length for each binary doc values field. </summary> |
| public static readonly int MAX_BINARY_FIELD_LENGTH = (1 << 15) - 2; |
| |
| /// <summary> |
| /// Sole constructor. </summary> |
| public Lucene40DocValuesFormat() |
| : base() |
| { |
| } |
| |
| public override DocValuesConsumer FieldsConsumer(SegmentWriteState state) |
| { |
| throw new NotSupportedException("this codec can only be used for reading"); |
| } |
| |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| public override DocValuesProducer FieldsProducer(SegmentReadState state) |
| { |
| string filename = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, "dv", IndexFileNames.COMPOUND_FILE_EXTENSION); |
| return new Lucene40DocValuesReader(state, filename, Lucene40FieldInfosReader.LEGACY_DV_TYPE_KEY); |
| } |
| |
| // constants for VAR_INTS |
| internal const string VAR_INTS_CODEC_NAME = "PackedInts"; |
| |
| internal const int VAR_INTS_VERSION_START = 0; |
| internal const int VAR_INTS_VERSION_CURRENT = VAR_INTS_VERSION_START; |
| internal const sbyte VAR_INTS_PACKED = 0x00; |
| internal const sbyte VAR_INTS_FIXED_64 = 0x01; |
| |
| // constants for FIXED_INTS_8, FIXED_INTS_16, FIXED_INTS_32, FIXED_INTS_64 |
| internal const string INTS_CODEC_NAME = "Ints"; |
| |
| internal const int INTS_VERSION_START = 0; |
| internal const int INTS_VERSION_CURRENT = INTS_VERSION_START; |
| |
| // constants for FLOAT_32, FLOAT_64 |
| internal const string FLOATS_CODEC_NAME = "Floats"; |
| |
| internal const int FLOATS_VERSION_START = 0; |
| internal const int FLOATS_VERSION_CURRENT = FLOATS_VERSION_START; |
| |
| // constants for BYTES_FIXED_STRAIGHT |
| internal const string BYTES_FIXED_STRAIGHT_CODEC_NAME = "FixedStraightBytes"; |
| |
| internal const int BYTES_FIXED_STRAIGHT_VERSION_START = 0; |
| internal const int BYTES_FIXED_STRAIGHT_VERSION_CURRENT = BYTES_FIXED_STRAIGHT_VERSION_START; |
| |
| // constants for BYTES_VAR_STRAIGHT |
| internal const string BYTES_VAR_STRAIGHT_CODEC_NAME_IDX = "VarStraightBytesIdx"; |
| |
| internal const string BYTES_VAR_STRAIGHT_CODEC_NAME_DAT = "VarStraightBytesDat"; |
| internal const int BYTES_VAR_STRAIGHT_VERSION_START = 0; |
| internal const int BYTES_VAR_STRAIGHT_VERSION_CURRENT = BYTES_VAR_STRAIGHT_VERSION_START; |
| |
| // constants for BYTES_FIXED_DEREF |
| internal const string BYTES_FIXED_DEREF_CODEC_NAME_IDX = "FixedDerefBytesIdx"; |
| |
| internal const string BYTES_FIXED_DEREF_CODEC_NAME_DAT = "FixedDerefBytesDat"; |
| internal const int BYTES_FIXED_DEREF_VERSION_START = 0; |
| internal const int BYTES_FIXED_DEREF_VERSION_CURRENT = BYTES_FIXED_DEREF_VERSION_START; |
| |
| // constants for BYTES_VAR_DEREF |
| internal const string BYTES_VAR_DEREF_CODEC_NAME_IDX = "VarDerefBytesIdx"; |
| |
| internal const string BYTES_VAR_DEREF_CODEC_NAME_DAT = "VarDerefBytesDat"; |
| internal const int BYTES_VAR_DEREF_VERSION_START = 0; |
| internal const int BYTES_VAR_DEREF_VERSION_CURRENT = BYTES_VAR_DEREF_VERSION_START; |
| |
| // constants for BYTES_FIXED_SORTED |
| internal const string BYTES_FIXED_SORTED_CODEC_NAME_IDX = "FixedSortedBytesIdx"; |
| |
| internal const string BYTES_FIXED_SORTED_CODEC_NAME_DAT = "FixedSortedBytesDat"; |
| internal const int BYTES_FIXED_SORTED_VERSION_START = 0; |
| internal const int BYTES_FIXED_SORTED_VERSION_CURRENT = BYTES_FIXED_SORTED_VERSION_START; |
| |
| // constants for BYTES_VAR_SORTED |
| // NOTE this IS NOT A BUG! 4.0 actually screwed this up (VAR_SORTED and VAR_DEREF have same codec header) |
| internal const string BYTES_VAR_SORTED_CODEC_NAME_IDX = "VarDerefBytesIdx"; |
| |
| internal const string BYTES_VAR_SORTED_CODEC_NAME_DAT = "VarDerefBytesDat"; |
| internal const int BYTES_VAR_SORTED_VERSION_START = 0; |
| internal const int BYTES_VAR_SORTED_VERSION_CURRENT = BYTES_VAR_SORTED_VERSION_START; |
| } |
| } |