| package org.apache.lucene.codecs.lucene50; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.IOException; |
| |
| import org.apache.lucene.codecs.CodecUtil; |
| import org.apache.lucene.codecs.DocValuesConsumer; |
| import org.apache.lucene.codecs.DocValuesProducer; |
| import org.apache.lucene.codecs.DocValuesFormat; |
| import org.apache.lucene.index.SegmentReadState; |
| import org.apache.lucene.index.SegmentWriteState; |
| import org.apache.lucene.index.FieldInfo.DocValuesType; |
| import org.apache.lucene.store.DataOutput; |
| import org.apache.lucene.util.SmallFloat; |
| import org.apache.lucene.util.fst.FST; |
| import org.apache.lucene.util.packed.DirectWriter; |
| import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; |
| |
| /** |
| * Lucene 5.0 DocValues format. |
| * <p> |
| * Encodes the five per-document value types (Numeric,Binary,Sorted,SortedSet,SortedNumeric) with these strategies: |
| * <p> |
| * {@link DocValuesType#NUMERIC NUMERIC}: |
| * <ul> |
| * <li>Delta-compressed: per-document integers written as deltas from the minimum value, |
| * compressed with bitpacking. For more information, see {@link DirectWriter}. |
| * <li>Table-compressed: when the number of unique values is very small (< 256), and |
| * when there are unused "gaps" in the range of values used (such as {@link SmallFloat}), |
| * a lookup table is written instead. Each per-document entry is instead the ordinal |
| * to this table, and those ordinals are compressed with bitpacking ({@link DirectWriter}). |
| * <li>GCD-compressed: when all numbers share a common divisor, such as dates, the greatest |
| * common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics. |
| * <li>Monotonic-compressed: when all numbers are monotonically increasing offsets, they are written |
| * as blocks of bitpacked integers, encoding the deviation from the expected delta. |
| * <li>Const-compressed: when there is only one possible non-missing value, only the missing |
| * bitset is encoded. |
| * </ul> |
| * <p> |
| * {@link DocValuesType#BINARY BINARY}: |
| * <ul> |
| * <li>Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length. |
| * Each document's value can be addressed directly with multiplication ({@code docID * length}). |
| * <li>Variable-width Binary: one large concatenated byte[] is written, along with end addresses |
| * for each document. The addresses are written as Monotonic-compressed numerics. |
| * <li>Prefix-compressed Binary: values are written in chunks of 16, with the first value written |
| * completely and other values sharing prefixes. chunk addresses are written as Monotonic-compressed |
| * numerics. A reverse lookup index is written from a portion of every 1024th term. |
| * </ul> |
| * <p> |
| * {@link DocValuesType#SORTED SORTED}: |
| * <ul> |
| * <li>Sorted: a mapping of ordinals to deduplicated terms is written as Binary, |
| * along with the per-document ordinals written using one of the numeric strategies above. |
| * </ul> |
| * <p> |
| * {@link DocValuesType#SORTED_SET SORTED_SET}: |
| * <ul> |
| * <li>SortedSet: a mapping of ordinals to deduplicated terms is written as Binary, |
| * an ordinal list and per-document index into this list are written using the numeric strategies |
| * above. |
| * </ul> |
| * <p> |
| * {@link DocValuesType#SORTED_NUMERIC SORTED_NUMERIC}: |
| * <ul> |
| * <li>SortedNumeric: a value list and per-document index into this list are written using the numeric |
| * strategies above. |
| * </ul> |
| * <p> |
| * Files: |
| * <ol> |
| * <li><tt>.dvd</tt>: DocValues data</li> |
| * <li><tt>.dvm</tt>: DocValues metadata</li> |
| * </ol> |
| * <ol> |
| * <li><a name="dvm" id="dvm"></a> |
| * <p>The DocValues metadata or .dvm file.</p> |
| * <p>For DocValues field, this stores metadata, such as the offset into the |
| * DocValues data (.dvd)</p> |
| * <p>DocValues metadata (.dvm) --> Header,<Entry><sup>NumFields</sup>,Footer</p> |
| * <ul> |
| * <li>Entry --> NumericEntry | BinaryEntry | SortedEntry | SortedSetEntry | SortedNumericEntry</li> |
| * <li>NumericEntry --> GCDNumericEntry | TableNumericEntry | DeltaNumericEntry</li> |
| * <li>GCDNumericEntry --> NumericHeader,MinValue,GCD,BitsPerValue</li> |
| * <li>TableNumericEntry --> NumericHeader,TableSize,{@link DataOutput#writeLong Int64}<sup>TableSize</sup>,BitsPerValue</li> |
| * <li>DeltaNumericEntry --> NumericHeader,MinValue,BitsPerValue</li> |
| * <li>MonotonicNumericEntry --> NumericHeader,PackedVersion,BlockSize</li> |
| * <li>NumericHeader --> FieldNumber,EntryType,NumericType,MissingOffset,DataOffset,Count,EndOffset</li> |
| * <li>BinaryEntry --> FixedBinaryEntry | VariableBinaryEntry | PrefixBinaryEntry</li> |
| * <li>FixedBinaryEntry --> BinaryHeader</li> |
| * <li>VariableBinaryEntry --> BinaryHeader,AddressOffset,PackedVersion,BlockSize</li> |
| * <li>PrefixBinaryEntry --> BinaryHeader,AddressInterval,AddressOffset,PackedVersion,BlockSize</li> |
| * <li>BinaryHeader --> FieldNumber,EntryType,BinaryType,MissingOffset,MinLength,MaxLength,DataOffset</li> |
| * <li>SortedEntry --> FieldNumber,EntryType,BinaryEntry,NumericEntry</li> |
| * <li>SortedSetEntry --> EntryType,BinaryEntry,NumericEntry,NumericEntry</li> |
| * <li>SortedNumericEntry --> EntryType,NumericEntry,NumericEntry</li> |
| * <li>FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --> {@link DataOutput#writeVInt VInt}</li> |
| * <li>EntryType,CompressionType --> {@link DataOutput#writeByte Byte}</li> |
| * <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}</li> |
| * <li>MinValue,GCD,MissingOffset,AddressOffset,DataOffset,EndOffset --> {@link DataOutput#writeLong Int64}</li> |
| * <li>TableSize,BitsPerValue --> {@link DataOutput#writeVInt vInt}</li> |
| * <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li> |
| * </ul> |
| * <p>Sorted fields have two entries: a BinaryEntry with the value metadata, |
| * and an ordinary NumericEntry for the document-to-ord metadata.</p> |
| * <p>SortedSet fields have three entries: a BinaryEntry with the value metadata, |
| * and two NumericEntries for the document-to-ord-index and ordinal list metadata.</p> |
| * <p>SortedNumeric fields have two entries: A NumericEntry with the value metadata, |
| * and a numeric entry with the document-to-value index.</p> |
| * <p>FieldNumber of -1 indicates the end of metadata.</p> |
| * <p>EntryType is a 0 (NumericEntry) or 1 (BinaryEntry)</p> |
| * <p>DataOffset is the pointer to the start of the data in the DocValues data (.dvd)</p> |
| * <p>EndOffset is the pointer to the end of the data in the DocValues data (.dvd)</p> |
| * <p>NumericType indicates how Numeric values will be compressed: |
| * <ul> |
| * <li>0 --> delta-compressed. For each block of 16k integers, every integer is delta-encoded |
| * from the minimum value within the block. |
| * <li>1 -->, gcd-compressed. When all integers share a common divisor, only quotients are stored |
| * using blocks of delta-encoded ints. |
| * <li>2 --> table-compressed. When the number of unique numeric values is small and it would save space, |
| * a lookup table of unique values is written, followed by the ordinal for each document. |
| * <li>3 --> monotonic-compressed. Used to implement addressing for BINARY, SORTED_SET, SORTED_NUMERIC. |
| * <li>4 --> const-compressed. Used when all non-missing values are the same. |
| * </ul> |
| * <p>BinaryType indicates how Binary values will be stored: |
| * <ul> |
| * <li>0 --> fixed-width. All values have the same length, addressing by multiplication. |
| * <li>1 -->, variable-width. An address for each value is stored. |
| * <li>2 --> prefix-compressed. An address to the start of every interval'th value is stored. |
| * </ul> |
| * <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values. |
| * If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length). |
| * Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize) |
| * is written for the addresses. |
| * <p>MissingOffset points to a byte[] containing a bitset of all documents that had a value for the field. |
| * If its -1, then there are no missing values. If its -2, all values are missing. |
| * <li><a name="dvd" id="dvd"></a> |
| * <p>The DocValues data or .dvd file.</p> |
| * <p>For DocValues field, this stores the actual per-document data (the heavy-lifting)</p> |
| * <p>DocValues data (.dvd) --> Header,<NumericData | BinaryData | SortedData><sup>NumFields</sup>,Footer</p> |
| * <ul> |
| * <li>NumericData --> DeltaCompressedNumerics | TableCompressedNumerics | GCDCompressedNumerics</li> |
| * <li>BinaryData --> {@link DataOutput#writeByte Byte}<sup>DataLength</sup>,Addresses</li> |
| * <li>SortedData --> {@link FST FST<Int64>}</li> |
| * <li>DeltaCompressedNumerics,TableCompressedNumerics,GCDCompressedNumerics --> {@link DirectWriter PackedInts}</li> |
| * <li>Addresses --> {@link MonotonicBlockPackedWriter MonotonicBlockPackedInts(blockSize=16k)}</li> |
| * <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li> |
| * </ul> |
| * </ol> |
| * @lucene.experimental |
| */ |
| public final class Lucene50DocValuesFormat extends DocValuesFormat { |
| |
| /** Sole Constructor */ |
| public Lucene50DocValuesFormat() { |
| super("Lucene50"); |
| } |
| |
| @Override |
| public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { |
| return new Lucene50DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); |
| } |
| |
| @Override |
| public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException { |
| return new Lucene50DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); |
| } |
| |
| static final String DATA_CODEC = "Lucene50DocValuesData"; |
| static final String DATA_EXTENSION = "dvd"; |
| static final String META_CODEC = "Lucene50DocValuesMetadata"; |
| static final String META_EXTENSION = "dvm"; |
| static final int VERSION_START = 0; |
| static final int VERSION_CURRENT = VERSION_START; |
| static final byte NUMERIC = 0; |
| static final byte BINARY = 1; |
| static final byte SORTED = 2; |
| static final byte SORTED_SET = 3; |
| static final byte SORTED_NUMERIC = 4; |
| } |