| package org.apache.lucene.codecs.lucene42; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.IOException; |
| |
| import org.apache.lucene.codecs.CodecUtil; |
| import org.apache.lucene.codecs.DocValuesConsumer; |
| import org.apache.lucene.codecs.DocValuesProducer; |
| import org.apache.lucene.codecs.DocValuesFormat; |
| import org.apache.lucene.index.SegmentReadState; |
| import org.apache.lucene.index.SegmentWriteState; |
| import org.apache.lucene.store.DataOutput; |
| import org.apache.lucene.util.fst.FST; |
| import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; |
| import org.apache.lucene.util.packed.PackedInts; |
| import org.apache.lucene.util.packed.BlockPackedWriter; |
| |
| /** |
| * Lucene 4.2 DocValues format. |
| * <p> |
| * Encodes the four per-document value types (Numeric,Binary,Sorted,SortedSet) with seven basic strategies. |
| * <p> |
| * <ul> |
| * <li>Delta-compressed Numerics: per-document integers written in blocks of 4096. For each block |
| * the minimum value is encoded, and each entry is a delta from that minimum value. |
| * <li>Table-compressed Numerics: when the number of unique values is very small, a lookup table |
| * is written instead. Each per-document entry is instead the ordinal to this table. |
| * <li>Uncompressed Numerics: when all values would fit into a single byte, and the |
| * <code>acceptableOverheadRatio</code> would pack values into 8 bits per value anyway, they |
| * are written as absolute values (with no indirection or packing) for performance. |
| * <li>GCD-compressed Numerics: when all numbers share a common divisor, such as dates, the greatest |
| * common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics. |
| * <li>Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length. |
| * Each document's value can be addressed by maxDoc*length. |
| * <li>Variable-width Binary: one large concatenated byte[] is written, along with end addresses |
| * for each document. The addresses are written in blocks of 4096, with the current absolute |
| * start for the block, and the average (expected) delta per entry. For each document the |
| * deviation from the delta (actual - expected) is written. |
| * <li>Sorted: an FST mapping deduplicated terms to ordinals is written, along with the per-document |
| * ordinals written using one of the numeric strategies above. |
| * <li>SortedSet: an FST mapping deduplicated terms to ordinals is written, along with the per-document |
| * ordinal list written using one of the binary strategies above. |
| * </ul> |
| * <p> |
| * Files: |
| * <ol> |
| * <li><tt>.dvd</tt>: DocValues data</li> |
| * <li><tt>.dvm</tt>: DocValues metadata</li> |
| * </ol> |
| * <ol> |
| * <li><a name="dvm" id="dvm"></a> |
| * <p>The DocValues metadata or .dvm file.</p> |
| * <p>For DocValues field, this stores metadata, such as the offset into the |
| * DocValues data (.dvd)</p> |
| * <p>DocValues metadata (.dvm) --> Header,<FieldNumber,EntryType,Entry><sup>NumFields</sup></p> |
| * <ul> |
| * <li>Entry --> NumericEntry | BinaryEntry | SortedEntry</li> |
| * <li>NumericEntry --> DataOffset,CompressionType,PackedVersion</li> |
| * <li>BinaryEntry --> DataOffset,DataLength,MinLength,MaxLength,PackedVersion?,BlockSize?</li> |
| * <li>SortedEntry --> DataOffset,ValueCount</li> |
| * <li>FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --> {@link DataOutput#writeVInt VInt}</li> |
| * <li>DataOffset,DataLength --> {@link DataOutput#writeLong Int64}</li> |
| * <li>EntryType,CompressionType --> {@link DataOutput#writeByte Byte}</li> |
| * <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li> |
| * </ul> |
| * <p>Sorted fields have two entries: a SortedEntry with the FST metadata, |
| * and an ordinary NumericEntry for the document-to-ord metadata.</p> |
| * <p>SortedSet fields have two entries: a SortedEntry with the FST metadata, |
| * and an ordinary BinaryEntry for the document-to-ord-list metadata.</p> |
| * <p>FieldNumber of -1 indicates the end of metadata.</p> |
| * <p>EntryType is a 0 (NumericEntry), 1 (BinaryEntry, or 2 (SortedEntry)</p> |
| * <p>DataOffset is the pointer to the start of the data in the DocValues data (.dvd)</p> |
| * <p>CompressionType indicates how Numeric values will be compressed: |
| * <ul> |
| * <li>0 --> delta-compressed. For each block of 4096 integers, every integer is delta-encoded |
| * from the minimum value within the block. |
| * <li>1 --> table-compressed. When the number of unique numeric values is small and it would save space, |
| * a lookup table of unique values is written, followed by the ordinal for each document. |
| * <li>2 --> uncompressed. When the <code>acceptableOverheadRatio</code> parameter would upgrade the number |
| * of bits required to 8, and all values fit in a byte, these are written as absolute binary values |
| * for performance. |
| * <li>3 -->, gcd-compressed. When all integers share a common divisor, only quotients are stored |
| * using blocks of delta-encoded ints. |
| * </ul> |
| * <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values. |
| * If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length). |
| * Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize) |
| * is written for the addresses. |
| * <li><a name="dvd" id="dvd"></a> |
| * <p>The DocValues data or .dvd file.</p> |
| * <p>For DocValues field, this stores the actual per-document data (the heavy-lifting)</p> |
| * <p>DocValues data (.dvd) --> Header,<NumericData | BinaryData | SortedData><sup>NumFields</sup></p> |
| * <ul> |
| * <li>NumericData --> DeltaCompressedNumerics | TableCompressedNumerics | UncompressedNumerics | GCDCompressedNumerics</li> |
| * <li>BinaryData --> {@link DataOutput#writeByte Byte}<sup>DataLength</sup>,Addresses</li> |
| * <li>SortedData --> {@link FST FST<Int64>}</li> |
| * <li>DeltaCompressedNumerics --> {@link BlockPackedWriter BlockPackedInts(blockSize=4096)}</li> |
| * <li>TableCompressedNumerics --> TableSize,{@link DataOutput#writeLong Int64}<sup>TableSize</sup>,{@link PackedInts PackedInts}</li> |
| * <li>UncompressedNumerics --> {@link DataOutput#writeByte Byte}<sup>maxdoc</sup></li> |
| * <li>Addresses --> {@link MonotonicBlockPackedWriter MonotonicBlockPackedInts(blockSize=4096)}</li> |
| * </ul> |
| * <p>SortedSet entries store the list of ordinals in their BinaryData as a |
| * sequences of increasing {@link DataOutput#writeVLong vLong}s, delta-encoded.</p> |
| * </ol> |
| */ |
| public final class Lucene42DocValuesFormat extends DocValuesFormat { |
| final float acceptableOverheadRatio; |
| |
| /** |
| * Calls {@link #Lucene42DocValuesFormat(float) |
| * Lucene42DocValuesFormat(PackedInts.DEFAULT)} |
| */ |
| public Lucene42DocValuesFormat() { |
| this(PackedInts.DEFAULT); |
| } |
| |
| /** |
| * Creates a new Lucene42DocValuesFormat with the specified |
| * <code>acceptableOverheadRatio</code> for NumericDocValues. |
| * @param acceptableOverheadRatio compression parameter for numerics. |
| * Currently this is only used when the number of unique values is small. |
| * |
| * @lucene.experimental |
| */ |
| public Lucene42DocValuesFormat(float acceptableOverheadRatio) { |
| super("Lucene42"); |
| this.acceptableOverheadRatio = acceptableOverheadRatio; |
| } |
| |
| @Override |
| public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { |
| // note: we choose DEFAULT here (its reasonably fast, and for small bpv has tiny waste) |
| return new Lucene42DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION, acceptableOverheadRatio); |
| } |
| |
| @Override |
| public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException { |
| return new Lucene42DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION); |
| } |
| |
| private static final String DATA_CODEC = "Lucene42DocValuesData"; |
| private static final String DATA_EXTENSION = "dvd"; |
| private static final String METADATA_CODEC = "Lucene42DocValuesMetadata"; |
| private static final String METADATA_EXTENSION = "dvm"; |
| } |