blob: c4e089f5627479dacf22ad75e8b918e0f4d92d9b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.uniformsplit;
import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.NAME;
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_BLOCKS_EXTENSION;
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_DICTIONARY_EXTENSION;
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.VERSION_CURRENT;
/**
* A block-based terms index and dictionary that assigns terms to nearly
* uniform length blocks. This technique is called Uniform Split.
* <p>
* The block construction is driven by two parameters, {@code targetNumBlockLines}
* and {@code deltaNumLines}.
* Each block size (number of terms) is {@code targetNumBlockLines}+-{@code deltaNumLines}.
* The algorithm computes the minimal distinguishing prefix (MDP) between
* each term and its previous term (alphabetically ordered). Then it selects
* in the neighborhood of the {@code targetNumBlockLines}, and within the
* {@code deltaNumLines}, the term with the minimal MDP. This term becomes
* the first term of the next block and its MDP is the block key. This block
* key is added to the terms dictionary trie.
* <p>
* We call dictionary the trie structure in memory, and block file the disk file
* containing the block lines, with one term and its corresponding term state
* details per line.
* <p>
* When seeking a term, the dictionary seeks the floor leaf of the trie for the
* searched term and jumps to the corresponding file pointer in the block file.
* There, the block terms are scanned for the exact searched term.
* <p>
* The terms inside a block do not need to share a prefix. Only the block
* key is used to find the block from the dictionary trie. And the block key
* is selected because it is the locally smallest MDP. This makes the dictionary
* trie very compact.
* <p>
* An interesting property of the Uniform Split technique is the very linear
* balance between memory usage and lookup performance. By decreasing
* the target block size, the block scan becomes faster, and since there are
* more blocks, the dictionary trie memory usage increases. Additionally,
* small blocks are faster to read from disk. A good sweet spot for the target
* block size is 32 with delta of 3 (10%) (default values). This can be tuned
* in the constructor.
* <p>
* There are additional optimizations:
* <ul>
* <li>Each block has a header that allows the lookup to jump directly to
* the middle term with a fast comparison. This reduces the linear scan
* by 2 for a small disk size increase.</li>
* <li>Each block term is incrementally encoded according to its previous
* term. This both reduces the disk size and speeds up the block scan.</li>
* <li>All term line details (the terms states) are written after all terms. This
* allows faster term scan without needing to decode the term states.</li>
* <li>All file pointers are base-encoded. Their value is relative to the block
* base file pointer (not to the previous file pointer), this allows to read the
* term state of any term independently.</li>
* </ul>
* <p>
* Blocks can be compressed or encrypted with an optional {@link BlockEncoder}
* provided in the {@link #UniformSplitTermsWriter(PostingsWriterBase, SegmentWriteState, int, int, BlockEncoder) constructor}.
* <p>
* The {@link UniformSplitPostingsFormat#TERMS_BLOCKS_EXTENSION block file}
* contains all the term blocks for each field sequentially. It also contains
* the fields metadata at the end of the file.
* <p>
* The {@link UniformSplitPostingsFormat#TERMS_DICTIONARY_EXTENSION dictionary file}
* contains the trie ({@link org.apache.lucene.util.fst.FST} bytes) for each
* field sequentially.
*
* @lucene.experimental
*/
public class UniformSplitTermsWriter extends FieldsConsumer {
/**
* Default value for the target block size (number of terms per block).
*/
public static final int DEFAULT_TARGET_NUM_BLOCK_LINES = 32;
/**
* Default value for the maximum allowed delta variation of the block size (delta of the number of terms per block).
* The block size will be [target block size]+-[allowed delta].
*/
public static final int DEFAULT_DELTA_NUM_LINES = (int) (DEFAULT_TARGET_NUM_BLOCK_LINES * 0.1);
/**
* Upper limit of the block size (maximum number of terms per block).
*/
protected static final int MAX_NUM_BLOCK_LINES = 1_000;
protected final FieldInfos fieldInfos;
protected final PostingsWriterBase postingsWriter;
protected final int maxDoc;
protected final int targetNumBlockLines;
protected final int deltaNumLines;
protected final BlockEncoder blockEncoder;
protected final FieldMetadata.Serializer fieldMetadataWriter;
protected final IndexOutput blockOutput;
protected final IndexOutput dictionaryOutput;
/**
* @param blockEncoder Optional block encoder, may be null if none.
* It can be used for compression or encryption.
*/
public UniformSplitTermsWriter(PostingsWriterBase postingsWriter, SegmentWriteState state,
BlockEncoder blockEncoder) throws IOException {
this(postingsWriter, state, DEFAULT_TARGET_NUM_BLOCK_LINES, DEFAULT_DELTA_NUM_LINES, blockEncoder);
}
/**
* @param blockEncoder Optional block encoder, may be null if none.
* It can be used for compression or encryption.
*/
public UniformSplitTermsWriter(PostingsWriterBase postingsWriter, SegmentWriteState state,
int targetNumBlockLines, int deltaNumLines, BlockEncoder blockEncoder) throws IOException {
this(postingsWriter, state, targetNumBlockLines, deltaNumLines, blockEncoder, FieldMetadata.Serializer.INSTANCE,
NAME, VERSION_CURRENT, TERMS_BLOCKS_EXTENSION, TERMS_DICTIONARY_EXTENSION);
}
/**
* @param targetNumBlockLines Target number of lines per block.
* Must be strictly greater than 0.
* The parameters can be pre-validated with {@link #validateSettings(int, int)}.
* There is one term per block line, with its corresponding details ({@link org.apache.lucene.index.TermState}).
* @param deltaNumLines Maximum allowed delta variation of the number of lines per block.
* Must be greater than or equal to 0 and strictly less than {@code targetNumBlockLines}.
* The block size will be {@code targetNumBlockLines}+-{@code deltaNumLines}.
* The block size must always be less than or equal to {@link #MAX_NUM_BLOCK_LINES}.
* @param blockEncoder Optional block encoder, may be null if none.
* It can be used for compression or encryption.
*/
protected UniformSplitTermsWriter(PostingsWriterBase postingsWriter, SegmentWriteState state,
int targetNumBlockLines, int deltaNumLines, BlockEncoder blockEncoder, FieldMetadata.Serializer fieldMetadataWriter,
String codecName, int versionCurrent, String termsBlocksExtension, String dictionaryExtension) throws IOException {
validateSettings(targetNumBlockLines, deltaNumLines);
IndexOutput blockOutput = null;
IndexOutput dictionaryOutput = null;
boolean success = false;
try {
this.fieldInfos = state.fieldInfos;
this.postingsWriter = postingsWriter;
this.maxDoc = state.segmentInfo.maxDoc();
this.targetNumBlockLines = targetNumBlockLines;
this.deltaNumLines = deltaNumLines;
this.blockEncoder = blockEncoder;
this.fieldMetadataWriter = fieldMetadataWriter;
String termsName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, termsBlocksExtension);
blockOutput = state.directory.createOutput(termsName, state.context);
CodecUtil.writeIndexHeader(blockOutput, codecName, versionCurrent, state.segmentInfo.getId(), state.segmentSuffix);
String indexName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dictionaryExtension);
dictionaryOutput = state.directory.createOutput(indexName, state.context);
CodecUtil.writeIndexHeader(dictionaryOutput, codecName, versionCurrent, state.segmentInfo.getId(), state.segmentSuffix);
postingsWriter.init(blockOutput, state);
this.blockOutput = blockOutput;
this.dictionaryOutput = dictionaryOutput;
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(blockOutput, dictionaryOutput);
}
}
}
/**
* Validates the {@link #UniformSplitTermsWriter(PostingsWriterBase, SegmentWriteState, int, int, BlockEncoder) constructor}
* settings.
*
* @param targetNumBlockLines Target number of lines per block.
* Must be strictly greater than 0.
* @param deltaNumLines Maximum allowed delta variation of the number of lines per block.
* Must be greater than or equal to 0 and strictly less than {@code targetNumBlockLines}.
* Additionally, {@code targetNumBlockLines} + {@code deltaNumLines} must be less than
* or equal to {@link #MAX_NUM_BLOCK_LINES}.
*/
protected static void validateSettings(int targetNumBlockLines, int deltaNumLines) {
if (targetNumBlockLines <= 0) {
throw new IllegalArgumentException("Invalid negative or nul targetNumBlockLines=" + targetNumBlockLines);
}
if (deltaNumLines < 0) {
throw new IllegalArgumentException("Invalid negative deltaNumLines=" + deltaNumLines);
}
if (deltaNumLines >= targetNumBlockLines) {
throw new IllegalArgumentException("Invalid too large deltaNumLines=" + deltaNumLines
+ ", it must be < targetNumBlockLines=" + targetNumBlockLines);
}
if (targetNumBlockLines + deltaNumLines > UniformSplitTermsWriter.MAX_NUM_BLOCK_LINES) {
throw new IllegalArgumentException("Invalid (targetNumBlockLines + deltaNumLines)="
+ (targetNumBlockLines + deltaNumLines) + ", it must be <= MAX_NUM_BLOCK_LINES="
+ UniformSplitTermsWriter.MAX_NUM_BLOCK_LINES);
}
}
@Override
public void write(Fields fields, NormsProducer normsProducer) throws IOException {
BlockWriter blockWriter = new BlockWriter(blockOutput, targetNumBlockLines, deltaNumLines, blockEncoder);
ByteBuffersDataOutput fieldsOutput = new ByteBuffersDataOutput();
int fieldsNumber = 0;
for (String field : fields) {
Terms terms = fields.terms(field);
if (terms != null) {
TermsEnum termsEnum = terms.iterator();
FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
fieldsNumber += writeFieldTerms(blockWriter, fieldsOutput, termsEnum, fieldInfo, normsProducer);
}
}
writeFieldsMetadata(fieldsNumber, fieldsOutput);
CodecUtil.writeFooter(dictionaryOutput);
}
protected void writeFieldsMetadata(int fieldsNumber, ByteBuffersDataOutput fieldsOutput) throws IOException {
long fieldsStartPosition = blockOutput.getFilePointer();
blockOutput.writeVInt(fieldsNumber);
if (blockEncoder == null) {
writeUnencodedFieldsMetadata(fieldsOutput);
} else {
writeEncodedFieldsMetadata(fieldsOutput);
}
// Must be a fixed length. Read by UniformSplitTermsReader when seeking fields metadata.
blockOutput.writeLong(fieldsStartPosition);
CodecUtil.writeFooter(blockOutput);
}
protected void writeUnencodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
fieldsOutput.copyTo(blockOutput);
}
protected void writeEncodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
BlockEncoder.WritableBytes encodedBytes = blockEncoder.encode(fieldsOutput.toDataInput(), fieldsOutput.size());
blockOutput.writeVLong(encodedBytes.size());
encodedBytes.writeTo(blockOutput);
}
/**
* @return 1 if the field was written; 0 otherwise.
*/
protected int writeFieldTerms(BlockWriter blockWriter, DataOutput fieldsOutput, TermsEnum termsEnum,
FieldInfo fieldInfo, NormsProducer normsProducer) throws IOException {
FieldMetadata fieldMetadata = new FieldMetadata(fieldInfo, maxDoc);
fieldMetadata.setDictionaryStartFP(dictionaryOutput.getFilePointer());
postingsWriter.setField(fieldInfo);
blockWriter.setField(fieldMetadata);
IndexDictionary.Builder dictionaryBuilder = new FSTDictionary.Builder();
BytesRef lastTerm = null;
while (termsEnum.next() != null) {
BlockTermState blockTermState = writePostingLine(termsEnum, fieldMetadata, normsProducer);
if (blockTermState != null) {
lastTerm = BytesRef.deepCopyOf(termsEnum.term());
blockWriter.addLine(lastTerm, blockTermState, dictionaryBuilder);
}
}
// Flush remaining terms.
blockWriter.finishLastBlock(dictionaryBuilder);
if (fieldMetadata.getNumTerms() > 0) {
fieldMetadata.setLastTerm(lastTerm);
fieldMetadataWriter.write(fieldsOutput, fieldMetadata);
writeDictionary(dictionaryBuilder);
return 1;
}
return 0;
}
/**
* Writes the posting values for the current term in the given {@link TermsEnum}
* and updates the {@link FieldMetadata} stats.
*
* @return the written {@link BlockTermState}; or null if none.
*/
protected BlockTermState writePostingLine(TermsEnum termsEnum, FieldMetadata fieldMetadata, NormsProducer normsProducer) throws IOException {
BlockTermState state = postingsWriter.writeTerm(termsEnum.term(), termsEnum, fieldMetadata.getDocsSeen(), normsProducer);
if (state == null) {
// No doc for this term.
return null;
}
fieldMetadata.updateStats(state);
return state;
}
/**
* Writes the dictionary index (FST) to disk.
*/
protected void writeDictionary(IndexDictionary.Builder dictionaryBuilder) throws IOException {
dictionaryBuilder.build().write(dictionaryOutput, blockEncoder);
}
@Override
public void close() throws IOException {
IOUtils.close(blockOutput, dictionaryOutput, postingsWriter);
}
}