blob: 6022dac13f1f9c3dd5a0e594948a2290bb7c163f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.uniformsplit;
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.NAME;
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_BLOCKS_EXTENSION;
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_DICTIONARY_EXTENSION;
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.VERSION_CURRENT;
import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
/**
* A block-based terms index and dictionary that assigns terms to nearly uniform length blocks. This
* technique is called Uniform Split.
*
* <p>The block construction is driven by two parameters, {@code targetNumBlockLines} and {@code
* deltaNumLines}. Each block size (number of terms) is {@code targetNumBlockLines}+-{@code
* deltaNumLines}. The algorithm computes the minimal distinguishing prefix (MDP) between each term
* and its previous term (alphabetically ordered). Then it selects in the neighborhood of the {@code
* targetNumBlockLines}, and within the {@code deltaNumLines}, the term with the minimal MDP. This
* term becomes the first term of the next block and its MDP is the block key. This block key is
* added to the terms dictionary trie.
*
* <p>We call dictionary the trie structure in memory, and block file the disk file containing the
* block lines, with one term and its corresponding term state details per line.
*
* <p>When seeking a term, the dictionary seeks the floor leaf of the trie for the searched term and
* jumps to the corresponding file pointer in the block file. There, the block terms are scanned for
* the exact searched term.
*
* <p>The terms inside a block do not need to share a prefix. Only the block key is used to find the
* block from the dictionary trie. And the block key is selected because it is the locally smallest
* MDP. This makes the dictionary trie very compact.
*
* <p>An interesting property of the Uniform Split technique is the very linear balance between
* memory usage and lookup performance. By decreasing the target block size, the block scan becomes
* faster, and since there are more blocks, the dictionary trie memory usage increases.
* Additionally, small blocks are faster to read from disk. A good sweet spot for the target block
* size is 32 with delta of 3 (10%) (default values). This can be tuned in the constructor.
*
* <p>There are additional optimizations:
*
* <ul>
* <li>Each block has a header that allows the lookup to jump directly to the middle term with a
* fast comparison. This reduces the linear scan by 2 for a small disk size increase.
* <li>Each block term is incrementally encoded according to its previous term. This both reduces
* the disk size and speeds up the block scan.
* <li>All term line details (the terms states) are written after all terms. This allows faster
* term scan without needing to decode the term states.
* <li>All file pointers are base-encoded. Their value is relative to the block base file pointer
* (not to the previous file pointer), this allows to read the term state of any term
* independently.
* </ul>
*
* <p>Blocks can be compressed or encrypted with an optional {@link BlockEncoder} provided in the
* {@link #UniformSplitTermsWriter(PostingsWriterBase, SegmentWriteState, int, int, BlockEncoder)
* constructor}.
*
* <p>The {@link UniformSplitPostingsFormat#TERMS_BLOCKS_EXTENSION block file} contains all the term
* blocks for each field sequentially. It also contains the fields metadata at the end of the file.
*
* <p>The {@link UniformSplitPostingsFormat#TERMS_DICTIONARY_EXTENSION dictionary file} contains the
* trie ({@link org.apache.lucene.util.fst.FST} bytes) for each field sequentially.
*
* @lucene.experimental
*/
public class UniformSplitTermsWriter extends FieldsConsumer {
/** Default value for the target block size (number of terms per block). */
public static final int DEFAULT_TARGET_NUM_BLOCK_LINES = 32;
/**
* Default value for the maximum allowed delta variation of the block size (delta of the number of
* terms per block). The block size will be [target block size]+-[allowed delta].
*/
public static final int DEFAULT_DELTA_NUM_LINES = (int) (DEFAULT_TARGET_NUM_BLOCK_LINES * 0.1);
/** Upper limit of the block size (maximum number of terms per block). */
protected static final int MAX_NUM_BLOCK_LINES = 1_000;
protected final FieldInfos fieldInfos;
protected final PostingsWriterBase postingsWriter;
protected final int maxDoc;
protected final int targetNumBlockLines;
protected final int deltaNumLines;
protected final BlockEncoder blockEncoder;
protected final FieldMetadata.Serializer fieldMetadataWriter;
protected final IndexOutput blockOutput;
protected final IndexOutput dictionaryOutput;
/**
* @param blockEncoder Optional block encoder, may be null if none. It can be used for compression
* or encryption.
*/
public UniformSplitTermsWriter(
PostingsWriterBase postingsWriter, SegmentWriteState state, BlockEncoder blockEncoder)
throws IOException {
this(
postingsWriter,
state,
DEFAULT_TARGET_NUM_BLOCK_LINES,
DEFAULT_DELTA_NUM_LINES,
blockEncoder);
}
/**
* @param blockEncoder Optional block encoder, may be null if none. It can be used for compression
* or encryption.
*/
public UniformSplitTermsWriter(
PostingsWriterBase postingsWriter,
SegmentWriteState state,
int targetNumBlockLines,
int deltaNumLines,
BlockEncoder blockEncoder)
throws IOException {
this(
postingsWriter,
state,
targetNumBlockLines,
deltaNumLines,
blockEncoder,
FieldMetadata.Serializer.INSTANCE,
NAME,
VERSION_CURRENT,
TERMS_BLOCKS_EXTENSION,
TERMS_DICTIONARY_EXTENSION);
}
/**
* @param targetNumBlockLines Target number of lines per block. Must be strictly greater than 0.
* The parameters can be pre-validated with {@link #validateSettings(int, int)}. There is one
* term per block line, with its corresponding details ({@link
* org.apache.lucene.index.TermState}).
* @param deltaNumLines Maximum allowed delta variation of the number of lines per block. Must be
* greater than or equal to 0 and strictly less than {@code targetNumBlockLines}. The block
* size will be {@code targetNumBlockLines}+-{@code deltaNumLines}. The block size must always
* be less than or equal to {@link #MAX_NUM_BLOCK_LINES}.
* @param blockEncoder Optional block encoder, may be null if none. It can be used for compression
* or encryption.
*/
protected UniformSplitTermsWriter(
PostingsWriterBase postingsWriter,
SegmentWriteState state,
int targetNumBlockLines,
int deltaNumLines,
BlockEncoder blockEncoder,
FieldMetadata.Serializer fieldMetadataWriter,
String codecName,
int versionCurrent,
String termsBlocksExtension,
String dictionaryExtension)
throws IOException {
validateSettings(targetNumBlockLines, deltaNumLines);
IndexOutput blockOutput = null;
IndexOutput dictionaryOutput = null;
boolean success = false;
try {
this.fieldInfos = state.fieldInfos;
this.postingsWriter = postingsWriter;
this.maxDoc = state.segmentInfo.maxDoc();
this.targetNumBlockLines = targetNumBlockLines;
this.deltaNumLines = deltaNumLines;
this.blockEncoder = blockEncoder;
this.fieldMetadataWriter = fieldMetadataWriter;
String termsName =
IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, termsBlocksExtension);
blockOutput = state.directory.createOutput(termsName, state.context);
CodecUtil.writeIndexHeader(
blockOutput, codecName, versionCurrent, state.segmentInfo.getId(), state.segmentSuffix);
String indexName =
IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, dictionaryExtension);
dictionaryOutput = state.directory.createOutput(indexName, state.context);
CodecUtil.writeIndexHeader(
dictionaryOutput,
codecName,
versionCurrent,
state.segmentInfo.getId(),
state.segmentSuffix);
postingsWriter.init(blockOutput, state);
this.blockOutput = blockOutput;
this.dictionaryOutput = dictionaryOutput;
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(blockOutput, dictionaryOutput);
}
}
}
/**
* Validates the {@link #UniformSplitTermsWriter(PostingsWriterBase, SegmentWriteState, int, int,
* BlockEncoder) constructor} settings.
*
* @param targetNumBlockLines Target number of lines per block. Must be strictly greater than 0.
* @param deltaNumLines Maximum allowed delta variation of the number of lines per block. Must be
* greater than or equal to 0 and strictly less than {@code targetNumBlockLines}.
* Additionally, {@code targetNumBlockLines} + {@code deltaNumLines} must be less than or
* equal to {@link #MAX_NUM_BLOCK_LINES}.
*/
protected static void validateSettings(int targetNumBlockLines, int deltaNumLines) {
if (targetNumBlockLines <= 0) {
throw new IllegalArgumentException(
"Invalid negative or nul targetNumBlockLines=" + targetNumBlockLines);
}
if (deltaNumLines < 0) {
throw new IllegalArgumentException("Invalid negative deltaNumLines=" + deltaNumLines);
}
if (deltaNumLines >= targetNumBlockLines) {
throw new IllegalArgumentException(
"Invalid too large deltaNumLines="
+ deltaNumLines
+ ", it must be < targetNumBlockLines="
+ targetNumBlockLines);
}
if (targetNumBlockLines + deltaNumLines > UniformSplitTermsWriter.MAX_NUM_BLOCK_LINES) {
throw new IllegalArgumentException(
"Invalid (targetNumBlockLines + deltaNumLines)="
+ (targetNumBlockLines + deltaNumLines)
+ ", it must be <= MAX_NUM_BLOCK_LINES="
+ UniformSplitTermsWriter.MAX_NUM_BLOCK_LINES);
}
}
@Override
public void write(Fields fields, NormsProducer normsProducer) throws IOException {
BlockWriter blockWriter =
new BlockWriter(blockOutput, targetNumBlockLines, deltaNumLines, blockEncoder);
ByteBuffersDataOutput fieldsOutput = new ByteBuffersDataOutput();
int fieldsNumber = 0;
for (String field : fields) {
Terms terms = fields.terms(field);
if (terms != null) {
TermsEnum termsEnum = terms.iterator();
FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
fieldsNumber +=
writeFieldTerms(blockWriter, fieldsOutput, termsEnum, fieldInfo, normsProducer);
}
}
writeFieldsMetadata(fieldsNumber, fieldsOutput);
CodecUtil.writeFooter(dictionaryOutput);
}
protected void writeFieldsMetadata(int fieldsNumber, ByteBuffersDataOutput fieldsOutput)
throws IOException {
long fieldsStartPosition = blockOutput.getFilePointer();
blockOutput.writeVInt(fieldsNumber);
if (blockEncoder == null) {
writeUnencodedFieldsMetadata(fieldsOutput);
} else {
writeEncodedFieldsMetadata(fieldsOutput);
}
// Must be a fixed length. Read by UniformSplitTermsReader when seeking fields metadata.
blockOutput.writeLong(fieldsStartPosition);
CodecUtil.writeFooter(blockOutput);
}
protected void writeUnencodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput)
throws IOException {
fieldsOutput.copyTo(blockOutput);
}
protected void writeEncodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
BlockEncoder.WritableBytes encodedBytes =
blockEncoder.encode(fieldsOutput.toDataInput(), fieldsOutput.size());
blockOutput.writeVLong(encodedBytes.size());
encodedBytes.writeTo(blockOutput);
}
/** @return 1 if the field was written; 0 otherwise. */
protected int writeFieldTerms(
BlockWriter blockWriter,
DataOutput fieldsOutput,
TermsEnum termsEnum,
FieldInfo fieldInfo,
NormsProducer normsProducer)
throws IOException {
FieldMetadata fieldMetadata = new FieldMetadata(fieldInfo, maxDoc);
fieldMetadata.setDictionaryStartFP(dictionaryOutput.getFilePointer());
postingsWriter.setField(fieldInfo);
blockWriter.setField(fieldMetadata);
IndexDictionary.Builder dictionaryBuilder = new FSTDictionary.Builder();
BytesRef lastTerm = null;
while (termsEnum.next() != null) {
BlockTermState blockTermState = writePostingLine(termsEnum, fieldMetadata, normsProducer);
if (blockTermState != null) {
lastTerm = BytesRef.deepCopyOf(termsEnum.term());
blockWriter.addLine(lastTerm, blockTermState, dictionaryBuilder);
}
}
// Flush remaining terms.
blockWriter.finishLastBlock(dictionaryBuilder);
if (fieldMetadata.getNumTerms() > 0) {
fieldMetadata.setLastTerm(lastTerm);
fieldMetadataWriter.write(fieldsOutput, fieldMetadata);
writeDictionary(dictionaryBuilder);
return 1;
}
return 0;
}
/**
* Writes the posting values for the current term in the given {@link TermsEnum} and updates the
* {@link FieldMetadata} stats.
*
* @return the written {@link BlockTermState}; or null if none.
*/
protected BlockTermState writePostingLine(
TermsEnum termsEnum, FieldMetadata fieldMetadata, NormsProducer normsProducer)
throws IOException {
BlockTermState state =
postingsWriter.writeTerm(
termsEnum.term(), termsEnum, fieldMetadata.getDocsSeen(), normsProducer);
if (state == null) {
// No doc for this term.
return null;
}
fieldMetadata.updateStats(state);
return state;
}
/** Writes the dictionary index (FST) to disk. */
protected void writeDictionary(IndexDictionary.Builder dictionaryBuilder) throws IOException {
dictionaryBuilder.build().write(dictionaryOutput, blockEncoder);
}
@Override
public void close() throws IOException {
IOUtils.close(blockOutput, dictionaryOutput, postingsWriter);
}
}