| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.lucene.codecs.uniformsplit; |
| |
| import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.NAME; |
| import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_BLOCKS_EXTENSION; |
| import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_DICTIONARY_EXTENSION; |
| import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.VERSION_CURRENT; |
| |
| import java.io.IOException; |
| import org.apache.lucene.codecs.BlockTermState; |
| import org.apache.lucene.codecs.CodecUtil; |
| import org.apache.lucene.codecs.FieldsConsumer; |
| import org.apache.lucene.codecs.NormsProducer; |
| import org.apache.lucene.codecs.PostingsWriterBase; |
| import org.apache.lucene.index.FieldInfo; |
| import org.apache.lucene.index.FieldInfos; |
| import org.apache.lucene.index.Fields; |
| import org.apache.lucene.index.IndexFileNames; |
| import org.apache.lucene.index.SegmentWriteState; |
| import org.apache.lucene.index.Terms; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.store.ByteBuffersDataOutput; |
| import org.apache.lucene.store.DataOutput; |
| import org.apache.lucene.store.IndexOutput; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.IOUtils; |
| |
| /** |
| * A block-based terms index and dictionary that assigns terms to nearly uniform length blocks. This |
| * technique is called Uniform Split. |
| * |
| * <p>The block construction is driven by two parameters, {@code targetNumBlockLines} and {@code |
| * deltaNumLines}. Each block size (number of terms) is {@code targetNumBlockLines}+-{@code |
| * deltaNumLines}. The algorithm computes the minimal distinguishing prefix (MDP) between each term |
| * and its previous term (alphabetically ordered). Then it selects in the neighborhood of the {@code |
| * targetNumBlockLines}, and within the {@code deltaNumLines}, the term with the minimal MDP. This |
| * term becomes the first term of the next block and its MDP is the block key. This block key is |
| * added to the terms dictionary trie. |
| * |
| * <p>We call dictionary the trie structure in memory, and block file the disk file containing the |
| * block lines, with one term and its corresponding term state details per line. |
| * |
| * <p>When seeking a term, the dictionary seeks the floor leaf of the trie for the searched term and |
| * jumps to the corresponding file pointer in the block file. There, the block terms are scanned for |
| * the exact searched term. |
| * |
| * <p>The terms inside a block do not need to share a prefix. Only the block key is used to find the |
| * block from the dictionary trie. And the block key is selected because it is the locally smallest |
| * MDP. This makes the dictionary trie very compact. |
| * |
| * <p>An interesting property of the Uniform Split technique is the very linear balance between |
| * memory usage and lookup performance. By decreasing the target block size, the block scan becomes |
| * faster, and since there are more blocks, the dictionary trie memory usage increases. |
| * Additionally, small blocks are faster to read from disk. A good sweet spot for the target block |
| * size is 32 with delta of 3 (10%) (default values). This can be tuned in the constructor. |
| * |
| * <p>There are additional optimizations: |
| * |
| * <ul> |
| * <li>Each block has a header that allows the lookup to jump directly to the middle term with a |
| * fast comparison. This reduces the linear scan by 2 for a small disk size increase. |
| * <li>Each block term is incrementally encoded according to its previous term. This both reduces |
| * the disk size and speeds up the block scan. |
| * <li>All term line details (the terms states) are written after all terms. This allows faster |
| * term scan without needing to decode the term states. |
| * <li>All file pointers are base-encoded. Their value is relative to the block base file pointer |
| * (not to the previous file pointer), this allows to read the term state of any term |
| * independently. |
| * </ul> |
| * |
| * <p>Blocks can be compressed or encrypted with an optional {@link BlockEncoder} provided in the |
| * {@link #UniformSplitTermsWriter(PostingsWriterBase, SegmentWriteState, int, int, BlockEncoder) |
| * constructor}. |
| * |
| * <p>The {@link UniformSplitPostingsFormat#TERMS_BLOCKS_EXTENSION block file} contains all the term |
| * blocks for each field sequentially. It also contains the fields metadata at the end of the file. |
| * |
| * <p>The {@link UniformSplitPostingsFormat#TERMS_DICTIONARY_EXTENSION dictionary file} contains the |
| * trie ({@link org.apache.lucene.util.fst.FST} bytes) for each field sequentially. |
| * |
| * @lucene.experimental |
| */ |
| public class UniformSplitTermsWriter extends FieldsConsumer { |
| |
| /** Default value for the target block size (number of terms per block). */ |
| public static final int DEFAULT_TARGET_NUM_BLOCK_LINES = 32; |
| /** |
| * Default value for the maximum allowed delta variation of the block size (delta of the number of |
| * terms per block). The block size will be [target block size]+-[allowed delta]. |
| */ |
| public static final int DEFAULT_DELTA_NUM_LINES = (int) (DEFAULT_TARGET_NUM_BLOCK_LINES * 0.1); |
| /** Upper limit of the block size (maximum number of terms per block). */ |
| protected static final int MAX_NUM_BLOCK_LINES = 1_000; |
| |
| protected final FieldInfos fieldInfos; |
| protected final PostingsWriterBase postingsWriter; |
| protected final int maxDoc; |
| |
| protected final int targetNumBlockLines; |
| protected final int deltaNumLines; |
| |
| protected final BlockEncoder blockEncoder; |
| protected final FieldMetadata.Serializer fieldMetadataWriter; |
| protected final IndexOutput blockOutput; |
| protected final IndexOutput dictionaryOutput; |
| |
| /** |
| * @param blockEncoder Optional block encoder, may be null if none. It can be used for compression |
| * or encryption. |
| */ |
| public UniformSplitTermsWriter( |
| PostingsWriterBase postingsWriter, SegmentWriteState state, BlockEncoder blockEncoder) |
| throws IOException { |
| this( |
| postingsWriter, |
| state, |
| DEFAULT_TARGET_NUM_BLOCK_LINES, |
| DEFAULT_DELTA_NUM_LINES, |
| blockEncoder); |
| } |
| |
| /** |
| * @param blockEncoder Optional block encoder, may be null if none. It can be used for compression |
| * or encryption. |
| */ |
| public UniformSplitTermsWriter( |
| PostingsWriterBase postingsWriter, |
| SegmentWriteState state, |
| int targetNumBlockLines, |
| int deltaNumLines, |
| BlockEncoder blockEncoder) |
| throws IOException { |
| this( |
| postingsWriter, |
| state, |
| targetNumBlockLines, |
| deltaNumLines, |
| blockEncoder, |
| FieldMetadata.Serializer.INSTANCE, |
| NAME, |
| VERSION_CURRENT, |
| TERMS_BLOCKS_EXTENSION, |
| TERMS_DICTIONARY_EXTENSION); |
| } |
| |
| /** |
| * @param targetNumBlockLines Target number of lines per block. Must be strictly greater than 0. |
| * The parameters can be pre-validated with {@link #validateSettings(int, int)}. There is one |
| * term per block line, with its corresponding details ({@link |
| * org.apache.lucene.index.TermState}). |
| * @param deltaNumLines Maximum allowed delta variation of the number of lines per block. Must be |
| * greater than or equal to 0 and strictly less than {@code targetNumBlockLines}. The block |
| * size will be {@code targetNumBlockLines}+-{@code deltaNumLines}. The block size must always |
| * be less than or equal to {@link #MAX_NUM_BLOCK_LINES}. |
| * @param blockEncoder Optional block encoder, may be null if none. It can be used for compression |
| * or encryption. |
| */ |
| protected UniformSplitTermsWriter( |
| PostingsWriterBase postingsWriter, |
| SegmentWriteState state, |
| int targetNumBlockLines, |
| int deltaNumLines, |
| BlockEncoder blockEncoder, |
| FieldMetadata.Serializer fieldMetadataWriter, |
| String codecName, |
| int versionCurrent, |
| String termsBlocksExtension, |
| String dictionaryExtension) |
| throws IOException { |
| validateSettings(targetNumBlockLines, deltaNumLines); |
| IndexOutput blockOutput = null; |
| IndexOutput dictionaryOutput = null; |
| boolean success = false; |
| try { |
| this.fieldInfos = state.fieldInfos; |
| this.postingsWriter = postingsWriter; |
| this.maxDoc = state.segmentInfo.maxDoc(); |
| this.targetNumBlockLines = targetNumBlockLines; |
| this.deltaNumLines = deltaNumLines; |
| this.blockEncoder = blockEncoder; |
| this.fieldMetadataWriter = fieldMetadataWriter; |
| |
| String termsName = |
| IndexFileNames.segmentFileName( |
| state.segmentInfo.name, state.segmentSuffix, termsBlocksExtension); |
| blockOutput = state.directory.createOutput(termsName, state.context); |
| CodecUtil.writeIndexHeader( |
| blockOutput, codecName, versionCurrent, state.segmentInfo.getId(), state.segmentSuffix); |
| |
| String indexName = |
| IndexFileNames.segmentFileName( |
| state.segmentInfo.name, state.segmentSuffix, dictionaryExtension); |
| dictionaryOutput = state.directory.createOutput(indexName, state.context); |
| CodecUtil.writeIndexHeader( |
| dictionaryOutput, |
| codecName, |
| versionCurrent, |
| state.segmentInfo.getId(), |
| state.segmentSuffix); |
| |
| postingsWriter.init(blockOutput, state); |
| |
| this.blockOutput = blockOutput; |
| this.dictionaryOutput = dictionaryOutput; |
| success = true; |
| } finally { |
| if (!success) { |
| IOUtils.closeWhileHandlingException(blockOutput, dictionaryOutput); |
| } |
| } |
| } |
| |
| /** |
| * Validates the {@link #UniformSplitTermsWriter(PostingsWriterBase, SegmentWriteState, int, int, |
| * BlockEncoder) constructor} settings. |
| * |
| * @param targetNumBlockLines Target number of lines per block. Must be strictly greater than 0. |
| * @param deltaNumLines Maximum allowed delta variation of the number of lines per block. Must be |
| * greater than or equal to 0 and strictly less than {@code targetNumBlockLines}. |
| * Additionally, {@code targetNumBlockLines} + {@code deltaNumLines} must be less than or |
| * equal to {@link #MAX_NUM_BLOCK_LINES}. |
| */ |
| protected static void validateSettings(int targetNumBlockLines, int deltaNumLines) { |
| if (targetNumBlockLines <= 0) { |
| throw new IllegalArgumentException( |
| "Invalid negative or nul targetNumBlockLines=" + targetNumBlockLines); |
| } |
| if (deltaNumLines < 0) { |
| throw new IllegalArgumentException("Invalid negative deltaNumLines=" + deltaNumLines); |
| } |
| if (deltaNumLines >= targetNumBlockLines) { |
| throw new IllegalArgumentException( |
| "Invalid too large deltaNumLines=" |
| + deltaNumLines |
| + ", it must be < targetNumBlockLines=" |
| + targetNumBlockLines); |
| } |
| if (targetNumBlockLines + deltaNumLines > UniformSplitTermsWriter.MAX_NUM_BLOCK_LINES) { |
| throw new IllegalArgumentException( |
| "Invalid (targetNumBlockLines + deltaNumLines)=" |
| + (targetNumBlockLines + deltaNumLines) |
| + ", it must be <= MAX_NUM_BLOCK_LINES=" |
| + UniformSplitTermsWriter.MAX_NUM_BLOCK_LINES); |
| } |
| } |
| |
| @Override |
| public void write(Fields fields, NormsProducer normsProducer) throws IOException { |
| BlockWriter blockWriter = |
| new BlockWriter(blockOutput, targetNumBlockLines, deltaNumLines, blockEncoder); |
| ByteBuffersDataOutput fieldsOutput = new ByteBuffersDataOutput(); |
| int fieldsNumber = 0; |
| for (String field : fields) { |
| Terms terms = fields.terms(field); |
| if (terms != null) { |
| TermsEnum termsEnum = terms.iterator(); |
| FieldInfo fieldInfo = fieldInfos.fieldInfo(field); |
| fieldsNumber += |
| writeFieldTerms(blockWriter, fieldsOutput, termsEnum, fieldInfo, normsProducer); |
| } |
| } |
| writeFieldsMetadata(fieldsNumber, fieldsOutput); |
| CodecUtil.writeFooter(dictionaryOutput); |
| } |
| |
| protected void writeFieldsMetadata(int fieldsNumber, ByteBuffersDataOutput fieldsOutput) |
| throws IOException { |
| long fieldsStartPosition = blockOutput.getFilePointer(); |
| blockOutput.writeVInt(fieldsNumber); |
| if (blockEncoder == null) { |
| writeUnencodedFieldsMetadata(fieldsOutput); |
| } else { |
| writeEncodedFieldsMetadata(fieldsOutput); |
| } |
| // Must be a fixed length. Read by UniformSplitTermsReader when seeking fields metadata. |
| blockOutput.writeLong(fieldsStartPosition); |
| CodecUtil.writeFooter(blockOutput); |
| } |
| |
| protected void writeUnencodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) |
| throws IOException { |
| fieldsOutput.copyTo(blockOutput); |
| } |
| |
| protected void writeEncodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException { |
| BlockEncoder.WritableBytes encodedBytes = |
| blockEncoder.encode(fieldsOutput.toDataInput(), fieldsOutput.size()); |
| blockOutput.writeVLong(encodedBytes.size()); |
| encodedBytes.writeTo(blockOutput); |
| } |
| |
| /** @return 1 if the field was written; 0 otherwise. */ |
| protected int writeFieldTerms( |
| BlockWriter blockWriter, |
| DataOutput fieldsOutput, |
| TermsEnum termsEnum, |
| FieldInfo fieldInfo, |
| NormsProducer normsProducer) |
| throws IOException { |
| |
| FieldMetadata fieldMetadata = new FieldMetadata(fieldInfo, maxDoc); |
| fieldMetadata.setDictionaryStartFP(dictionaryOutput.getFilePointer()); |
| |
| postingsWriter.setField(fieldInfo); |
| blockWriter.setField(fieldMetadata); |
| IndexDictionary.Builder dictionaryBuilder = new FSTDictionary.Builder(); |
| BytesRef lastTerm = null; |
| while (termsEnum.next() != null) { |
| BlockTermState blockTermState = writePostingLine(termsEnum, fieldMetadata, normsProducer); |
| if (blockTermState != null) { |
| lastTerm = BytesRef.deepCopyOf(termsEnum.term()); |
| blockWriter.addLine(lastTerm, blockTermState, dictionaryBuilder); |
| } |
| } |
| |
| // Flush remaining terms. |
| blockWriter.finishLastBlock(dictionaryBuilder); |
| |
| if (fieldMetadata.getNumTerms() > 0) { |
| fieldMetadata.setLastTerm(lastTerm); |
| fieldMetadataWriter.write(fieldsOutput, fieldMetadata); |
| writeDictionary(dictionaryBuilder); |
| return 1; |
| } |
| return 0; |
| } |
| |
| /** |
| * Writes the posting values for the current term in the given {@link TermsEnum} and updates the |
| * {@link FieldMetadata} stats. |
| * |
| * @return the written {@link BlockTermState}; or null if none. |
| */ |
| protected BlockTermState writePostingLine( |
| TermsEnum termsEnum, FieldMetadata fieldMetadata, NormsProducer normsProducer) |
| throws IOException { |
| BlockTermState state = |
| postingsWriter.writeTerm( |
| termsEnum.term(), termsEnum, fieldMetadata.getDocsSeen(), normsProducer); |
| if (state == null) { |
| // No doc for this term. |
| return null; |
| } |
| fieldMetadata.updateStats(state); |
| return state; |
| } |
| |
| /** Writes the dictionary index (FST) to disk. */ |
| protected void writeDictionary(IndexDictionary.Builder dictionaryBuilder) throws IOException { |
| dictionaryBuilder.build().write(dictionaryOutput, blockEncoder); |
| } |
| |
| @Override |
| public void close() throws IOException { |
| IOUtils.close(blockOutput, dictionaryOutput, postingsWriter); |
| } |
| } |