| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.lucene.codecs.uniformsplit; |
| |
| import java.io.IOException; |
| |
| import org.apache.lucene.codecs.BlockTermState; |
| import org.apache.lucene.codecs.CodecUtil; |
| import org.apache.lucene.codecs.FieldsConsumer; |
| import org.apache.lucene.codecs.NormsProducer; |
| import org.apache.lucene.codecs.PostingsWriterBase; |
| import org.apache.lucene.index.FieldInfo; |
| import org.apache.lucene.index.FieldInfos; |
| import org.apache.lucene.index.Fields; |
| import org.apache.lucene.index.IndexFileNames; |
| import org.apache.lucene.index.SegmentWriteState; |
| import org.apache.lucene.index.Terms; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.store.ByteBuffersDataOutput; |
| import org.apache.lucene.store.DataOutput; |
| import org.apache.lucene.store.IndexOutput; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.IOUtils; |
| |
| import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.NAME; |
| import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_BLOCKS_EXTENSION; |
| import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_DICTIONARY_EXTENSION; |
| import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.VERSION_CURRENT; |
| |
| /** |
| * A block-based terms index and dictionary that assigns terms to nearly |
| * uniform length blocks. This technique is called Uniform Split. |
| * <p> |
| * The block construction is driven by two parameters, {@code targetNumBlockLines} |
| * and {@code deltaNumLines}. |
| * Each block size (number of terms) is {@code targetNumBlockLines}+-{@code deltaNumLines}. |
| * The algorithm computes the minimal distinguishing prefix (MDP) between |
| * each term and its previous term (alphabetically ordered). Then it selects |
| * in the neighborhood of the {@code targetNumBlockLines}, and within the |
| * {@code deltaNumLines}, the term with the minimal MDP. This term becomes |
| * the first term of the next block and its MDP is the block key. This block |
| * key is added to the terms dictionary trie. |
| * <p> |
| * We call dictionary the trie structure in memory, and block file the disk file |
| * containing the block lines, with one term and its corresponding term state |
| * details per line. |
| * <p> |
| * When seeking a term, the dictionary seeks the floor leaf of the trie for the |
| * searched term and jumps to the corresponding file pointer in the block file. |
| * There, the block terms are scanned for the exact searched term. |
| * <p> |
| * The terms inside a block do not need to share a prefix. Only the block |
| * key is used to find the block from the dictionary trie. And the block key |
| * is selected because it is the locally smallest MDP. This makes the dictionary |
| * trie very compact. |
| * <p> |
| * An interesting property of the Uniform Split technique is the very linear |
| * balance between memory usage and lookup performance. By decreasing |
| * the target block size, the block scan becomes faster, and since there are |
| * more blocks, the dictionary trie memory usage increases. Additionally, |
| * small blocks are faster to read from disk. A good sweet spot for the target |
| * block size is 32 with delta of 3 (10%) (default values). This can be tuned |
| * in the constructor. |
| * <p> |
| * There are additional optimizations: |
| * <ul> |
| * <li>Each block has a header that allows the lookup to jump directly to |
| * the middle term with a fast comparison. This reduces the linear scan |
| * by 2 for a small disk size increase.</li> |
| * <li>Each block term is incrementally encoded according to its previous |
| * term. This both reduces the disk size and speeds up the block scan.</li> |
| * <li>All term line details (the terms states) are written after all terms. This |
| * allows faster term scan without needing to decode the term states.</li> |
| * <li>All file pointers are base-encoded. Their value is relative to the block |
| * base file pointer (not to the previous file pointer), this allows to read the |
| * term state of any term independently.</li> |
| * </ul> |
| * <p> |
| * Blocks can be compressed or encrypted with an optional {@link BlockEncoder} |
| * provided in the {@link #UniformSplitTermsWriter(PostingsWriterBase, SegmentWriteState, int, int, BlockEncoder) constructor}. |
| * <p> |
| * The {@link UniformSplitPostingsFormat#TERMS_BLOCKS_EXTENSION block file} |
| * contains all the term blocks for each field sequentially. It also contains |
| * the fields metadata at the end of the file. |
| * <p> |
| * The {@link UniformSplitPostingsFormat#TERMS_DICTIONARY_EXTENSION dictionary file} |
| * contains the trie ({@link org.apache.lucene.util.fst.FST} bytes) for each |
| * field sequentially. |
| * |
| * @lucene.experimental |
| */ |
| public class UniformSplitTermsWriter extends FieldsConsumer { |
| |
| /** |
| * Default value for the target block size (number of terms per block). |
| */ |
| public static final int DEFAULT_TARGET_NUM_BLOCK_LINES = 32; |
| /** |
| * Default value for the maximum allowed delta variation of the block size (delta of the number of terms per block). |
| * The block size will be [target block size]+-[allowed delta]. |
| */ |
| public static final int DEFAULT_DELTA_NUM_LINES = (int) (DEFAULT_TARGET_NUM_BLOCK_LINES * 0.1); |
| /** |
| * Upper limit of the block size (maximum number of terms per block). |
| */ |
| protected static final int MAX_NUM_BLOCK_LINES = 1_000; |
| |
| protected final FieldInfos fieldInfos; |
| protected final PostingsWriterBase postingsWriter; |
| protected final int maxDoc; |
| |
| protected final int targetNumBlockLines; |
| protected final int deltaNumLines; |
| |
| protected final BlockEncoder blockEncoder; |
| protected final FieldMetadata.Serializer fieldMetadataWriter; |
| protected final IndexOutput blockOutput; |
| protected final IndexOutput dictionaryOutput; |
| |
| /** |
| * @param blockEncoder Optional block encoder, may be null if none. |
| * It can be used for compression or encryption. |
| */ |
| public UniformSplitTermsWriter(PostingsWriterBase postingsWriter, SegmentWriteState state, |
| BlockEncoder blockEncoder) throws IOException { |
| this(postingsWriter, state, DEFAULT_TARGET_NUM_BLOCK_LINES, DEFAULT_DELTA_NUM_LINES, blockEncoder); |
| } |
| |
| /** |
| * @param blockEncoder Optional block encoder, may be null if none. |
| * It can be used for compression or encryption. |
| */ |
| public UniformSplitTermsWriter(PostingsWriterBase postingsWriter, SegmentWriteState state, |
| int targetNumBlockLines, int deltaNumLines, BlockEncoder blockEncoder) throws IOException { |
| this(postingsWriter, state, targetNumBlockLines, deltaNumLines, blockEncoder, FieldMetadata.Serializer.INSTANCE, |
| NAME, VERSION_CURRENT, TERMS_BLOCKS_EXTENSION, TERMS_DICTIONARY_EXTENSION); |
| } |
| |
| |
| /** |
| * @param targetNumBlockLines Target number of lines per block. |
| * Must be strictly greater than 0. |
| * The parameters can be pre-validated with {@link #validateSettings(int, int)}. |
| * There is one term per block line, with its corresponding details ({@link org.apache.lucene.index.TermState}). |
| * @param deltaNumLines Maximum allowed delta variation of the number of lines per block. |
| * Must be greater than or equal to 0 and strictly less than {@code targetNumBlockLines}. |
| * The block size will be {@code targetNumBlockLines}+-{@code deltaNumLines}. |
| * The block size must always be less than or equal to {@link #MAX_NUM_BLOCK_LINES}. |
| * @param blockEncoder Optional block encoder, may be null if none. |
| * It can be used for compression or encryption. |
| */ |
| protected UniformSplitTermsWriter(PostingsWriterBase postingsWriter, SegmentWriteState state, |
| int targetNumBlockLines, int deltaNumLines, BlockEncoder blockEncoder, FieldMetadata.Serializer fieldMetadataWriter, |
| String codecName, int versionCurrent, String termsBlocksExtension, String dictionaryExtension) throws IOException { |
| validateSettings(targetNumBlockLines, deltaNumLines); |
| IndexOutput blockOutput = null; |
| IndexOutput dictionaryOutput = null; |
| boolean success = false; |
| try { |
| this.fieldInfos = state.fieldInfos; |
| this.postingsWriter = postingsWriter; |
| this.maxDoc = state.segmentInfo.maxDoc(); |
| this.targetNumBlockLines = targetNumBlockLines; |
| this.deltaNumLines = deltaNumLines; |
| this.blockEncoder = blockEncoder; |
| this.fieldMetadataWriter = fieldMetadataWriter; |
| |
| String termsName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, termsBlocksExtension); |
| blockOutput = state.directory.createOutput(termsName, state.context); |
| CodecUtil.writeIndexHeader(blockOutput, codecName, versionCurrent, state.segmentInfo.getId(), state.segmentSuffix); |
| |
| String indexName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dictionaryExtension); |
| dictionaryOutput = state.directory.createOutput(indexName, state.context); |
| CodecUtil.writeIndexHeader(dictionaryOutput, codecName, versionCurrent, state.segmentInfo.getId(), state.segmentSuffix); |
| |
| postingsWriter.init(blockOutput, state); |
| |
| this.blockOutput = blockOutput; |
| this.dictionaryOutput = dictionaryOutput; |
| success = true; |
| } finally { |
| if (!success) { |
| IOUtils.closeWhileHandlingException(blockOutput, dictionaryOutput); |
| } |
| } |
| } |
| |
| /** |
| * Validates the {@link #UniformSplitTermsWriter(PostingsWriterBase, SegmentWriteState, int, int, BlockEncoder) constructor} |
| * settings. |
| * |
| * @param targetNumBlockLines Target number of lines per block. |
| * Must be strictly greater than 0. |
| * @param deltaNumLines Maximum allowed delta variation of the number of lines per block. |
| * Must be greater than or equal to 0 and strictly less than {@code targetNumBlockLines}. |
| * Additionally, {@code targetNumBlockLines} + {@code deltaNumLines} must be less than |
| * or equal to {@link #MAX_NUM_BLOCK_LINES}. |
| */ |
| protected static void validateSettings(int targetNumBlockLines, int deltaNumLines) { |
| if (targetNumBlockLines <= 0) { |
| throw new IllegalArgumentException("Invalid negative or nul targetNumBlockLines=" + targetNumBlockLines); |
| } |
| if (deltaNumLines < 0) { |
| throw new IllegalArgumentException("Invalid negative deltaNumLines=" + deltaNumLines); |
| } |
| if (deltaNumLines >= targetNumBlockLines) { |
| throw new IllegalArgumentException("Invalid too large deltaNumLines=" + deltaNumLines |
| + ", it must be < targetNumBlockLines=" + targetNumBlockLines); |
| } |
| if (targetNumBlockLines + deltaNumLines > UniformSplitTermsWriter.MAX_NUM_BLOCK_LINES) { |
| throw new IllegalArgumentException("Invalid (targetNumBlockLines + deltaNumLines)=" |
| + (targetNumBlockLines + deltaNumLines) + ", it must be <= MAX_NUM_BLOCK_LINES=" |
| + UniformSplitTermsWriter.MAX_NUM_BLOCK_LINES); |
| } |
| } |
| |
| @Override |
| public void write(Fields fields, NormsProducer normsProducer) throws IOException { |
| BlockWriter blockWriter = new BlockWriter(blockOutput, targetNumBlockLines, deltaNumLines, blockEncoder); |
| ByteBuffersDataOutput fieldsOutput = new ByteBuffersDataOutput(); |
| int fieldsNumber = 0; |
| for (String field : fields) { |
| Terms terms = fields.terms(field); |
| if (terms != null) { |
| TermsEnum termsEnum = terms.iterator(); |
| FieldInfo fieldInfo = fieldInfos.fieldInfo(field); |
| fieldsNumber += writeFieldTerms(blockWriter, fieldsOutput, termsEnum, fieldInfo, normsProducer); |
| } |
| } |
| writeFieldsMetadata(fieldsNumber, fieldsOutput); |
| CodecUtil.writeFooter(dictionaryOutput); |
| } |
| |
| protected void writeFieldsMetadata(int fieldsNumber, ByteBuffersDataOutput fieldsOutput) throws IOException { |
| long fieldsStartPosition = blockOutput.getFilePointer(); |
| blockOutput.writeVInt(fieldsNumber); |
| if (blockEncoder == null) { |
| writeUnencodedFieldsMetadata(fieldsOutput); |
| } else { |
| writeEncodedFieldsMetadata(fieldsOutput); |
| } |
| // Must be a fixed length. Read by UniformSplitTermsReader when seeking fields metadata. |
| blockOutput.writeLong(fieldsStartPosition); |
| CodecUtil.writeFooter(blockOutput); |
| } |
| |
| protected void writeUnencodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException { |
| fieldsOutput.copyTo(blockOutput); |
| } |
| |
| protected void writeEncodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException { |
| BlockEncoder.WritableBytes encodedBytes = blockEncoder.encode(fieldsOutput.toDataInput(), fieldsOutput.size()); |
| blockOutput.writeVLong(encodedBytes.size()); |
| encodedBytes.writeTo(blockOutput); |
| } |
| |
| /** |
| * @return 1 if the field was written; 0 otherwise. |
| */ |
| protected int writeFieldTerms(BlockWriter blockWriter, DataOutput fieldsOutput, TermsEnum termsEnum, |
| FieldInfo fieldInfo, NormsProducer normsProducer) throws IOException { |
| |
| FieldMetadata fieldMetadata = new FieldMetadata(fieldInfo, maxDoc); |
| fieldMetadata.setDictionaryStartFP(dictionaryOutput.getFilePointer()); |
| |
| postingsWriter.setField(fieldInfo); |
| blockWriter.setField(fieldMetadata); |
| IndexDictionary.Builder dictionaryBuilder = new FSTDictionary.Builder(); |
| BytesRef lastTerm = null; |
| while (termsEnum.next() != null) { |
| BlockTermState blockTermState = writePostingLine(termsEnum, fieldMetadata, normsProducer); |
| if (blockTermState != null) { |
| lastTerm = BytesRef.deepCopyOf(termsEnum.term()); |
| blockWriter.addLine(lastTerm, blockTermState, dictionaryBuilder); |
| } |
| } |
| |
| // Flush remaining terms. |
| blockWriter.finishLastBlock(dictionaryBuilder); |
| |
| if (fieldMetadata.getNumTerms() > 0) { |
| fieldMetadata.setLastTerm(lastTerm); |
| fieldMetadataWriter.write(fieldsOutput, fieldMetadata); |
| writeDictionary(dictionaryBuilder); |
| return 1; |
| } |
| return 0; |
| } |
| |
| /** |
| * Writes the posting values for the current term in the given {@link TermsEnum} |
| * and updates the {@link FieldMetadata} stats. |
| * |
| * @return the written {@link BlockTermState}; or null if none. |
| */ |
| protected BlockTermState writePostingLine(TermsEnum termsEnum, FieldMetadata fieldMetadata, NormsProducer normsProducer) throws IOException { |
| BlockTermState state = postingsWriter.writeTerm(termsEnum.term(), termsEnum, fieldMetadata.getDocsSeen(), normsProducer); |
| if (state == null) { |
| // No doc for this term. |
| return null; |
| } |
| fieldMetadata.updateStats(state); |
| return state; |
| } |
| |
| /** |
| * Writes the dictionary index (FST) to disk. |
| */ |
| protected void writeDictionary(IndexDictionary.Builder dictionaryBuilder) throws IOException { |
| dictionaryBuilder.build().write(dictionaryOutput, blockEncoder); |
| } |
| |
| @Override |
| public void close() throws IOException { |
| IOUtils.close(blockOutput, dictionaryOutput, postingsWriter); |
| } |
| } |