| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.lucene.codecs.uniformsplit; |
| |
| import java.io.IOException; |
| import org.apache.lucene.codecs.BlockTermState; |
| import org.apache.lucene.index.CorruptIndexException; |
| import org.apache.lucene.index.FieldInfo; |
| import org.apache.lucene.store.DataInput; |
| import org.apache.lucene.store.DataOutput; |
| import org.apache.lucene.util.Accountable; |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.RamUsageEstimator; |
| |
| /** |
| * One term block line. |
| * |
| * <p>Contains a term and its details as a {@link BlockTermState}. |
| * |
| * <p>The line is written to the {@link UniformSplitPostingsFormat#TERMS_BLOCKS_EXTENSION block |
| * file} in two parts. The first part is the term followed by an offset to the details region. The |
| * second part is the term {@link BlockTermState}, written in the details region, after all the |
| * terms of the block. |
| * |
| * <p>The separate details region allows fast scan of the terms without having to decode the details |
| * for each term. At read time, the {@link BlockLine.Serializer#readLine} only reads the term and |
| * its offset to the details. The corresponding {@link BlockTermState} is decoded on demand in the |
| * {@link BlockReader} (see {@link BlockReader#readTermStateIfNotRead}). |
| * |
| * @lucene.experimental |
| */ |
| public class BlockLine implements Accountable { |
| |
| private static final long BASE_RAM_USAGE = |
| RamUsageEstimator.shallowSizeOfInstance(BlockLine.class); |
| |
| protected TermBytes termBytes; |
| protected int termStateRelativeOffset; |
| |
| /** Only used for writing. */ |
| protected final BlockTermState termState; |
| |
| /** Constructor used for writing a {@link BlockLine}. */ |
| protected BlockLine(TermBytes termBytes, BlockTermState termState) { |
| this(termBytes, -1, termState); |
| } |
| |
| /** Constructor used for reading a {@link BlockLine}. */ |
| protected BlockLine(TermBytes termBytes, int termStateRelativeOffset) { |
| this(termBytes, termStateRelativeOffset, null); |
| } |
| |
| private BlockLine(TermBytes termBytes, int termStateRelativeOffset, BlockTermState termState) { |
| reset(termBytes, termStateRelativeOffset); |
| this.termState = termState; |
| } |
| |
| /** Resets this {@link BlockLine} to reuse it when reading. */ |
| protected BlockLine reset(TermBytes termBytes, int termStateRelativeOffset) { |
| assert termState == null; |
| this.termBytes = termBytes; |
| this.termStateRelativeOffset = termStateRelativeOffset; |
| return this; |
| } |
| |
| public TermBytes getTermBytes() { |
| return termBytes; |
| } |
| |
| /** |
| * @return The offset of the {@link org.apache.lucene.index.TermState} bytes in the block, |
| * relatively to the term states base offset. |
| */ |
| public int getTermStateRelativeOffset() { |
| return termStateRelativeOffset; |
| } |
| |
| @Override |
| public long ramBytesUsed() { |
| return BASE_RAM_USAGE + termBytes.ramBytesUsed() + RamUsageUtil.ramBytesUsed(termState); |
| } |
| |
| /** |
| * Reads/writes block lines with terms encoded incrementally inside a block. This class keeps a |
| * state of the previous term read to decode the next term. |
| */ |
| public static class Serializer implements Accountable { |
| |
| private static final long BASE_RAM_USAGE = |
| RamUsageEstimator.shallowSizeOfInstance(Serializer.class); |
| |
| protected final BytesRef currentTerm; |
| |
| public Serializer() { |
| currentTerm = new BytesRef(64); |
| } |
| |
| /** |
| * Reads the current line. |
| * |
| * @param isIncrementalEncodingSeed Whether the term is a seed of the incremental encoding. |
| * {@code true} for the first and middle term, {@code false} for other terms. |
| * @param reuse A {@link BlockLine} instance to reuse; or null if none. |
| */ |
| public BlockLine readLine( |
| DataInput blockInput, boolean isIncrementalEncodingSeed, BlockLine reuse) |
| throws IOException { |
| int termStateRelativeOffset = blockInput.readVInt(); |
| if (termStateRelativeOffset < 0) { |
| throw new CorruptIndexException( |
| "Illegal termStateRelativeOffset= " + termStateRelativeOffset, blockInput); |
| } |
| return reuse == null |
| ? new BlockLine( |
| readIncrementallyEncodedTerm(blockInput, isIncrementalEncodingSeed, null), |
| termStateRelativeOffset) |
| : reuse.reset( |
| readIncrementallyEncodedTerm(blockInput, isIncrementalEncodingSeed, reuse.termBytes), |
| termStateRelativeOffset); |
| } |
| |
| /** |
| * Writes a line and its offset to the corresponding term state details in the details region. |
| * |
| * @param blockOutput The output pointing to the block terms region. |
| * @param termStateRelativeOffset The offset to the corresponding term state details in the |
| * details region. |
| * @param isIncrementalEncodingSeed Whether the term is a seed of the incremental encoding. |
| * {@code true} for the first and middle term, {@code false} for other terms. |
| */ |
| public void writeLine( |
| DataOutput blockOutput, |
| BlockLine line, |
| BlockLine previousLine, |
| int termStateRelativeOffset, |
| boolean isIncrementalEncodingSeed) |
| throws IOException { |
| blockOutput.writeVInt(termStateRelativeOffset); |
| writeIncrementallyEncodedTerm( |
| line.getTermBytes(), |
| previousLine == null ? null : previousLine.getTermBytes(), |
| isIncrementalEncodingSeed, |
| blockOutput); |
| } |
| |
| /** |
| * Writes the term state details of a line in the details region. |
| * |
| * @param termStatesOutput The output pointing to the details region. |
| */ |
| protected void writeLineTermState( |
| DataOutput termStatesOutput, |
| BlockLine line, |
| FieldInfo fieldInfo, |
| DeltaBaseTermStateSerializer encoder) |
| throws IOException { |
| assert line.termState != null; |
| encoder.writeTermState(termStatesOutput, fieldInfo, line.termState); |
| } |
| |
| protected void writeIncrementallyEncodedTerm( |
| TermBytes termBytes, |
| TermBytes previousTermBytes, |
| boolean isIncrementalEncodingSeed, |
| DataOutput blockOutput) |
| throws IOException { |
| BytesRef term = termBytes.getTerm(); |
| assert term.offset == 0; |
| if (isIncrementalEncodingSeed) { |
| // Mdp length is always 1 for an incremental encoding seed. |
| blockOutput.writeVLong(term.length); |
| blockOutput.writeBytes(term.bytes, 0, term.length); |
| return; |
| } |
| if (term.length == 0) { |
| // Empty term. |
| blockOutput.writeVLong(0); |
| return; |
| } |
| |
| // For other lines we store: |
| // - Mdp length. |
| // - Suffix length. |
| // - Suffix bytes. |
| // Instead of writing mdp length and suffix length with 2 VInt, we can compress the storage |
| // by merging them in a single VLong. The idea is to leverage the information we have about |
| // the previous line. We know the previous line term length. And we know that |
| // new line mdp length <= (previous line term length + 1) |
| // So if numMdpBits = numBitsToEncode(previous line term length), |
| // then we know we can encode (new line mdp length - 1) in numMdpBits. |
| // Hence we encode (new line mdp length - 1) in the rightmost numMdpBits of the VLong. |
| // And we encode new line suffix length in the remaining left bits of the VLong. |
| // Most of the time both values will be encoded in a single byte. |
| |
| assert previousTermBytes != null; |
| assert termBytes.getMdpLength() >= 1; |
| |
| int numMdpBits = numBitsToEncode(previousTermBytes.getTerm().length); |
| assert numBitsToEncode(termBytes.getMdpLength() - 1) <= numMdpBits; |
| |
| long mdpAndSuffixLengths = |
| (((long) termBytes.getSuffixLength()) << numMdpBits) | (termBytes.getMdpLength() - 1); |
| assert mdpAndSuffixLengths != 0; |
| blockOutput.writeVLong(mdpAndSuffixLengths); |
| blockOutput.writeBytes(term.bytes, termBytes.getSuffixOffset(), termBytes.getSuffixLength()); |
| } |
| |
| protected TermBytes readIncrementallyEncodedTerm( |
| DataInput blockInput, boolean isIncrementalEncodingSeed, TermBytes reuse) |
| throws IOException { |
| assert currentTerm.offset == 0; |
| int mdpLength; |
| if (isIncrementalEncodingSeed) { |
| int length = (int) blockInput.readVLong(); |
| mdpLength = length == 0 ? 0 : 1; |
| readBytes(blockInput, currentTerm, 0, length); |
| } else { |
| long mdpAndSuffixLengths = blockInput.readVLong(); |
| if (mdpAndSuffixLengths == 0) { |
| // Empty term. |
| mdpLength = 0; |
| currentTerm.length = 0; |
| } else { |
| int numMdpBits = numBitsToEncode(currentTerm.length); |
| mdpLength = |
| (int) (mdpAndSuffixLengths & ((1 << numMdpBits) - 1)) |
| + 1; // Get rightmost numMdpBits. |
| int suffixLength = (int) (mdpAndSuffixLengths >>> numMdpBits); // Get remaining left bits. |
| assert mdpLength >= 1; |
| assert suffixLength >= 1; |
| readBytes(blockInput, currentTerm, mdpLength - 1, suffixLength); |
| } |
| } |
| return reuse == null |
| ? new TermBytes(mdpLength, currentTerm) |
| : reuse.reset(mdpLength, currentTerm); |
| } |
| |
| /** |
| * Reads {@code length} bytes from the given {@link DataInput} and stores them at {@code offset} |
| * in {@code bytes.bytes}. |
| */ |
| protected void readBytes(DataInput input, BytesRef bytes, int offset, int length) |
| throws IOException { |
| assert bytes.offset == 0; |
| bytes.length = offset + length; |
| bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length); |
| input.readBytes(bytes.bytes, offset, length); |
| } |
| |
| @Override |
| public long ramBytesUsed() { |
| return BASE_RAM_USAGE + RamUsageUtil.ramBytesUsed(currentTerm); |
| } |
| |
| /** |
| * Gets the number of bits required to encode the value of the provided int. Returns 0 for int |
| * value 0. Equivalent to (log2(i) + 1). |
| */ |
| protected static int numBitsToEncode(int i) { |
| return 32 - Integer.numberOfLeadingZeros(i); |
| } |
| } |
| } |