blob: ec73ddcb838ad25f961de0a32492595a1f76a8ed [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.uniformsplit;
import static org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat.BLOCK_SIZE;
import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat.IntBlockTermState;
import org.apache.lucene.codecs.lucene84.Lucene84PostingsReader;
import org.apache.lucene.codecs.lucene84.Lucene84PostingsWriter;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.TermState;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.RamUsageEstimator;
/**
* {@link TermState} serializer which encodes each file pointer as a delta relative
* to a base file pointer. It differs from {@link Lucene84PostingsWriter#encodeTerm}
* which encodes each file pointer as a delta relative to the previous file pointer.
* <p>
* It automatically sets the base file pointer to the first valid file pointer for
* doc start FP, pos start FP, pay start FP. These base file pointers have to be
* {@link #resetBaseStartFP() reset} by the caller before starting to write a new block.
*
* @lucene.experimental
*/
public class DeltaBaseTermStateSerializer implements Accountable {
private static final long RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(DeltaBaseTermStateSerializer.class);
private static final long INT_BLOCK_TERM_STATE_RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(IntBlockTermState.class);
protected long baseDocStartFP;
protected long basePosStartFP;
protected long basePayStartFP;
public DeltaBaseTermStateSerializer() {
resetBaseStartFP();
}
/**
* Resets the base file pointers to 0.
* This method has to be called before starting to write a new block.
*/
public void resetBaseStartFP() {
this.baseDocStartFP = 0;
this.basePosStartFP = 0;
this.basePayStartFP = 0;
}
/**
* @return The base doc start file pointer. It is the file pointer of the first
* {@link TermState} written after {@link #resetBaseStartFP()} is called.
*/
public long getBaseDocStartFP() {
return baseDocStartFP;
}
/**
* @return The base position start file pointer. It is the file pointer of the first
* {@link TermState} written after {@link #resetBaseStartFP()} is called.
*/
public long getBasePosStartFP() {
return basePosStartFP;
}
/**
* @return The base payload start file pointer. It is the file pointer of the first
* {@link TermState} written after {@link #resetBaseStartFP()} is called.
*/
public long getBasePayStartFP() {
return basePayStartFP;
}
/**
* Writes a {@link BlockTermState} to the provided {@link DataOutput}.
* <p>
* Simpler variant of {@link Lucene84PostingsWriter#encodeTerm(DataOutput, FieldInfo, BlockTermState, boolean)}.
*/
public void writeTermState(DataOutput termStatesOutput, FieldInfo fieldInfo, BlockTermState termState) throws IOException {
IndexOptions indexOptions = fieldInfo.getIndexOptions();
boolean hasFreqs = indexOptions != IndexOptions.DOCS;
boolean hasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
boolean hasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
boolean hasPayloads = fieldInfo.hasPayloads();
IntBlockTermState intTermState = (IntBlockTermState) termState;
termStatesOutput.writeVInt(intTermState.docFreq);
if (hasFreqs) {
assert intTermState.totalTermFreq >= intTermState.docFreq;
termStatesOutput.writeVLong(intTermState.totalTermFreq - intTermState.docFreq);
}
if (intTermState.singletonDocID != -1) {
termStatesOutput.writeVInt(intTermState.singletonDocID);
} else {
if (baseDocStartFP == 0) {
baseDocStartFP = intTermState.docStartFP;
}
termStatesOutput.writeVLong(intTermState.docStartFP - baseDocStartFP);
}
if (hasPositions) {
if (basePosStartFP == 0) {
basePosStartFP = intTermState.posStartFP;
}
termStatesOutput.writeVLong(intTermState.posStartFP - basePosStartFP);
if (hasPayloads || hasOffsets) {
if (basePayStartFP == 0) {
basePayStartFP = intTermState.payStartFP;
}
termStatesOutput.writeVLong(intTermState.payStartFP - basePayStartFP);
}
if (intTermState.lastPosBlockOffset != -1) {
termStatesOutput.writeVLong(intTermState.lastPosBlockOffset);
}
}
if (intTermState.skipOffset != -1) {
termStatesOutput.writeVLong(intTermState.skipOffset);
}
}
/**
* Reads a {@link BlockTermState} from the provided {@link DataInput}.
* <p>
* Simpler variant of {@link Lucene84PostingsReader#decodeTerm(DataInput, FieldInfo, BlockTermState, boolean)}.
*
* @param reuse {@link BlockTermState} to reuse; or null to create a new one.
*/
public BlockTermState readTermState(long baseDocStartFP, long basePosStartFP, long basePayStartFP,
DataInput termStatesInput, FieldInfo fieldInfo, BlockTermState reuse) throws IOException {
IndexOptions indexOptions = fieldInfo.getIndexOptions();
boolean hasFreqs = indexOptions != IndexOptions.DOCS;
boolean hasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
IntBlockTermState intTermState = reuse != null ? reset((IntBlockTermState) reuse) : new IntBlockTermState();
intTermState.docFreq = termStatesInput.readVInt();
intTermState.totalTermFreq = hasFreqs ?
intTermState.docFreq + termStatesInput.readVLong() : intTermState.docFreq;
assert intTermState.totalTermFreq >= intTermState.docFreq;
if (intTermState.docFreq == 1) {
intTermState.singletonDocID = termStatesInput.readVInt();
} else {
intTermState.docStartFP = baseDocStartFP + termStatesInput.readVLong();
}
if (hasPositions) {
intTermState.posStartFP = basePosStartFP + termStatesInput.readVLong();
boolean hasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
if (hasOffsets || fieldInfo.hasPayloads()) {
intTermState.payStartFP = basePayStartFP + termStatesInput.readVLong();
}
if (intTermState.totalTermFreq > BLOCK_SIZE) {
intTermState.lastPosBlockOffset = termStatesInput.readVLong();
}
}
if (intTermState.docFreq > BLOCK_SIZE) {
intTermState.skipOffset = termStatesInput.readVLong();
}
return intTermState;
}
protected IntBlockTermState reset(IntBlockTermState termState) {
// OrdTermState.
termState.ord = 0;
// BlockTermState.
termState.docFreq = 0;
termState.totalTermFreq = 0;
termState.termBlockOrd = 0;
termState.blockFilePointer = 0;
// IntBlockTermState.
termState.docStartFP = 0;
termState.posStartFP = 0;
termState.payStartFP = 0;
termState.skipOffset = -1;
termState.lastPosBlockOffset = -1;
termState.singletonDocID = -1;
return termState;
}
@Override
public long ramBytesUsed() {
return RAM_USAGE;
}
/**
* @return The estimated RAM usage of the given {@link TermState}.
*/
public static long ramBytesUsed(TermState termState) {
return termState instanceof IntBlockTermState ?
INT_BLOCK_TERM_STATE_RAM_USAGE
: RamUsageEstimator.shallowSizeOf(termState);
}
}