blob: 49352fd1c47da91c6ccc90572d2cdcad9ec18649 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.uniformsplit.sharedterms;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.uniformsplit.BlockHeader;
import org.apache.lucene.codecs.uniformsplit.BlockLine;
import org.apache.lucene.codecs.uniformsplit.DeltaBaseTermStateSerializer;
import org.apache.lucene.codecs.uniformsplit.FieldMetadata;
import org.apache.lucene.codecs.uniformsplit.TermBytes;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
/**
* Represents a term and its details stored in the {@link BlockTermState}. It is an extension of
* {@link BlockLine} for the Shared Terms format. This means the line contains a term and all its
* fields {@link org.apache.lucene.index.TermState}s.
*
* @lucene.experimental
*/
public class STBlockLine extends BlockLine {
/** List of the fields ond their TermStates for this block line. Only used for writing. */
protected final List<FieldMetadataTermState> termStates;
public STBlockLine(TermBytes termBytes, List<FieldMetadataTermState> termStates) {
super(termBytes, null);
assert !termStates.isEmpty();
this.termStates = new ArrayList<>(termStates);
}
/**
* Collects the {@link FieldMetadata} of all fields listed in this line.
*
* @param collector Receives the collected {@link FieldMetadata}.
*/
public void collectFields(Collection<FieldMetadata> collector) {
for (FieldMetadataTermState fieldTermState : termStates) {
collector.add(fieldTermState.fieldMetadata);
}
}
/**
* Reads block lines encoded incrementally, with all fields corresponding to the term of the line.
*
* <p>This class extends {@link org.apache.lucene.codecs.uniformsplit.BlockLine.Serializer}, so it
* keeps a state of the previous term read to decode the next term.
*/
public static class Serializer extends BlockLine.Serializer {
/**
* Writes all the {@link BlockTermState} of the provided {@link STBlockLine} to the given
* output.
*/
public void writeLineTermStates(
DataOutput termStatesOutput, STBlockLine line, DeltaBaseTermStateSerializer encoder)
throws IOException {
FieldMetadataTermState fieldMetadataTermState;
int size = line.termStates.size();
assert size > 0 : "not valid block line with :" + size + " lines.";
if (size == 1) {
// When there is only 1 field, write its id as negative, followed by the field TermState.
int fieldID = line.termStates.get(0).fieldMetadata.getFieldInfo().number;
termStatesOutput.writeZInt(-fieldID);
fieldMetadataTermState = line.termStates.get(0);
encoder.writeTermState(
termStatesOutput,
fieldMetadataTermState.fieldMetadata.getFieldInfo(),
fieldMetadataTermState.state);
return;
}
termStatesOutput.writeZInt(size);
// First iteration writes the fields ids.
for (int i = 0; i < size; i++) {
fieldMetadataTermState = line.termStates.get(i);
termStatesOutput.writeVInt(fieldMetadataTermState.fieldMetadata.getFieldInfo().number);
}
// Second iteration writes the corresponding field TermStates.
for (int i = 0; i < size; i++) {
fieldMetadataTermState = line.termStates.get(i);
encoder.writeTermState(
termStatesOutput,
fieldMetadataTermState.fieldMetadata.getFieldInfo(),
fieldMetadataTermState.state);
}
}
/**
* Reads a single {@link BlockTermState} for the provided field in the current block line of the
* provided input.
*
* @param termStatesInput Data input to read the {@link BlockTermState} from.
* @param blockHeader Current block header.
* @param reuse A previous {@link BlockTermState} to reuse; or null to create a new one.
* @return The {@link BlockTermState} corresponding to the provided field id; or null if the
* field does not occur in the line.
*/
public BlockTermState readTermStateForField(
int fieldId,
DataInput termStatesInput,
DeltaBaseTermStateSerializer termStateSerializer,
BlockHeader blockHeader,
FieldInfos fieldInfos,
BlockTermState reuse)
throws IOException {
assert fieldId >= 0;
int numFields = termStatesInput.readZInt();
if (numFields <= 0) {
int readFieldId = -numFields;
if (fieldId == readFieldId) {
return termStateSerializer.readTermState(
blockHeader.getBaseDocsFP(),
blockHeader.getBasePositionsFP(),
blockHeader.getBasePayloadsFP(),
termStatesInput,
fieldInfos.fieldInfo(readFieldId),
reuse);
}
return null;
}
// There are multiple fields for the term.
// We have to read all the field ids (aka field numbers) sequentially.
// Then if the required field is in the list, we have to read all the TermState
// sequentially. This could be optimized with a jump-to-middle offset
// for example, but we don't need that currently.
boolean isFieldInList = false;
int[] readFieldIds = new int[numFields];
for (int i = 0; i < numFields; i++) {
int readFieldId = termStatesInput.readVInt();
if (!isFieldInList && readFieldId > fieldId) {
// As the list of fieldIds is sorted we can return early if we find fieldId greater than
// the seeked one.
// But if we found the seeked one, we have to read all the list to get to the term state
// part afterward (there is no jump offset).
return null;
}
isFieldInList |= readFieldId == fieldId;
readFieldIds[i] = readFieldId;
}
if (isFieldInList) {
for (int readFieldId : readFieldIds) {
BlockTermState termState =
termStateSerializer.readTermState(
blockHeader.getBaseDocsFP(),
blockHeader.getBasePositionsFP(),
blockHeader.getBasePayloadsFP(),
termStatesInput,
fieldInfos.fieldInfo(readFieldId),
reuse);
if (fieldId == readFieldId) {
return termState;
}
}
}
return null;
}
/**
* Reads all the {@link BlockTermState} of all the field in the current block line of the
* provided input.
*
* @param fieldTermStatesMap Map filled with the term states for each field. It is cleared
* first.
* @see #readTermStateForField
*/
public void readFieldTermStatesMap(
DataInput termStatesInput,
DeltaBaseTermStateSerializer termStateSerializer,
BlockHeader blockHeader,
FieldInfos fieldInfos,
Map<String, BlockTermState> fieldTermStatesMap)
throws IOException {
fieldTermStatesMap.clear();
int numFields = termStatesInput.readZInt();
if (numFields <= 0) {
int fieldId = -numFields;
fieldTermStatesMap.put(
fieldInfos.fieldInfo(fieldId).name,
termStateSerializer.readTermState(
blockHeader.getBaseDocsFP(),
blockHeader.getBasePositionsFP(),
blockHeader.getBasePayloadsFP(),
termStatesInput,
fieldInfos.fieldInfo(fieldId),
null));
return;
}
for (int fieldId : readFieldIds(termStatesInput, numFields)) {
fieldTermStatesMap.put(
fieldInfos.fieldInfo(fieldId).name,
termStateSerializer.readTermState(
blockHeader.getBaseDocsFP(),
blockHeader.getBasePositionsFP(),
blockHeader.getBasePayloadsFP(),
termStatesInput,
fieldInfos.fieldInfo(fieldId),
null));
}
}
/** Reads all the field ids in the current block line of the provided input. */
public int[] readFieldIds(DataInput termStatesInput, int numFields) throws IOException {
int[] fieldIds = new int[numFields];
for (int i = 0; i < numFields; i++) {
fieldIds[i] = termStatesInput.readVInt();
}
return fieldIds;
}
}
}