blob: 6d8a248042fd39c9ef11fe5067cc411469fe5967 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.backward_codecs.lucene50.compressing;
import java.io.Closeable;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.NoSuchElementException;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.codecs.compressing.CompressionMode;
import org.apache.lucene.codecs.compressing.Decompressor;
import org.apache.lucene.index.BaseTermsEnum;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SlowImpactsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LongsRef;
import org.apache.lucene.util.packed.BlockPackedReaderIterator;
import org.apache.lucene.util.packed.PackedInts;
/**
* {@link TermVectorsReader} for {@link Lucene50CompressingTermVectorsFormat}.
*
* @lucene.experimental
*/
public final class Lucene50CompressingTermVectorsReader extends TermVectorsReader
implements Closeable {
// hard limit on the maximum number of documents per chunk
static final int MAX_DOCUMENTS_PER_CHUNK = 128;
static final String VECTORS_EXTENSION = "tvd";
static final String VECTORS_INDEX_EXTENSION = "tvx";
static final String VECTORS_META_EXTENSION = "tvm";
static final String VECTORS_INDEX_CODEC_NAME = "Lucene85TermVectorsIndex";
static final int VERSION_START = 1;
static final int VERSION_OFFHEAP_INDEX = 2;
/** Version where all metadata were moved to the meta file. */
static final int VERSION_META = 3;
static final int VERSION_CURRENT = VERSION_META;
static final int META_VERSION_START = 0;
static final int PACKED_BLOCK_SIZE = 64;
static final int POSITIONS = 0x01;
static final int OFFSETS = 0x02;
static final int PAYLOADS = 0x04;
static final int FLAGS_BITS = PackedInts.bitsRequired(POSITIONS | OFFSETS | PAYLOADS);
private final FieldInfos fieldInfos;
final FieldsIndex indexReader;
final IndexInput vectorsStream;
private final int version;
private final int packedIntsVersion;
private final CompressionMode compressionMode;
private final Decompressor decompressor;
private final int chunkSize;
private final int numDocs;
private boolean closed;
private final BlockPackedReaderIterator reader;
private final long numDirtyChunks; // number of incomplete compressed blocks written
private final long numDirtyDocs; // cumulative number of missing docs in incomplete chunks
private final long maxPointer; // end of the data section
// used by clone
private Lucene50CompressingTermVectorsReader(Lucene50CompressingTermVectorsReader reader) {
this.fieldInfos = reader.fieldInfos;
this.vectorsStream = reader.vectorsStream.clone();
this.indexReader = reader.indexReader.clone();
this.packedIntsVersion = reader.packedIntsVersion;
this.compressionMode = reader.compressionMode;
this.decompressor = reader.decompressor.clone();
this.chunkSize = reader.chunkSize;
this.numDocs = reader.numDocs;
this.reader =
new BlockPackedReaderIterator(vectorsStream, packedIntsVersion, PACKED_BLOCK_SIZE, 0);
this.version = reader.version;
this.numDirtyChunks = reader.numDirtyChunks;
this.numDirtyDocs = reader.numDirtyDocs;
this.maxPointer = reader.maxPointer;
this.closed = false;
}
/** Sole constructor. */
public Lucene50CompressingTermVectorsReader(
Directory d,
SegmentInfo si,
String segmentSuffix,
FieldInfos fn,
IOContext context,
String formatName,
CompressionMode compressionMode)
throws IOException {
this.compressionMode = compressionMode;
final String segment = si.name;
boolean success = false;
fieldInfos = fn;
numDocs = si.maxDoc();
ChecksumIndexInput metaIn = null;
try {
// Open the data file
final String vectorsStreamFN =
IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_EXTENSION);
vectorsStream = d.openInput(vectorsStreamFN, context);
version =
CodecUtil.checkIndexHeader(
vectorsStream, formatName, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
assert CodecUtil.indexHeaderLength(formatName, segmentSuffix)
== vectorsStream.getFilePointer();
if (version >= VERSION_OFFHEAP_INDEX) {
final String metaStreamFN =
IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_META_EXTENSION);
metaIn = d.openChecksumInput(metaStreamFN, IOContext.READONCE);
CodecUtil.checkIndexHeader(
metaIn,
VECTORS_INDEX_CODEC_NAME + "Meta",
META_VERSION_START,
version,
si.getId(),
segmentSuffix);
}
if (version >= VERSION_META) {
packedIntsVersion = metaIn.readVInt();
chunkSize = metaIn.readVInt();
} else {
packedIntsVersion = vectorsStream.readVInt();
chunkSize = vectorsStream.readVInt();
}
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
CodecUtil.retrieveChecksum(vectorsStream);
FieldsIndex indexReader = null;
long maxPointer = -1;
if (version < VERSION_OFFHEAP_INDEX) {
// Load the index into memory
final String indexName = IndexFileNames.segmentFileName(segment, segmentSuffix, "tvx");
try (ChecksumIndexInput indexStream = d.openChecksumInput(indexName, context)) {
Throwable priorE = null;
try {
assert formatName.endsWith("Data");
final String codecNameIdx =
formatName.substring(0, formatName.length() - "Data".length()) + "Index";
final int version2 =
CodecUtil.checkIndexHeader(
indexStream,
codecNameIdx,
VERSION_START,
VERSION_CURRENT,
si.getId(),
segmentSuffix);
if (version != version2) {
throw new CorruptIndexException(
"Version mismatch between stored fields index and data: "
+ version
+ " != "
+ version2,
indexStream);
}
assert CodecUtil.indexHeaderLength(codecNameIdx, segmentSuffix)
== indexStream.getFilePointer();
indexReader = new LegacyFieldsIndexReader(indexStream, si);
maxPointer = indexStream.readVLong(); // the end of the data section
} catch (Throwable exception) {
priorE = exception;
} finally {
CodecUtil.checkFooter(indexStream, priorE);
}
}
} else {
FieldsIndexReader fieldsIndexReader =
new FieldsIndexReader(
d,
si.name,
segmentSuffix,
VECTORS_INDEX_EXTENSION,
VECTORS_INDEX_CODEC_NAME,
si.getId(),
metaIn);
indexReader = fieldsIndexReader;
maxPointer = fieldsIndexReader.getMaxPointer();
}
this.indexReader = indexReader;
this.maxPointer = maxPointer;
if (version >= VERSION_META) {
numDirtyChunks = metaIn.readVLong();
numDirtyDocs = metaIn.readVLong();
} else {
// Old versions of this format did not record numDirtyDocs. Since bulk
// merges are disabled on version increments anyway, we make no effort
// to get valid values of numDirtyChunks and numDirtyDocs.
numDirtyChunks = numDirtyDocs = -1;
}
decompressor = compressionMode.newDecompressor();
this.reader =
new BlockPackedReaderIterator(vectorsStream, packedIntsVersion, PACKED_BLOCK_SIZE, 0);
if (metaIn != null) {
CodecUtil.checkFooter(metaIn, null);
metaIn.close();
}
success = true;
} catch (Throwable t) {
if (metaIn != null) {
CodecUtil.checkFooter(metaIn, t);
throw new AssertionError("unreachable");
} else {
throw t;
}
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(this, metaIn);
}
}
}
CompressionMode getCompressionMode() {
return compressionMode;
}
int getChunkSize() {
return chunkSize;
}
int getPackedIntsVersion() {
return packedIntsVersion;
}
int getVersion() {
return version;
}
FieldsIndex getIndexReader() {
return indexReader;
}
IndexInput getVectorsStream() {
return vectorsStream;
}
long getMaxPointer() {
return maxPointer;
}
long getNumDirtyDocs() {
if (version != VERSION_CURRENT) {
throw new IllegalStateException(
"getNumDirtyDocs should only ever get called when the reader is on the current version");
}
assert numDirtyDocs >= 0;
return numDirtyDocs;
}
long getNumDirtyChunks() {
if (version != VERSION_CURRENT) {
throw new IllegalStateException(
"getNumDirtyChunks should only ever get called when the reader is on the current version");
}
assert numDirtyChunks >= 0;
return numDirtyChunks;
}
int getNumDocs() {
return numDocs;
}
/** @throws AlreadyClosedException if this TermVectorsReader is closed */
private void ensureOpen() throws AlreadyClosedException {
if (closed) {
throw new AlreadyClosedException("this FieldsReader is closed");
}
}
@Override
public void close() throws IOException {
if (!closed) {
IOUtils.close(indexReader, vectorsStream);
closed = true;
}
}
@Override
public TermVectorsReader clone() {
return new Lucene50CompressingTermVectorsReader(this);
}
@Override
public Fields get(int doc) throws IOException {
ensureOpen();
// seek to the right place
{
final long startPointer = indexReader.getStartPointer(doc);
vectorsStream.seek(startPointer);
}
// decode
// - docBase: first doc ID of the chunk
// - chunkDocs: number of docs of the chunk
final int docBase = vectorsStream.readVInt();
final int chunkDocs = vectorsStream.readVInt();
if (doc < docBase || doc >= docBase + chunkDocs || docBase + chunkDocs > numDocs) {
throw new CorruptIndexException(
"docBase=" + docBase + ",chunkDocs=" + chunkDocs + ",doc=" + doc, vectorsStream);
}
final int skip; // number of fields to skip
final int numFields; // number of fields of the document we're looking for
final int totalFields; // total number of fields of the chunk (sum for all docs)
if (chunkDocs == 1) {
skip = 0;
numFields = totalFields = vectorsStream.readVInt();
} else {
reader.reset(vectorsStream, chunkDocs);
int sum = 0;
for (int i = docBase; i < doc; ++i) {
sum += reader.next();
}
skip = sum;
numFields = (int) reader.next();
sum += numFields;
for (int i = doc + 1; i < docBase + chunkDocs; ++i) {
sum += reader.next();
}
totalFields = sum;
}
if (numFields == 0) {
// no vectors
return null;
}
// read field numbers that have term vectors
final int[] fieldNums;
{
final int token = vectorsStream.readByte() & 0xFF;
assert token != 0; // means no term vectors, cannot happen since we checked for numFields == 0
final int bitsPerFieldNum = token & 0x1F;
int totalDistinctFields = token >>> 5;
if (totalDistinctFields == 0x07) {
totalDistinctFields += vectorsStream.readVInt();
}
++totalDistinctFields;
final PackedInts.ReaderIterator it =
PackedInts.getReaderIteratorNoHeader(
vectorsStream,
PackedInts.Format.PACKED,
packedIntsVersion,
totalDistinctFields,
bitsPerFieldNum,
1);
fieldNums = new int[totalDistinctFields];
for (int i = 0; i < totalDistinctFields; ++i) {
fieldNums[i] = (int) it.next();
}
}
// read field numbers and flags
final int[] fieldNumOffs = new int[numFields];
final PackedInts.Reader flags;
{
final int bitsPerOff = PackedInts.bitsRequired(fieldNums.length - 1);
final PackedInts.Reader allFieldNumOffs =
PackedInts.getReaderNoHeader(
vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, bitsPerOff);
switch (vectorsStream.readVInt()) {
case 0:
final PackedInts.Reader fieldFlags =
PackedInts.getReaderNoHeader(
vectorsStream,
PackedInts.Format.PACKED,
packedIntsVersion,
fieldNums.length,
FLAGS_BITS);
PackedInts.Mutable f = PackedInts.getMutable(totalFields, FLAGS_BITS, PackedInts.COMPACT);
for (int i = 0; i < totalFields; ++i) {
final int fieldNumOff = (int) allFieldNumOffs.get(i);
assert fieldNumOff >= 0 && fieldNumOff < fieldNums.length;
final int fgs = (int) fieldFlags.get(fieldNumOff);
f.set(i, fgs);
}
flags = f;
break;
case 1:
flags =
PackedInts.getReaderNoHeader(
vectorsStream,
PackedInts.Format.PACKED,
packedIntsVersion,
totalFields,
FLAGS_BITS);
break;
default:
throw new AssertionError();
}
for (int i = 0; i < numFields; ++i) {
fieldNumOffs[i] = (int) allFieldNumOffs.get(skip + i);
}
}
// number of terms per field for all fields
final PackedInts.Reader numTerms;
final int totalTerms;
{
final int bitsRequired = vectorsStream.readVInt();
numTerms =
PackedInts.getReaderNoHeader(
vectorsStream,
PackedInts.Format.PACKED,
packedIntsVersion,
totalFields,
bitsRequired);
int sum = 0;
for (int i = 0; i < totalFields; ++i) {
sum += numTerms.get(i);
}
totalTerms = sum;
}
// term lengths
int docOff = 0, docLen = 0, totalLen;
final int[] fieldLengths = new int[numFields];
final int[][] prefixLengths = new int[numFields][];
final int[][] suffixLengths = new int[numFields][];
{
reader.reset(vectorsStream, totalTerms);
// skip
int toSkip = 0;
for (int i = 0; i < skip; ++i) {
toSkip += numTerms.get(i);
}
reader.skip(toSkip);
// read prefix lengths
for (int i = 0; i < numFields; ++i) {
final int termCount = (int) numTerms.get(skip + i);
final int[] fieldPrefixLengths = new int[termCount];
prefixLengths[i] = fieldPrefixLengths;
for (int j = 0; j < termCount; ) {
final LongsRef next = reader.next(termCount - j);
for (int k = 0; k < next.length; ++k) {
fieldPrefixLengths[j++] = (int) next.longs[next.offset + k];
}
}
}
reader.skip(totalTerms - reader.ord());
reader.reset(vectorsStream, totalTerms);
// skip
toSkip = 0;
for (int i = 0; i < skip; ++i) {
for (int j = 0; j < numTerms.get(i); ++j) {
docOff += reader.next();
}
}
for (int i = 0; i < numFields; ++i) {
final int termCount = (int) numTerms.get(skip + i);
final int[] fieldSuffixLengths = new int[termCount];
suffixLengths[i] = fieldSuffixLengths;
for (int j = 0; j < termCount; ) {
final LongsRef next = reader.next(termCount - j);
for (int k = 0; k < next.length; ++k) {
fieldSuffixLengths[j++] = (int) next.longs[next.offset + k];
}
}
fieldLengths[i] = sum(suffixLengths[i]);
docLen += fieldLengths[i];
}
totalLen = docOff + docLen;
for (int i = skip + numFields; i < totalFields; ++i) {
for (int j = 0; j < numTerms.get(i); ++j) {
totalLen += reader.next();
}
}
}
// term freqs
final int[] termFreqs = new int[totalTerms];
{
reader.reset(vectorsStream, totalTerms);
for (int i = 0; i < totalTerms; ) {
final LongsRef next = reader.next(totalTerms - i);
for (int k = 0; k < next.length; ++k) {
termFreqs[i++] = 1 + (int) next.longs[next.offset + k];
}
}
}
// total number of positions, offsets and payloads
int totalPositions = 0, totalOffsets = 0, totalPayloads = 0;
for (int i = 0, termIndex = 0; i < totalFields; ++i) {
final int f = (int) flags.get(i);
final int termCount = (int) numTerms.get(i);
for (int j = 0; j < termCount; ++j) {
final int freq = termFreqs[termIndex++];
if ((f & POSITIONS) != 0) {
totalPositions += freq;
}
if ((f & OFFSETS) != 0) {
totalOffsets += freq;
}
if ((f & PAYLOADS) != 0) {
totalPayloads += freq;
}
}
assert i != totalFields - 1 || termIndex == totalTerms : termIndex + " " + totalTerms;
}
final int[][] positionIndex = positionIndex(skip, numFields, numTerms, termFreqs);
final int[][] positions, startOffsets, lengths;
if (totalPositions > 0) {
positions =
readPositions(
skip,
numFields,
flags,
numTerms,
termFreqs,
POSITIONS,
totalPositions,
positionIndex);
} else {
positions = new int[numFields][];
}
if (totalOffsets > 0) {
// average number of chars per term
final float[] charsPerTerm = new float[fieldNums.length];
for (int i = 0; i < charsPerTerm.length; ++i) {
charsPerTerm[i] = Float.intBitsToFloat(vectorsStream.readInt());
}
startOffsets =
readPositions(
skip, numFields, flags, numTerms, termFreqs, OFFSETS, totalOffsets, positionIndex);
lengths =
readPositions(
skip, numFields, flags, numTerms, termFreqs, OFFSETS, totalOffsets, positionIndex);
for (int i = 0; i < numFields; ++i) {
final int[] fStartOffsets = startOffsets[i];
final int[] fPositions = positions[i];
// patch offsets from positions
if (fStartOffsets != null && fPositions != null) {
final float fieldCharsPerTerm = charsPerTerm[fieldNumOffs[i]];
for (int j = 0; j < startOffsets[i].length; ++j) {
fStartOffsets[j] += (int) (fieldCharsPerTerm * fPositions[j]);
}
}
if (fStartOffsets != null) {
final int[] fPrefixLengths = prefixLengths[i];
final int[] fSuffixLengths = suffixLengths[i];
final int[] fLengths = lengths[i];
for (int j = 0, end = (int) numTerms.get(skip + i); j < end; ++j) {
// delta-decode start offsets and patch lengths using term lengths
final int termLength = fPrefixLengths[j] + fSuffixLengths[j];
lengths[i][positionIndex[i][j]] += termLength;
for (int k = positionIndex[i][j] + 1; k < positionIndex[i][j + 1]; ++k) {
fStartOffsets[k] += fStartOffsets[k - 1];
fLengths[k] += termLength;
}
}
}
}
} else {
startOffsets = lengths = new int[numFields][];
}
if (totalPositions > 0) {
// delta-decode positions
for (int i = 0; i < numFields; ++i) {
final int[] fPositions = positions[i];
final int[] fpositionIndex = positionIndex[i];
if (fPositions != null) {
for (int j = 0, end = (int) numTerms.get(skip + i); j < end; ++j) {
// delta-decode start offsets
for (int k = fpositionIndex[j] + 1; k < fpositionIndex[j + 1]; ++k) {
fPositions[k] += fPositions[k - 1];
}
}
}
}
}
// payload lengths
final int[][] payloadIndex = new int[numFields][];
int totalPayloadLength = 0;
int payloadOff = 0;
int payloadLen = 0;
if (totalPayloads > 0) {
reader.reset(vectorsStream, totalPayloads);
// skip
int termIndex = 0;
for (int i = 0; i < skip; ++i) {
final int f = (int) flags.get(i);
final int termCount = (int) numTerms.get(i);
if ((f & PAYLOADS) != 0) {
for (int j = 0; j < termCount; ++j) {
final int freq = termFreqs[termIndex + j];
for (int k = 0; k < freq; ++k) {
final int l = (int) reader.next();
payloadOff += l;
}
}
}
termIndex += termCount;
}
totalPayloadLength = payloadOff;
// read doc payload lengths
for (int i = 0; i < numFields; ++i) {
final int f = (int) flags.get(skip + i);
final int termCount = (int) numTerms.get(skip + i);
if ((f & PAYLOADS) != 0) {
final int totalFreq = positionIndex[i][termCount];
payloadIndex[i] = new int[totalFreq + 1];
int posIdx = 0;
payloadIndex[i][posIdx] = payloadLen;
for (int j = 0; j < termCount; ++j) {
final int freq = termFreqs[termIndex + j];
for (int k = 0; k < freq; ++k) {
final int payloadLength = (int) reader.next();
payloadLen += payloadLength;
payloadIndex[i][posIdx + 1] = payloadLen;
++posIdx;
}
}
assert posIdx == totalFreq;
}
termIndex += termCount;
}
totalPayloadLength += payloadLen;
for (int i = skip + numFields; i < totalFields; ++i) {
final int f = (int) flags.get(i);
final int termCount = (int) numTerms.get(i);
if ((f & PAYLOADS) != 0) {
for (int j = 0; j < termCount; ++j) {
final int freq = termFreqs[termIndex + j];
for (int k = 0; k < freq; ++k) {
totalPayloadLength += reader.next();
}
}
}
termIndex += termCount;
}
assert termIndex == totalTerms : termIndex + " " + totalTerms;
}
// decompress data
final BytesRef suffixBytes = new BytesRef();
decompressor.decompress(
vectorsStream,
totalLen + totalPayloadLength,
docOff + payloadOff,
docLen + payloadLen,
suffixBytes);
suffixBytes.length = docLen;
final BytesRef payloadBytes =
new BytesRef(suffixBytes.bytes, suffixBytes.offset + docLen, payloadLen);
final int[] fieldFlags = new int[numFields];
for (int i = 0; i < numFields; ++i) {
fieldFlags[i] = (int) flags.get(skip + i);
}
final int[] fieldNumTerms = new int[numFields];
for (int i = 0; i < numFields; ++i) {
fieldNumTerms[i] = (int) numTerms.get(skip + i);
}
final int[][] fieldTermFreqs = new int[numFields][];
{
int termIdx = 0;
for (int i = 0; i < skip; ++i) {
termIdx += numTerms.get(i);
}
for (int i = 0; i < numFields; ++i) {
final int termCount = (int) numTerms.get(skip + i);
fieldTermFreqs[i] = new int[termCount];
for (int j = 0; j < termCount; ++j) {
fieldTermFreqs[i][j] = termFreqs[termIdx++];
}
}
}
assert sum(fieldLengths) == docLen : sum(fieldLengths) + " != " + docLen;
return new TVFields(
fieldNums,
fieldFlags,
fieldNumOffs,
fieldNumTerms,
fieldLengths,
prefixLengths,
suffixLengths,
fieldTermFreqs,
positionIndex,
positions,
startOffsets,
lengths,
payloadBytes,
payloadIndex,
suffixBytes);
}
// field -> term index -> position index
private int[][] positionIndex(
int skip, int numFields, PackedInts.Reader numTerms, int[] termFreqs) {
final int[][] positionIndex = new int[numFields][];
int termIndex = 0;
for (int i = 0; i < skip; ++i) {
final int termCount = (int) numTerms.get(i);
termIndex += termCount;
}
for (int i = 0; i < numFields; ++i) {
final int termCount = (int) numTerms.get(skip + i);
positionIndex[i] = new int[termCount + 1];
for (int j = 0; j < termCount; ++j) {
final int freq = termFreqs[termIndex + j];
positionIndex[i][j + 1] = positionIndex[i][j] + freq;
}
termIndex += termCount;
}
return positionIndex;
}
private int[][] readPositions(
int skip,
int numFields,
PackedInts.Reader flags,
PackedInts.Reader numTerms,
int[] termFreqs,
int flag,
final int totalPositions,
int[][] positionIndex)
throws IOException {
final int[][] positions = new int[numFields][];
reader.reset(vectorsStream, totalPositions);
// skip
int toSkip = 0;
int termIndex = 0;
for (int i = 0; i < skip; ++i) {
final int f = (int) flags.get(i);
final int termCount = (int) numTerms.get(i);
if ((f & flag) != 0) {
for (int j = 0; j < termCount; ++j) {
final int freq = termFreqs[termIndex + j];
toSkip += freq;
}
}
termIndex += termCount;
}
reader.skip(toSkip);
// read doc positions
for (int i = 0; i < numFields; ++i) {
final int f = (int) flags.get(skip + i);
final int termCount = (int) numTerms.get(skip + i);
if ((f & flag) != 0) {
final int totalFreq = positionIndex[i][termCount];
final int[] fieldPositions = new int[totalFreq];
positions[i] = fieldPositions;
for (int j = 0; j < totalFreq; ) {
final LongsRef nextPositions = reader.next(totalFreq - j);
for (int k = 0; k < nextPositions.length; ++k) {
fieldPositions[j++] = (int) nextPositions.longs[nextPositions.offset + k];
}
}
}
termIndex += termCount;
}
reader.skip(totalPositions - reader.ord());
return positions;
}
private class TVFields extends Fields {
private final int[] fieldNums, fieldFlags, fieldNumOffs, numTerms, fieldLengths;
private final int[][] prefixLengths,
suffixLengths,
termFreqs,
positionIndex,
positions,
startOffsets,
lengths,
payloadIndex;
private final BytesRef suffixBytes, payloadBytes;
public TVFields(
int[] fieldNums,
int[] fieldFlags,
int[] fieldNumOffs,
int[] numTerms,
int[] fieldLengths,
int[][] prefixLengths,
int[][] suffixLengths,
int[][] termFreqs,
int[][] positionIndex,
int[][] positions,
int[][] startOffsets,
int[][] lengths,
BytesRef payloadBytes,
int[][] payloadIndex,
BytesRef suffixBytes) {
this.fieldNums = fieldNums;
this.fieldFlags = fieldFlags;
this.fieldNumOffs = fieldNumOffs;
this.numTerms = numTerms;
this.fieldLengths = fieldLengths;
this.prefixLengths = prefixLengths;
this.suffixLengths = suffixLengths;
this.termFreqs = termFreqs;
this.positionIndex = positionIndex;
this.positions = positions;
this.startOffsets = startOffsets;
this.lengths = lengths;
this.payloadBytes = payloadBytes;
this.payloadIndex = payloadIndex;
this.suffixBytes = suffixBytes;
}
@Override
public Iterator<String> iterator() {
return new Iterator<String>() {
int i = 0;
@Override
public boolean hasNext() {
return i < fieldNumOffs.length;
}
@Override
public String next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
final int fieldNum = fieldNums[fieldNumOffs[i++]];
return fieldInfos.fieldInfo(fieldNum).name;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
@Override
public Terms terms(String field) throws IOException {
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
if (fieldInfo == null) {
return null;
}
int idx = -1;
for (int i = 0; i < fieldNumOffs.length; ++i) {
if (fieldNums[fieldNumOffs[i]] == fieldInfo.number) {
idx = i;
break;
}
}
if (idx == -1 || numTerms[idx] == 0) {
// no term
return null;
}
int fieldOff = 0, fieldLen = -1;
for (int i = 0; i < fieldNumOffs.length; ++i) {
if (i < idx) {
fieldOff += fieldLengths[i];
} else {
fieldLen = fieldLengths[i];
break;
}
}
assert fieldLen >= 0;
return new TVTerms(
numTerms[idx],
fieldFlags[idx],
prefixLengths[idx],
suffixLengths[idx],
termFreqs[idx],
positionIndex[idx],
positions[idx],
startOffsets[idx],
lengths[idx],
payloadIndex[idx],
payloadBytes,
new BytesRef(suffixBytes.bytes, suffixBytes.offset + fieldOff, fieldLen));
}
@Override
public int size() {
return fieldNumOffs.length;
}
}
private static class TVTerms extends Terms {
private final int numTerms, flags;
private final long totalTermFreq;
private final int[] prefixLengths,
suffixLengths,
termFreqs,
positionIndex,
positions,
startOffsets,
lengths,
payloadIndex;
private final BytesRef termBytes, payloadBytes;
TVTerms(
int numTerms,
int flags,
int[] prefixLengths,
int[] suffixLengths,
int[] termFreqs,
int[] positionIndex,
int[] positions,
int[] startOffsets,
int[] lengths,
int[] payloadIndex,
BytesRef payloadBytes,
BytesRef termBytes) {
this.numTerms = numTerms;
this.flags = flags;
this.prefixLengths = prefixLengths;
this.suffixLengths = suffixLengths;
this.termFreqs = termFreqs;
this.positionIndex = positionIndex;
this.positions = positions;
this.startOffsets = startOffsets;
this.lengths = lengths;
this.payloadIndex = payloadIndex;
this.payloadBytes = payloadBytes;
this.termBytes = termBytes;
long ttf = 0;
for (int tf : termFreqs) {
ttf += tf;
}
this.totalTermFreq = ttf;
}
@Override
public TermsEnum iterator() throws IOException {
TVTermsEnum termsEnum = new TVTermsEnum();
termsEnum.reset(
numTerms,
flags,
prefixLengths,
suffixLengths,
termFreqs,
positionIndex,
positions,
startOffsets,
lengths,
payloadIndex,
payloadBytes,
new ByteArrayDataInput(termBytes.bytes, termBytes.offset, termBytes.length));
return termsEnum;
}
@Override
public long size() throws IOException {
return numTerms;
}
@Override
public long getSumTotalTermFreq() throws IOException {
return totalTermFreq;
}
@Override
public long getSumDocFreq() throws IOException {
return numTerms;
}
@Override
public int getDocCount() throws IOException {
return 1;
}
@Override
public boolean hasFreqs() {
return true;
}
@Override
public boolean hasOffsets() {
return (flags & OFFSETS) != 0;
}
@Override
public boolean hasPositions() {
return (flags & POSITIONS) != 0;
}
@Override
public boolean hasPayloads() {
return (flags & PAYLOADS) != 0;
}
}
private static class TVTermsEnum extends BaseTermsEnum {
private int numTerms, startPos, ord;
private int[] prefixLengths,
suffixLengths,
termFreqs,
positionIndex,
positions,
startOffsets,
lengths,
payloadIndex;
private ByteArrayDataInput in;
private BytesRef payloads;
private final BytesRef term;
private TVTermsEnum() {
term = new BytesRef(16);
}
void reset(
int numTerms,
int flags,
int[] prefixLengths,
int[] suffixLengths,
int[] termFreqs,
int[] positionIndex,
int[] positions,
int[] startOffsets,
int[] lengths,
int[] payloadIndex,
BytesRef payloads,
ByteArrayDataInput in) {
this.numTerms = numTerms;
this.prefixLengths = prefixLengths;
this.suffixLengths = suffixLengths;
this.termFreqs = termFreqs;
this.positionIndex = positionIndex;
this.positions = positions;
this.startOffsets = startOffsets;
this.lengths = lengths;
this.payloadIndex = payloadIndex;
this.payloads = payloads;
this.in = in;
startPos = in.getPosition();
reset();
}
void reset() {
term.length = 0;
in.setPosition(startPos);
ord = -1;
}
@Override
public BytesRef next() throws IOException {
if (ord == numTerms - 1) {
return null;
} else {
assert ord < numTerms;
++ord;
}
// read term
term.offset = 0;
term.length = prefixLengths[ord] + suffixLengths[ord];
if (term.length > term.bytes.length) {
term.bytes = ArrayUtil.grow(term.bytes, term.length);
}
in.readBytes(term.bytes, prefixLengths[ord], suffixLengths[ord]);
return term;
}
@Override
public SeekStatus seekCeil(BytesRef text) throws IOException {
if (ord < numTerms && ord >= 0) {
final int cmp = term().compareTo(text);
if (cmp == 0) {
return SeekStatus.FOUND;
} else if (cmp > 0) {
reset();
}
}
// linear scan
while (true) {
final BytesRef term = next();
if (term == null) {
return SeekStatus.END;
}
final int cmp = term.compareTo(text);
if (cmp > 0) {
return SeekStatus.NOT_FOUND;
} else if (cmp == 0) {
return SeekStatus.FOUND;
}
}
}
@Override
public void seekExact(long ord) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public BytesRef term() throws IOException {
return term;
}
@Override
public long ord() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int docFreq() throws IOException {
return 1;
}
@Override
public long totalTermFreq() throws IOException {
return termFreqs[ord];
}
@Override
public final PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
final TVPostingsEnum docsEnum;
if (reuse != null && reuse instanceof TVPostingsEnum) {
docsEnum = (TVPostingsEnum) reuse;
} else {
docsEnum = new TVPostingsEnum();
}
docsEnum.reset(
termFreqs[ord],
positionIndex[ord],
positions,
startOffsets,
lengths,
payloads,
payloadIndex);
return docsEnum;
}
@Override
public ImpactsEnum impacts(int flags) throws IOException {
final PostingsEnum delegate = postings(null, PostingsEnum.FREQS);
return new SlowImpactsEnum(delegate);
}
}
private static class TVPostingsEnum extends PostingsEnum {
private int doc = -1;
private int termFreq;
private int positionIndex;
private int[] positions;
private int[] startOffsets;
private int[] lengths;
private final BytesRef payload;
private int[] payloadIndex;
private int basePayloadOffset;
private int i;
TVPostingsEnum() {
payload = new BytesRef();
}
public void reset(
int freq,
int positionIndex,
int[] positions,
int[] startOffsets,
int[] lengths,
BytesRef payloads,
int[] payloadIndex) {
this.termFreq = freq;
this.positionIndex = positionIndex;
this.positions = positions;
this.startOffsets = startOffsets;
this.lengths = lengths;
this.basePayloadOffset = payloads.offset;
this.payload.bytes = payloads.bytes;
payload.offset = payload.length = 0;
this.payloadIndex = payloadIndex;
doc = i = -1;
}
private void checkDoc() {
if (doc == NO_MORE_DOCS) {
throw new IllegalStateException("DocsEnum exhausted");
} else if (doc == -1) {
throw new IllegalStateException("DocsEnum not started");
}
}
private void checkPosition() {
checkDoc();
if (i < 0) {
throw new IllegalStateException("Position enum not started");
} else if (i >= termFreq) {
throw new IllegalStateException("Read past last position");
}
}
@Override
public int nextPosition() throws IOException {
if (doc != 0) {
throw new IllegalStateException();
} else if (i >= termFreq - 1) {
throw new IllegalStateException("Read past last position");
}
++i;
if (payloadIndex != null) {
payload.offset = basePayloadOffset + payloadIndex[positionIndex + i];
payload.length = payloadIndex[positionIndex + i + 1] - payloadIndex[positionIndex + i];
}
if (positions == null) {
return -1;
} else {
return positions[positionIndex + i];
}
}
@Override
public int startOffset() throws IOException {
checkPosition();
if (startOffsets == null) {
return -1;
} else {
return startOffsets[positionIndex + i];
}
}
@Override
public int endOffset() throws IOException {
checkPosition();
if (startOffsets == null) {
return -1;
} else {
return startOffsets[positionIndex + i] + lengths[positionIndex + i];
}
}
@Override
public BytesRef getPayload() throws IOException {
checkPosition();
if (payloadIndex == null || payload.length == 0) {
return null;
} else {
return payload;
}
}
@Override
public int freq() throws IOException {
checkDoc();
return termFreq;
}
@Override
public int docID() {
return doc;
}
@Override
public int nextDoc() throws IOException {
if (doc == -1) {
return (doc = 0);
} else {
return (doc = NO_MORE_DOCS);
}
}
@Override
public int advance(int target) throws IOException {
return slowAdvance(target);
}
@Override
public long cost() {
return 1;
}
}
private static int sum(int[] arr) {
int sum = 0;
for (int el : arr) {
sum += el;
}
return sum;
}
@Override
public long ramBytesUsed() {
return indexReader.ramBytesUsed();
}
@Override
public Collection<Accountable> getChildResources() {
return Collections.singleton(Accountables.namedAccountable("term vector index", indexReader));
}
@Override
public void checkIntegrity() throws IOException {
indexReader.checkIntegrity();
CodecUtil.checksumEntireFile(vectorsStream);
}
@Override
public String toString() {
return getClass().getSimpleName()
+ "(mode="
+ compressionMode
+ ",chunksize="
+ chunkSize
+ ")";
}
}