blob: 0d555a59a102a784e167293a38dec71bb97c8862 [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.lucene.codecs.compressing;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.NoSuchElementException;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.index.BaseTermsEnum;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.SlowImpactsEnum;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LongsRef;
import org.apache.lucene.util.packed.BlockPackedReaderIterator;
import org.apache.lucene.util.packed.PackedInts;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VERSION_OFFHEAP_INDEX;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.FLAGS_BITS;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.META_VERSION_START;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.OFFSETS;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.PACKED_BLOCK_SIZE;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.PAYLOADS;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.POSITIONS;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VECTORS_EXTENSION;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VECTORS_INDEX_CODEC_NAME;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VECTORS_INDEX_EXTENSION;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VECTORS_META_EXTENSION;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VERSION_CURRENT;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VERSION_META;
import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VERSION_START;
* {@link TermVectorsReader} for {@link CompressingTermVectorsFormat}.
* @lucene.experimental
public final class CompressingTermVectorsReader extends TermVectorsReader implements Closeable {
private final FieldInfos fieldInfos;
final FieldsIndex indexReader;
final IndexInput vectorsStream;
private final int version;
private final int packedIntsVersion;
private final CompressionMode compressionMode;
private final Decompressor decompressor;
private final int chunkSize;
private final int numDocs;
private boolean closed;
private final BlockPackedReaderIterator reader;
private final long numDirtyChunks; // number of incomplete compressed blocks written
private final long numDirtyDocs; // cumulative number of missing docs in incomplete chunks
private final long maxPointer; // end of the data section
// used by clone
private CompressingTermVectorsReader(CompressingTermVectorsReader reader) {
this.fieldInfos = reader.fieldInfos;
this.vectorsStream = reader.vectorsStream.clone();
this.indexReader = reader.indexReader.clone();
this.packedIntsVersion = reader.packedIntsVersion;
this.compressionMode = reader.compressionMode;
this.decompressor = reader.decompressor.clone();
this.chunkSize = reader.chunkSize;
this.numDocs = reader.numDocs;
this.reader = new BlockPackedReaderIterator(vectorsStream, packedIntsVersion, PACKED_BLOCK_SIZE, 0);
this.version = reader.version;
this.numDirtyChunks = reader.numDirtyChunks;
this.numDirtyDocs = reader.numDirtyDocs;
this.maxPointer = reader.maxPointer;
this.closed = false;
/** Sole constructor. */
public CompressingTermVectorsReader(Directory d, SegmentInfo si, String segmentSuffix, FieldInfos fn,
IOContext context, String formatName, CompressionMode compressionMode) throws IOException {
this.compressionMode = compressionMode;
final String segment =;
boolean success = false;
fieldInfos = fn;
numDocs = si.maxDoc();
ChecksumIndexInput metaIn = null;
try {
// Open the data file
final String vectorsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_EXTENSION);
vectorsStream = d.openInput(vectorsStreamFN, context);
version = CodecUtil.checkIndexHeader(vectorsStream, formatName, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
assert CodecUtil.indexHeaderLength(formatName, segmentSuffix) == vectorsStream.getFilePointer();
if (version >= VERSION_OFFHEAP_INDEX) {
final String metaStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_META_EXTENSION);
metaIn = d.openChecksumInput(metaStreamFN, IOContext.READONCE);
CodecUtil.checkIndexHeader(metaIn, VECTORS_INDEX_CODEC_NAME + "Meta", META_VERSION_START, version, si.getId(), segmentSuffix);
if (version >= VERSION_META) {
packedIntsVersion = metaIn.readVInt();
chunkSize = metaIn.readVInt();
} else {
packedIntsVersion = vectorsStream.readVInt();
chunkSize = vectorsStream.readVInt();
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
FieldsIndex indexReader = null;
long maxPointer = -1;
if (version < VERSION_OFFHEAP_INDEX) {
// Load the index into memory
final String indexName = IndexFileNames.segmentFileName(segment, segmentSuffix, "tvx");
try (ChecksumIndexInput indexStream = d.openChecksumInput(indexName, context)) {
Throwable priorE = null;
try {
assert formatName.endsWith("Data");
final String codecNameIdx = formatName.substring(0, formatName.length() - "Data".length()) + "Index";
final int version2 = CodecUtil.checkIndexHeader(indexStream, codecNameIdx, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
if (version != version2) {
throw new CorruptIndexException("Version mismatch between stored fields index and data: " + version + " != " + version2, indexStream);
assert CodecUtil.indexHeaderLength(codecNameIdx, segmentSuffix) == indexStream.getFilePointer();
indexReader = new LegacyFieldsIndexReader(indexStream, si);
maxPointer = indexStream.readVLong(); // the end of the data section
} catch (Throwable exception) {
priorE = exception;
} finally {
CodecUtil.checkFooter(indexStream, priorE);
} else {
FieldsIndexReader fieldsIndexReader = new FieldsIndexReader(d,, segmentSuffix, VECTORS_INDEX_EXTENSION, VECTORS_INDEX_CODEC_NAME, si.getId(), metaIn);
indexReader = fieldsIndexReader;
maxPointer = fieldsIndexReader.getMaxPointer();
this.indexReader = indexReader;
this.maxPointer = maxPointer;
if (version >= VERSION_META) {
numDirtyChunks = metaIn.readVLong();
numDirtyDocs = metaIn.readVLong();
} else {
// Old versions of this format did not record numDirtyDocs. Since bulk
// merges are disabled on version increments anyway, we make no effort
// to get valid values of numDirtyChunks and numDirtyDocs.
numDirtyChunks = numDirtyDocs = -1;
decompressor = compressionMode.newDecompressor();
this.reader = new BlockPackedReaderIterator(vectorsStream, packedIntsVersion, PACKED_BLOCK_SIZE, 0);
if (metaIn != null) {
CodecUtil.checkFooter(metaIn, null);
success = true;
} catch (Throwable t) {
if (metaIn != null) {
CodecUtil.checkFooter(metaIn, t);
throw new AssertionError("unreachable");
} else {
throw t;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(this, metaIn);
CompressionMode getCompressionMode() {
return compressionMode;
int getChunkSize() {
return chunkSize;
int getPackedIntsVersion() {
return packedIntsVersion;
int getVersion() {
return version;
FieldsIndex getIndexReader() {
return indexReader;
IndexInput getVectorsStream() {
return vectorsStream;
long getMaxPointer() {
return maxPointer;
long getNumDirtyDocs() {
if (version != VERSION_CURRENT) {
throw new IllegalStateException("getNumDirtyDocs should only ever get called when the reader is on the current version");
assert numDirtyDocs >= 0;
return numDirtyDocs;
long getNumDirtyChunks() {
if (version != VERSION_CURRENT) {
throw new IllegalStateException("getNumDirtyChunks should only ever get called when the reader is on the current version");
assert numDirtyChunks >= 0;
return numDirtyChunks;
int getNumDocs() {
return numDocs;
* @throws AlreadyClosedException if this TermVectorsReader is closed
private void ensureOpen() throws AlreadyClosedException {
if (closed) {
throw new AlreadyClosedException("this FieldsReader is closed");
public void close() throws IOException {
if (!closed) {
IOUtils.close(indexReader, vectorsStream);
closed = true;
public TermVectorsReader clone() {
return new CompressingTermVectorsReader(this);
public Fields get(int doc) throws IOException {
// seek to the right place
final long startPointer = indexReader.getStartPointer(doc);;
// decode
// - docBase: first doc ID of the chunk
// - chunkDocs: number of docs of the chunk
final int docBase = vectorsStream.readVInt();
final int chunkDocs = vectorsStream.readVInt();
if (doc < docBase || doc >= docBase + chunkDocs || docBase + chunkDocs > numDocs) {
throw new CorruptIndexException("docBase=" + docBase + ",chunkDocs=" + chunkDocs + ",doc=" + doc, vectorsStream);
final int skip; // number of fields to skip
final int numFields; // number of fields of the document we're looking for
final int totalFields; // total number of fields of the chunk (sum for all docs)
if (chunkDocs == 1) {
skip = 0;
numFields = totalFields = vectorsStream.readVInt();
} else {
reader.reset(vectorsStream, chunkDocs);
int sum = 0;
for (int i = docBase; i < doc; ++i) {
sum +=;
skip = sum;
numFields = (int);
sum += numFields;
for (int i = doc + 1; i < docBase + chunkDocs; ++i) {
sum +=;
totalFields = sum;
if (numFields == 0) {
// no vectors
return null;
// read field numbers that have term vectors
final int[] fieldNums;
final int token = vectorsStream.readByte() & 0xFF;
assert token != 0; // means no term vectors, cannot happen since we checked for numFields == 0
final int bitsPerFieldNum = token & 0x1F;
int totalDistinctFields = token >>> 5;
if (totalDistinctFields == 0x07) {
totalDistinctFields += vectorsStream.readVInt();
final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalDistinctFields, bitsPerFieldNum, 1);
fieldNums = new int[totalDistinctFields];
for (int i = 0; i < totalDistinctFields; ++i) {
fieldNums[i] = (int);
// read field numbers and flags
final int[] fieldNumOffs = new int[numFields];
final PackedInts.Reader flags;
final int bitsPerOff = PackedInts.bitsRequired(fieldNums.length - 1);
final PackedInts.Reader allFieldNumOffs = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, bitsPerOff);
switch (vectorsStream.readVInt()) {
case 0:
final PackedInts.Reader fieldFlags = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, fieldNums.length, FLAGS_BITS);
PackedInts.Mutable f = PackedInts.getMutable(totalFields, FLAGS_BITS, PackedInts.COMPACT);
for (int i = 0; i < totalFields; ++i) {
final int fieldNumOff = (int) allFieldNumOffs.get(i);
assert fieldNumOff >= 0 && fieldNumOff < fieldNums.length;
final int fgs = (int) fieldFlags.get(fieldNumOff);
f.set(i, fgs);
flags = f;
case 1:
flags = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, FLAGS_BITS);
throw new AssertionError();
for (int i = 0; i < numFields; ++i) {
fieldNumOffs[i] = (int) allFieldNumOffs.get(skip + i);
// number of terms per field for all fields
final PackedInts.Reader numTerms;
final int totalTerms;
final int bitsRequired = vectorsStream.readVInt();
numTerms = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, bitsRequired);
int sum = 0;
for (int i = 0; i < totalFields; ++i) {
sum += numTerms.get(i);
totalTerms = sum;
// term lengths
int docOff = 0, docLen = 0, totalLen;
final int[] fieldLengths = new int[numFields];
final int[][] prefixLengths = new int[numFields][];
final int[][] suffixLengths = new int[numFields][];
reader.reset(vectorsStream, totalTerms);
// skip
int toSkip = 0;
for (int i = 0; i < skip; ++i) {
toSkip += numTerms.get(i);
// read prefix lengths
for (int i = 0; i < numFields; ++i) {
final int termCount = (int) numTerms.get(skip + i);
final int[] fieldPrefixLengths = new int[termCount];
prefixLengths[i] = fieldPrefixLengths;
for (int j = 0; j < termCount; ) {
final LongsRef next = - j);
for (int k = 0; k < next.length; ++k) {
fieldPrefixLengths[j++] = (int) next.longs[next.offset + k];
reader.skip(totalTerms - reader.ord());
reader.reset(vectorsStream, totalTerms);
// skip
toSkip = 0;
for (int i = 0; i < skip; ++i) {
for (int j = 0; j < numTerms.get(i); ++j) {
docOff +=;
for (int i = 0; i < numFields; ++i) {
final int termCount = (int) numTerms.get(skip + i);
final int[] fieldSuffixLengths = new int[termCount];
suffixLengths[i] = fieldSuffixLengths;
for (int j = 0; j < termCount; ) {
final LongsRef next = - j);
for (int k = 0; k < next.length; ++k) {
fieldSuffixLengths[j++] = (int) next.longs[next.offset + k];
fieldLengths[i] = sum(suffixLengths[i]);
docLen += fieldLengths[i];
totalLen = docOff + docLen;
for (int i = skip + numFields; i < totalFields; ++i) {
for (int j = 0; j < numTerms.get(i); ++j) {
totalLen +=;
// term freqs
final int[] termFreqs = new int[totalTerms];
reader.reset(vectorsStream, totalTerms);
for (int i = 0; i < totalTerms; ) {
final LongsRef next = - i);
for (int k = 0; k < next.length; ++k) {
termFreqs[i++] = 1 + (int) next.longs[next.offset + k];
// total number of positions, offsets and payloads
int totalPositions = 0, totalOffsets = 0, totalPayloads = 0;
for (int i = 0, termIndex = 0; i < totalFields; ++i) {
final int f = (int) flags.get(i);
final int termCount = (int) numTerms.get(i);
for (int j = 0; j < termCount; ++j) {
final int freq = termFreqs[termIndex++];
if ((f & POSITIONS) != 0) {
totalPositions += freq;
if ((f & OFFSETS) != 0) {
totalOffsets += freq;
if ((f & PAYLOADS) != 0) {
totalPayloads += freq;
assert i != totalFields - 1 || termIndex == totalTerms : termIndex + " " + totalTerms;
final int[][] positionIndex = positionIndex(skip, numFields, numTerms, termFreqs);
final int[][] positions, startOffsets, lengths;
if (totalPositions > 0) {
positions = readPositions(skip, numFields, flags, numTerms, termFreqs, POSITIONS, totalPositions, positionIndex);
} else {
positions = new int[numFields][];
if (totalOffsets > 0) {
// average number of chars per term
final float[] charsPerTerm = new float[fieldNums.length];
for (int i = 0; i < charsPerTerm.length; ++i) {
charsPerTerm[i] = Float.intBitsToFloat(vectorsStream.readInt());
startOffsets = readPositions(skip, numFields, flags, numTerms, termFreqs, OFFSETS, totalOffsets, positionIndex);
lengths = readPositions(skip, numFields, flags, numTerms, termFreqs, OFFSETS, totalOffsets, positionIndex);
for (int i = 0; i < numFields; ++i) {
final int[] fStartOffsets = startOffsets[i];
final int[] fPositions = positions[i];
// patch offsets from positions
if (fStartOffsets != null && fPositions != null) {
final float fieldCharsPerTerm = charsPerTerm[fieldNumOffs[i]];
for (int j = 0; j < startOffsets[i].length; ++j) {
fStartOffsets[j] += (int) (fieldCharsPerTerm * fPositions[j]);
if (fStartOffsets != null) {
final int[] fPrefixLengths = prefixLengths[i];
final int[] fSuffixLengths = suffixLengths[i];
final int[] fLengths = lengths[i];
for (int j = 0, end = (int) numTerms.get(skip + i); j < end; ++j) {
// delta-decode start offsets and patch lengths using term lengths
final int termLength = fPrefixLengths[j] + fSuffixLengths[j];
lengths[i][positionIndex[i][j]] += termLength;
for (int k = positionIndex[i][j] + 1; k < positionIndex[i][j + 1]; ++k) {
fStartOffsets[k] += fStartOffsets[k - 1];
fLengths[k] += termLength;
} else {
startOffsets = lengths = new int[numFields][];
if (totalPositions > 0) {
// delta-decode positions
for (int i = 0; i < numFields; ++i) {
final int[] fPositions = positions[i];
final int[] fpositionIndex = positionIndex[i];
if (fPositions != null) {
for (int j = 0, end = (int) numTerms.get(skip + i); j < end; ++j) {
// delta-decode start offsets
for (int k = fpositionIndex[j] + 1; k < fpositionIndex[j + 1]; ++k) {
fPositions[k] += fPositions[k - 1];
// payload lengths
final int[][] payloadIndex = new int[numFields][];
int totalPayloadLength = 0;
int payloadOff = 0;
int payloadLen = 0;
if (totalPayloads > 0) {
reader.reset(vectorsStream, totalPayloads);
// skip
int termIndex = 0;
for (int i = 0; i < skip; ++i) {
final int f = (int) flags.get(i);
final int termCount = (int) numTerms.get(i);
if ((f & PAYLOADS) != 0) {
for (int j = 0; j < termCount; ++j) {
final int freq = termFreqs[termIndex + j];
for (int k = 0; k < freq; ++k) {
final int l = (int);
payloadOff += l;
termIndex += termCount;
totalPayloadLength = payloadOff;
// read doc payload lengths
for (int i = 0; i < numFields; ++i) {
final int f = (int) flags.get(skip + i);
final int termCount = (int) numTerms.get(skip + i);
if ((f & PAYLOADS) != 0) {
final int totalFreq = positionIndex[i][termCount];
payloadIndex[i] = new int[totalFreq + 1];
int posIdx = 0;
payloadIndex[i][posIdx] = payloadLen;
for (int j = 0; j < termCount; ++j) {
final int freq = termFreqs[termIndex + j];
for (int k = 0; k < freq; ++k) {
final int payloadLength = (int);
payloadLen += payloadLength;
payloadIndex[i][posIdx+1] = payloadLen;
assert posIdx == totalFreq;
termIndex += termCount;
totalPayloadLength += payloadLen;
for (int i = skip + numFields; i < totalFields; ++i) {
final int f = (int) flags.get(i);
final int termCount = (int) numTerms.get(i);
if ((f & PAYLOADS) != 0) {
for (int j = 0; j < termCount; ++j) {
final int freq = termFreqs[termIndex + j];
for (int k = 0; k < freq; ++k) {
totalPayloadLength +=;
termIndex += termCount;
assert termIndex == totalTerms : termIndex + " " + totalTerms;
// decompress data
final BytesRef suffixBytes = new BytesRef();
decompressor.decompress(vectorsStream, totalLen + totalPayloadLength, docOff + payloadOff, docLen + payloadLen, suffixBytes);
suffixBytes.length = docLen;
final BytesRef payloadBytes = new BytesRef(suffixBytes.bytes, suffixBytes.offset + docLen, payloadLen);
final int[] fieldFlags = new int[numFields];
for (int i = 0; i < numFields; ++i) {
fieldFlags[i] = (int) flags.get(skip + i);
final int[] fieldNumTerms = new int[numFields];
for (int i = 0; i < numFields; ++i) {
fieldNumTerms[i] = (int) numTerms.get(skip + i);
final int[][] fieldTermFreqs = new int[numFields][];
int termIdx = 0;
for (int i = 0; i < skip; ++i) {
termIdx += numTerms.get(i);
for (int i = 0; i < numFields; ++i) {
final int termCount = (int) numTerms.get(skip + i);
fieldTermFreqs[i] = new int[termCount];
for (int j = 0; j < termCount; ++j) {
fieldTermFreqs[i][j] = termFreqs[termIdx++];
assert sum(fieldLengths) == docLen : sum(fieldLengths) + " != " + docLen;
return new TVFields(fieldNums, fieldFlags, fieldNumOffs, fieldNumTerms, fieldLengths,
prefixLengths, suffixLengths, fieldTermFreqs,
positionIndex, positions, startOffsets, lengths,
payloadBytes, payloadIndex,
// field -> term index -> position index
private int[][] positionIndex(int skip, int numFields, PackedInts.Reader numTerms, int[] termFreqs) {
final int[][] positionIndex = new int[numFields][];
int termIndex = 0;
for (int i = 0; i < skip; ++i) {
final int termCount = (int) numTerms.get(i);
termIndex += termCount;
for (int i = 0; i < numFields; ++i) {
final int termCount = (int) numTerms.get(skip + i);
positionIndex[i] = new int[termCount + 1];
for (int j = 0; j < termCount; ++j) {
final int freq = termFreqs[termIndex+j];
positionIndex[i][j + 1] = positionIndex[i][j] + freq;
termIndex += termCount;
return positionIndex;
private int[][] readPositions(int skip, int numFields, PackedInts.Reader flags, PackedInts.Reader numTerms, int[] termFreqs, int flag, final int totalPositions, int[][] positionIndex) throws IOException {
final int[][] positions = new int[numFields][];
reader.reset(vectorsStream, totalPositions);
// skip
int toSkip = 0;
int termIndex = 0;
for (int i = 0; i < skip; ++i) {
final int f = (int) flags.get(i);
final int termCount = (int) numTerms.get(i);
if ((f & flag) != 0) {
for (int j = 0; j < termCount; ++j) {
final int freq = termFreqs[termIndex+j];
toSkip += freq;
termIndex += termCount;
// read doc positions
for (int i = 0; i < numFields; ++i) {
final int f = (int) flags.get(skip + i);
final int termCount = (int) numTerms.get(skip + i);
if ((f & flag) != 0) {
final int totalFreq = positionIndex[i][termCount];
final int[] fieldPositions = new int[totalFreq];
positions[i] = fieldPositions;
for (int j = 0; j < totalFreq; ) {
final LongsRef nextPositions = - j);
for (int k = 0; k < nextPositions.length; ++k) {
fieldPositions[j++] = (int) nextPositions.longs[nextPositions.offset + k];
termIndex += termCount;
reader.skip(totalPositions - reader.ord());
return positions;
private class TVFields extends Fields {
private final int[] fieldNums, fieldFlags, fieldNumOffs, numTerms, fieldLengths;
private final int[][] prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths, payloadIndex;
private final BytesRef suffixBytes, payloadBytes;
public TVFields(int[] fieldNums, int[] fieldFlags, int[] fieldNumOffs, int[] numTerms, int[] fieldLengths,
int[][] prefixLengths, int[][] suffixLengths, int[][] termFreqs,
int[][] positionIndex, int[][] positions, int[][] startOffsets, int[][] lengths,
BytesRef payloadBytes, int[][] payloadIndex,
BytesRef suffixBytes) {
this.fieldNums = fieldNums;
this.fieldFlags = fieldFlags;
this.fieldNumOffs = fieldNumOffs;
this.numTerms = numTerms;
this.fieldLengths = fieldLengths;
this.prefixLengths = prefixLengths;
this.suffixLengths = suffixLengths;
this.termFreqs = termFreqs;
this.positionIndex = positionIndex;
this.positions = positions;
this.startOffsets = startOffsets;
this.lengths = lengths;
this.payloadBytes = payloadBytes;
this.payloadIndex = payloadIndex;
this.suffixBytes = suffixBytes;
public Iterator<String> iterator() {
return new Iterator<String>() {
int i = 0;
public boolean hasNext() {
return i < fieldNumOffs.length;
public String next() {
if (!hasNext()) {
throw new NoSuchElementException();
final int fieldNum = fieldNums[fieldNumOffs[i++]];
return fieldInfos.fieldInfo(fieldNum).name;
public void remove() {
throw new UnsupportedOperationException();
public Terms terms(String field) throws IOException {
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
if (fieldInfo == null) {
return null;
int idx = -1;
for (int i = 0; i < fieldNumOffs.length; ++i) {
if (fieldNums[fieldNumOffs[i]] == fieldInfo.number) {
idx = i;
if (idx == -1 || numTerms[idx] == 0) {
// no term
return null;
int fieldOff = 0, fieldLen = -1;
for (int i = 0; i < fieldNumOffs.length; ++i) {
if (i < idx) {
fieldOff += fieldLengths[i];
} else {
fieldLen = fieldLengths[i];
assert fieldLen >= 0;
return new TVTerms(numTerms[idx], fieldFlags[idx],
prefixLengths[idx], suffixLengths[idx], termFreqs[idx],
positionIndex[idx], positions[idx], startOffsets[idx], lengths[idx],
payloadIndex[idx], payloadBytes,
new BytesRef(suffixBytes.bytes, suffixBytes.offset + fieldOff, fieldLen));
public int size() {
return fieldNumOffs.length;
private static class TVTerms extends Terms {
private final int numTerms, flags;
private final long totalTermFreq;
private final int[] prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths, payloadIndex;
private final BytesRef termBytes, payloadBytes;
TVTerms(int numTerms, int flags, int[] prefixLengths, int[] suffixLengths, int[] termFreqs,
int[] positionIndex, int[] positions, int[] startOffsets, int[] lengths,
int[] payloadIndex, BytesRef payloadBytes,
BytesRef termBytes) {
this.numTerms = numTerms;
this.flags = flags;
this.prefixLengths = prefixLengths;
this.suffixLengths = suffixLengths;
this.termFreqs = termFreqs;
this.positionIndex = positionIndex;
this.positions = positions;
this.startOffsets = startOffsets;
this.lengths = lengths;
this.payloadIndex = payloadIndex;
this.payloadBytes = payloadBytes;
this.termBytes = termBytes;
long ttf = 0;
for (int tf : termFreqs) {
ttf += tf;
this.totalTermFreq = ttf;
public TermsEnum iterator() throws IOException {
TVTermsEnum termsEnum = new TVTermsEnum();
termsEnum.reset(numTerms, flags, prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths,
payloadIndex, payloadBytes,
new ByteArrayDataInput(termBytes.bytes, termBytes.offset, termBytes.length));
return termsEnum;
public long size() throws IOException {
return numTerms;
public long getSumTotalTermFreq() throws IOException {
return totalTermFreq;
public long getSumDocFreq() throws IOException {
return numTerms;
public int getDocCount() throws IOException {
return 1;
public boolean hasFreqs() {
return true;
public boolean hasOffsets() {
return (flags & OFFSETS) != 0;
public boolean hasPositions() {
return (flags & POSITIONS) != 0;
public boolean hasPayloads() {
return (flags & PAYLOADS) != 0;
private static class TVTermsEnum extends BaseTermsEnum {
private int numTerms, startPos, ord;
private int[] prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths, payloadIndex;
private ByteArrayDataInput in;
private BytesRef payloads;
private final BytesRef term;
private TVTermsEnum() {
term = new BytesRef(16);
void reset(int numTerms, int flags, int[] prefixLengths, int[] suffixLengths, int[] termFreqs, int[] positionIndex, int[] positions, int[] startOffsets, int[] lengths,
int[] payloadIndex, BytesRef payloads, ByteArrayDataInput in) {
this.numTerms = numTerms;
this.prefixLengths = prefixLengths;
this.suffixLengths = suffixLengths;
this.termFreqs = termFreqs;
this.positionIndex = positionIndex;
this.positions = positions;
this.startOffsets = startOffsets;
this.lengths = lengths;
this.payloadIndex = payloadIndex;
this.payloads = payloads; = in;
startPos = in.getPosition();
void reset() {
term.length = 0;
ord = -1;
public BytesRef next() throws IOException {
if (ord == numTerms - 1) {
return null;
} else {
assert ord < numTerms;
// read term
term.offset = 0;
term.length = prefixLengths[ord] + suffixLengths[ord];
if (term.length > term.bytes.length) {
term.bytes = ArrayUtil.grow(term.bytes, term.length);
in.readBytes(term.bytes, prefixLengths[ord], suffixLengths[ord]);
return term;
public SeekStatus seekCeil(BytesRef text)
throws IOException {
if (ord < numTerms && ord >= 0) {
final int cmp = term().compareTo(text);
if (cmp == 0) {
return SeekStatus.FOUND;
} else if (cmp > 0) {
// linear scan
while (true) {
final BytesRef term = next();
if (term == null) {
return SeekStatus.END;
final int cmp = term.compareTo(text);
if (cmp > 0) {
return SeekStatus.NOT_FOUND;
} else if (cmp == 0) {
return SeekStatus.FOUND;
public void seekExact(long ord) throws IOException {
throw new UnsupportedOperationException();
public BytesRef term() throws IOException {
return term;
public long ord() throws IOException {
throw new UnsupportedOperationException();
public int docFreq() throws IOException {
return 1;
public long totalTermFreq() throws IOException {
return termFreqs[ord];
public final PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
final TVPostingsEnum docsEnum;
if (reuse != null && reuse instanceof TVPostingsEnum) {
docsEnum = (TVPostingsEnum) reuse;
} else {
docsEnum = new TVPostingsEnum();
docsEnum.reset(termFreqs[ord], positionIndex[ord], positions, startOffsets, lengths, payloads, payloadIndex);
return docsEnum;
public ImpactsEnum impacts(int flags) throws IOException {
final PostingsEnum delegate = postings(null, PostingsEnum.FREQS);
return new SlowImpactsEnum(delegate);
private static class TVPostingsEnum extends PostingsEnum {
private int doc = -1;
private int termFreq;
private int positionIndex;
private int[] positions;
private int[] startOffsets;
private int[] lengths;
private final BytesRef payload;
private int[] payloadIndex;
private int basePayloadOffset;
private int i;
TVPostingsEnum() {
payload = new BytesRef();
public void reset(int freq, int positionIndex, int[] positions,
int[] startOffsets, int[] lengths, BytesRef payloads,
int[] payloadIndex) {
this.termFreq = freq;
this.positionIndex = positionIndex;
this.positions = positions;
this.startOffsets = startOffsets;
this.lengths = lengths;
this.basePayloadOffset = payloads.offset;
this.payload.bytes = payloads.bytes;
payload.offset = payload.length = 0;
this.payloadIndex = payloadIndex;
doc = i = -1;
private void checkDoc() {
if (doc == NO_MORE_DOCS) {
throw new IllegalStateException("DocsEnum exhausted");
} else if (doc == -1) {
throw new IllegalStateException("DocsEnum not started");
private void checkPosition() {
if (i < 0) {
throw new IllegalStateException("Position enum not started");
} else if (i >= termFreq) {
throw new IllegalStateException("Read past last position");
public int nextPosition() throws IOException {
if (doc != 0) {
throw new IllegalStateException();
} else if (i >= termFreq - 1) {
throw new IllegalStateException("Read past last position");
if (payloadIndex != null) {
payload.offset = basePayloadOffset + payloadIndex[positionIndex + i];
payload.length = payloadIndex[positionIndex + i + 1] - payloadIndex[positionIndex + i];
if (positions == null) {
return -1;
} else {
return positions[positionIndex + i];
public int startOffset() throws IOException {
if (startOffsets == null) {
return -1;
} else {
return startOffsets[positionIndex + i];
public int endOffset() throws IOException {
if (startOffsets == null) {
return -1;
} else {
return startOffsets[positionIndex + i] + lengths[positionIndex + i];
public BytesRef getPayload() throws IOException {
if (payloadIndex == null || payload.length == 0) {
return null;
} else {
return payload;
public int freq() throws IOException {
return termFreq;
public int docID() {
return doc;
public int nextDoc() throws IOException {
if (doc == -1) {
return (doc = 0);
} else {
return (doc = NO_MORE_DOCS);
public int advance(int target) throws IOException {
return slowAdvance(target);
public long cost() {
return 1;
private static int sum(int[] arr) {
int sum = 0;
for (int el : arr) {
sum += el;
return sum;
public long ramBytesUsed() {
return indexReader.ramBytesUsed();
public Collection<Accountable> getChildResources() {
return Collections.singleton(Accountables.namedAccountable("term vector index", indexReader));
public void checkIntegrity() throws IOException {
public String toString() {
return getClass().getSimpleName() + "(mode=" + compressionMode + ",chunksize=" + chunkSize + ")";