| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.backward_codecs.lucene50.compressing; |
| |
| import java.io.Closeable; |
| import java.io.IOException; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.Iterator; |
| import java.util.NoSuchElementException; |
| import org.apache.lucene.codecs.CodecUtil; |
| import org.apache.lucene.codecs.TermVectorsReader; |
| import org.apache.lucene.codecs.compressing.CompressionMode; |
| import org.apache.lucene.codecs.compressing.Decompressor; |
| import org.apache.lucene.index.BaseTermsEnum; |
| import org.apache.lucene.index.CorruptIndexException; |
| import org.apache.lucene.index.FieldInfo; |
| import org.apache.lucene.index.FieldInfos; |
| import org.apache.lucene.index.Fields; |
| import org.apache.lucene.index.ImpactsEnum; |
| import org.apache.lucene.index.IndexFileNames; |
| import org.apache.lucene.index.PostingsEnum; |
| import org.apache.lucene.index.SegmentInfo; |
| import org.apache.lucene.index.SlowImpactsEnum; |
| import org.apache.lucene.index.Terms; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.store.AlreadyClosedException; |
| import org.apache.lucene.store.ByteArrayDataInput; |
| import org.apache.lucene.store.ChecksumIndexInput; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.store.IOContext; |
| import org.apache.lucene.store.IndexInput; |
| import org.apache.lucene.util.Accountable; |
| import org.apache.lucene.util.Accountables; |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.IOUtils; |
| import org.apache.lucene.util.LongsRef; |
| import org.apache.lucene.util.packed.BlockPackedReaderIterator; |
| import org.apache.lucene.util.packed.PackedInts; |
| |
| /** |
| * {@link TermVectorsReader} for {@link Lucene50CompressingTermVectorsFormat}. |
| * |
| * @lucene.experimental |
| */ |
| public final class Lucene50CompressingTermVectorsReader extends TermVectorsReader |
| implements Closeable { |
| |
| // hard limit on the maximum number of documents per chunk |
| static final int MAX_DOCUMENTS_PER_CHUNK = 128; |
| |
| static final String VECTORS_EXTENSION = "tvd"; |
| static final String VECTORS_INDEX_EXTENSION = "tvx"; |
| static final String VECTORS_META_EXTENSION = "tvm"; |
| static final String VECTORS_INDEX_CODEC_NAME = "Lucene85TermVectorsIndex"; |
| |
| static final int VERSION_START = 1; |
| static final int VERSION_OFFHEAP_INDEX = 2; |
| /** Version where all metadata were moved to the meta file. */ |
| static final int VERSION_META = 3; |
| |
| static final int VERSION_CURRENT = VERSION_META; |
| static final int META_VERSION_START = 0; |
| |
| static final int PACKED_BLOCK_SIZE = 64; |
| |
| static final int POSITIONS = 0x01; |
| static final int OFFSETS = 0x02; |
| static final int PAYLOADS = 0x04; |
| static final int FLAGS_BITS = PackedInts.bitsRequired(POSITIONS | OFFSETS | PAYLOADS); |
| |
| private final FieldInfos fieldInfos; |
| final FieldsIndex indexReader; |
| final IndexInput vectorsStream; |
| private final int version; |
| private final int packedIntsVersion; |
| private final CompressionMode compressionMode; |
| private final Decompressor decompressor; |
| private final int chunkSize; |
| private final int numDocs; |
| private boolean closed; |
| private final BlockPackedReaderIterator reader; |
| private final long numDirtyChunks; // number of incomplete compressed blocks written |
| private final long numDirtyDocs; // cumulative number of missing docs in incomplete chunks |
| private final long maxPointer; // end of the data section |
| |
| // used by clone |
| private Lucene50CompressingTermVectorsReader(Lucene50CompressingTermVectorsReader reader) { |
| this.fieldInfos = reader.fieldInfos; |
| this.vectorsStream = reader.vectorsStream.clone(); |
| this.indexReader = reader.indexReader.clone(); |
| this.packedIntsVersion = reader.packedIntsVersion; |
| this.compressionMode = reader.compressionMode; |
| this.decompressor = reader.decompressor.clone(); |
| this.chunkSize = reader.chunkSize; |
| this.numDocs = reader.numDocs; |
| this.reader = |
| new BlockPackedReaderIterator(vectorsStream, packedIntsVersion, PACKED_BLOCK_SIZE, 0); |
| this.version = reader.version; |
| this.numDirtyChunks = reader.numDirtyChunks; |
| this.numDirtyDocs = reader.numDirtyDocs; |
| this.maxPointer = reader.maxPointer; |
| this.closed = false; |
| } |
| |
| /** Sole constructor. */ |
| public Lucene50CompressingTermVectorsReader( |
| Directory d, |
| SegmentInfo si, |
| String segmentSuffix, |
| FieldInfos fn, |
| IOContext context, |
| String formatName, |
| CompressionMode compressionMode) |
| throws IOException { |
| this.compressionMode = compressionMode; |
| final String segment = si.name; |
| boolean success = false; |
| fieldInfos = fn; |
| numDocs = si.maxDoc(); |
| |
| ChecksumIndexInput metaIn = null; |
| try { |
| // Open the data file |
| final String vectorsStreamFN = |
| IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_EXTENSION); |
| vectorsStream = d.openInput(vectorsStreamFN, context); |
| version = |
| CodecUtil.checkIndexHeader( |
| vectorsStream, formatName, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix); |
| assert CodecUtil.indexHeaderLength(formatName, segmentSuffix) |
| == vectorsStream.getFilePointer(); |
| |
| if (version >= VERSION_OFFHEAP_INDEX) { |
| final String metaStreamFN = |
| IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_META_EXTENSION); |
| metaIn = d.openChecksumInput(metaStreamFN, IOContext.READONCE); |
| CodecUtil.checkIndexHeader( |
| metaIn, |
| VECTORS_INDEX_CODEC_NAME + "Meta", |
| META_VERSION_START, |
| version, |
| si.getId(), |
| segmentSuffix); |
| } |
| |
| if (version >= VERSION_META) { |
| packedIntsVersion = metaIn.readVInt(); |
| chunkSize = metaIn.readVInt(); |
| } else { |
| packedIntsVersion = vectorsStream.readVInt(); |
| chunkSize = vectorsStream.readVInt(); |
| } |
| |
| // NOTE: data file is too costly to verify checksum against all the bytes on open, |
| // but for now we at least verify proper structure of the checksum footer: which looks |
| // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption |
| // such as file truncation. |
| CodecUtil.retrieveChecksum(vectorsStream); |
| |
| FieldsIndex indexReader = null; |
| long maxPointer = -1; |
| |
| if (version < VERSION_OFFHEAP_INDEX) { |
| // Load the index into memory |
| final String indexName = IndexFileNames.segmentFileName(segment, segmentSuffix, "tvx"); |
| try (ChecksumIndexInput indexStream = d.openChecksumInput(indexName, context)) { |
| Throwable priorE = null; |
| try { |
| assert formatName.endsWith("Data"); |
| final String codecNameIdx = |
| formatName.substring(0, formatName.length() - "Data".length()) + "Index"; |
| final int version2 = |
| CodecUtil.checkIndexHeader( |
| indexStream, |
| codecNameIdx, |
| VERSION_START, |
| VERSION_CURRENT, |
| si.getId(), |
| segmentSuffix); |
| if (version != version2) { |
| throw new CorruptIndexException( |
| "Version mismatch between stored fields index and data: " |
| + version |
| + " != " |
| + version2, |
| indexStream); |
| } |
| assert CodecUtil.indexHeaderLength(codecNameIdx, segmentSuffix) |
| == indexStream.getFilePointer(); |
| indexReader = new LegacyFieldsIndexReader(indexStream, si); |
| maxPointer = indexStream.readVLong(); // the end of the data section |
| } catch (Throwable exception) { |
| priorE = exception; |
| } finally { |
| CodecUtil.checkFooter(indexStream, priorE); |
| } |
| } |
| } else { |
| FieldsIndexReader fieldsIndexReader = |
| new FieldsIndexReader( |
| d, |
| si.name, |
| segmentSuffix, |
| VECTORS_INDEX_EXTENSION, |
| VECTORS_INDEX_CODEC_NAME, |
| si.getId(), |
| metaIn); |
| indexReader = fieldsIndexReader; |
| maxPointer = fieldsIndexReader.getMaxPointer(); |
| } |
| |
| this.indexReader = indexReader; |
| this.maxPointer = maxPointer; |
| |
| if (version >= VERSION_META) { |
| numDirtyChunks = metaIn.readVLong(); |
| numDirtyDocs = metaIn.readVLong(); |
| } else { |
| // Old versions of this format did not record numDirtyDocs. Since bulk |
| // merges are disabled on version increments anyway, we make no effort |
| // to get valid values of numDirtyChunks and numDirtyDocs. |
| numDirtyChunks = numDirtyDocs = -1; |
| } |
| |
| decompressor = compressionMode.newDecompressor(); |
| this.reader = |
| new BlockPackedReaderIterator(vectorsStream, packedIntsVersion, PACKED_BLOCK_SIZE, 0); |
| |
| if (metaIn != null) { |
| CodecUtil.checkFooter(metaIn, null); |
| metaIn.close(); |
| } |
| |
| success = true; |
| } catch (Throwable t) { |
| if (metaIn != null) { |
| CodecUtil.checkFooter(metaIn, t); |
| throw new AssertionError("unreachable"); |
| } else { |
| throw t; |
| } |
| } finally { |
| if (!success) { |
| IOUtils.closeWhileHandlingException(this, metaIn); |
| } |
| } |
| } |
| |
| CompressionMode getCompressionMode() { |
| return compressionMode; |
| } |
| |
| int getChunkSize() { |
| return chunkSize; |
| } |
| |
| int getPackedIntsVersion() { |
| return packedIntsVersion; |
| } |
| |
| int getVersion() { |
| return version; |
| } |
| |
| FieldsIndex getIndexReader() { |
| return indexReader; |
| } |
| |
| IndexInput getVectorsStream() { |
| return vectorsStream; |
| } |
| |
| long getMaxPointer() { |
| return maxPointer; |
| } |
| |
| long getNumDirtyDocs() { |
| if (version != VERSION_CURRENT) { |
| throw new IllegalStateException( |
| "getNumDirtyDocs should only ever get called when the reader is on the current version"); |
| } |
| assert numDirtyDocs >= 0; |
| return numDirtyDocs; |
| } |
| |
| long getNumDirtyChunks() { |
| if (version != VERSION_CURRENT) { |
| throw new IllegalStateException( |
| "getNumDirtyChunks should only ever get called when the reader is on the current version"); |
| } |
| assert numDirtyChunks >= 0; |
| return numDirtyChunks; |
| } |
| |
| int getNumDocs() { |
| return numDocs; |
| } |
| |
| /** @throws AlreadyClosedException if this TermVectorsReader is closed */ |
| private void ensureOpen() throws AlreadyClosedException { |
| if (closed) { |
| throw new AlreadyClosedException("this FieldsReader is closed"); |
| } |
| } |
| |
| @Override |
| public void close() throws IOException { |
| if (!closed) { |
| IOUtils.close(indexReader, vectorsStream); |
| closed = true; |
| } |
| } |
| |
| @Override |
| public TermVectorsReader clone() { |
| return new Lucene50CompressingTermVectorsReader(this); |
| } |
| |
| @Override |
| public Fields get(int doc) throws IOException { |
| ensureOpen(); |
| |
| // seek to the right place |
| { |
| final long startPointer = indexReader.getStartPointer(doc); |
| vectorsStream.seek(startPointer); |
| } |
| |
| // decode |
| // - docBase: first doc ID of the chunk |
| // - chunkDocs: number of docs of the chunk |
| final int docBase = vectorsStream.readVInt(); |
| final int chunkDocs = vectorsStream.readVInt(); |
| if (doc < docBase || doc >= docBase + chunkDocs || docBase + chunkDocs > numDocs) { |
| throw new CorruptIndexException( |
| "docBase=" + docBase + ",chunkDocs=" + chunkDocs + ",doc=" + doc, vectorsStream); |
| } |
| |
| final int skip; // number of fields to skip |
| final int numFields; // number of fields of the document we're looking for |
| final int totalFields; // total number of fields of the chunk (sum for all docs) |
| if (chunkDocs == 1) { |
| skip = 0; |
| numFields = totalFields = vectorsStream.readVInt(); |
| } else { |
| reader.reset(vectorsStream, chunkDocs); |
| int sum = 0; |
| for (int i = docBase; i < doc; ++i) { |
| sum += reader.next(); |
| } |
| skip = sum; |
| numFields = (int) reader.next(); |
| sum += numFields; |
| for (int i = doc + 1; i < docBase + chunkDocs; ++i) { |
| sum += reader.next(); |
| } |
| totalFields = sum; |
| } |
| |
| if (numFields == 0) { |
| // no vectors |
| return null; |
| } |
| |
| // read field numbers that have term vectors |
| final int[] fieldNums; |
| { |
| final int token = vectorsStream.readByte() & 0xFF; |
| assert token != 0; // means no term vectors, cannot happen since we checked for numFields == 0 |
| final int bitsPerFieldNum = token & 0x1F; |
| int totalDistinctFields = token >>> 5; |
| if (totalDistinctFields == 0x07) { |
| totalDistinctFields += vectorsStream.readVInt(); |
| } |
| ++totalDistinctFields; |
| final PackedInts.ReaderIterator it = |
| PackedInts.getReaderIteratorNoHeader( |
| vectorsStream, |
| PackedInts.Format.PACKED, |
| packedIntsVersion, |
| totalDistinctFields, |
| bitsPerFieldNum, |
| 1); |
| fieldNums = new int[totalDistinctFields]; |
| for (int i = 0; i < totalDistinctFields; ++i) { |
| fieldNums[i] = (int) it.next(); |
| } |
| } |
| |
| // read field numbers and flags |
| final int[] fieldNumOffs = new int[numFields]; |
| final PackedInts.Reader flags; |
| { |
| final int bitsPerOff = PackedInts.bitsRequired(fieldNums.length - 1); |
| final PackedInts.Reader allFieldNumOffs = |
| PackedInts.getReaderNoHeader( |
| vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, bitsPerOff); |
| switch (vectorsStream.readVInt()) { |
| case 0: |
| final PackedInts.Reader fieldFlags = |
| PackedInts.getReaderNoHeader( |
| vectorsStream, |
| PackedInts.Format.PACKED, |
| packedIntsVersion, |
| fieldNums.length, |
| FLAGS_BITS); |
| PackedInts.Mutable f = PackedInts.getMutable(totalFields, FLAGS_BITS, PackedInts.COMPACT); |
| for (int i = 0; i < totalFields; ++i) { |
| final int fieldNumOff = (int) allFieldNumOffs.get(i); |
| assert fieldNumOff >= 0 && fieldNumOff < fieldNums.length; |
| final int fgs = (int) fieldFlags.get(fieldNumOff); |
| f.set(i, fgs); |
| } |
| flags = f; |
| break; |
| case 1: |
| flags = |
| PackedInts.getReaderNoHeader( |
| vectorsStream, |
| PackedInts.Format.PACKED, |
| packedIntsVersion, |
| totalFields, |
| FLAGS_BITS); |
| break; |
| default: |
| throw new AssertionError(); |
| } |
| for (int i = 0; i < numFields; ++i) { |
| fieldNumOffs[i] = (int) allFieldNumOffs.get(skip + i); |
| } |
| } |
| |
| // number of terms per field for all fields |
| final PackedInts.Reader numTerms; |
| final int totalTerms; |
| { |
| final int bitsRequired = vectorsStream.readVInt(); |
| numTerms = |
| PackedInts.getReaderNoHeader( |
| vectorsStream, |
| PackedInts.Format.PACKED, |
| packedIntsVersion, |
| totalFields, |
| bitsRequired); |
| int sum = 0; |
| for (int i = 0; i < totalFields; ++i) { |
| sum += numTerms.get(i); |
| } |
| totalTerms = sum; |
| } |
| |
| // term lengths |
| int docOff = 0, docLen = 0, totalLen; |
| final int[] fieldLengths = new int[numFields]; |
| final int[][] prefixLengths = new int[numFields][]; |
| final int[][] suffixLengths = new int[numFields][]; |
| { |
| reader.reset(vectorsStream, totalTerms); |
| // skip |
| int toSkip = 0; |
| for (int i = 0; i < skip; ++i) { |
| toSkip += numTerms.get(i); |
| } |
| reader.skip(toSkip); |
| // read prefix lengths |
| for (int i = 0; i < numFields; ++i) { |
| final int termCount = (int) numTerms.get(skip + i); |
| final int[] fieldPrefixLengths = new int[termCount]; |
| prefixLengths[i] = fieldPrefixLengths; |
| for (int j = 0; j < termCount; ) { |
| final LongsRef next = reader.next(termCount - j); |
| for (int k = 0; k < next.length; ++k) { |
| fieldPrefixLengths[j++] = (int) next.longs[next.offset + k]; |
| } |
| } |
| } |
| reader.skip(totalTerms - reader.ord()); |
| |
| reader.reset(vectorsStream, totalTerms); |
| // skip |
| toSkip = 0; |
| for (int i = 0; i < skip; ++i) { |
| for (int j = 0; j < numTerms.get(i); ++j) { |
| docOff += reader.next(); |
| } |
| } |
| for (int i = 0; i < numFields; ++i) { |
| final int termCount = (int) numTerms.get(skip + i); |
| final int[] fieldSuffixLengths = new int[termCount]; |
| suffixLengths[i] = fieldSuffixLengths; |
| for (int j = 0; j < termCount; ) { |
| final LongsRef next = reader.next(termCount - j); |
| for (int k = 0; k < next.length; ++k) { |
| fieldSuffixLengths[j++] = (int) next.longs[next.offset + k]; |
| } |
| } |
| fieldLengths[i] = sum(suffixLengths[i]); |
| docLen += fieldLengths[i]; |
| } |
| totalLen = docOff + docLen; |
| for (int i = skip + numFields; i < totalFields; ++i) { |
| for (int j = 0; j < numTerms.get(i); ++j) { |
| totalLen += reader.next(); |
| } |
| } |
| } |
| |
| // term freqs |
| final int[] termFreqs = new int[totalTerms]; |
| { |
| reader.reset(vectorsStream, totalTerms); |
| for (int i = 0; i < totalTerms; ) { |
| final LongsRef next = reader.next(totalTerms - i); |
| for (int k = 0; k < next.length; ++k) { |
| termFreqs[i++] = 1 + (int) next.longs[next.offset + k]; |
| } |
| } |
| } |
| |
| // total number of positions, offsets and payloads |
| int totalPositions = 0, totalOffsets = 0, totalPayloads = 0; |
| for (int i = 0, termIndex = 0; i < totalFields; ++i) { |
| final int f = (int) flags.get(i); |
| final int termCount = (int) numTerms.get(i); |
| for (int j = 0; j < termCount; ++j) { |
| final int freq = termFreqs[termIndex++]; |
| if ((f & POSITIONS) != 0) { |
| totalPositions += freq; |
| } |
| if ((f & OFFSETS) != 0) { |
| totalOffsets += freq; |
| } |
| if ((f & PAYLOADS) != 0) { |
| totalPayloads += freq; |
| } |
| } |
| assert i != totalFields - 1 || termIndex == totalTerms : termIndex + " " + totalTerms; |
| } |
| |
| final int[][] positionIndex = positionIndex(skip, numFields, numTerms, termFreqs); |
| final int[][] positions, startOffsets, lengths; |
| if (totalPositions > 0) { |
| positions = |
| readPositions( |
| skip, |
| numFields, |
| flags, |
| numTerms, |
| termFreqs, |
| POSITIONS, |
| totalPositions, |
| positionIndex); |
| } else { |
| positions = new int[numFields][]; |
| } |
| |
| if (totalOffsets > 0) { |
| // average number of chars per term |
| final float[] charsPerTerm = new float[fieldNums.length]; |
| for (int i = 0; i < charsPerTerm.length; ++i) { |
| charsPerTerm[i] = Float.intBitsToFloat(vectorsStream.readInt()); |
| } |
| startOffsets = |
| readPositions( |
| skip, numFields, flags, numTerms, termFreqs, OFFSETS, totalOffsets, positionIndex); |
| lengths = |
| readPositions( |
| skip, numFields, flags, numTerms, termFreqs, OFFSETS, totalOffsets, positionIndex); |
| |
| for (int i = 0; i < numFields; ++i) { |
| final int[] fStartOffsets = startOffsets[i]; |
| final int[] fPositions = positions[i]; |
| // patch offsets from positions |
| if (fStartOffsets != null && fPositions != null) { |
| final float fieldCharsPerTerm = charsPerTerm[fieldNumOffs[i]]; |
| for (int j = 0; j < startOffsets[i].length; ++j) { |
| fStartOffsets[j] += (int) (fieldCharsPerTerm * fPositions[j]); |
| } |
| } |
| if (fStartOffsets != null) { |
| final int[] fPrefixLengths = prefixLengths[i]; |
| final int[] fSuffixLengths = suffixLengths[i]; |
| final int[] fLengths = lengths[i]; |
| for (int j = 0, end = (int) numTerms.get(skip + i); j < end; ++j) { |
| // delta-decode start offsets and patch lengths using term lengths |
| final int termLength = fPrefixLengths[j] + fSuffixLengths[j]; |
| lengths[i][positionIndex[i][j]] += termLength; |
| for (int k = positionIndex[i][j] + 1; k < positionIndex[i][j + 1]; ++k) { |
| fStartOffsets[k] += fStartOffsets[k - 1]; |
| fLengths[k] += termLength; |
| } |
| } |
| } |
| } |
| } else { |
| startOffsets = lengths = new int[numFields][]; |
| } |
| if (totalPositions > 0) { |
| // delta-decode positions |
| for (int i = 0; i < numFields; ++i) { |
| final int[] fPositions = positions[i]; |
| final int[] fpositionIndex = positionIndex[i]; |
| if (fPositions != null) { |
| for (int j = 0, end = (int) numTerms.get(skip + i); j < end; ++j) { |
| // delta-decode start offsets |
| for (int k = fpositionIndex[j] + 1; k < fpositionIndex[j + 1]; ++k) { |
| fPositions[k] += fPositions[k - 1]; |
| } |
| } |
| } |
| } |
| } |
| |
| // payload lengths |
| final int[][] payloadIndex = new int[numFields][]; |
| int totalPayloadLength = 0; |
| int payloadOff = 0; |
| int payloadLen = 0; |
| if (totalPayloads > 0) { |
| reader.reset(vectorsStream, totalPayloads); |
| // skip |
| int termIndex = 0; |
| for (int i = 0; i < skip; ++i) { |
| final int f = (int) flags.get(i); |
| final int termCount = (int) numTerms.get(i); |
| if ((f & PAYLOADS) != 0) { |
| for (int j = 0; j < termCount; ++j) { |
| final int freq = termFreqs[termIndex + j]; |
| for (int k = 0; k < freq; ++k) { |
| final int l = (int) reader.next(); |
| payloadOff += l; |
| } |
| } |
| } |
| termIndex += termCount; |
| } |
| totalPayloadLength = payloadOff; |
| // read doc payload lengths |
| for (int i = 0; i < numFields; ++i) { |
| final int f = (int) flags.get(skip + i); |
| final int termCount = (int) numTerms.get(skip + i); |
| if ((f & PAYLOADS) != 0) { |
| final int totalFreq = positionIndex[i][termCount]; |
| payloadIndex[i] = new int[totalFreq + 1]; |
| int posIdx = 0; |
| payloadIndex[i][posIdx] = payloadLen; |
| for (int j = 0; j < termCount; ++j) { |
| final int freq = termFreqs[termIndex + j]; |
| for (int k = 0; k < freq; ++k) { |
| final int payloadLength = (int) reader.next(); |
| payloadLen += payloadLength; |
| payloadIndex[i][posIdx + 1] = payloadLen; |
| ++posIdx; |
| } |
| } |
| assert posIdx == totalFreq; |
| } |
| termIndex += termCount; |
| } |
| totalPayloadLength += payloadLen; |
| for (int i = skip + numFields; i < totalFields; ++i) { |
| final int f = (int) flags.get(i); |
| final int termCount = (int) numTerms.get(i); |
| if ((f & PAYLOADS) != 0) { |
| for (int j = 0; j < termCount; ++j) { |
| final int freq = termFreqs[termIndex + j]; |
| for (int k = 0; k < freq; ++k) { |
| totalPayloadLength += reader.next(); |
| } |
| } |
| } |
| termIndex += termCount; |
| } |
| assert termIndex == totalTerms : termIndex + " " + totalTerms; |
| } |
| |
| // decompress data |
| final BytesRef suffixBytes = new BytesRef(); |
| decompressor.decompress( |
| vectorsStream, |
| totalLen + totalPayloadLength, |
| docOff + payloadOff, |
| docLen + payloadLen, |
| suffixBytes); |
| suffixBytes.length = docLen; |
| final BytesRef payloadBytes = |
| new BytesRef(suffixBytes.bytes, suffixBytes.offset + docLen, payloadLen); |
| |
| final int[] fieldFlags = new int[numFields]; |
| for (int i = 0; i < numFields; ++i) { |
| fieldFlags[i] = (int) flags.get(skip + i); |
| } |
| |
| final int[] fieldNumTerms = new int[numFields]; |
| for (int i = 0; i < numFields; ++i) { |
| fieldNumTerms[i] = (int) numTerms.get(skip + i); |
| } |
| |
| final int[][] fieldTermFreqs = new int[numFields][]; |
| { |
| int termIdx = 0; |
| for (int i = 0; i < skip; ++i) { |
| termIdx += numTerms.get(i); |
| } |
| for (int i = 0; i < numFields; ++i) { |
| final int termCount = (int) numTerms.get(skip + i); |
| fieldTermFreqs[i] = new int[termCount]; |
| for (int j = 0; j < termCount; ++j) { |
| fieldTermFreqs[i][j] = termFreqs[termIdx++]; |
| } |
| } |
| } |
| |
| assert sum(fieldLengths) == docLen : sum(fieldLengths) + " != " + docLen; |
| |
| return new TVFields( |
| fieldNums, |
| fieldFlags, |
| fieldNumOffs, |
| fieldNumTerms, |
| fieldLengths, |
| prefixLengths, |
| suffixLengths, |
| fieldTermFreqs, |
| positionIndex, |
| positions, |
| startOffsets, |
| lengths, |
| payloadBytes, |
| payloadIndex, |
| suffixBytes); |
| } |
| |
| // field -> term index -> position index |
| private int[][] positionIndex( |
| int skip, int numFields, PackedInts.Reader numTerms, int[] termFreqs) { |
| final int[][] positionIndex = new int[numFields][]; |
| int termIndex = 0; |
| for (int i = 0; i < skip; ++i) { |
| final int termCount = (int) numTerms.get(i); |
| termIndex += termCount; |
| } |
| for (int i = 0; i < numFields; ++i) { |
| final int termCount = (int) numTerms.get(skip + i); |
| positionIndex[i] = new int[termCount + 1]; |
| for (int j = 0; j < termCount; ++j) { |
| final int freq = termFreqs[termIndex + j]; |
| positionIndex[i][j + 1] = positionIndex[i][j] + freq; |
| } |
| termIndex += termCount; |
| } |
| return positionIndex; |
| } |
| |
| private int[][] readPositions( |
| int skip, |
| int numFields, |
| PackedInts.Reader flags, |
| PackedInts.Reader numTerms, |
| int[] termFreqs, |
| int flag, |
| final int totalPositions, |
| int[][] positionIndex) |
| throws IOException { |
| final int[][] positions = new int[numFields][]; |
| reader.reset(vectorsStream, totalPositions); |
| // skip |
| int toSkip = 0; |
| int termIndex = 0; |
| for (int i = 0; i < skip; ++i) { |
| final int f = (int) flags.get(i); |
| final int termCount = (int) numTerms.get(i); |
| if ((f & flag) != 0) { |
| for (int j = 0; j < termCount; ++j) { |
| final int freq = termFreqs[termIndex + j]; |
| toSkip += freq; |
| } |
| } |
| termIndex += termCount; |
| } |
| reader.skip(toSkip); |
| // read doc positions |
| for (int i = 0; i < numFields; ++i) { |
| final int f = (int) flags.get(skip + i); |
| final int termCount = (int) numTerms.get(skip + i); |
| if ((f & flag) != 0) { |
| final int totalFreq = positionIndex[i][termCount]; |
| final int[] fieldPositions = new int[totalFreq]; |
| positions[i] = fieldPositions; |
| for (int j = 0; j < totalFreq; ) { |
| final LongsRef nextPositions = reader.next(totalFreq - j); |
| for (int k = 0; k < nextPositions.length; ++k) { |
| fieldPositions[j++] = (int) nextPositions.longs[nextPositions.offset + k]; |
| } |
| } |
| } |
| termIndex += termCount; |
| } |
| reader.skip(totalPositions - reader.ord()); |
| return positions; |
| } |
| |
| private class TVFields extends Fields { |
| |
| private final int[] fieldNums, fieldFlags, fieldNumOffs, numTerms, fieldLengths; |
| private final int[][] prefixLengths, |
| suffixLengths, |
| termFreqs, |
| positionIndex, |
| positions, |
| startOffsets, |
| lengths, |
| payloadIndex; |
| private final BytesRef suffixBytes, payloadBytes; |
| |
| public TVFields( |
| int[] fieldNums, |
| int[] fieldFlags, |
| int[] fieldNumOffs, |
| int[] numTerms, |
| int[] fieldLengths, |
| int[][] prefixLengths, |
| int[][] suffixLengths, |
| int[][] termFreqs, |
| int[][] positionIndex, |
| int[][] positions, |
| int[][] startOffsets, |
| int[][] lengths, |
| BytesRef payloadBytes, |
| int[][] payloadIndex, |
| BytesRef suffixBytes) { |
| this.fieldNums = fieldNums; |
| this.fieldFlags = fieldFlags; |
| this.fieldNumOffs = fieldNumOffs; |
| this.numTerms = numTerms; |
| this.fieldLengths = fieldLengths; |
| this.prefixLengths = prefixLengths; |
| this.suffixLengths = suffixLengths; |
| this.termFreqs = termFreqs; |
| this.positionIndex = positionIndex; |
| this.positions = positions; |
| this.startOffsets = startOffsets; |
| this.lengths = lengths; |
| this.payloadBytes = payloadBytes; |
| this.payloadIndex = payloadIndex; |
| this.suffixBytes = suffixBytes; |
| } |
| |
| @Override |
| public Iterator<String> iterator() { |
| return new Iterator<String>() { |
| int i = 0; |
| |
| @Override |
| public boolean hasNext() { |
| return i < fieldNumOffs.length; |
| } |
| |
| @Override |
| public String next() { |
| if (!hasNext()) { |
| throw new NoSuchElementException(); |
| } |
| final int fieldNum = fieldNums[fieldNumOffs[i++]]; |
| return fieldInfos.fieldInfo(fieldNum).name; |
| } |
| |
| @Override |
| public void remove() { |
| throw new UnsupportedOperationException(); |
| } |
| }; |
| } |
| |
| @Override |
| public Terms terms(String field) throws IOException { |
| final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); |
| if (fieldInfo == null) { |
| return null; |
| } |
| int idx = -1; |
| for (int i = 0; i < fieldNumOffs.length; ++i) { |
| if (fieldNums[fieldNumOffs[i]] == fieldInfo.number) { |
| idx = i; |
| break; |
| } |
| } |
| |
| if (idx == -1 || numTerms[idx] == 0) { |
| // no term |
| return null; |
| } |
| int fieldOff = 0, fieldLen = -1; |
| for (int i = 0; i < fieldNumOffs.length; ++i) { |
| if (i < idx) { |
| fieldOff += fieldLengths[i]; |
| } else { |
| fieldLen = fieldLengths[i]; |
| break; |
| } |
| } |
| assert fieldLen >= 0; |
| return new TVTerms( |
| numTerms[idx], |
| fieldFlags[idx], |
| prefixLengths[idx], |
| suffixLengths[idx], |
| termFreqs[idx], |
| positionIndex[idx], |
| positions[idx], |
| startOffsets[idx], |
| lengths[idx], |
| payloadIndex[idx], |
| payloadBytes, |
| new BytesRef(suffixBytes.bytes, suffixBytes.offset + fieldOff, fieldLen)); |
| } |
| |
| @Override |
| public int size() { |
| return fieldNumOffs.length; |
| } |
| } |
| |
| private static class TVTerms extends Terms { |
| |
| private final int numTerms, flags; |
| private final long totalTermFreq; |
| private final int[] prefixLengths, |
| suffixLengths, |
| termFreqs, |
| positionIndex, |
| positions, |
| startOffsets, |
| lengths, |
| payloadIndex; |
| private final BytesRef termBytes, payloadBytes; |
| |
| TVTerms( |
| int numTerms, |
| int flags, |
| int[] prefixLengths, |
| int[] suffixLengths, |
| int[] termFreqs, |
| int[] positionIndex, |
| int[] positions, |
| int[] startOffsets, |
| int[] lengths, |
| int[] payloadIndex, |
| BytesRef payloadBytes, |
| BytesRef termBytes) { |
| this.numTerms = numTerms; |
| this.flags = flags; |
| this.prefixLengths = prefixLengths; |
| this.suffixLengths = suffixLengths; |
| this.termFreqs = termFreqs; |
| this.positionIndex = positionIndex; |
| this.positions = positions; |
| this.startOffsets = startOffsets; |
| this.lengths = lengths; |
| this.payloadIndex = payloadIndex; |
| this.payloadBytes = payloadBytes; |
| this.termBytes = termBytes; |
| long ttf = 0; |
| for (int tf : termFreqs) { |
| ttf += tf; |
| } |
| this.totalTermFreq = ttf; |
| } |
| |
| @Override |
| public TermsEnum iterator() throws IOException { |
| TVTermsEnum termsEnum = new TVTermsEnum(); |
| termsEnum.reset( |
| numTerms, |
| flags, |
| prefixLengths, |
| suffixLengths, |
| termFreqs, |
| positionIndex, |
| positions, |
| startOffsets, |
| lengths, |
| payloadIndex, |
| payloadBytes, |
| new ByteArrayDataInput(termBytes.bytes, termBytes.offset, termBytes.length)); |
| return termsEnum; |
| } |
| |
| @Override |
| public long size() throws IOException { |
| return numTerms; |
| } |
| |
| @Override |
| public long getSumTotalTermFreq() throws IOException { |
| return totalTermFreq; |
| } |
| |
| @Override |
| public long getSumDocFreq() throws IOException { |
| return numTerms; |
| } |
| |
| @Override |
| public int getDocCount() throws IOException { |
| return 1; |
| } |
| |
| @Override |
| public boolean hasFreqs() { |
| return true; |
| } |
| |
| @Override |
| public boolean hasOffsets() { |
| return (flags & OFFSETS) != 0; |
| } |
| |
| @Override |
| public boolean hasPositions() { |
| return (flags & POSITIONS) != 0; |
| } |
| |
| @Override |
| public boolean hasPayloads() { |
| return (flags & PAYLOADS) != 0; |
| } |
| } |
| |
| private static class TVTermsEnum extends BaseTermsEnum { |
| |
| private int numTerms, startPos, ord; |
| private int[] prefixLengths, |
| suffixLengths, |
| termFreqs, |
| positionIndex, |
| positions, |
| startOffsets, |
| lengths, |
| payloadIndex; |
| private ByteArrayDataInput in; |
| private BytesRef payloads; |
| private final BytesRef term; |
| |
| private TVTermsEnum() { |
| term = new BytesRef(16); |
| } |
| |
| void reset( |
| int numTerms, |
| int flags, |
| int[] prefixLengths, |
| int[] suffixLengths, |
| int[] termFreqs, |
| int[] positionIndex, |
| int[] positions, |
| int[] startOffsets, |
| int[] lengths, |
| int[] payloadIndex, |
| BytesRef payloads, |
| ByteArrayDataInput in) { |
| this.numTerms = numTerms; |
| this.prefixLengths = prefixLengths; |
| this.suffixLengths = suffixLengths; |
| this.termFreqs = termFreqs; |
| this.positionIndex = positionIndex; |
| this.positions = positions; |
| this.startOffsets = startOffsets; |
| this.lengths = lengths; |
| this.payloadIndex = payloadIndex; |
| this.payloads = payloads; |
| this.in = in; |
| startPos = in.getPosition(); |
| reset(); |
| } |
| |
| void reset() { |
| term.length = 0; |
| in.setPosition(startPos); |
| ord = -1; |
| } |
| |
| @Override |
| public BytesRef next() throws IOException { |
| if (ord == numTerms - 1) { |
| return null; |
| } else { |
| assert ord < numTerms; |
| ++ord; |
| } |
| |
| // read term |
| term.offset = 0; |
| term.length = prefixLengths[ord] + suffixLengths[ord]; |
| if (term.length > term.bytes.length) { |
| term.bytes = ArrayUtil.grow(term.bytes, term.length); |
| } |
| in.readBytes(term.bytes, prefixLengths[ord], suffixLengths[ord]); |
| |
| return term; |
| } |
| |
| @Override |
| public SeekStatus seekCeil(BytesRef text) throws IOException { |
| if (ord < numTerms && ord >= 0) { |
| final int cmp = term().compareTo(text); |
| if (cmp == 0) { |
| return SeekStatus.FOUND; |
| } else if (cmp > 0) { |
| reset(); |
| } |
| } |
| // linear scan |
| while (true) { |
| final BytesRef term = next(); |
| if (term == null) { |
| return SeekStatus.END; |
| } |
| final int cmp = term.compareTo(text); |
| if (cmp > 0) { |
| return SeekStatus.NOT_FOUND; |
| } else if (cmp == 0) { |
| return SeekStatus.FOUND; |
| } |
| } |
| } |
| |
| @Override |
| public void seekExact(long ord) throws IOException { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public BytesRef term() throws IOException { |
| return term; |
| } |
| |
| @Override |
| public long ord() throws IOException { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public int docFreq() throws IOException { |
| return 1; |
| } |
| |
| @Override |
| public long totalTermFreq() throws IOException { |
| return termFreqs[ord]; |
| } |
| |
| @Override |
| public final PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { |
| final TVPostingsEnum docsEnum; |
| if (reuse != null && reuse instanceof TVPostingsEnum) { |
| docsEnum = (TVPostingsEnum) reuse; |
| } else { |
| docsEnum = new TVPostingsEnum(); |
| } |
| |
| docsEnum.reset( |
| termFreqs[ord], |
| positionIndex[ord], |
| positions, |
| startOffsets, |
| lengths, |
| payloads, |
| payloadIndex); |
| return docsEnum; |
| } |
| |
| @Override |
| public ImpactsEnum impacts(int flags) throws IOException { |
| final PostingsEnum delegate = postings(null, PostingsEnum.FREQS); |
| return new SlowImpactsEnum(delegate); |
| } |
| } |
| |
| private static class TVPostingsEnum extends PostingsEnum { |
| |
| private int doc = -1; |
| private int termFreq; |
| private int positionIndex; |
| private int[] positions; |
| private int[] startOffsets; |
| private int[] lengths; |
| private final BytesRef payload; |
| private int[] payloadIndex; |
| private int basePayloadOffset; |
| private int i; |
| |
| TVPostingsEnum() { |
| payload = new BytesRef(); |
| } |
| |
| public void reset( |
| int freq, |
| int positionIndex, |
| int[] positions, |
| int[] startOffsets, |
| int[] lengths, |
| BytesRef payloads, |
| int[] payloadIndex) { |
| this.termFreq = freq; |
| this.positionIndex = positionIndex; |
| this.positions = positions; |
| this.startOffsets = startOffsets; |
| this.lengths = lengths; |
| this.basePayloadOffset = payloads.offset; |
| this.payload.bytes = payloads.bytes; |
| payload.offset = payload.length = 0; |
| this.payloadIndex = payloadIndex; |
| |
| doc = i = -1; |
| } |
| |
| private void checkDoc() { |
| if (doc == NO_MORE_DOCS) { |
| throw new IllegalStateException("DocsEnum exhausted"); |
| } else if (doc == -1) { |
| throw new IllegalStateException("DocsEnum not started"); |
| } |
| } |
| |
| private void checkPosition() { |
| checkDoc(); |
| if (i < 0) { |
| throw new IllegalStateException("Position enum not started"); |
| } else if (i >= termFreq) { |
| throw new IllegalStateException("Read past last position"); |
| } |
| } |
| |
| @Override |
| public int nextPosition() throws IOException { |
| if (doc != 0) { |
| throw new IllegalStateException(); |
| } else if (i >= termFreq - 1) { |
| throw new IllegalStateException("Read past last position"); |
| } |
| |
| ++i; |
| |
| if (payloadIndex != null) { |
| payload.offset = basePayloadOffset + payloadIndex[positionIndex + i]; |
| payload.length = payloadIndex[positionIndex + i + 1] - payloadIndex[positionIndex + i]; |
| } |
| |
| if (positions == null) { |
| return -1; |
| } else { |
| return positions[positionIndex + i]; |
| } |
| } |
| |
| @Override |
| public int startOffset() throws IOException { |
| checkPosition(); |
| if (startOffsets == null) { |
| return -1; |
| } else { |
| return startOffsets[positionIndex + i]; |
| } |
| } |
| |
| @Override |
| public int endOffset() throws IOException { |
| checkPosition(); |
| if (startOffsets == null) { |
| return -1; |
| } else { |
| return startOffsets[positionIndex + i] + lengths[positionIndex + i]; |
| } |
| } |
| |
| @Override |
| public BytesRef getPayload() throws IOException { |
| checkPosition(); |
| if (payloadIndex == null || payload.length == 0) { |
| return null; |
| } else { |
| return payload; |
| } |
| } |
| |
| @Override |
| public int freq() throws IOException { |
| checkDoc(); |
| return termFreq; |
| } |
| |
| @Override |
| public int docID() { |
| return doc; |
| } |
| |
| @Override |
| public int nextDoc() throws IOException { |
| if (doc == -1) { |
| return (doc = 0); |
| } else { |
| return (doc = NO_MORE_DOCS); |
| } |
| } |
| |
| @Override |
| public int advance(int target) throws IOException { |
| return slowAdvance(target); |
| } |
| |
| @Override |
| public long cost() { |
| return 1; |
| } |
| } |
| |
| private static int sum(int[] arr) { |
| int sum = 0; |
| for (int el : arr) { |
| sum += el; |
| } |
| return sum; |
| } |
| |
| @Override |
| public long ramBytesUsed() { |
| return indexReader.ramBytesUsed(); |
| } |
| |
| @Override |
| public Collection<Accountable> getChildResources() { |
| return Collections.singleton(Accountables.namedAccountable("term vector index", indexReader)); |
| } |
| |
| @Override |
| public void checkIntegrity() throws IOException { |
| indexReader.checkIntegrity(); |
| CodecUtil.checksumEntireFile(vectorsStream); |
| } |
| |
| @Override |
| public String toString() { |
| return getClass().getSimpleName() |
| + "(mode=" |
| + compressionMode |
| + ",chunksize=" |
| + chunkSize |
| + ")"; |
| } |
| } |