LUCENE-9705: Create Lucene90TermVectorsFormat (#2334)
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50TermVectorsFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50TermVectorsFormat.java
similarity index 96%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50TermVectorsFormat.java
rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50TermVectorsFormat.java
index 270767e..1771e7f 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50TermVectorsFormat.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50TermVectorsFormat.java
@@ -14,11 +14,11 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.codecs.lucene50;
+package org.apache.lucene.backward_codecs.lucene50;
+import org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingTermVectorsFormat;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.TermVectorsFormat;
-import org.apache.lucene.codecs.compressing.CompressingTermVectorsFormat;
import org.apache.lucene.codecs.compressing.CompressionMode;
import org.apache.lucene.codecs.compressing.FieldsIndexWriter;
import org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat;
@@ -151,7 +151,7 @@
*
* @lucene.experimental
*/
-public final class Lucene50TermVectorsFormat extends CompressingTermVectorsFormat {
+public final class Lucene50TermVectorsFormat extends Lucene50CompressingTermVectorsFormat {
/** Sole constructor. */
public Lucene50TermVectorsFormat() {
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50TermVectorsFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/FieldsIndex.java
similarity index 61%
copy from lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50TermVectorsFormat.java
copy to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/FieldsIndex.java
index b46f4f7..59ec656 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50TermVectorsFormat.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/FieldsIndex.java
@@ -14,15 +14,20 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.codecs.lucene50;
+package org.apache.lucene.backward_codecs.lucene50.compressing;
-import org.apache.lucene.codecs.Codec;
-import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
-import org.apache.lucene.util.TestUtil;
+import java.io.Closeable;
+import java.io.IOException;
+import org.apache.lucene.util.Accountable;
-public class TestLucene50TermVectorsFormat extends BaseTermVectorsFormatTestCase {
+abstract class FieldsIndex implements Accountable, Cloneable, Closeable {
+
+ /** Get the start pointer for the block that contains the given docID. */
+ abstract long getStartPointer(int docID);
+
+ /** Check the integrity of the index. */
+ abstract void checkIntegrity() throws IOException;
+
@Override
- protected Codec getCodec() {
- return TestUtil.getDefaultCodec();
- }
+ public abstract FieldsIndex clone();
}
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/FieldsIndexReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/FieldsIndexReader.java
new file mode 100644
index 0000000..05ff4ca
--- /dev/null
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/FieldsIndexReader.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.backward_codecs.lucene50.compressing;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.Objects;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.packed.DirectMonotonicReader;
+
+final class FieldsIndexReader extends FieldsIndex {
+
+ static final int VERSION_START = 0;
+ static final int VERSION_CURRENT = 0;
+
+ private static final long BASE_RAM_BYTES_USED =
+ RamUsageEstimator.shallowSizeOfInstance(FieldsIndexReader.class);
+
+ private final int maxDoc;
+ private final int blockShift;
+ private final int numChunks;
+ private final DirectMonotonicReader.Meta docsMeta;
+ private final DirectMonotonicReader.Meta startPointersMeta;
+ private final IndexInput indexInput;
+ private final long docsStartPointer,
+ docsEndPointer,
+ startPointersStartPointer,
+ startPointersEndPointer;
+ private final DirectMonotonicReader docs, startPointers;
+ private final long maxPointer;
+
+ FieldsIndexReader(
+ Directory dir,
+ String name,
+ String suffix,
+ String extension,
+ String codecName,
+ byte[] id,
+ IndexInput metaIn)
+ throws IOException {
+ maxDoc = metaIn.readInt();
+ blockShift = metaIn.readInt();
+ numChunks = metaIn.readInt();
+ docsStartPointer = metaIn.readLong();
+ docsMeta = DirectMonotonicReader.loadMeta(metaIn, numChunks, blockShift);
+ docsEndPointer = startPointersStartPointer = metaIn.readLong();
+ startPointersMeta = DirectMonotonicReader.loadMeta(metaIn, numChunks, blockShift);
+ startPointersEndPointer = metaIn.readLong();
+ maxPointer = metaIn.readLong();
+
+ indexInput =
+ dir.openInput(IndexFileNames.segmentFileName(name, suffix, extension), IOContext.READ);
+ boolean success = false;
+ try {
+ CodecUtil.checkIndexHeader(
+ indexInput, codecName + "Idx", VERSION_START, VERSION_CURRENT, id, suffix);
+ CodecUtil.retrieveChecksum(indexInput);
+ success = true;
+ } finally {
+ if (success == false) {
+ indexInput.close();
+ }
+ }
+ final RandomAccessInput docsSlice =
+ indexInput.randomAccessSlice(docsStartPointer, docsEndPointer - docsStartPointer);
+ final RandomAccessInput startPointersSlice =
+ indexInput.randomAccessSlice(
+ startPointersStartPointer, startPointersEndPointer - startPointersStartPointer);
+ docs = DirectMonotonicReader.getInstance(docsMeta, docsSlice);
+ startPointers = DirectMonotonicReader.getInstance(startPointersMeta, startPointersSlice);
+ }
+
+ private FieldsIndexReader(FieldsIndexReader other) throws IOException {
+ maxDoc = other.maxDoc;
+ numChunks = other.numChunks;
+ blockShift = other.blockShift;
+ docsMeta = other.docsMeta;
+ startPointersMeta = other.startPointersMeta;
+ indexInput = other.indexInput.clone();
+ docsStartPointer = other.docsStartPointer;
+ docsEndPointer = other.docsEndPointer;
+ startPointersStartPointer = other.startPointersStartPointer;
+ startPointersEndPointer = other.startPointersEndPointer;
+ maxPointer = other.maxPointer;
+ final RandomAccessInput docsSlice =
+ indexInput.randomAccessSlice(docsStartPointer, docsEndPointer - docsStartPointer);
+ final RandomAccessInput startPointersSlice =
+ indexInput.randomAccessSlice(
+ startPointersStartPointer, startPointersEndPointer - startPointersStartPointer);
+ docs = DirectMonotonicReader.getInstance(docsMeta, docsSlice);
+ startPointers = DirectMonotonicReader.getInstance(startPointersMeta, startPointersSlice);
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ return BASE_RAM_BYTES_USED
+ + docsMeta.ramBytesUsed()
+ + startPointersMeta.ramBytesUsed()
+ + docs.ramBytesUsed()
+ + startPointers.ramBytesUsed();
+ }
+
+ @Override
+ public void close() throws IOException {
+ indexInput.close();
+ }
+
+ @Override
+ long getStartPointer(int docID) {
+ Objects.checkIndex(docID, maxDoc);
+ long blockIndex = docs.binarySearch(0, numChunks, docID);
+ if (blockIndex < 0) {
+ blockIndex = -2 - blockIndex;
+ }
+ return startPointers.get(blockIndex);
+ }
+
+ @Override
+ public FieldsIndex clone() {
+ try {
+ return new FieldsIndexReader(this);
+ } catch (IOException e) {
+ throw new UncheckedIOException(e);
+ }
+ }
+
+ public long getMaxPointer() {
+ return maxPointer;
+ }
+
+ @Override
+ void checkIntegrity() throws IOException {
+ CodecUtil.checksumEntireFile(indexInput);
+ }
+}
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/LegacyFieldsIndexReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/LegacyFieldsIndexReader.java
new file mode 100644
index 0000000..596ac82
--- /dev/null
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/LegacyFieldsIndexReader.java
@@ -0,0 +1,233 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.backward_codecs.lucene50.compressing;
+
+import static org.apache.lucene.util.BitUtil.zigZagDecode;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.Accountables;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.packed.PackedInts;
+
+final class LegacyFieldsIndexReader extends FieldsIndex {
+
+ private static final long BASE_RAM_BYTES_USED =
+ RamUsageEstimator.shallowSizeOfInstance(LegacyFieldsIndexReader.class);
+
+ final int maxDoc;
+ final int[] docBases;
+ final long[] startPointers;
+ final int[] avgChunkDocs;
+ final long[] avgChunkSizes;
+ final PackedInts.Reader[] docBasesDeltas; // delta from the avg
+ final PackedInts.Reader[] startPointersDeltas; // delta from the avg
+
+ // It is the responsibility of the caller to close fieldsIndexIn after this constructor
+ // has been called
+ LegacyFieldsIndexReader(IndexInput fieldsIndexIn, SegmentInfo si) throws IOException {
+ maxDoc = si.maxDoc();
+ int[] docBases = new int[16];
+ long[] startPointers = new long[16];
+ int[] avgChunkDocs = new int[16];
+ long[] avgChunkSizes = new long[16];
+ PackedInts.Reader[] docBasesDeltas = new PackedInts.Reader[16];
+ PackedInts.Reader[] startPointersDeltas = new PackedInts.Reader[16];
+
+ final int packedIntsVersion = fieldsIndexIn.readVInt();
+
+ int blockCount = 0;
+
+ for (; ; ) {
+ final int numChunks = fieldsIndexIn.readVInt();
+ if (numChunks == 0) {
+ break;
+ }
+ if (blockCount == docBases.length) {
+ final int newSize = ArrayUtil.oversize(blockCount + 1, 8);
+ docBases = ArrayUtil.growExact(docBases, newSize);
+ startPointers = ArrayUtil.growExact(startPointers, newSize);
+ avgChunkDocs = ArrayUtil.growExact(avgChunkDocs, newSize);
+ avgChunkSizes = ArrayUtil.growExact(avgChunkSizes, newSize);
+ docBasesDeltas = ArrayUtil.growExact(docBasesDeltas, newSize);
+ startPointersDeltas = ArrayUtil.growExact(startPointersDeltas, newSize);
+ }
+
+ // doc bases
+ docBases[blockCount] = fieldsIndexIn.readVInt();
+ avgChunkDocs[blockCount] = fieldsIndexIn.readVInt();
+ final int bitsPerDocBase = fieldsIndexIn.readVInt();
+ if (bitsPerDocBase > 32) {
+ throw new CorruptIndexException(
+ "Corrupted bitsPerDocBase: " + bitsPerDocBase, fieldsIndexIn);
+ }
+ docBasesDeltas[blockCount] =
+ PackedInts.getReaderNoHeader(
+ fieldsIndexIn,
+ PackedInts.Format.PACKED,
+ packedIntsVersion,
+ numChunks,
+ bitsPerDocBase);
+
+ // start pointers
+ startPointers[blockCount] = fieldsIndexIn.readVLong();
+ avgChunkSizes[blockCount] = fieldsIndexIn.readVLong();
+ final int bitsPerStartPointer = fieldsIndexIn.readVInt();
+ if (bitsPerStartPointer > 64) {
+ throw new CorruptIndexException(
+ "Corrupted bitsPerStartPointer: " + bitsPerStartPointer, fieldsIndexIn);
+ }
+ startPointersDeltas[blockCount] =
+ PackedInts.getReaderNoHeader(
+ fieldsIndexIn,
+ PackedInts.Format.PACKED,
+ packedIntsVersion,
+ numChunks,
+ bitsPerStartPointer);
+
+ ++blockCount;
+ }
+
+ this.docBases = ArrayUtil.copyOfSubArray(docBases, 0, blockCount);
+ this.startPointers = ArrayUtil.copyOfSubArray(startPointers, 0, blockCount);
+ this.avgChunkDocs = ArrayUtil.copyOfSubArray(avgChunkDocs, 0, blockCount);
+ this.avgChunkSizes = ArrayUtil.copyOfSubArray(avgChunkSizes, 0, blockCount);
+ this.docBasesDeltas = ArrayUtil.copyOfSubArray(docBasesDeltas, 0, blockCount);
+ this.startPointersDeltas = ArrayUtil.copyOfSubArray(startPointersDeltas, 0, blockCount);
+ }
+
+ private int block(int docID) {
+ int lo = 0, hi = docBases.length - 1;
+ while (lo <= hi) {
+ final int mid = (lo + hi) >>> 1;
+ final int midValue = docBases[mid];
+ if (midValue == docID) {
+ return mid;
+ } else if (midValue < docID) {
+ lo = mid + 1;
+ } else {
+ hi = mid - 1;
+ }
+ }
+ return hi;
+ }
+
+ private int relativeDocBase(int block, int relativeChunk) {
+ final int expected = avgChunkDocs[block] * relativeChunk;
+ final long delta = zigZagDecode(docBasesDeltas[block].get(relativeChunk));
+ return expected + (int) delta;
+ }
+
+ private long relativeStartPointer(int block, int relativeChunk) {
+ final long expected = avgChunkSizes[block] * relativeChunk;
+ final long delta = zigZagDecode(startPointersDeltas[block].get(relativeChunk));
+ return expected + delta;
+ }
+
+ private int relativeChunk(int block, int relativeDoc) {
+ int lo = 0, hi = docBasesDeltas[block].size() - 1;
+ while (lo <= hi) {
+ final int mid = (lo + hi) >>> 1;
+ final int midValue = relativeDocBase(block, mid);
+ if (midValue == relativeDoc) {
+ return mid;
+ } else if (midValue < relativeDoc) {
+ lo = mid + 1;
+ } else {
+ hi = mid - 1;
+ }
+ }
+ return hi;
+ }
+
+ long getStartPointer(int docID) {
+ if (docID < 0 || docID >= maxDoc) {
+ throw new IllegalArgumentException("docID out of range [0-" + maxDoc + "]: " + docID);
+ }
+ final int block = block(docID);
+ final int relativeChunk = relativeChunk(block, docID - docBases[block]);
+ return startPointers[block] + relativeStartPointer(block, relativeChunk);
+ }
+
+ @Override
+ public LegacyFieldsIndexReader clone() {
+ return this;
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ long res = BASE_RAM_BYTES_USED;
+
+ res += RamUsageEstimator.shallowSizeOf(docBasesDeltas);
+ for (PackedInts.Reader r : docBasesDeltas) {
+ res += r.ramBytesUsed();
+ }
+ res += RamUsageEstimator.shallowSizeOf(startPointersDeltas);
+ for (PackedInts.Reader r : startPointersDeltas) {
+ res += r.ramBytesUsed();
+ }
+
+ res += RamUsageEstimator.sizeOf(docBases);
+ res += RamUsageEstimator.sizeOf(startPointers);
+ res += RamUsageEstimator.sizeOf(avgChunkDocs);
+ res += RamUsageEstimator.sizeOf(avgChunkSizes);
+
+ return res;
+ }
+
+ @Override
+ public Collection<Accountable> getChildResources() {
+ List<Accountable> resources = new ArrayList<>();
+
+ long docBaseDeltaBytes = RamUsageEstimator.shallowSizeOf(docBasesDeltas);
+ for (PackedInts.Reader r : docBasesDeltas) {
+ docBaseDeltaBytes += r.ramBytesUsed();
+ }
+ resources.add(Accountables.namedAccountable("doc base deltas", docBaseDeltaBytes));
+
+ long startPointerDeltaBytes = RamUsageEstimator.shallowSizeOf(startPointersDeltas);
+ for (PackedInts.Reader r : startPointersDeltas) {
+ startPointerDeltaBytes += r.ramBytesUsed();
+ }
+ resources.add(Accountables.namedAccountable("start pointer deltas", startPointerDeltaBytes));
+
+ return Collections.unmodifiableList(resources);
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName() + "(blocks=" + docBases.length + ")";
+ }
+
+ @Override
+ public void close() throws IOException {
+ // nothing to do
+ }
+
+ @Override
+ void checkIntegrity() throws IOException {
+ // nothing to do, the index is checked at open time
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingTermVectorsFormat.java
similarity index 80%
copy from lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsFormat.java
copy to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingTermVectorsFormat.java
index 15016b8..628b21d 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsFormat.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingTermVectorsFormat.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.codecs.compressing;
+package org.apache.lucene.backward_codecs.lucene50.compressing;
import java.io.IOException;
import org.apache.lucene.codecs.CodecUtil;
@@ -22,6 +22,7 @@
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.codecs.TermVectorsWriter;
+import org.apache.lucene.codecs.compressing.CompressionMode;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.Directory;
@@ -33,16 +34,21 @@
*
* @lucene.experimental
*/
-public class CompressingTermVectorsFormat extends TermVectorsFormat {
+public class Lucene50CompressingTermVectorsFormat extends TermVectorsFormat {
- private final String formatName;
- private final String segmentSuffix;
- private final CompressionMode compressionMode;
- private final int chunkSize;
- private final int blockSize;
+ /** format name */
+ protected final String formatName;
+ /** segment suffix */
+ protected final String segmentSuffix;
+ /** compression mode */
+ protected final CompressionMode compressionMode;
+ /** chunk size */
+ protected final int chunkSize;
+ /** block size */
+ protected final int blockSize;
/**
- * Create a new {@link CompressingTermVectorsFormat}.
+ * Create a new {@link Lucene50CompressingTermVectorsFormat}.
*
* <p><code>formatName</code> is the name of the format. This name will be used in the file
* formats to perform {@link CodecUtil#checkIndexHeader codec header checks}.
@@ -50,7 +56,8 @@
* <p>The <code>compressionMode</code> parameter allows you to choose between compression
* algorithms that have various compression and decompression speeds so that you can pick the one
* that best fits your indexing and searching throughput. You should never instantiate two {@link
- * CompressingTermVectorsFormat}s that have the same name but different {@link CompressionMode}s.
+ * Lucene50CompressingTermVectorsFormat}s that have the same name but different {@link
+ * CompressionMode}s.
*
* <p><code>chunkSize</code> is the minimum byte size of a chunk of documents. Higher values of
* <code>chunkSize</code> should improve the compression ratio but will require more memory at
@@ -64,7 +71,7 @@
* @param blockSize the number of chunks to store in an index block.
* @see CompressionMode
*/
- public CompressingTermVectorsFormat(
+ public Lucene50CompressingTermVectorsFormat(
String formatName,
String segmentSuffix,
CompressionMode compressionMode,
@@ -87,22 +94,14 @@
public final TermVectorsReader vectorsReader(
Directory directory, SegmentInfo segmentInfo, FieldInfos fieldInfos, IOContext context)
throws IOException {
- return new CompressingTermVectorsReader(
+ return new Lucene50CompressingTermVectorsReader(
directory, segmentInfo, segmentSuffix, fieldInfos, context, formatName, compressionMode);
}
@Override
- public final TermVectorsWriter vectorsWriter(
+ public TermVectorsWriter vectorsWriter(
Directory directory, SegmentInfo segmentInfo, IOContext context) throws IOException {
- return new CompressingTermVectorsWriter(
- directory,
- segmentInfo,
- segmentSuffix,
- context,
- formatName,
- compressionMode,
- chunkSize,
- blockSize);
+ throw new UnsupportedOperationException("Old formats can't be used for writing");
}
@Override
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingTermVectorsReader.java
similarity index 95%
rename from lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java
rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingTermVectorsReader.java
index a3230fa..6d8a248 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingTermVectorsReader.java
@@ -14,22 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.codecs.compressing;
-
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.FLAGS_BITS;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.META_VERSION_START;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.OFFSETS;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.PACKED_BLOCK_SIZE;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.PAYLOADS;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.POSITIONS;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VECTORS_EXTENSION;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VECTORS_INDEX_CODEC_NAME;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VECTORS_INDEX_EXTENSION;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VECTORS_META_EXTENSION;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VERSION_CURRENT;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VERSION_META;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VERSION_OFFHEAP_INDEX;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VERSION_START;
+package org.apache.lucene.backward_codecs.lucene50.compressing;
import java.io.Closeable;
import java.io.IOException;
@@ -39,6 +24,8 @@
import java.util.NoSuchElementException;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.TermVectorsReader;
+import org.apache.lucene.codecs.compressing.CompressionMode;
+import org.apache.lucene.codecs.compressing.Decompressor;
import org.apache.lucene.index.BaseTermsEnum;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
@@ -67,11 +54,35 @@
import org.apache.lucene.util.packed.PackedInts;
/**
- * {@link TermVectorsReader} for {@link CompressingTermVectorsFormat}.
+ * {@link TermVectorsReader} for {@link Lucene50CompressingTermVectorsFormat}.
*
* @lucene.experimental
*/
-public final class CompressingTermVectorsReader extends TermVectorsReader implements Closeable {
+public final class Lucene50CompressingTermVectorsReader extends TermVectorsReader
+ implements Closeable {
+
+ // hard limit on the maximum number of documents per chunk
+ static final int MAX_DOCUMENTS_PER_CHUNK = 128;
+
+ static final String VECTORS_EXTENSION = "tvd";
+ static final String VECTORS_INDEX_EXTENSION = "tvx";
+ static final String VECTORS_META_EXTENSION = "tvm";
+ static final String VECTORS_INDEX_CODEC_NAME = "Lucene85TermVectorsIndex";
+
+ static final int VERSION_START = 1;
+ static final int VERSION_OFFHEAP_INDEX = 2;
+ /** Version where all metadata were moved to the meta file. */
+ static final int VERSION_META = 3;
+
+ static final int VERSION_CURRENT = VERSION_META;
+ static final int META_VERSION_START = 0;
+
+ static final int PACKED_BLOCK_SIZE = 64;
+
+ static final int POSITIONS = 0x01;
+ static final int OFFSETS = 0x02;
+ static final int PAYLOADS = 0x04;
+ static final int FLAGS_BITS = PackedInts.bitsRequired(POSITIONS | OFFSETS | PAYLOADS);
private final FieldInfos fieldInfos;
final FieldsIndex indexReader;
@@ -89,7 +100,7 @@
private final long maxPointer; // end of the data section
// used by clone
- private CompressingTermVectorsReader(CompressingTermVectorsReader reader) {
+ private Lucene50CompressingTermVectorsReader(Lucene50CompressingTermVectorsReader reader) {
this.fieldInfos = reader.fieldInfos;
this.vectorsStream = reader.vectorsStream.clone();
this.indexReader = reader.indexReader.clone();
@@ -108,7 +119,7 @@
}
/** Sole constructor. */
- public CompressingTermVectorsReader(
+ public Lucene50CompressingTermVectorsReader(
Directory d,
SegmentInfo si,
String segmentSuffix,
@@ -318,7 +329,7 @@
@Override
public TermVectorsReader clone() {
- return new CompressingTermVectorsReader(this);
+ return new Lucene50CompressingTermVectorsReader(this);
}
@Override
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/package-info.java
similarity index 81%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java
rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/package-info.java
index af42de0..53d5f4b 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/package-info.java
@@ -15,8 +15,5 @@
* limitations under the License.
*/
-/**
- * Components from the Lucene 5.0 index format See {@link org.apache.lucene.codecs.lucene90} for an
- * overview of the index format.
- */
-package org.apache.lucene.codecs.lucene50;
+/** Lucene 5.0 compressing format. */
+package org.apache.lucene.backward_codecs.lucene50.compressing;
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene70/Lucene70Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene70/Lucene70Codec.java
index e6e1d9e..4b7d8bf 100644
--- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene70/Lucene70Codec.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene70/Lucene70Codec.java
@@ -20,6 +20,7 @@
import org.apache.lucene.backward_codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50StoredFieldsFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
+import org.apache.lucene.backward_codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.backward_codecs.lucene60.Lucene60FieldInfosFormat;
import org.apache.lucene.backward_codecs.lucene60.Lucene60PointsFormat;
import org.apache.lucene.codecs.Codec;
@@ -35,7 +36,6 @@
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.VectorFormat;
-import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
@@ -87,7 +87,7 @@
}
@Override
- public final TermVectorsFormat termVectorsFormat() {
+ public TermVectorsFormat termVectorsFormat() {
return vectorsFormat;
}
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene80/Lucene80Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene80/Lucene80Codec.java
index 7136276..c745121 100644
--- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene80/Lucene80Codec.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene80/Lucene80Codec.java
@@ -19,6 +19,7 @@
import org.apache.lucene.backward_codecs.lucene50.Lucene50CompoundFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50StoredFieldsFormat;
+import org.apache.lucene.backward_codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.backward_codecs.lucene60.Lucene60FieldInfosFormat;
import org.apache.lucene.backward_codecs.lucene60.Lucene60PointsFormat;
import org.apache.lucene.backward_codecs.lucene70.Lucene70SegmentInfoFormat;
@@ -34,7 +35,6 @@
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.VectorFormat;
-import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
@@ -84,7 +84,7 @@
}
@Override
- public final TermVectorsFormat termVectorsFormat() {
+ public TermVectorsFormat termVectorsFormat() {
return vectorsFormat;
}
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene84/Lucene84Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene84/Lucene84Codec.java
index 81125f5..57c0d41 100644
--- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene84/Lucene84Codec.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene84/Lucene84Codec.java
@@ -21,6 +21,7 @@
import org.apache.lucene.backward_codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50StoredFieldsFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
+import org.apache.lucene.backward_codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.backward_codecs.lucene60.Lucene60FieldInfosFormat;
import org.apache.lucene.backward_codecs.lucene60.Lucene60PointsFormat;
import org.apache.lucene.backward_codecs.lucene70.Lucene70SegmentInfoFormat;
@@ -38,7 +39,6 @@
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.VectorFormat;
-import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
@@ -99,7 +99,7 @@
}
@Override
- public final TermVectorsFormat termVectorsFormat() {
+ public TermVectorsFormat termVectorsFormat() {
return vectorsFormat;
}
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene86/Lucene86Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene86/Lucene86Codec.java
index 78fb85ae..1cd5d4c 100644
--- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene86/Lucene86Codec.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene86/Lucene86Codec.java
@@ -21,6 +21,7 @@
import org.apache.lucene.backward_codecs.lucene50.Lucene50CompoundFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50StoredFieldsFormat;
+import org.apache.lucene.backward_codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.backward_codecs.lucene60.Lucene60FieldInfosFormat;
import org.apache.lucene.backward_codecs.lucene80.Lucene80NormsFormat;
import org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat;
@@ -37,7 +38,6 @@
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.VectorFormat;
-import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.codecs.lucene86.Lucene86PointsFormat;
import org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
@@ -101,7 +101,7 @@
}
@Override
- public final TermVectorsFormat termVectorsFormat() {
+ public TermVectorsFormat termVectorsFormat() {
return vectorsFormat;
}
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/Lucene87Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/Lucene87Codec.java
index 181e8fb..5703d5e 100644
--- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/Lucene87Codec.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/Lucene87Codec.java
@@ -20,6 +20,7 @@
import java.util.Objects;
import org.apache.lucene.backward_codecs.lucene50.Lucene50CompoundFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50LiveDocsFormat;
+import org.apache.lucene.backward_codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.backward_codecs.lucene60.Lucene60FieldInfosFormat;
import org.apache.lucene.backward_codecs.lucene80.Lucene80DocValuesFormat;
import org.apache.lucene.backward_codecs.lucene80.Lucene80NormsFormat;
@@ -37,7 +38,6 @@
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.VectorFormat;
-import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.codecs.lucene86.Lucene86PointsFormat;
import org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat;
import org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat;
@@ -113,7 +113,7 @@
}
@Override
- public final TermVectorsFormat termVectorsFormat() {
+ public TermVectorsFormat termVectorsFormat() {
return vectorsFormat;
}
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50TermVectorsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/Lucene50RWTermVectorsFormat.java
similarity index 61%
copy from lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50TermVectorsFormat.java
copy to lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/Lucene50RWTermVectorsFormat.java
index b46f4f7..e3fef3f 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50TermVectorsFormat.java
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/Lucene50RWTermVectorsFormat.java
@@ -14,15 +14,16 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.codecs.lucene50;
+package org.apache.lucene.backward_codecs.lucene50;
-import org.apache.lucene.codecs.Codec;
-import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
-import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50RWCompressingTermVectorsFormat;
+import org.apache.lucene.codecs.compressing.CompressionMode;
-public class TestLucene50TermVectorsFormat extends BaseTermVectorsFormatTestCase {
- @Override
- protected Codec getCodec() {
- return TestUtil.getDefaultCodec();
+/** RW impersonation of Lucene50StoredFieldsFormat. */
+public final class Lucene50RWTermVectorsFormat extends Lucene50RWCompressingTermVectorsFormat {
+
+ /** Sole constructor. */
+ public Lucene50RWTermVectorsFormat() {
+ super("Lucene50TermVectorsData", "", CompressionMode.FAST, 1 << 12, 10);
}
}
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50TermVectorsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/TestLucene50TermVectorsFormat.java
similarity index 86%
copy from lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50TermVectorsFormat.java
copy to lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/TestLucene50TermVectorsFormat.java
index b46f4f7..4c0f868 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50TermVectorsFormat.java
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/TestLucene50TermVectorsFormat.java
@@ -14,15 +14,16 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.codecs.lucene50;
+package org.apache.lucene.backward_codecs.lucene50;
+import org.apache.lucene.backward_codecs.lucene87.Lucene87RWCodec;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
-import org.apache.lucene.util.TestUtil;
public class TestLucene50TermVectorsFormat extends BaseTermVectorsFormatTestCase {
+
@Override
protected Codec getCodec() {
- return TestUtil.getDefaultCodec();
+ return new Lucene87RWCodec();
}
}
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/compressing/FieldsIndexWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/compressing/FieldsIndexWriter.java
new file mode 100644
index 0000000..c7ae5c1
--- /dev/null
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/compressing/FieldsIndexWriter.java
@@ -0,0 +1,202 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.backward_codecs.lucene50.compressing;
+
+import static org.apache.lucene.backward_codecs.lucene50.compressing.FieldsIndexReader.VERSION_CURRENT;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.packed.DirectMonotonicReader;
+import org.apache.lucene.util.packed.DirectMonotonicWriter;
+
+/**
+ * Efficient index format for block-based {@link Codec}s.
+ *
+ * <p>For each block of compressed stored fields, this stores the first document of the block and
+ * the start pointer of the block in a {@link DirectMonotonicWriter}. At read time, the docID is
+ * binary-searched in the {@link DirectMonotonicReader} that records doc IDS, and the returned index
+ * is used to look up the start pointer in the {@link DirectMonotonicReader} that records start
+ * pointers.
+ *
+ * @lucene.internal
+ */
+public final class FieldsIndexWriter implements Closeable {
+
+ private final Directory dir;
+ private final String name;
+ private final String suffix;
+ private final String extension;
+ private final String codecName;
+ private final byte[] id;
+ private final int blockShift;
+ private final IOContext ioContext;
+ private IndexOutput docsOut;
+ private IndexOutput filePointersOut;
+ private int totalDocs;
+ private int totalChunks;
+ private long previousFP;
+
+ FieldsIndexWriter(
+ Directory dir,
+ String name,
+ String suffix,
+ String extension,
+ String codecName,
+ byte[] id,
+ int blockShift,
+ IOContext ioContext)
+ throws IOException {
+ this.dir = dir;
+ this.name = name;
+ this.suffix = suffix;
+ this.extension = extension;
+ this.codecName = codecName;
+ this.id = id;
+ this.blockShift = blockShift;
+ this.ioContext = ioContext;
+ this.docsOut = dir.createTempOutput(name, codecName + "-doc_ids", ioContext);
+ boolean success = false;
+ try {
+ CodecUtil.writeHeader(docsOut, codecName + "Docs", VERSION_CURRENT);
+ filePointersOut = dir.createTempOutput(name, codecName + "file_pointers", ioContext);
+ CodecUtil.writeHeader(filePointersOut, codecName + "FilePointers", VERSION_CURRENT);
+ success = true;
+ } finally {
+ if (success == false) {
+ close();
+ }
+ }
+ }
+
+ void writeIndex(int numDocs, long startPointer) throws IOException {
+ assert startPointer >= previousFP;
+ docsOut.writeVInt(numDocs);
+ filePointersOut.writeVLong(startPointer - previousFP);
+ previousFP = startPointer;
+ totalDocs += numDocs;
+ totalChunks++;
+ }
+
+ void finish(int numDocs, long maxPointer, IndexOutput metaOut) throws IOException {
+ if (numDocs != totalDocs) {
+ throw new IllegalStateException("Expected " + numDocs + " docs, but got " + totalDocs);
+ }
+ CodecUtil.writeFooter(docsOut);
+ CodecUtil.writeFooter(filePointersOut);
+ IOUtils.close(docsOut, filePointersOut);
+
+ try (IndexOutput dataOut =
+ dir.createOutput(IndexFileNames.segmentFileName(name, suffix, extension), ioContext)) {
+ CodecUtil.writeIndexHeader(dataOut, codecName + "Idx", VERSION_CURRENT, id, suffix);
+
+ metaOut.writeInt(numDocs);
+ metaOut.writeInt(blockShift);
+ metaOut.writeInt(totalChunks + 1);
+ metaOut.writeLong(dataOut.getFilePointer());
+
+ try (ChecksumIndexInput docsIn =
+ dir.openChecksumInput(docsOut.getName(), IOContext.READONCE)) {
+ CodecUtil.checkHeader(docsIn, codecName + "Docs", VERSION_CURRENT, VERSION_CURRENT);
+ Throwable priorE = null;
+ try {
+ final DirectMonotonicWriter docs =
+ DirectMonotonicWriter.getInstance(metaOut, dataOut, totalChunks + 1, blockShift);
+ long doc = 0;
+ docs.add(doc);
+ for (int i = 0; i < totalChunks; ++i) {
+ doc += docsIn.readVInt();
+ docs.add(doc);
+ }
+ docs.finish();
+ if (doc != totalDocs) {
+ throw new CorruptIndexException("Docs don't add up", docsIn);
+ }
+ } catch (Throwable e) {
+ priorE = e;
+ } finally {
+ CodecUtil.checkFooter(docsIn, priorE);
+ }
+ }
+ dir.deleteFile(docsOut.getName());
+ docsOut = null;
+
+ metaOut.writeLong(dataOut.getFilePointer());
+ try (ChecksumIndexInput filePointersIn =
+ dir.openChecksumInput(filePointersOut.getName(), IOContext.READONCE)) {
+ CodecUtil.checkHeader(
+ filePointersIn, codecName + "FilePointers", VERSION_CURRENT, VERSION_CURRENT);
+ Throwable priorE = null;
+ try {
+ final DirectMonotonicWriter filePointers =
+ DirectMonotonicWriter.getInstance(metaOut, dataOut, totalChunks + 1, blockShift);
+ long fp = 0;
+ for (int i = 0; i < totalChunks; ++i) {
+ fp += filePointersIn.readVLong();
+ filePointers.add(fp);
+ }
+ if (maxPointer < fp) {
+ throw new CorruptIndexException("File pointers don't add up", filePointersIn);
+ }
+ filePointers.add(maxPointer);
+ filePointers.finish();
+ } catch (Throwable e) {
+ priorE = e;
+ } finally {
+ CodecUtil.checkFooter(filePointersIn, priorE);
+ }
+ }
+ dir.deleteFile(filePointersOut.getName());
+ filePointersOut = null;
+
+ metaOut.writeLong(dataOut.getFilePointer());
+ metaOut.writeLong(maxPointer);
+
+ CodecUtil.writeFooter(dataOut);
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ try {
+ IOUtils.close(docsOut, filePointersOut);
+ } finally {
+ List<String> fileNames = new ArrayList<>();
+ if (docsOut != null) {
+ fileNames.add(docsOut.getName());
+ }
+ if (filePointersOut != null) {
+ fileNames.add(filePointersOut.getName());
+ }
+ try {
+ IOUtils.deleteFiles(dir, fileNames);
+ } finally {
+ docsOut = filePointersOut = null;
+ }
+ }
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingTermVectorsWriter.java
similarity index 93%
copy from lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
copy to lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingTermVectorsWriter.java
index 95c633e..926a4be 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingTermVectorsWriter.java
@@ -14,7 +14,19 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.codecs.compressing;
+package org.apache.lucene.backward_codecs.lucene50.compressing;
+
+import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingTermVectorsReader.FLAGS_BITS;
+import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingTermVectorsReader.MAX_DOCUMENTS_PER_CHUNK;
+import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingTermVectorsReader.OFFSETS;
+import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingTermVectorsReader.PACKED_BLOCK_SIZE;
+import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingTermVectorsReader.PAYLOADS;
+import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingTermVectorsReader.POSITIONS;
+import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingTermVectorsReader.VECTORS_EXTENSION;
+import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingTermVectorsReader.VECTORS_INDEX_CODEC_NAME;
+import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingTermVectorsReader.VECTORS_INDEX_EXTENSION;
+import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingTermVectorsReader.VECTORS_META_EXTENSION;
+import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingTermVectorsReader.VERSION_CURRENT;
import java.io.IOException;
import java.util.ArrayDeque;
@@ -28,6 +40,9 @@
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.codecs.TermVectorsWriter;
+import org.apache.lucene.codecs.compressing.CompressionMode;
+import org.apache.lucene.codecs.compressing.Compressor;
+import org.apache.lucene.codecs.compressing.MatchingReaders;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
@@ -51,34 +66,11 @@
import org.apache.lucene.util.packed.PackedInts;
/**
- * {@link TermVectorsWriter} for {@link CompressingTermVectorsFormat}.
+ * {@link TermVectorsWriter} for {@link Lucene50CompressingTermVectorsFormat}.
*
* @lucene.experimental
*/
-public final class CompressingTermVectorsWriter extends TermVectorsWriter {
-
- // hard limit on the maximum number of documents per chunk
- static final int MAX_DOCUMENTS_PER_CHUNK = 128;
-
- static final String VECTORS_EXTENSION = "tvd";
- static final String VECTORS_INDEX_EXTENSION = "tvx";
- static final String VECTORS_META_EXTENSION = "tvm";
- static final String VECTORS_INDEX_CODEC_NAME = "Lucene85TermVectorsIndex";
-
- static final int VERSION_START = 1;
- static final int VERSION_OFFHEAP_INDEX = 2;
- /** Version where all metadata were moved to the meta file. */
- static final int VERSION_META = 3;
-
- static final int VERSION_CURRENT = VERSION_META;
- static final int META_VERSION_START = 0;
-
- static final int PACKED_BLOCK_SIZE = 64;
-
- static final int POSITIONS = 0x01;
- static final int OFFSETS = 0x02;
- static final int PAYLOADS = 0x04;
- static final int FLAGS_BITS = PackedInts.bitsRequired(POSITIONS | OFFSETS | PAYLOADS);
+public final class Lucene50CompressingTermVectorsWriter extends TermVectorsWriter {
private final String segment;
private FieldsIndexWriter indexWriter;
@@ -227,7 +219,7 @@
private final BlockPackedWriter writer;
/** Sole constructor. */
- CompressingTermVectorsWriter(
+ Lucene50CompressingTermVectorsWriter(
Directory directory,
SegmentInfo si,
String segmentSuffix,
@@ -792,7 +784,7 @@
// we try to be extra safe with this impl, but add an escape hatch to
// have a workaround for undiscovered bugs.
static final String BULK_MERGE_ENABLED_SYSPROP =
- CompressingTermVectorsWriter.class.getName() + ".enableBulkMerge";
+ Lucene50CompressingTermVectorsWriter.class.getName() + ".enableBulkMerge";
static final boolean BULK_MERGE_ENABLED;
static {
@@ -818,12 +810,13 @@
MatchingReaders matching = new MatchingReaders(mergeState);
for (int readerIndex = 0; readerIndex < numReaders; readerIndex++) {
- CompressingTermVectorsReader matchingVectorsReader = null;
+ Lucene50CompressingTermVectorsReader matchingVectorsReader = null;
final TermVectorsReader vectorsReader = mergeState.termVectorsReaders[readerIndex];
if (matching.matchingReaders[readerIndex]) {
// we can only bulk-copy if the matching reader is also a CompressingTermVectorsReader
- if (vectorsReader != null && vectorsReader instanceof CompressingTermVectorsReader) {
- matchingVectorsReader = (CompressingTermVectorsReader) vectorsReader;
+ if (vectorsReader != null
+ && vectorsReader instanceof Lucene50CompressingTermVectorsReader) {
+ matchingVectorsReader = (Lucene50CompressingTermVectorsReader) vectorsReader;
}
}
@@ -936,7 +929,7 @@
* some worst-case situations (e.g. frequent reopen with tiny flushes), over time the compression
* ratio can degrade. This is a safety switch.
*/
- boolean tooDirty(CompressingTermVectorsReader candidate) {
+ boolean tooDirty(Lucene50CompressingTermVectorsReader candidate) {
// more than 1% dirty, or more than hard limit of 1024 dirty chunks
return candidate.getNumDirtyChunks() > 1024
|| candidate.getNumDirtyDocs() * 100 > candidate.getNumDocs();
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50RWCompressingTermVectorsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50RWCompressingTermVectorsFormat.java
new file mode 100644
index 0000000..68ef482
--- /dev/null
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50RWCompressingTermVectorsFormat.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.backward_codecs.lucene50.compressing;
+
+import java.io.IOException;
+import org.apache.lucene.codecs.TermVectorsWriter;
+import org.apache.lucene.codecs.compressing.CompressionMode;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+
+/** RW impersonation of Lucene50CompressingTermVectorsFormat. */
+public class Lucene50RWCompressingTermVectorsFormat extends Lucene50CompressingTermVectorsFormat {
+
+ /** Sole constructor. */
+ public Lucene50RWCompressingTermVectorsFormat(
+ String formatName,
+ String segmentSuffix,
+ CompressionMode compressionMode,
+ int chunkSize,
+ int blockSize) {
+ super(formatName, segmentSuffix, compressionMode, chunkSize, blockSize);
+ }
+
+ @Override
+ public final TermVectorsWriter vectorsWriter(
+ Directory directory, SegmentInfo segmentInfo, IOContext context) throws IOException {
+ return new Lucene50CompressingTermVectorsWriter(
+ directory,
+ segmentInfo,
+ segmentSuffix,
+ context,
+ formatName,
+ compressionMode,
+ chunkSize,
+ blockSize);
+ }
+}
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene70/Lucene70RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene70/Lucene70RWCodec.java
index 7b44821..5a38dd4 100644
--- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene70/Lucene70RWCodec.java
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene70/Lucene70RWCodec.java
@@ -19,11 +19,13 @@
import org.apache.lucene.backward_codecs.lucene50.Lucene50RWCompoundFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50RWPostingsFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50RWStoredFieldsFormat;
+import org.apache.lucene.backward_codecs.lucene50.Lucene50RWTermVectorsFormat;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
+import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
/** RW impersonation of {@link Lucene70Codec}. */
@@ -65,4 +67,9 @@
public CompoundFormat compoundFormat() {
return new Lucene50RWCompoundFormat();
}
+
+ @Override
+ public TermVectorsFormat termVectorsFormat() {
+ return new Lucene50RWTermVectorsFormat();
+ }
}
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene84/Lucene84RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene84/Lucene84RWCodec.java
index 3b70b2c..5d578d0 100644
--- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene84/Lucene84RWCodec.java
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene84/Lucene84RWCodec.java
@@ -18,6 +18,7 @@
import org.apache.lucene.backward_codecs.lucene50.Lucene50RWCompoundFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50RWStoredFieldsFormat;
+import org.apache.lucene.backward_codecs.lucene50.Lucene50RWTermVectorsFormat;
import org.apache.lucene.backward_codecs.lucene60.Lucene60RWPointsFormat;
import org.apache.lucene.backward_codecs.lucene70.Lucene70RWSegmentInfoFormat;
import org.apache.lucene.backward_codecs.lucene80.Lucene80RWNormsFormat;
@@ -27,6 +28,7 @@
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
+import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
/** RW impersonation of {@link Lucene84Codec}. */
@@ -70,4 +72,9 @@
public final CompoundFormat compoundFormat() {
return new Lucene50RWCompoundFormat();
}
+
+ @Override
+ public TermVectorsFormat termVectorsFormat() {
+ return new Lucene50RWTermVectorsFormat();
+ }
}
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/Lucene86RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/Lucene86RWCodec.java
index 97d5633..6a544fd 100644
--- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/Lucene86RWCodec.java
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/Lucene86RWCodec.java
@@ -18,6 +18,7 @@
import org.apache.lucene.backward_codecs.lucene50.Lucene50RWCompoundFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50RWStoredFieldsFormat;
+import org.apache.lucene.backward_codecs.lucene50.Lucene50RWTermVectorsFormat;
import org.apache.lucene.backward_codecs.lucene50.Lucene50StoredFieldsFormat;
import org.apache.lucene.backward_codecs.lucene80.Lucene80RWNormsFormat;
import org.apache.lucene.backward_codecs.lucene84.Lucene84RWPostingsFormat;
@@ -25,6 +26,7 @@
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
+import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
/** RW impersonation of {@link Lucene86Codec}. */
@@ -69,4 +71,9 @@
public final CompoundFormat compoundFormat() {
return new Lucene50RWCompoundFormat();
}
+
+ @Override
+ public TermVectorsFormat termVectorsFormat() {
+ return new Lucene50RWTermVectorsFormat();
+ }
}
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/Lucene87RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/Lucene87RWCodec.java
index 8fac2d6..a5af408 100644
--- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/Lucene87RWCodec.java
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/Lucene87RWCodec.java
@@ -17,11 +17,13 @@
package org.apache.lucene.backward_codecs.lucene87;
import org.apache.lucene.backward_codecs.lucene50.Lucene50RWCompoundFormat;
+import org.apache.lucene.backward_codecs.lucene50.Lucene50RWTermVectorsFormat;
import org.apache.lucene.backward_codecs.lucene80.Lucene80RWNormsFormat;
import org.apache.lucene.backward_codecs.lucene84.Lucene84RWPostingsFormat;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
/** RW impersonation of {@link Lucene87Codec}. */
@@ -50,4 +52,9 @@
public PostingsFormat postingsFormat() {
return postingsFormat;
}
+
+ @Override
+ public TermVectorsFormat termVectorsFormat() {
+ return new Lucene50RWTermVectorsFormat();
+ }
}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/Lucene90CompressingTermVectorsFormat.java
similarity index 91%
rename from lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsFormat.java
rename to lucene/core/src/java/org/apache/lucene/codecs/compressing/Lucene90CompressingTermVectorsFormat.java
index 15016b8..13e0708 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/Lucene90CompressingTermVectorsFormat.java
@@ -33,7 +33,7 @@
*
* @lucene.experimental
*/
-public class CompressingTermVectorsFormat extends TermVectorsFormat {
+public class Lucene90CompressingTermVectorsFormat extends TermVectorsFormat {
private final String formatName;
private final String segmentSuffix;
@@ -42,7 +42,7 @@
private final int blockSize;
/**
- * Create a new {@link CompressingTermVectorsFormat}.
+ * Create a new {@link Lucene90CompressingTermVectorsFormat}.
*
* <p><code>formatName</code> is the name of the format. This name will be used in the file
* formats to perform {@link CodecUtil#checkIndexHeader codec header checks}.
@@ -50,7 +50,8 @@
* <p>The <code>compressionMode</code> parameter allows you to choose between compression
* algorithms that have various compression and decompression speeds so that you can pick the one
* that best fits your indexing and searching throughput. You should never instantiate two {@link
- * CompressingTermVectorsFormat}s that have the same name but different {@link CompressionMode}s.
+ * Lucene90CompressingTermVectorsFormat}s that have the same name but different {@link
+ * CompressionMode}s.
*
* <p><code>chunkSize</code> is the minimum byte size of a chunk of documents. Higher values of
* <code>chunkSize</code> should improve the compression ratio but will require more memory at
@@ -64,7 +65,7 @@
* @param blockSize the number of chunks to store in an index block.
* @see CompressionMode
*/
- public CompressingTermVectorsFormat(
+ public Lucene90CompressingTermVectorsFormat(
String formatName,
String segmentSuffix,
CompressionMode compressionMode,
@@ -87,14 +88,14 @@
public final TermVectorsReader vectorsReader(
Directory directory, SegmentInfo segmentInfo, FieldInfos fieldInfos, IOContext context)
throws IOException {
- return new CompressingTermVectorsReader(
+ return new Lucene90CompressingTermVectorsReader(
directory, segmentInfo, segmentSuffix, fieldInfos, context, formatName, compressionMode);
}
@Override
public final TermVectorsWriter vectorsWriter(
Directory directory, SegmentInfo segmentInfo, IOContext context) throws IOException {
- return new CompressingTermVectorsWriter(
+ return new Lucene90CompressingTermVectorsWriter(
directory,
segmentInfo,
segmentSuffix,
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/Lucene90CompressingTermVectorsReader.java
similarity index 88%
copy from lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java
copy to lucene/core/src/java/org/apache/lucene/codecs/compressing/Lucene90CompressingTermVectorsReader.java
index a3230fa..c31a37f 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/Lucene90CompressingTermVectorsReader.java
@@ -16,20 +16,18 @@
*/
package org.apache.lucene.codecs.compressing;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.FLAGS_BITS;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.META_VERSION_START;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.OFFSETS;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.PACKED_BLOCK_SIZE;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.PAYLOADS;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.POSITIONS;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VECTORS_EXTENSION;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VECTORS_INDEX_CODEC_NAME;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VECTORS_INDEX_EXTENSION;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VECTORS_META_EXTENSION;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VERSION_CURRENT;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VERSION_META;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VERSION_OFFHEAP_INDEX;
-import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VERSION_START;
+import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.FLAGS_BITS;
+import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.META_VERSION_START;
+import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.OFFSETS;
+import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.PACKED_BLOCK_SIZE;
+import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.PAYLOADS;
+import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.POSITIONS;
+import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.VECTORS_EXTENSION;
+import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.VECTORS_INDEX_CODEC_NAME;
+import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.VECTORS_INDEX_EXTENSION;
+import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.VECTORS_META_EXTENSION;
+import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.VERSION_CURRENT;
+import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.VERSION_START;
import java.io.Closeable;
import java.io.IOException;
@@ -67,11 +65,12 @@
import org.apache.lucene.util.packed.PackedInts;
/**
- * {@link TermVectorsReader} for {@link CompressingTermVectorsFormat}.
+ * {@link TermVectorsReader} for {@link Lucene90CompressingTermVectorsFormat}.
*
* @lucene.experimental
*/
-public final class CompressingTermVectorsReader extends TermVectorsReader implements Closeable {
+public final class Lucene90CompressingTermVectorsReader extends TermVectorsReader
+ implements Closeable {
private final FieldInfos fieldInfos;
final FieldsIndex indexReader;
@@ -89,7 +88,7 @@
private final long maxPointer; // end of the data section
// used by clone
- private CompressingTermVectorsReader(CompressingTermVectorsReader reader) {
+ private Lucene90CompressingTermVectorsReader(Lucene90CompressingTermVectorsReader reader) {
this.fieldInfos = reader.fieldInfos;
this.vectorsStream = reader.vectorsStream.clone();
this.indexReader = reader.indexReader.clone();
@@ -108,7 +107,7 @@
}
/** Sole constructor. */
- public CompressingTermVectorsReader(
+ public Lucene90CompressingTermVectorsReader(
Directory d,
SegmentInfo si,
String segmentSuffix,
@@ -135,26 +134,19 @@
assert CodecUtil.indexHeaderLength(formatName, segmentSuffix)
== vectorsStream.getFilePointer();
- if (version >= VERSION_OFFHEAP_INDEX) {
- final String metaStreamFN =
- IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_META_EXTENSION);
- metaIn = d.openChecksumInput(metaStreamFN, IOContext.READONCE);
- CodecUtil.checkIndexHeader(
- metaIn,
- VECTORS_INDEX_CODEC_NAME + "Meta",
- META_VERSION_START,
- version,
- si.getId(),
- segmentSuffix);
- }
+ final String metaStreamFN =
+ IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_META_EXTENSION);
+ metaIn = d.openChecksumInput(metaStreamFN, IOContext.READONCE);
+ CodecUtil.checkIndexHeader(
+ metaIn,
+ VECTORS_INDEX_CODEC_NAME + "Meta",
+ META_VERSION_START,
+ version,
+ si.getId(),
+ segmentSuffix);
- if (version >= VERSION_META) {
- packedIntsVersion = metaIn.readVInt();
- chunkSize = metaIn.readVInt();
- } else {
- packedIntsVersion = vectorsStream.readVInt();
- chunkSize = vectorsStream.readVInt();
- }
+ packedIntsVersion = metaIn.readVInt();
+ chunkSize = metaIn.readVInt();
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
@@ -162,70 +154,21 @@
// such as file truncation.
CodecUtil.retrieveChecksum(vectorsStream);
- FieldsIndex indexReader = null;
- long maxPointer = -1;
+ FieldsIndexReader fieldsIndexReader =
+ new FieldsIndexReader(
+ d,
+ si.name,
+ segmentSuffix,
+ VECTORS_INDEX_EXTENSION,
+ VECTORS_INDEX_CODEC_NAME,
+ si.getId(),
+ metaIn);
- if (version < VERSION_OFFHEAP_INDEX) {
- // Load the index into memory
- final String indexName = IndexFileNames.segmentFileName(segment, segmentSuffix, "tvx");
- try (ChecksumIndexInput indexStream = d.openChecksumInput(indexName, context)) {
- Throwable priorE = null;
- try {
- assert formatName.endsWith("Data");
- final String codecNameIdx =
- formatName.substring(0, formatName.length() - "Data".length()) + "Index";
- final int version2 =
- CodecUtil.checkIndexHeader(
- indexStream,
- codecNameIdx,
- VERSION_START,
- VERSION_CURRENT,
- si.getId(),
- segmentSuffix);
- if (version != version2) {
- throw new CorruptIndexException(
- "Version mismatch between stored fields index and data: "
- + version
- + " != "
- + version2,
- indexStream);
- }
- assert CodecUtil.indexHeaderLength(codecNameIdx, segmentSuffix)
- == indexStream.getFilePointer();
- indexReader = new LegacyFieldsIndexReader(indexStream, si);
- maxPointer = indexStream.readVLong(); // the end of the data section
- } catch (Throwable exception) {
- priorE = exception;
- } finally {
- CodecUtil.checkFooter(indexStream, priorE);
- }
- }
- } else {
- FieldsIndexReader fieldsIndexReader =
- new FieldsIndexReader(
- d,
- si.name,
- segmentSuffix,
- VECTORS_INDEX_EXTENSION,
- VECTORS_INDEX_CODEC_NAME,
- si.getId(),
- metaIn);
- indexReader = fieldsIndexReader;
- maxPointer = fieldsIndexReader.getMaxPointer();
- }
+ this.indexReader = fieldsIndexReader;
+ this.maxPointer = fieldsIndexReader.getMaxPointer();
- this.indexReader = indexReader;
- this.maxPointer = maxPointer;
-
- if (version >= VERSION_META) {
- numDirtyChunks = metaIn.readVLong();
- numDirtyDocs = metaIn.readVLong();
- } else {
- // Old versions of this format did not record numDirtyDocs. Since bulk
- // merges are disabled on version increments anyway, we make no effort
- // to get valid values of numDirtyChunks and numDirtyDocs.
- numDirtyChunks = numDirtyDocs = -1;
- }
+ numDirtyChunks = metaIn.readVLong();
+ numDirtyDocs = metaIn.readVLong();
decompressor = compressionMode.newDecompressor();
this.reader =
@@ -318,7 +261,7 @@
@Override
public TermVectorsReader clone() {
- return new CompressingTermVectorsReader(this);
+ return new Lucene90CompressingTermVectorsReader(this);
}
@Override
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/Lucene90CompressingTermVectorsWriter.java
similarity index 97%
rename from lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
rename to lucene/core/src/java/org/apache/lucene/codecs/compressing/Lucene90CompressingTermVectorsWriter.java
index 95c633e..4fc88c3 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/Lucene90CompressingTermVectorsWriter.java
@@ -51,11 +51,11 @@
import org.apache.lucene.util.packed.PackedInts;
/**
- * {@link TermVectorsWriter} for {@link CompressingTermVectorsFormat}.
+ * {@link TermVectorsWriter} for {@link Lucene90CompressingTermVectorsFormat}.
*
* @lucene.experimental
*/
-public final class CompressingTermVectorsWriter extends TermVectorsWriter {
+public final class Lucene90CompressingTermVectorsWriter extends TermVectorsWriter {
// hard limit on the maximum number of documents per chunk
static final int MAX_DOCUMENTS_PER_CHUNK = 128;
@@ -63,14 +63,10 @@
static final String VECTORS_EXTENSION = "tvd";
static final String VECTORS_INDEX_EXTENSION = "tvx";
static final String VECTORS_META_EXTENSION = "tvm";
- static final String VECTORS_INDEX_CODEC_NAME = "Lucene85TermVectorsIndex";
+ static final String VECTORS_INDEX_CODEC_NAME = "Lucene90TermVectorsIndex";
- static final int VERSION_START = 1;
- static final int VERSION_OFFHEAP_INDEX = 2;
- /** Version where all metadata were moved to the meta file. */
- static final int VERSION_META = 3;
-
- static final int VERSION_CURRENT = VERSION_META;
+ static final int VERSION_START = 0;
+ static final int VERSION_CURRENT = VERSION_START;
static final int META_VERSION_START = 0;
static final int PACKED_BLOCK_SIZE = 64;
@@ -227,7 +223,7 @@
private final BlockPackedWriter writer;
/** Sole constructor. */
- CompressingTermVectorsWriter(
+ Lucene90CompressingTermVectorsWriter(
Directory directory,
SegmentInfo si,
String segmentSuffix,
@@ -792,7 +788,7 @@
// we try to be extra safe with this impl, but add an escape hatch to
// have a workaround for undiscovered bugs.
static final String BULK_MERGE_ENABLED_SYSPROP =
- CompressingTermVectorsWriter.class.getName() + ".enableBulkMerge";
+ Lucene90CompressingTermVectorsWriter.class.getName() + ".enableBulkMerge";
static final boolean BULK_MERGE_ENABLED;
static {
@@ -818,12 +814,13 @@
MatchingReaders matching = new MatchingReaders(mergeState);
for (int readerIndex = 0; readerIndex < numReaders; readerIndex++) {
- CompressingTermVectorsReader matchingVectorsReader = null;
+ Lucene90CompressingTermVectorsReader matchingVectorsReader = null;
final TermVectorsReader vectorsReader = mergeState.termVectorsReaders[readerIndex];
if (matching.matchingReaders[readerIndex]) {
// we can only bulk-copy if the matching reader is also a CompressingTermVectorsReader
- if (vectorsReader != null && vectorsReader instanceof CompressingTermVectorsReader) {
- matchingVectorsReader = (CompressingTermVectorsReader) vectorsReader;
+ if (vectorsReader != null
+ && vectorsReader instanceof Lucene90CompressingTermVectorsReader) {
+ matchingVectorsReader = (Lucene90CompressingTermVectorsReader) vectorsReader;
}
}
@@ -936,7 +933,7 @@
* some worst-case situations (e.g. frequent reopen with tiny flushes), over time the compression
* ratio can degrade. This is a safety switch.
*/
- boolean tooDirty(CompressingTermVectorsReader candidate) {
+ boolean tooDirty(Lucene90CompressingTermVectorsReader candidate) {
// more than 1% dirty, or more than hard limit of 1024 dirty chunks
return candidate.getNumDirtyChunks() > 1024
|| candidate.getNumDirtyDocs() * 100 > candidate.getNumDocs();
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/MatchingReaders.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/MatchingReaders.java
index 8ec245a..0305ff2 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/MatchingReaders.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/MatchingReaders.java
@@ -23,19 +23,21 @@
/**
* Computes which segments have identical field name to number mappings, which allows stored fields
* and term vectors in this codec to be bulk-merged.
+ *
+ * @lucene.internal
*/
-class MatchingReaders {
+public class MatchingReaders {
/**
* {@link SegmentReader}s that have identical field name/number mapping, so their stored fields
* and term vectors may be bulk merged.
*/
- final boolean[] matchingReaders;
+ public final boolean[] matchingReaders;
/** How many {@link #matchingReaders} are set. */
final int count;
- MatchingReaders(MergeState mergeState) {
+ public MatchingReaders(MergeState mergeState) {
// If the i'th reader is a SegmentReader and has
// identical fieldName -> number mapping, then this
// array will be non-null at position i:
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java
index 770f352..616d7d7 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java
@@ -30,7 +30,6 @@
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.VectorFormat;
-import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.codecs.lucene86.Lucene86PointsFormat;
import org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat;
import org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat;
@@ -65,7 +64,7 @@
}
}
- private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
+ private final TermVectorsFormat vectorsFormat = new Lucene90TermVectorsFormat();
private final FieldInfosFormat fieldInfosFormat = new Lucene90FieldInfosFormat();
private final SegmentInfoFormat segmentInfosFormat = new Lucene86SegmentInfoFormat();
private final LiveDocsFormat liveDocsFormat = new Lucene90LiveDocsFormat();
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50TermVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90TermVectorsFormat.java
similarity index 95%
copy from lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50TermVectorsFormat.java
copy to lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90TermVectorsFormat.java
index 270767e..8c894ed 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50TermVectorsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90TermVectorsFormat.java
@@ -14,20 +14,20 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.codecs.lucene50;
+package org.apache.lucene.codecs.lucene90;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.TermVectorsFormat;
-import org.apache.lucene.codecs.compressing.CompressingTermVectorsFormat;
import org.apache.lucene.codecs.compressing.CompressionMode;
import org.apache.lucene.codecs.compressing.FieldsIndexWriter;
+import org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsFormat;
import org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.packed.BlockPackedWriter;
import org.apache.lucene.util.packed.PackedInts;
/**
- * Lucene 5.0 {@link TermVectorsFormat term vectors format}.
+ * Lucene 9.0 {@link TermVectorsFormat term vectors format}.
*
* <p>Very similarly to {@link Lucene87StoredFieldsFormat}, this format is based on compressed
* chunks of data, with document-level granularity so that a document can never span across distinct
@@ -151,10 +151,10 @@
*
* @lucene.experimental
*/
-public final class Lucene50TermVectorsFormat extends CompressingTermVectorsFormat {
+public final class Lucene90TermVectorsFormat extends Lucene90CompressingTermVectorsFormat {
/** Sole constructor. */
- public Lucene50TermVectorsFormat() {
- super("Lucene50TermVectorsData", "", CompressionMode.FAST, 1 << 12, 10);
+ public Lucene90TermVectorsFormat() {
+ super("Lucene90TermVectorsData", "", CompressionMode.FAST, 1 << 12, 10);
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java
index 83a3912..dcab523 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java
@@ -165,7 +165,7 @@
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
* each field in each document, a value is stored that is multiplied into the score for hits
* on that field.
- * <li>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vectors}. For each
+ * <li>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each
* field in each document, the term vector (sometimes called document vector) may be stored. A
* term vector consists of term text and term frequency. To add Term Vectors to your index see
* the {@link org.apache.lucene.document.Field Field} constructors
@@ -290,12 +290,12 @@
* <td>Encodes additional scoring factors or other per-document information.</td>
* </tr>
* <tr>
- * <td>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Index}</td>
+ * <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}</td>
* <td>.tvx</td>
* <td>Stores offset into the document data file</td>
* </tr>
* <tr>
- * <td>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Data}</td>
+ * <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}</td>
* <td>.tvd</td>
* <td>Contains term vector data.</td>
* </tr>
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java b/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java
index 2c859c9..d6b72f8 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java
@@ -24,7 +24,7 @@
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.codecs.TermVectorsWriter;
-import org.apache.lucene.codecs.compressing.CompressingTermVectorsFormat;
+import org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsFormat;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FlushInfo;
@@ -37,7 +37,7 @@
final class SortingTermVectorsConsumer extends TermVectorsConsumer {
private static final TermVectorsFormat TEMP_TERM_VECTORS_FORMAT =
- new CompressingTermVectorsFormat(
+ new Lucene90CompressingTermVectorsFormat(
"TempTermVectors", "", SortingStoredFieldsConsumer.NO_COMPRESSION, 8 * 1024, 10);
TrackingTmpOutputDirectoryWrapper tmpDirectory;
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50TermVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90TermVectorsFormat.java
similarity index 90%
rename from lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50TermVectorsFormat.java
rename to lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90TermVectorsFormat.java
index b46f4f7..00bf963 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50TermVectorsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90TermVectorsFormat.java
@@ -14,13 +14,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.codecs.lucene50;
+package org.apache.lucene.codecs.lucene90;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
import org.apache.lucene.util.TestUtil;
-public class TestLucene50TermVectorsFormat extends BaseTermVectorsFormatTestCase {
+public class TestLucene90TermVectorsFormat extends BaseTermVectorsFormatTestCase {
@Override
protected Codec getCodec() {
return TestUtil.getDefaultCodec();
diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java
index 7539511..cc76011 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java
@@ -102,7 +102,7 @@
}
private final CompressingStoredFieldsFormat storedFieldsFormat;
- private final CompressingTermVectorsFormat termVectorsFormat;
+ private final Lucene90CompressingTermVectorsFormat termVectorsFormat;
/** Creates a compressing codec with a given segment suffix */
public CompressingCodec(
@@ -117,7 +117,7 @@
new CompressingStoredFieldsFormat(
name, segmentSuffix, compressionMode, chunkSize, maxDocsPerChunk, blockShift);
this.termVectorsFormat =
- new CompressingTermVectorsFormat(
+ new Lucene90CompressingTermVectorsFormat(
name, segmentSuffix, compressionMode, chunkSize, blockShift);
}
diff --git a/lucene/test-framework/src/test/org/apache/lucene/codecs/compressing/TestCompressingTermVectorsFormat.java b/lucene/test-framework/src/test/org/apache/lucene/codecs/compressing/TestCompressingTermVectorsFormat.java
index 18785f0..1c8f1d0 100644
--- a/lucene/test-framework/src/test/org/apache/lucene/codecs/compressing/TestCompressingTermVectorsFormat.java
+++ b/lucene/test-framework/src/test/org/apache/lucene/codecs/compressing/TestCompressingTermVectorsFormat.java
@@ -100,8 +100,8 @@
// examine dirty counts:
for (LeafReaderContext leaf : ir2.leaves()) {
CodecReader sr = (CodecReader) leaf.reader();
- CompressingTermVectorsReader reader =
- (CompressingTermVectorsReader) sr.getTermVectorsReader();
+ Lucene90CompressingTermVectorsReader reader =
+ (Lucene90CompressingTermVectorsReader) sr.getTermVectorsReader();
assertTrue(reader.getNumDirtyDocs() > 0);
assertEquals(1, reader.getNumDirtyChunks());
}
@@ -113,7 +113,8 @@
ir.close();
ir = ir2;
CodecReader sr = (CodecReader) getOnlyLeafReader(ir);
- CompressingTermVectorsReader reader = (CompressingTermVectorsReader) sr.getTermVectorsReader();
+ Lucene90CompressingTermVectorsReader reader =
+ (Lucene90CompressingTermVectorsReader) sr.getTermVectorsReader();
// we could get lucky, and have zero, but typically one.
assertTrue(reader.getNumDirtyChunks() <= 1);
ir.close();