LUCENE-9639: Add unit tests for SimpleTextVector format (#2404)
... and fix the implementation so it passes!
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorReader.java
index 92cf8b3..15603c1 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorReader.java
@@ -39,6 +39,7 @@
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;
/**
@@ -63,6 +64,13 @@
readState.segmentInfo.name,
readState.segmentSuffix,
SimpleTextVectorFormat.META_EXTENSION);
+ String vectorFileName =
+ IndexFileNames.segmentFileName(
+ readState.segmentInfo.name,
+ readState.segmentSuffix,
+ SimpleTextVectorFormat.VECTOR_EXTENSION);
+
+ boolean success = false;
try (ChecksumIndexInput in =
readState.directory.openChecksumInput(metaFileName, IOContext.DEFAULT)) {
int fieldNumber = readInt(in, FIELD_NUMBER);
@@ -86,21 +94,23 @@
fieldNumber = readInt(in, FIELD_NUMBER);
}
SimpleTextUtil.checkFooter(in);
- }
- String vectorFileName =
- IndexFileNames.segmentFileName(
- readState.segmentInfo.name,
- readState.segmentSuffix,
- SimpleTextVectorFormat.VECTOR_EXTENSION);
- dataIn = readState.directory.openInput(vectorFileName, IOContext.DEFAULT);
+ dataIn = readState.directory.openInput(vectorFileName, IOContext.DEFAULT);
+ success = true;
+ } finally {
+ if (success == false) {
+ IOUtils.closeWhileHandlingException(this);
+ }
+ }
}
@Override
public VectorValues getVectorValues(String field) throws IOException {
FieldInfo info = readState.fieldInfos.fieldInfo(field);
if (info == null) {
- throw new IllegalStateException("No vectors indexed for field=\"" + field + "\"");
+ // mirror the handling in Lucene90VectorReader#getVectorValues
+ // needed to pass TestSimpleTextVectorFormat#testDeleteAllVectorDocs
+ return null;
}
int dimension = info.getVectorDimension();
if (dimension == 0) {
@@ -108,7 +118,9 @@
}
FieldEntry fieldEntry = fieldEntries.get(field);
if (fieldEntry == null) {
- throw new IllegalStateException("No entry found for vector field=\"" + field + "\"");
+ // mirror the handling in Lucene90VectorReader#getVectorValues
+ // needed to pass TestSimpleTextVectorFormat#testDeleteAllVectorDocs
+ return null;
}
if (dimension != fieldEntry.dimension) {
throw new IllegalStateException(
@@ -133,6 +145,15 @@
// in SimpleTextUtil.CHECKSUM):
long footerStartPos = dataIn.length() - (SimpleTextUtil.CHECKSUM.length + 21);
ChecksumIndexInput input = new BufferedChecksumIndexInput(clone);
+
+ // when there's no actual vector data written (e.g. tested in
+ // TestSimpleTextVectorFormat#testDeleteAllVectorDocs)
+ // the first line in dataInput will be, checksum 00000000000000000000
+ if (footerStartPos == 0) {
+ SimpleTextUtil.checkFooter(input);
+ return;
+ }
+
while (true) {
SimpleTextUtil.readLine(input, scratch);
if (input.getFilePointer() >= footerStartPos) {
@@ -244,7 +265,13 @@
public int docID() {
if (curOrd == -1) {
return -1;
+ } else if (curOrd >= entry.size()) {
+ // when call to advance / nextDoc below already returns NO_MORE_DOCS, calling docID
+ // immediately afterward should also return NO_MORE_DOCS
+ // this is needed for TestSimpleTextVectorFormat.testAdvance test case
+ return NO_MORE_DOCS;
}
+
return entry.ordToDoc[curOrd];
}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorWriter.java
index 4fa55b8..1699537 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorWriter.java
@@ -50,15 +50,24 @@
SimpleTextVectorWriter(SegmentWriteState state) throws IOException {
assert state.fieldInfos.hasVectorValues();
- String metaFileName =
- IndexFileNames.segmentFileName(
- state.segmentInfo.name, state.segmentSuffix, SimpleTextVectorFormat.META_EXTENSION);
- meta = state.directory.createOutput(metaFileName, state.context);
+ boolean success = false;
+ // exception handling to pass TestSimpleTextVectorFormat#testRandomExceptions
+ try {
+ String metaFileName =
+ IndexFileNames.segmentFileName(
+ state.segmentInfo.name, state.segmentSuffix, SimpleTextVectorFormat.META_EXTENSION);
+ meta = state.directory.createOutput(metaFileName, state.context);
- String vectorDataFileName =
- IndexFileNames.segmentFileName(
- state.segmentInfo.name, state.segmentSuffix, SimpleTextVectorFormat.VECTOR_EXTENSION);
- vectorData = state.directory.createOutput(vectorDataFileName, state.context);
+ String vectorDataFileName =
+ IndexFileNames.segmentFileName(
+ state.segmentInfo.name, state.segmentSuffix, SimpleTextVectorFormat.VECTOR_EXTENSION);
+ vectorData = state.directory.createOutput(vectorDataFileName, state.context);
+ success = true;
+ } finally {
+ if (success == false) {
+ IOUtils.closeWhileHandlingException(this);
+ }
+ }
}
@Override
@@ -71,7 +80,9 @@
docIds.add(docV);
}
long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
- writeMeta(fieldInfo, vectorDataOffset, vectorDataLength, docIds);
+ if (vectorDataLength > 0) {
+ writeMeta(fieldInfo, vectorDataOffset, vectorDataLength, docIds);
+ }
}
private void writeVectorValue(VectorValues vectors) throws IOException {
diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextVectorFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextVectorFormat.java
new file mode 100644
index 0000000..b8b40e0
--- /dev/null
+++ b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextVectorFormat.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.simpletext;
+
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.index.BaseVectorFormatTestCase;
+
+public class TestSimpleTextVectorFormat extends BaseVectorFormatTestCase {
+ @Override
+ protected Codec getCodec() {
+ return new SimpleTextCodec();
+ }
+}