blob: 86b9b9e7634d7d39c0925be928bb9c7a817cfae7 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.VectorField;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.VectorUtil;
/**
* Base class aiming at testing {@link VectorFormat vectors formats}. To test a new format, all you
* need is to register a new {@link Codec} which uses it and extend this class and override {@link
* #getCodec()}.
*
* @lucene.experimental
*/
public abstract class BaseVectorFormatTestCase extends BaseIndexFileFormatTestCase {
@Override
protected void addRandomFields(Document doc) {
doc.add(new VectorField("v2", randomVector(30), VectorValues.SimilarityFunction.NONE));
}
public void testFieldConstructor() {
float[] v = new float[1];
VectorField field = new VectorField("f", v);
assertEquals(1, field.fieldType().vectorDimension());
assertEquals(
VectorValues.SimilarityFunction.EUCLIDEAN, field.fieldType().vectorSimilarityFunction());
assertSame(v, field.vectorValue());
}
public void testFieldConstructorExceptions() {
expectThrows(IllegalArgumentException.class, () -> new VectorField(null, new float[1]));
expectThrows(IllegalArgumentException.class, () -> new VectorField("f", null));
expectThrows(
IllegalArgumentException.class,
() -> new VectorField("f", new float[1], (VectorValues.SimilarityFunction) null));
expectThrows(IllegalArgumentException.class, () -> new VectorField("f", new float[0]));
expectThrows(
IllegalArgumentException.class,
() -> new VectorField("f", new float[VectorValues.MAX_DIMENSIONS + 1]));
expectThrows(
IllegalArgumentException.class,
() -> new VectorField("f", new float[VectorValues.MAX_DIMENSIONS + 1], (FieldType) null));
}
public void testFieldSetValue() {
VectorField field = new VectorField("f", new float[1]);
float[] v1 = new float[1];
field.setVectorValue(v1);
assertSame(v1, field.vectorValue());
expectThrows(IllegalArgumentException.class, () -> field.setVectorValue(new float[2]));
expectThrows(IllegalArgumentException.class, () -> field.setVectorValue(null));
}
public void testFieldCreateFieldType() {
expectThrows(
IllegalArgumentException.class,
() -> VectorField.createHnswType(0, VectorValues.SimilarityFunction.EUCLIDEAN, 16, 16));
expectThrows(
IllegalArgumentException.class,
() ->
VectorField.createHnswType(
VectorValues.MAX_DIMENSIONS + 1,
VectorValues.SimilarityFunction.EUCLIDEAN,
16,
16));
expectThrows(
IllegalArgumentException.class,
() -> VectorField.createHnswType(VectorValues.MAX_DIMENSIONS + 1, null, 16, 16));
expectThrows(
IllegalArgumentException.class,
() ->
VectorField.createHnswType(
VectorValues.MAX_DIMENSIONS + 1, VectorValues.SimilarityFunction.NONE, 16, 16));
}
// Illegal schema change tests:
public void testIllegalDimChangeTwoDocs() throws Exception {
// illegal change in the same segment
try (Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.DOT_PRODUCT));
w.addDocument(doc);
Document doc2 = new Document();
doc2.add(new VectorField("f", new float[3], VectorValues.SimilarityFunction.DOT_PRODUCT));
IllegalArgumentException expected =
expectThrows(IllegalArgumentException.class, () -> w.addDocument(doc2));
String errMsg =
"Inconsistency of field data structures across documents for field [f] of doc [1].";
assertEquals(errMsg, expected.getMessage());
}
// illegal change in a different segment
try (Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.DOT_PRODUCT));
w.addDocument(doc);
w.commit();
Document doc2 = new Document();
doc2.add(new VectorField("f", new float[3], VectorValues.SimilarityFunction.DOT_PRODUCT));
IllegalArgumentException expected =
expectThrows(IllegalArgumentException.class, () -> w.addDocument(doc2));
String errMsg =
"cannot change field \"f\" from vector dimension=4, vector similarity function=DOT_PRODUCT "
+ "to inconsistent vector dimension=3, vector similarity function=DOT_PRODUCT";
assertEquals(errMsg, expected.getMessage());
}
}
public void testIllegalSimilarityFunctionChange() throws Exception {
// illegal change in the same segment
try (Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.DOT_PRODUCT));
w.addDocument(doc);
Document doc2 = new Document();
doc2.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.EUCLIDEAN));
IllegalArgumentException expected =
expectThrows(IllegalArgumentException.class, () -> w.addDocument(doc2));
String errMsg =
"Inconsistency of field data structures across documents for field [f] of doc [1].";
assertEquals(errMsg, expected.getMessage());
}
// illegal change a different segment
try (Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.DOT_PRODUCT));
w.addDocument(doc);
w.commit();
Document doc2 = new Document();
doc2.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.EUCLIDEAN));
IllegalArgumentException expected =
expectThrows(IllegalArgumentException.class, () -> w.addDocument(doc2));
String errMsg =
"cannot change field \"f\" from vector dimension=4, vector similarity function=DOT_PRODUCT "
+ "to inconsistent vector dimension=4, vector similarity function=EUCLIDEAN";
assertEquals(errMsg, expected.getMessage());
}
}
public void testIllegalDimChangeTwoWriters() throws Exception {
try (Directory dir = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.DOT_PRODUCT));
w.addDocument(doc);
}
try (IndexWriter w2 = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc2 = new Document();
doc2.add(new VectorField("f", new float[1], VectorValues.SimilarityFunction.DOT_PRODUCT));
IllegalArgumentException expected =
expectThrows(IllegalArgumentException.class, () -> w2.addDocument(doc2));
assertEquals(
"cannot change field \"f\" from vector dimension=4, vector similarity function=DOT_PRODUCT "
+ "to inconsistent vector dimension=1, vector similarity function=DOT_PRODUCT",
expected.getMessage());
}
}
}
public void testIllegalSimilarityFunctionChangeTwoWriters() throws Exception {
try (Directory dir = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.DOT_PRODUCT));
w.addDocument(doc);
}
try (IndexWriter w2 = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc2 = new Document();
doc2.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.EUCLIDEAN));
IllegalArgumentException expected =
expectThrows(IllegalArgumentException.class, () -> w2.addDocument(doc2));
assertEquals(
"cannot change field \"f\" from vector dimension=4, vector similarity function=DOT_PRODUCT "
+ "to inconsistent vector dimension=4, vector similarity function=EUCLIDEAN",
expected.getMessage());
}
}
}
public void testAddIndexesDirectory0() throws Exception {
String fieldName = "field";
Document doc = new Document();
doc.add(new VectorField(fieldName, new float[4], VectorValues.SimilarityFunction.DOT_PRODUCT));
try (Directory dir = newDirectory();
Directory dir2 = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
w.addDocument(doc);
}
try (IndexWriter w2 = new IndexWriter(dir2, newIndexWriterConfig())) {
w2.addIndexes(dir);
w2.forceMerge(1);
try (IndexReader reader = w2.getReader()) {
LeafReader r = getOnlyLeafReader(reader);
VectorValues vectorValues = r.getVectorValues(fieldName);
assertEquals(0, vectorValues.nextDoc());
assertEquals(0, vectorValues.vectorValue()[0], 0);
assertEquals(NO_MORE_DOCS, vectorValues.nextDoc());
}
}
}
}
public void testAddIndexesDirectory1() throws Exception {
String fieldName = "field";
Document doc = new Document();
try (Directory dir = newDirectory();
Directory dir2 = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
w.addDocument(doc);
}
doc.add(
new VectorField(fieldName, new float[4], VectorValues.SimilarityFunction.DOT_PRODUCT));
try (IndexWriter w2 = new IndexWriter(dir2, newIndexWriterConfig())) {
w2.addDocument(doc);
w2.addIndexes(dir);
w2.forceMerge(1);
try (IndexReader reader = w2.getReader()) {
LeafReader r = getOnlyLeafReader(reader);
VectorValues vectorValues = r.getVectorValues(fieldName);
assertNotEquals(NO_MORE_DOCS, vectorValues.nextDoc());
assertEquals(0, vectorValues.vectorValue()[0], 0);
assertEquals(NO_MORE_DOCS, vectorValues.nextDoc());
}
}
}
}
public void testAddIndexesDirectory01() throws Exception {
String fieldName = "field";
float[] vector = new float[1];
Document doc = new Document();
doc.add(new VectorField(fieldName, vector, VectorValues.SimilarityFunction.DOT_PRODUCT));
try (Directory dir = newDirectory();
Directory dir2 = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
w.addDocument(doc);
}
try (IndexWriter w2 = new IndexWriter(dir2, newIndexWriterConfig())) {
vector[0] = 1;
w2.addDocument(doc);
w2.addIndexes(dir);
w2.forceMerge(1);
try (IndexReader reader = w2.getReader()) {
LeafReader r = getOnlyLeafReader(reader);
VectorValues vectorValues = r.getVectorValues(fieldName);
assertEquals(0, vectorValues.nextDoc());
// The merge order is randomized, we might get 0 first, or 1
float value = vectorValues.vectorValue()[0];
assertTrue(value == 0 || value == 1);
assertEquals(1, vectorValues.nextDoc());
value += vectorValues.vectorValue()[0];
assertEquals(1, value, 0);
}
}
}
}
public void testIllegalDimChangeViaAddIndexesDirectory() throws Exception {
try (Directory dir = newDirectory();
Directory dir2 = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.DOT_PRODUCT));
w.addDocument(doc);
}
try (IndexWriter w2 = new IndexWriter(dir2, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[5], VectorValues.SimilarityFunction.DOT_PRODUCT));
w2.addDocument(doc);
IllegalArgumentException expected =
expectThrows(
IllegalArgumentException.class, () -> w2.addIndexes(new Directory[] {dir}));
assertEquals(
"cannot change field \"f\" from vector dimension=5, vector similarity function=DOT_PRODUCT "
+ "to inconsistent vector dimension=4, vector similarity function=DOT_PRODUCT",
expected.getMessage());
}
}
}
public void testIllegalSimilarityFunctionChangeViaAddIndexesDirectory() throws Exception {
try (Directory dir = newDirectory();
Directory dir2 = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.DOT_PRODUCT));
w.addDocument(doc);
}
try (IndexWriter w2 = new IndexWriter(dir2, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.EUCLIDEAN));
w2.addDocument(doc);
IllegalArgumentException expected =
expectThrows(IllegalArgumentException.class, () -> w2.addIndexes(dir));
assertEquals(
"cannot change field \"f\" from vector dimension=4, vector similarity function=EUCLIDEAN "
+ "to inconsistent vector dimension=4, vector similarity function=DOT_PRODUCT",
expected.getMessage());
}
}
}
public void testIllegalDimChangeViaAddIndexesCodecReader() throws Exception {
try (Directory dir = newDirectory();
Directory dir2 = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.DOT_PRODUCT));
w.addDocument(doc);
}
try (IndexWriter w2 = new IndexWriter(dir2, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[5], VectorValues.SimilarityFunction.DOT_PRODUCT));
w2.addDocument(doc);
try (DirectoryReader r = DirectoryReader.open(dir)) {
IllegalArgumentException expected =
expectThrows(
IllegalArgumentException.class,
() -> w2.addIndexes(new CodecReader[] {(CodecReader) getOnlyLeafReader(r)}));
assertEquals(
"cannot change field \"f\" from vector dimension=5, vector similarity function=DOT_PRODUCT "
+ "to inconsistent vector dimension=4, vector similarity function=DOT_PRODUCT",
expected.getMessage());
}
}
}
}
public void testIllegalSimilarityFunctionChangeViaAddIndexesCodecReader() throws Exception {
try (Directory dir = newDirectory();
Directory dir2 = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.DOT_PRODUCT));
w.addDocument(doc);
}
try (IndexWriter w2 = new IndexWriter(dir2, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.EUCLIDEAN));
w2.addDocument(doc);
try (DirectoryReader r = DirectoryReader.open(dir)) {
IllegalArgumentException expected =
expectThrows(
IllegalArgumentException.class,
() -> w2.addIndexes(new CodecReader[] {(CodecReader) getOnlyLeafReader(r)}));
assertEquals(
"cannot change field \"f\" from vector dimension=4, vector similarity function=EUCLIDEAN "
+ "to inconsistent vector dimension=4, vector similarity function=DOT_PRODUCT",
expected.getMessage());
}
}
}
}
public void testIllegalDimChangeViaAddIndexesSlowCodecReader() throws Exception {
try (Directory dir = newDirectory();
Directory dir2 = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.DOT_PRODUCT));
w.addDocument(doc);
}
try (IndexWriter w2 = new IndexWriter(dir2, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[5], VectorValues.SimilarityFunction.DOT_PRODUCT));
w2.addDocument(doc);
try (DirectoryReader r = DirectoryReader.open(dir)) {
IllegalArgumentException expected =
expectThrows(IllegalArgumentException.class, () -> TestUtil.addIndexesSlowly(w2, r));
assertEquals(
"cannot change field \"f\" from vector dimension=5, vector similarity function=DOT_PRODUCT "
+ "to inconsistent vector dimension=4, vector similarity function=DOT_PRODUCT",
expected.getMessage());
}
}
}
}
public void testIllegalSimilarityFunctionChangeViaAddIndexesSlowCodecReader() throws Exception {
try (Directory dir = newDirectory();
Directory dir2 = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.DOT_PRODUCT));
w.addDocument(doc);
}
try (IndexWriter w2 = new IndexWriter(dir2, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.EUCLIDEAN));
w2.addDocument(doc);
try (DirectoryReader r = DirectoryReader.open(dir)) {
IllegalArgumentException expected =
expectThrows(IllegalArgumentException.class, () -> TestUtil.addIndexesSlowly(w2, r));
assertEquals(
"cannot change field \"f\" from vector dimension=4, vector similarity function=EUCLIDEAN "
+ "to inconsistent vector dimension=4, vector similarity function=DOT_PRODUCT",
expected.getMessage());
}
}
}
}
public void testIllegalMultipleValues() throws Exception {
try (Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.DOT_PRODUCT));
doc.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.DOT_PRODUCT));
IllegalArgumentException expected =
expectThrows(IllegalArgumentException.class, () -> w.addDocument(doc));
assertEquals(
"VectorValuesField \"f\" appears more than once in this document (only one value is allowed per field)",
expected.getMessage());
}
}
public void testIllegalDimensionTooLarge() throws Exception {
try (Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc = new Document();
expectThrows(
IllegalArgumentException.class,
() ->
doc.add(
new VectorField(
"f",
new float[VectorValues.MAX_DIMENSIONS + 1],
VectorValues.SimilarityFunction.DOT_PRODUCT)));
Document doc2 = new Document();
doc2.add(new VectorField("f", new float[1], VectorValues.SimilarityFunction.EUCLIDEAN));
w.addDocument(doc2);
}
}
public void testIllegalEmptyVector() throws Exception {
try (Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc = new Document();
Exception e =
expectThrows(
IllegalArgumentException.class,
() ->
doc.add(
new VectorField("f", new float[0], VectorValues.SimilarityFunction.NONE)));
assertEquals("cannot index an empty vector", e.getMessage());
Document doc2 = new Document();
doc2.add(new VectorField("f", new float[1], VectorValues.SimilarityFunction.NONE));
w.addDocument(doc2);
}
}
// Write vectors, one segment with default codec, another with SimpleText, then forceMerge
public void testDifferentCodecs1() throws Exception {
try (Directory dir = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.DOT_PRODUCT));
w.addDocument(doc);
}
IndexWriterConfig iwc = newIndexWriterConfig();
iwc.setCodec(Codec.forName("SimpleText"));
try (IndexWriter w = new IndexWriter(dir, iwc)) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.DOT_PRODUCT));
w.addDocument(doc);
w.forceMerge(1);
}
}
}
// Write vectors, one segment with with SimpleText, another with default codec, then forceMerge
public void testDifferentCodecs2() throws Exception {
IndexWriterConfig iwc = newIndexWriterConfig();
iwc.setCodec(Codec.forName("SimpleText"));
try (Directory dir = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, iwc)) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.DOT_PRODUCT));
w.addDocument(doc);
}
try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("f", new float[4], VectorValues.SimilarityFunction.DOT_PRODUCT));
w.addDocument(doc);
w.forceMerge(1);
}
}
}
public void testInvalidVectorFieldUsage() {
VectorField field =
new VectorField("field", new float[2], VectorValues.SimilarityFunction.NONE);
expectThrows(IllegalArgumentException.class, () -> field.setIntValue(14));
expectThrows(IllegalArgumentException.class, () -> field.setVectorValue(new float[1]));
assertNull(field.numericValue());
}
public void testDeleteAllVectorDocs() throws Exception {
try (Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new StringField("id", "0", Field.Store.NO));
doc.add(
new VectorField("v", new float[] {2, 3, 5}, VectorValues.SimilarityFunction.DOT_PRODUCT));
w.addDocument(doc);
w.addDocument(new Document());
w.commit();
try (DirectoryReader r = w.getReader()) {
assertNotNull(getOnlyLeafReader(r).getVectorValues("v"));
}
w.deleteDocuments(new Term("id", "0"));
w.forceMerge(1);
try (DirectoryReader r = w.getReader()) {
assertNull(getOnlyLeafReader(r).getVectorValues("v"));
}
}
}
public void testVectorFieldMissingFromOneSegment() throws Exception {
try (Directory dir = FSDirectory.open(createTempDir());
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new StringField("id", "0", Field.Store.NO));
doc.add(
new VectorField(
"v0", new float[] {2, 3, 5}, VectorValues.SimilarityFunction.DOT_PRODUCT));
w.addDocument(doc);
w.commit();
doc = new Document();
doc.add(
new VectorField(
"v1", new float[] {2, 3, 5}, VectorValues.SimilarityFunction.DOT_PRODUCT));
w.addDocument(doc);
w.forceMerge(1);
}
}
public void testSparseVectors() throws Exception {
int numDocs = atLeast(1000);
int numFields = TestUtil.nextInt(random(), 1, 10);
int[] fieldDocCounts = new int[numFields];
float[] fieldTotals = new float[numFields];
int[] fieldDims = new int[numFields];
VectorValues.SimilarityFunction[] fieldSearchStrategies =
new VectorValues.SimilarityFunction[numFields];
for (int i = 0; i < numFields; i++) {
fieldDims[i] = random().nextInt(20) + 1;
fieldSearchStrategies[i] =
VectorValues.SimilarityFunction.values()[
random().nextInt(VectorValues.SimilarityFunction.values().length)];
}
try (Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig())) {
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
for (int field = 0; field < numFields; field++) {
String fieldName = "int" + field;
if (random().nextInt(100) == 17) {
float[] v = randomVector(fieldDims[field]);
doc.add(new VectorField(fieldName, v, fieldSearchStrategies[field]));
fieldDocCounts[field]++;
fieldTotals[field] += v[0];
}
}
w.addDocument(doc);
}
try (IndexReader r = w.getReader()) {
for (int field = 0; field < numFields; field++) {
int docCount = 0;
float checksum = 0;
String fieldName = "int" + field;
for (LeafReaderContext ctx : r.leaves()) {
VectorValues vectors = ctx.reader().getVectorValues(fieldName);
if (vectors != null) {
docCount += vectors.size();
while (vectors.nextDoc() != NO_MORE_DOCS) {
checksum += vectors.vectorValue()[0];
}
}
}
assertEquals(fieldDocCounts[field], docCount);
assertEquals(fieldTotals[field], checksum, 1e-5);
}
}
}
}
public void testIndexedValueNotAliased() throws Exception {
// We copy indexed values (as for BinaryDocValues) so the input float[] can be reused across
// calls to IndexWriter.addDocument.
String fieldName = "field";
float[] v = {0};
try (Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc1 = new Document();
doc1.add(new VectorField(fieldName, v, VectorValues.SimilarityFunction.EUCLIDEAN));
v[0] = 1;
Document doc2 = new Document();
doc2.add(new VectorField(fieldName, v, VectorValues.SimilarityFunction.EUCLIDEAN));
iw.addDocument(doc1);
iw.addDocument(doc2);
v[0] = 2;
Document doc3 = new Document();
doc3.add(new VectorField(fieldName, v, VectorValues.SimilarityFunction.EUCLIDEAN));
iw.addDocument(doc3);
iw.forceMerge(1);
try (IndexReader reader = iw.getReader()) {
LeafReader r = getOnlyLeafReader(reader);
VectorValues vectorValues = r.getVectorValues(fieldName);
vectorValues.nextDoc();
assertEquals(1, vectorValues.vectorValue()[0], 0);
vectorValues.nextDoc();
assertEquals(1, vectorValues.vectorValue()[0], 0);
vectorValues.nextDoc();
assertEquals(2, vectorValues.vectorValue()[0], 0);
}
}
}
public void testSortedIndex() throws Exception {
IndexWriterConfig iwc = newIndexWriterConfig();
iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT)));
String fieldName = "field";
try (Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, iwc)) {
add(iw, fieldName, 1, 1, new float[] {-1, 0});
add(iw, fieldName, 4, 4, new float[] {0, 1});
add(iw, fieldName, 3, 3, null);
add(iw, fieldName, 2, 2, new float[] {1, 0});
iw.forceMerge(1);
try (IndexReader reader = iw.getReader()) {
LeafReader leaf = getOnlyLeafReader(reader);
VectorValues vectorValues = leaf.getVectorValues(fieldName);
assertEquals(2, vectorValues.dimension());
assertEquals(3, vectorValues.size());
assertEquals("1", leaf.document(vectorValues.nextDoc()).get("id"));
assertEquals(-1f, vectorValues.vectorValue()[0], 0);
assertEquals("2", leaf.document(vectorValues.nextDoc()).get("id"));
assertEquals(1, vectorValues.vectorValue()[0], 0);
assertEquals("4", leaf.document(vectorValues.nextDoc()).get("id"));
assertEquals(0, vectorValues.vectorValue()[0], 0);
assertEquals(NO_MORE_DOCS, vectorValues.nextDoc());
RandomAccessVectorValues ra =
((RandomAccessVectorValuesProducer) vectorValues).randomAccess();
assertEquals(-1f, ra.vectorValue(0)[0], 0);
assertEquals(1f, ra.vectorValue(1)[0], 0);
assertEquals(0f, ra.vectorValue(2)[0], 0);
}
}
}
public void testIndexMultipleVectorFields() throws Exception {
try (Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc = new Document();
float[] v = new float[] {1};
doc.add(new VectorField("field1", v, VectorValues.SimilarityFunction.EUCLIDEAN));
doc.add(
new VectorField("field2", new float[] {1, 2, 3}, VectorValues.SimilarityFunction.NONE));
iw.addDocument(doc);
v[0] = 2;
iw.addDocument(doc);
doc = new Document();
doc.add(
new VectorField(
"field3", new float[] {1, 2, 3}, VectorValues.SimilarityFunction.DOT_PRODUCT));
iw.addDocument(doc);
iw.forceMerge(1);
try (IndexReader reader = iw.getReader()) {
LeafReader leaf = reader.leaves().get(0).reader();
VectorValues vectorValues = leaf.getVectorValues("field1");
assertEquals(1, vectorValues.dimension());
assertEquals(2, vectorValues.size());
vectorValues.nextDoc();
assertEquals(1f, vectorValues.vectorValue()[0], 0);
vectorValues.nextDoc();
assertEquals(2f, vectorValues.vectorValue()[0], 0);
assertEquals(NO_MORE_DOCS, vectorValues.nextDoc());
VectorValues vectorValues2 = leaf.getVectorValues("field2");
assertEquals(3, vectorValues2.dimension());
assertEquals(2, vectorValues2.size());
vectorValues2.nextDoc();
assertEquals(2f, vectorValues2.vectorValue()[1], 0);
vectorValues2.nextDoc();
assertEquals(2f, vectorValues2.vectorValue()[1], 0);
assertEquals(NO_MORE_DOCS, vectorValues2.nextDoc());
VectorValues vectorValues3 = leaf.getVectorValues("field3");
assertEquals(3, vectorValues3.dimension());
assertEquals(1, vectorValues3.size());
vectorValues3.nextDoc();
assertEquals(1f, vectorValues3.vectorValue()[0], 0);
assertEquals(NO_MORE_DOCS, vectorValues3.nextDoc());
}
}
}
/**
* Index random vectors, sometimes skipping documents, sometimes deleting a document, sometimes
* merging, sometimes sorting the index, and verify that the expected values can be read back
* consistently.
*/
public void testRandom() throws Exception {
IndexWriterConfig iwc = newIndexWriterConfig();
if (random().nextBoolean()) {
iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT)));
}
String fieldName = "field";
try (Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, iwc)) {
int numDoc = atLeast(100);
int dimension = atLeast(10);
float[] scratch = new float[dimension];
int numValues = 0;
float[][] values = new float[numDoc][];
for (int i = 0; i < numDoc; i++) {
if (random().nextInt(7) != 3) {
// usually index a vector value for a doc
values[i] = randomVector(dimension);
++numValues;
}
if (random().nextBoolean() && values[i] != null) {
// sometimes use a shared scratch array
System.arraycopy(values[i], 0, scratch, 0, scratch.length);
add(iw, fieldName, i, scratch, VectorValues.SimilarityFunction.NONE);
} else {
add(iw, fieldName, i, values[i], VectorValues.SimilarityFunction.NONE);
}
if (random().nextInt(10) == 2) {
// sometimes delete a random document
int idToDelete = random().nextInt(i + 1);
iw.deleteDocuments(new Term("id", Integer.toString(idToDelete)));
// and remember that it was deleted
if (values[idToDelete] != null) {
values[idToDelete] = null;
--numValues;
}
}
if (random().nextInt(10) == 3) {
iw.commit();
}
}
int numDeletes = 0;
try (IndexReader reader = iw.getReader()) {
int valueCount = 0, totalSize = 0;
for (LeafReaderContext ctx : reader.leaves()) {
VectorValues vectorValues = ctx.reader().getVectorValues(fieldName);
if (vectorValues == null) {
continue;
}
totalSize += vectorValues.size();
int docId;
while ((docId = vectorValues.nextDoc()) != NO_MORE_DOCS) {
float[] v = vectorValues.vectorValue();
assertEquals(dimension, v.length);
String idString = ctx.reader().document(docId).getField("id").stringValue();
int id = Integer.parseInt(idString);
if (ctx.reader().getLiveDocs() == null || ctx.reader().getLiveDocs().get(docId)) {
assertArrayEquals(idString, values[id], v, 0);
++valueCount;
} else {
++numDeletes;
assertNull(values[id]);
}
}
}
assertEquals(numValues, valueCount);
assertEquals(numValues, totalSize - numDeletes);
}
}
}
/**
* Index random vectors, sometimes skipping documents, sometimes updating a document, sometimes
* merging, sometimes sorting the index, using an HNSW similarity function so as to also produce a
* graph, and verify that the expected values can be read back consistently.
*/
public void testRandomWithUpdatesAndGraph() throws Exception {
IndexWriterConfig iwc = newIndexWriterConfig();
String fieldName = "field";
try (Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, iwc)) {
int numDoc = atLeast(100);
int dimension = atLeast(10);
float[][] id2value = new float[numDoc][];
int[] id2ord = new int[numDoc];
for (int i = 0; i < numDoc; i++) {
int id = random().nextInt(numDoc);
float[] value;
if (random().nextInt(7) != 3) {
// usually index a vector value for a doc
value = randomVector(dimension);
} else {
value = null;
}
id2value[id] = value;
id2ord[id] = i;
add(iw, fieldName, id, value, VectorValues.SimilarityFunction.EUCLIDEAN);
}
try (IndexReader reader = iw.getReader()) {
for (LeafReaderContext ctx : reader.leaves()) {
Bits liveDocs = ctx.reader().getLiveDocs();
VectorValues vectorValues = ctx.reader().getVectorValues(fieldName);
if (vectorValues == null) {
continue;
}
int docId;
while ((docId = vectorValues.nextDoc()) != NO_MORE_DOCS) {
float[] v = vectorValues.vectorValue();
assertEquals(dimension, v.length);
String idString = ctx.reader().document(docId).getField("id").stringValue();
int id = Integer.parseInt(idString);
if (liveDocs == null || liveDocs.get(docId)) {
assertArrayEquals(
"values differ for id=" + idString + ", docid=" + docId + " leaf=" + ctx.ord,
id2value[id],
v,
0);
} else {
if (id2value[id] != null) {
assertFalse(Arrays.equals(id2value[id], v));
}
}
}
}
}
}
}
private void add(
IndexWriter iw,
String field,
int id,
float[] vector,
VectorValues.SimilarityFunction similarityFunction)
throws IOException {
add(iw, field, id, random().nextInt(100), vector, similarityFunction);
}
private void add(IndexWriter iw, String field, int id, int sortkey, float[] vector)
throws IOException {
add(iw, field, id, sortkey, vector, VectorValues.SimilarityFunction.NONE);
}
private void add(
IndexWriter iw,
String field,
int id,
int sortkey,
float[] vector,
VectorValues.SimilarityFunction similarityFunction)
throws IOException {
Document doc = new Document();
if (vector != null) {
doc.add(new VectorField(field, vector, similarityFunction));
}
doc.add(new NumericDocValuesField("sortkey", sortkey));
String idString = Integer.toString(id);
doc.add(new StringField("id", idString, Field.Store.YES));
Term idTerm = new Term("id", idString);
iw.updateDocument(idTerm, doc);
}
private float[] randomVector(int dim) {
float[] v = new float[dim];
for (int i = 0; i < dim; i++) {
v[i] = random().nextFloat();
}
VectorUtil.l2normalize(v);
return v;
}
public void testCheckIndexIncludesVectors() throws Exception {
try (Directory dir = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
Document doc = new Document();
doc.add(new VectorField("v1", randomVector(3), VectorValues.SimilarityFunction.NONE));
w.addDocument(doc);
doc.add(new VectorField("v2", randomVector(3), VectorValues.SimilarityFunction.NONE));
w.addDocument(doc);
}
ByteArrayOutputStream output = new ByteArrayOutputStream();
CheckIndex.Status status = TestUtil.checkIndex(dir, false, true, output);
assertEquals(1, status.segmentInfos.size());
CheckIndex.Status.SegmentInfoStatus segStatus = status.segmentInfos.get(0);
// total 3 vector values were indexed:
assertEquals(3, segStatus.vectorValuesStatus.totalVectorValues);
// ... across 2 fields:
assertEquals(2, segStatus.vectorValuesStatus.totalVectorFields);
// Make sure CheckIndex in fact declares that it is testing vectors!
assertTrue(output.toString(IOUtils.UTF_8).contains("test: vectors..."));
}
}
public void testSimilarityFunctionIdentifiers() {
// make sure we don't accidentally mess up similarity function identifiers by re-ordering their
// enumerators
assertEquals(0, VectorValues.SimilarityFunction.NONE.ordinal());
assertEquals(1, VectorValues.SimilarityFunction.EUCLIDEAN.ordinal());
assertEquals(2, VectorValues.SimilarityFunction.DOT_PRODUCT.ordinal());
assertEquals(3, VectorValues.SimilarityFunction.values().length);
}
public void testAdvance() throws Exception {
try (Directory dir = newDirectory()) {
try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {
int numdocs = atLeast(1500);
String fieldName = "field";
for (int i = 0; i < numdocs; i++) {
Document doc = new Document();
// randomly add a vector field
if (random().nextInt(4) == 3) {
doc.add(new VectorField(fieldName, new float[4], VectorValues.SimilarityFunction.NONE));
}
w.addDocument(doc);
}
w.forceMerge(1);
try (IndexReader reader = w.getReader()) {
LeafReader r = getOnlyLeafReader(reader);
VectorValues vectorValues = r.getVectorValues(fieldName);
int[] vectorDocs = new int[vectorValues.size() + 1];
int cur = -1;
while (++cur < vectorValues.size() + 1) {
vectorDocs[cur] = vectorValues.nextDoc();
if (cur != 0) {
assertTrue(vectorDocs[cur] > vectorDocs[cur - 1]);
}
}
vectorValues = r.getVectorValues(fieldName);
cur = -1;
for (int i = 0; i < numdocs; i++) {
// randomly advance to i
if (random().nextInt(4) == 3) {
while (vectorDocs[++cur] < i)
;
assertEquals(vectorDocs[cur], vectorValues.advance(i));
assertEquals(vectorDocs[cur], vectorValues.docID());
if (vectorValues.docID() == NO_MORE_DOCS) {
break;
}
// make i equal to docid so that it is greater than docId in the next loop iteration
i = vectorValues.docID();
}
}
}
}
}
}
}