blob: 1cdab9e03b922d1cf28fcb1784b1c97215bb6459 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import java.nio.BufferUnderflowException;
import java.nio.FloatBuffer;
import java.util.Locale;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.VectorDocValuesField;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Ignore;
/** Tests VectorDocValues */
public class TestVectorDocValues extends LuceneTestCase {
/**
* Basic test of creating indexing and retrieving instances of vector doc values.
*/
public void testVectorField() throws Exception {
try (Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null))) {
add(iw, 1, 2, 3);
try (DirectoryReader dr = DirectoryReader.open(iw)) {
LeafReader r = getOnlyLeafReader(dr);
// ok
assertNotNull(DocValues.getBinary(r, "foo"));
assertNotNull(VectorDocValues.get(r, "foo"));
// errors
expectThrows(IllegalStateException.class, () -> {
DocValues.getNumeric(r, "foo");
});
expectThrows(IllegalStateException.class, () -> {
DocValues.getSorted(r, "foo");
});
expectThrows(IllegalStateException.class, () -> {
DocValues.getSortedSet(r, "foo");
});
}
}
}
public void testDimensions() throws Exception {
try (Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null))) {
add(iw, 1, 2, 3);
iw.addDocument(new Document());
add(iw, -1, 0, 1);
add(iw, 0);
add(iw, 0, 0, 0, 0);
float[] vector = new float[3];
try (DirectoryReader dr = DirectoryReader.open(iw)) {
LeafReader r = getOnlyLeafReader(dr);
VectorDocValues values = VectorDocValues.get(r, "foo");
assertEquals(0, values.nextDoc());
assertEquals(3, values.dimension());
values.vector(vector);
assertArrayEquals(new float[]{1, 2, 3}, vector, 0);
// we skip doc 1, which had no values
assertEquals(2, values.nextDoc());
assertEquals(3, values.dimension());
values.vector(vector);
assertArrayEquals(new float[]{-1, 0, 1}, vector, 0);
values.nextDoc();
expectThrows(BufferUnderflowException.class, () -> values.vector(vector));
// We ignore extra dimensions, but we should not
values.nextDoc();
values.vector(vector);
assertArrayEquals(new float[]{0, 0, 0}, vector, 0);
}
}
}
/*
private void assertArrayEquals(float[] expected, float[] actual, float delta) {
assertEquals("lengths differ", expected.length, actual.length);
for (int i = 0; i < expected.length; i++) {
assertEquals("mismatch at index " + i, actual[i], expected[i], delta);
}
}*/
public void testAdvance() throws Exception {
// We override advanceExact() and advance() to decode values and set the next value state: make sure that works
try (Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null))) {
add(iw, 1, 2, 3);
iw.addDocument(new Document());
iw.addDocument(new Document());
add(iw, -1, 0, 1);
iw.addDocument(new Document());
float[] vector = new float[3];
try (DirectoryReader dr = DirectoryReader.open(iw)) {
LeafReader r = getOnlyLeafReader(dr);
VectorDocValues values = VectorDocValues.get(r, "foo");
// in the initial state we do no checking
// expectThrows(ArrayIndexOutOfBoundsException.class, () -> values.vector(vector));
// no values, but dimension is known
assertEquals(3, values.dimension());
// after successful advanceExact
assertTrue(values.advanceExact(0));
values.vector(vector);
assertArrayEquals(new float[]{1, 2, 3}, vector, 0);
// after unsuccessful advanceExact we do not check
// assertFalse(values.advanceExact(1));
// expectThrows(AssertionError.class, () -> values.vector(vector));
// after successful advance
assertEquals(3, values.advance(2));
values.vector(vector);
assertArrayEquals(new float[]{-1, 0, 1}, vector, 0);
// after unsuccessful advance (no more docs)
// assertEquals(DocValuesIterator.NO_MORE_DOCS, values.advance(4));
// expectThrows(ArrayIndexOutOfBoundsException.class, () -> values.vector(vector));
}
}
}
public void testZeroLengthVector() throws Exception {
// We disallow 0-length vector values
expectThrows(IllegalArgumentException.class, () -> new VectorDocValuesField("foo", new float[0]));
}
private void add(IndexWriter iw, float ... values) throws IOException {
Document doc = new Document();
doc.add(new VectorDocValuesField("foo", values));
iw.addDocument(doc);
}
// TODO: test performance of writing/reading and compare with an implementation that uses memory-mapped I/O to directly map an array of floats
// TODO: implement vector-based matching using HNSW
/*
testPerf iterate values using nextValue()
[junit4] 1> 100000 docs, dim=10; write time 219ms, read time 51ms
[junit4] 1> 100000 docs, dim=10; write time 130ms, read time 56ms
[junit4] 1> 100000 docs, dim=10; write time 139ms, read time 40ms
[junit4] 1> 100000 docs, dim=100; write time 1016ms, read time 44ms
[junit4] 1> 100000 docs, dim=100; write time 982ms, read time 130ms
[junit4] 1> 100000 docs, dim=100; write time 994ms, read time 46ms
[junit4] 1> 100000 docs, dim=1000; write time 8449ms, read time 254ms
[junit4] 1> 100000 docs, dim=1000; write time 8796ms, read time 238ms
[junit4] 1> 100000 docs, dim=1000; write time 8923ms, read time 230ms
testPerf direct access to vector()
[junit4] 1> 100000 docs, dim=10; write time 142ms, read time 48ms
[junit4] 1> 100000 docs, dim=10; write time 158ms, read time 44ms
[junit4] 1> 100000 docs, dim=10; write time 133ms, read time 40ms
[junit4] 1> 100000 docs, dim=100; write time 909ms, read time 37ms
[junit4] 1> 100000 docs, dim=100; write time 893ms, read time 37ms
[junit4] 1> 100000 docs, dim=100; write time 898ms, read time 39ms
[junit4] 1> 100000 docs, dim=1000; write time 8696ms, read time 154ms
[junit4] 1> 100000 docs, dim=1000; write time 8689ms, read time 153ms
[junit4] 1> 100000 docs, dim=1000; write time 8641ms, read time 149ms
testPerf provide vector() using direct access to IndexInput and its ByteBuffer
to avoid copying bytes, still copying floats into array
[junit4] 1> 100000 docs, dim=10; write time 177ms, read time 45ms
[junit4] 1> 100000 docs, dim=10; write time 151ms, read time 36ms
[junit4] 1> 100000 docs, dim=10; write time 158ms, read time 29ms
[junit4] 1> 100000 docs, dim=100; write time 917ms, read time 41ms
[junit4] 1> 100000 docs, dim=100; write time 943ms, read time 39ms
[junit4] 1> 100000 docs, dim=100; write time 927ms, read time 39ms
[junit4] 1> 100000 docs, dim=1000; write time 9182ms, read time 131ms
[junit4] 1> 100000 docs, dim=1000; write time 8855ms, read time 141ms
[junit4] 1> 100000 docs, dim=1000; write time 8847ms, read time 128ms
testPerf provide vectorBuffer() using direct access to IndexInput and its ByteBuffer
repeated calls to VectorBuffer.get(int) ...
[junit4] 1> 100000 docs, dim=10; write time 168ms, read time 53ms
[junit4] 1> 100000 docs, dim=10; write time 177ms, read time 32ms
[junit4] 1> 100000 docs, dim=10; write time 158ms, read time 38ms
[junit4] 1> 100000 docs, dim=100; write time 947ms, read time 48ms
[junit4] 1> 100000 docs, dim=100; write time 918ms, read time 44ms
[junit4] 1> 100000 docs, dim=100; write time 915ms, read time 42ms
[junit4] 1> 100000 docs, dim=1000; write time 8894ms, read time 202ms
[junit4] 1> 100000 docs, dim=1000; write time 8792ms, read time 199ms
[junit4] 1> 100000 docs, dim=1000; write time 8941ms, read time 200ms
testPerf SlicedVectorValues (plumbing through IndexInput, iterating and converting from int bits to float)
[junit4] 1> 100000 docs, dim=10; write time 165ms, read time 44ms
[junit4] 1> 100000 docs, dim=10; write time 138ms, read time 42ms
[junit4] 1> 100000 docs, dim=10; write time 132ms, read time 37ms
[junit4] 1> 100000 docs, dim=100; write time 931ms, read time 59ms
[junit4] 1> 100000 docs, dim=100; write time 940ms, read time 61ms
[junit4] 1> 100000 docs, dim=100; write time 926ms, read time 60ms
[junit4] 1> 100000 docs, dim=1000; write time 8977ms, read time 624ms
[junit4] 1> 100000 docs, dim=1000; write time 9003ms, read time 623ms
[junit4] 1> 100000 docs, dim=1000; write time 8958ms, read time 624ms
testRawPerf (Just an in-memory float array, not Lucene; as a best-case)
[junit4] 1> 100000 docs, dim=10; write time 39ms, read time 0ms
[junit4] 1> 100000 docs, dim=10; write time 41ms, read time 0ms
[junit4] 1> 100000 docs, dim=10; write time 39ms, read time 0ms
[junit4] 1> 100000 docs, dim=100; write time 392ms, read time 7ms
[junit4] 1> 100000 docs, dim=100; write time 392ms, read time 7ms
[junit4] 1> 100000 docs, dim=100; write time 392ms, read time 6ms
[junit4] 1> 100000 docs, dim=1000; write time 3767ms, read time 70ms
[junit4] 1> 100000 docs, dim=1000; write time 3770ms, read time 70ms
[junit4] 1> 100000 docs, dim=1000; write time 3769ms, read time 70ms
*/
@Ignore
public void testPerf() throws Exception {
// Write lots of vectors, then read them back
// int numDocs = atLeast(100_000);
int numDocs = 100_000;
// int iters = random().nextInt(5);
int iters = 4;
//perfTest(2, 5, 1);
perfTest(numDocs, 10, iters);
perfTest(numDocs, 100, iters);
perfTest(numDocs, 1000, iters);
}
private void perfTest(int numDocs, int dimension, int iters) throws IOException {
String field = "field";
for (int iter = 0; iter < iters; iter++) {
try (Directory dir = FSDirectory.open(createTempDir());
IndexWriter iw = new IndexWriter(dir, new IndexWriterConfig())) {
float[] vector = new float[dimension];
float[] sum = new float[dimension];
long tStart = System.nanoTime();
for (int i = 0; i < numDocs; i++) {
for (int j = 0; j < dimension; j++) {
vector[j] = random().nextFloat();
sum[j] += vector[j];
}
/*
for (int j = 0; j < dimension; j++) {
System.out.print(vector[j] + " ");
}
System.out.println(" @" + i);
*/
Document doc = new Document();
doc.add(new VectorDocValuesField(field, vector));
iw.addDocument(doc);
}
long tWrite = System.nanoTime();
for (int j = 0; j < dimension; j++) {
vector[j] = 0;
}
try (DirectoryReader dr = DirectoryReader.open(iw)) {
for (LeafReaderContext lrc : dr.leaves()) {
VectorDocValues vdv = VectorDocValues.get(lrc.reader(), field);
//VectorDocValues vdv = VectorDocValues.getSliced(lrc.reader(), field);
float[] values = new float[dimension];
while (vdv.nextDoc() != DocValuesIterator.NO_MORE_DOCS) {
vdv.vector(values);
for (int i = 0; i < dimension; i++) {
vector[i] += values[i];
}
/*
FloatBuffer buf = vdv.vectorBuffer();
int pos = buf.position();
for (int i = 0, j = pos; i < dimension; i++, j++) {
// System.out.print(buf.get(j) + " ");
vector[i] += buf.get(j);
}
*/
/*
System.out.println("");
*/
}
}
}
if (iter != 0) {
long tRead = System.nanoTime();
System.out.printf(Locale.ROOT,
"%d docs, dim=%d; write time %dms, read time %dms\n",
numDocs, dimension,
nsToMs(tWrite - tStart),
nsToMs(tRead - tWrite));
}
for (int i = 0; i < vector.length; i++) {
assertEquals("wrong value for dimension " + i, sum[i], vector[i], 0);
}
}
}
}
/*
public void testRawPerf() throws Exception {
int numDocs = 100000;
int iters = 4;
doRawPerf(numDocs, 10, iters);
doRawPerf(numDocs, 100, iters);
doRawPerf(numDocs, 1000, iters);
}
*/
public void doRawPerf(int numDocs, int dimension, int iters) throws Exception {
float[] buffer = new float[dimension * numDocs];
for (int iter = 0; iter < iters; iter++) {
long tStart = System.nanoTime();
for (int i = 0; i < numDocs * dimension; i++) {
buffer[i++] = random().nextFloat();
}
long tWrite = System.nanoTime();
float[] vector = new float[dimension];
for (int i = 0; i < numDocs; i++) {
for (int j = 0, k = i * dimension; j < dimension; j++, k++) {
vector[j] += buffer[k];
}
}
if (iter != 0) {
long tRead = System.nanoTime();
System.out.printf("%d docs, dim=%d; write time %dms, read time %dms\n",
numDocs, dimension,
nsToMs(tWrite - tStart),
nsToMs(tRead - tWrite));
}
}
}
private static long nsToMs(long ns) {
return ns / 1_000_000;
}
}