blob: 3055007bfa102352d0f023e36a3de6ba4b23a097 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jackrabbit.oak.plugins.index.elastic;
import co.elastic.clients.elasticsearch._types.mapping.FieldMapping;
import co.elastic.clients.elasticsearch._types.mapping.Property;
import co.elastic.clients.elasticsearch.indices.GetFieldMappingResponse;
import co.elastic.clients.elasticsearch.indices.get_field_mapping.TypeFieldMappings;
import jakarta.json.JsonObject;
import org.apache.commons.io.IOUtils;
import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.api.Tree;
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
import org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants;
import org.apache.jackrabbit.oak.plugins.index.search.util.IndexDefinitionBuilder;
import org.junit.Test;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.net.URI;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.UUID;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toByteArray;
import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toDoubles;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals;
public class ElasticSimilarQueryTest extends ElasticAbstractQueryTest {
@Test
public void repSimilarWithStopWords() throws Exception {
createIndex(true);
String nativeQueryStringWithStopWords = "select [jcr:path] from [nt:base] where " +
"native('elastic-sim', 'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0&mlt.stopwords=Hello,bye')";
String nativeQueryStringWithoutStopWords = "select [jcr:path] from [nt:base] where " +
"native('elastic-sim', 'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0&mlt.minshouldmatch=20%')";
Tree test = root.getTree("/").addChild("test");
test.addChild("a").setProperty("text", "Hello World. Ok Bye Bye now. See you tomorrow.");
test.addChild("b").setProperty("text", "He said Hello and then the she said Hello as well.");
test.addChild("c").setProperty("text", "He said Bye.");
test.addChild("d").setProperty("text", "Bye Bye World.");
test.addChild("e").setProperty("text", "See you Tomorrow");
test.addChild("f").setProperty("text", "Hello Mr X. Let's catch up tomorrow. Bye Bye");
test.addChild("g").setProperty("text", "Random text");
root.commit();
// Matches due to terms Hello or bye should be ignored
assertEventually(() -> assertQuery(nativeQueryStringWithStopWords, List.of("/test/a", "/test/e", "/test/f")));
assertEventually(() -> assertQuery(nativeQueryStringWithoutStopWords,
List.of("/test/a", "/test/b", "/test/c", "/test/d", "/test/e", "/test/f")));
}
@Test
public void repSimilarWithMinWordLength() throws Exception {
createIndex(true);
String nativeQueryStringWithMinWordLength = "select [jcr:path] from [nt:base] where " +
"native('elastic-sim', 'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0&mlt.minwl=6')";
String nativeQueryStringWithoutMinWordLength = "select [jcr:path] from [nt:base] where " +
"native('elastic-sim', 'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0')";
Tree test = root.getTree("/").addChild("test");
test.addChild("a").setProperty("text", "Hello Worlds.");
test.addChild("b").setProperty("text", "He said Hello and then the world said Hello as well.");
test.addChild("c").setProperty("text", "War of the worlds is a good movie");
test.addChild("d").setProperty("text", "Hello. How are you? Worlds");
root.commit();
// Matches because of term Hello should be ignored since wl <6 (so /test/ should NOT be in the match list)
// /test/d should be in match list (because of Worlds term)
assertEventually(() -> assertQuery(nativeQueryStringWithMinWordLength, List.of("/test/a", "/test/c", "/test/d")));
assertEventually(() -> assertQuery(nativeQueryStringWithoutMinWordLength,
List.of("/test/a", "/test/b", "/test/c", "/test/d")));
}
@Test
public void repSimilarQueryWithLongPath() throws Exception {
createIndex(false);
Tree test = root.getTree("/").addChild("test");
Tree longPath = test.addChild("a");
for (int i = 0; i < 258; i++) {
longPath = longPath.addChild("a" + i);
}
longPath.setProperty("text", "Hello World Hello World");
test.addChild("b").setProperty("text", "Hello World");
test.addChild("c").setProperty("text", "World");
test.addChild("d").setProperty("text", "Hello");
test.addChild("e").setProperty("text", "Bye Bye");
test.addChild("f").setProperty("text", "Hello");
test.addChild("g").setProperty("text", "World");
test.addChild("h").setProperty("text", "Hello");
root.commit();
final String p = longPath.getPath();
String query = "select [jcr:path] from [nt:base] where similar(., '" + p + "')";
assertEventually(() -> assertQuery(query,
List.of(p, "/test/b", "/test/c", "/test/d", "/test/f", "/test/g", "/test/h")));
}
/**
* This test checks <a href="https://github.com/elastic/elasticsearch/pull/94518">94518</a> issue.
*/
@Test
public void repSimilarQueryWithIgnoredMetadataField() throws Exception {
createIndex(false);
Tree test = root.getTree("/").addChild("test");
// the max keyword length is 256, this field will be then listed as _ignored
test.addChild("a").setProperty("text", ElasticTestUtils.randomString(1000));
root.commit();
String query = "select [jcr:path] from [nt:base] where similar(., '/test/a')";
assertEventually(() -> assertQuery(query, List.of("/test/a")));
}
@Test
public void similarityTagsAffectRelevance() throws Exception {
createIndex(false);
Tree test = root.getTree("/").addChild("test");
Tree a = test.addChild("a");
a.setProperty("text", "Hello World Hello World");
a.setProperty("tags", "foo");
Tree b = test.addChild("b");
b.setProperty("text", "Hello World Hello World");
b.setProperty("tags", "bar");
Tree c = test.addChild("c");
c.setProperty("text", "Hello World Hello World");
c.setProperty("tags", "foo");
root.commit();
assertEventually(() -> {
List<String> paths = executeQuery("select [jcr:path] from [nt:base] where similar(., '/test/a')", SQL2, true, true);
assertEquals(paths.size(), 3);
assertEquals(paths.get(2), "/test/b");
});
}
@Test
public void vectorSimilarityElastiknnIndexConfiguration() throws Exception {
final String indexName = "test1";
final String fieldName1 = "fv1";
final String similarityFieldName1 = FieldNames.createSimilarityFieldName(fieldName1);
IndexDefinitionBuilder builder = createIndex(fieldName1);
Tree tree = builder.indexRule("nt:base").property(fieldName1).useInSimilarity(true).nodeScopeIndex()
.similaritySearchDenseVectorSize(2048).getBuilderTree();
tree.setProperty(ElasticPropertyDefinition.PROP_INDEX_SIMILARITY, "cosine");
tree.setProperty(ElasticPropertyDefinition.PROP_NUMBER_OF_HASH_TABLES, 10);
tree.setProperty(ElasticPropertyDefinition.PROP_NUMBER_OF_HASH_FUNCTIONS, 12);
setIndex(indexName, builder);
root.commit();
String alias = ElasticIndexNameHelper.getElasticSafeIndexName(esConnection.getIndexPrefix(), "/oak:index/" + indexName);
GetFieldMappingResponse mappingsResponse = esConnection.getClient()
.indices()
.getFieldMapping(b -> b
.index(alias)
.fields(similarityFieldName1)
);
Map<String, TypeFieldMappings> mappings = mappingsResponse.result();
assertEquals("More than one index found", 1, mappings.size());
Map<String, FieldMapping> typeFieldMappings = mappings.entrySet().iterator().next().getValue().mappings();
Property v = typeFieldMappings.get(similarityFieldName1).mapping().get(similarityFieldName1);
JsonObject map1 = v._custom().toJson().asJsonObject().get("elastiknn").asJsonObject();
assertEquals("Dense vector size doesn't match", 2048, map1.getInt("dims"));
assertEquals("Similarity doesn't match", "cosine", map1.getString("similarity"));
assertEquals("Similarity doesn't match", 10, map1.getInt("L"));
assertEquals("Similarity doesn't match", 12, map1.getInt("k"));
}
@Test
public void vectorSimilarityWithWrongVectorSizes() throws Exception {
IndexDefinitionBuilder builder = createIndex("fv");
builder.indexRule("nt:base").property("fv").useInSimilarity(true).nodeScopeIndex()
.similaritySearchDenseVectorSize(100);// test FVs have size 1048
Tree index = setIndex("test1", builder);
root.commit();
Tree test = root.getTree("/").addChild("test");
URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
File file = new File(uri);
for (String line : IOUtils.readLines(Files.newInputStream(file.toPath()), Charset.defaultCharset())) {
String[] split = line.split(",");
List<Double> values = Stream.of(split).skip(1).map(Double::parseDouble).collect(Collectors.toList());
byte[] bytes = toByteArray(values);
List<Double> actual = toDoubles(bytes);
assertEquals(values, actual);
Blob blob = root.createBlob(new ByteArrayInputStream(bytes));
String name = split[0];
Tree child = test.addChild(name);
child.setProperty("fv", blob, Type.BINARY);
}
root.commit();
// regardless of the wrong vectors, we should be able to index
assertEventually(() -> assertEquals(10, countDocuments(index)));
}
@Test
public void vectorSimilarity() throws Exception {
IndexDefinitionBuilder builder = createIndex("fv");
builder.indexRule("nt:base").property("fv").useInSimilarity(true).nodeScopeIndex();
setIndex("test1", builder);
root.commit();
Tree test = root.getTree("/").addChild("test");
URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
File file = new File(uri);
List<String> children = new LinkedList<>();
for (String line : IOUtils.readLines(Files.newInputStream(file.toPath()), Charset.defaultCharset())) {
String[] split = line.split(",");
List<Double> values = Stream.of(split).skip(1).map(Double::parseDouble).collect(Collectors.toList());
byte[] bytes = toByteArray(values);
List<Double> actual = toDoubles(bytes);
assertEquals(values, actual);
Blob blob = root.createBlob(new ByteArrayInputStream(bytes));
String name = split[0];
Tree child = test.addChild(name);
child.setProperty("fv", blob, Type.BINARY);
children.add(child.getPath());
}
// add a node without FV, the plugin cannot handle it directly
Tree child = test.addChild("nofv");
child.setProperty("nofv", "test");
children.add(child.getPath());
root.commit();
// check that similarity changes across different feature vectors
List<String> baseline = new LinkedList<>();
for (String similarPath : children) {
String query = "select [jcr:path] from [nt:base] where similar(., '" + similarPath + "')";
List<String> current = new LinkedList<>();
assertEventually(() -> {
Iterator<String> result = executeQuery(query, "JCR-SQL2", false, true).iterator();
current.clear();
while (result.hasNext()) {
String next = result.next();
current.add(next);
}
assertNotEquals(baseline, current);
});
baseline.clear();
baseline.addAll(current);
}
}
private void verifyLSHResults(Map<String, List<String>> expectedResults, double expected, double delta) {
for (String similarPath : expectedResults.keySet()) {
String query = "select [jcr:path] from [nt:base] where similar(., '" + "/test/" + similarPath + "')";
assertEventually(() -> {
Iterator<String> result = executeQuery(query, "JCR-SQL2", false, true).iterator();
List<String> expectedList = expectedResults.get(similarPath.substring(similarPath.lastIndexOf("/") + 1));
List<String> found = new ArrayList<>();
int resultNum = 0;
// Verify that the expected results are present in the top 10 results
while (result.hasNext() && resultNum < expectedList.size()) {
String next = result.next();
next = next.substring(next.lastIndexOf("/") + 1);
found.add(next);
resultNum++;
}
double per = (expectedList.stream().filter(found::contains).count() * 100.0) / expectedList.size();
assertEquals("expected: " + expectedList + " got: " + found, expected, per, delta);
});
}
}
@Test
public void vectorSimilarityLargeData() throws Exception {
final int similarImageCount = 10;
int featureVectorLength = 1024;
IndexDefinitionBuilder builder = createIndex("fv");
builder.indexRule("nt:base").property("fv").useInSimilarity(true).nodeScopeIndex();
setIndex("test1", builder);
root.commit();
Tree test = root.getTree("/").addChild("test");
Random r = new Random(1);
ArrayList<String> imageNameList = new ArrayList<>();
ArrayList<float[]> imageDataList = new ArrayList<>();
for (int i = 0; i < 2000; i++) {
String imageName = "img" + i;
imageNameList.add(imageName);
List<Double> values = new ArrayList<>();
float[] imageData = new float[featureVectorLength];
for (int j = 0; j < featureVectorLength; j++) {
double x = r.nextDouble() * 0.5;
double g = 30 * Math.pow(x, 3);
values.add(g);
imageData[j] = (float) g;
}
imageDataList.add(imageData);
byte[] bytes = toByteArray(values);
List<Double> actual = toDoubles(bytes);
assertEquals(values, actual);
Blob blob = root.createBlob(new ByteArrayInputStream(bytes));
Tree child = test.addChild(imageName);
child.setProperty("fv", blob, Type.BINARY);
}
root.commit();
Map<String, List<String>> expectedResults = new HashMap<>();
for (int testCase = 0; testCase < 10; testCase++) {
int imageId = r.nextInt(imageDataList.size());
float[] find = imageDataList.get(imageId);
String imageName = imageNameList.get(imageId);
ArrayList<Image> images = new ArrayList<>();
for (int i = 0; i < imageDataList.size(); i++) {
Image img = new Image();
img.name = imageNameList.get(i);
float[] compare = imageDataList.get(i);
img.distance = euclideanDistance(find, compare);
images.add(img);
}
images.sort(Comparator.comparingDouble(o -> o.distance));
ArrayList<String> expected = new ArrayList<>();
for (int i = 0; i < similarImageCount; i++) {
expected.add(images.get(i).name);
}
expectedResults.put(imageName, expected);
}
verifyLSHResults(expectedResults, 65, 35);
}
static long euclideanDistance(float[] x, float[] y) {
long sum = 0;
for (int i = 0; i < x.length; i++) {
float xx = y[i];
float yy = x[i];
float diff = xx - yy;
sum += diff * diff;
}
return sum;
}
private void createIndex(boolean nativeQuery) throws Exception {
IndexDefinitionBuilder builder = createIndex("text", "tags");
if (nativeQuery) {
builder.getBuilderTree().setProperty(FulltextIndexConstants.FUNC_NAME, "elastic-sim");
}
builder.indexRule("nt:base").property("text").analyzed();
builder.indexRule("nt:base").property("tags").similarityTags(true);
String indexId = UUID.randomUUID().toString();
setIndex(indexId, builder);
root.commit();
}
static class Image {
double distance;
String name;
}
}