blob: 59bbe1fdd56159a8b1084f1013476efa4cc79b15 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jackrabbit.oak.plugins.index.elastic;
import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition;
import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition;
import org.apache.jackrabbit.oak.spi.state.NodeState;
import static org.apache.jackrabbit.oak.plugins.index.search.util.ConfigUtil.getOptionalValue;
public class ElasticPropertyDefinition extends PropertyDefinition {
SimilaritySearchParameters similaritySearchParameters;
public static final String PROP_QUERY_MODEL = "queryModel";
public static final String PROP_NUMBER_OF_HASH_TABLES = "L";
public static final String PROP_NUMBER_OF_HASH_FUNCTIONS = "k";
public static final String PROP_NUMBER_OF_BUCKETS = "w";
public static final String PROP_INDEX_SIMILARITY = "indexSimilarity";
public static final String PROP_QUERY_SIMILARITY = "querySimilarity";
public static final String PROP_CANDIDATES = "candidates";
public static final String PROP_PROBES = "probes";
private static final int DEFAULT_NUMBER_OF_HASH_TABLES = 20;
private static final int DEFAULT_NO_OF_HASH_FUNCTIONS = 15;
private static final int DEFAULT_BUCKET_WIDTH = 500;
private static final String DEFAULT_SIMILARITY_QUERY_MODEL = "lsh";
private static final String DEFAULT_SIMILARITY_INDEX_FUNCTION = "l2";
private static final String DEFAULT_SIMILARITY_QUERY_FUNCTION = "l2";
private static final int DEFAULT_QUERY_CANDIDATES = 500;
private static final int DEFAULT_QUERY_PROBES = 3;
public ElasticPropertyDefinition(IndexDefinition.IndexingRule idxDefn, String nodeName, NodeState defn) {
super(idxDefn, nodeName, defn);
if (this.useInSimilarity) {
similaritySearchParameters = new SimilaritySearchParameters(
getOptionalValue(defn, PROP_NUMBER_OF_HASH_TABLES, DEFAULT_NUMBER_OF_HASH_TABLES),
getOptionalValue(defn, PROP_NUMBER_OF_HASH_FUNCTIONS, DEFAULT_NO_OF_HASH_FUNCTIONS),
getOptionalValue(defn, PROP_NUMBER_OF_BUCKETS, DEFAULT_BUCKET_WIDTH),
getOptionalValue(defn, PROP_QUERY_MODEL, DEFAULT_SIMILARITY_QUERY_MODEL),
getOptionalValue(defn, PROP_INDEX_SIMILARITY, DEFAULT_SIMILARITY_INDEX_FUNCTION),
getOptionalValue(defn, PROP_QUERY_SIMILARITY, DEFAULT_SIMILARITY_QUERY_FUNCTION),
getOptionalValue(defn, PROP_CANDIDATES, DEFAULT_QUERY_CANDIDATES),
getOptionalValue(defn, PROP_PROBES, DEFAULT_QUERY_PROBES));
}
}
/**
* Class for defining parameters for similarity search based on https://elastiknn.com/api.
* For all possible models and query combinations, see https://elastiknn.com/api/#model-and-query-compatibility
*/
public static class SimilaritySearchParameters {
/**
* Number of hash tables. Generally, increasing this value increases recall.
*/
private final int L;
/**
* Number of hash functions combined to form a single hash value. Generally, increasing this value increases precision.
*/
private final int k;
/**
* Integer bucket width.
*/
private final int w;
/**
* Possible values - lsh, exact
*/
private final String queryModel;
/**
* Possible values l2 (with lsh or exact model), l1 (with exact model), A (angular distance - with exact model)
*/
private final String queryTimeSimilarityFunction;
/**
* Possible values l2 (with lsh or exact model), l1 (with exact model), A (angular distance - with exact model)
*/
private final String indexTimeSimilarityFunction;
/**
* Take the top vectors with the most matching hashes and compute their exact similarity to the query vector. The candidates parameter
* controls the number of exact similarity computations. Specifically, we compute exact similarity for the top candidates candidate vectors
* in each segment. As a reminder, each Elasticsearch index has >= 1 shards, and each shard has >= 1 segments. That means if you set
* "candidates": 200 for an index with 2 shards, each with 3 segments, then you’ll compute the exact similarity for 2 * 3 * 200 = 1200 vectors.
* candidates must be set to a number greater or equal to the number of Elasticsearch results you want to get. Higher values generally mean
* higher recall and higher latency.
*/
private final int candidates;
/**
* Number of probes for using the multiprobe search technique. Default value is zero. Max value is 3^k. Generally, increasing probes will
* increase recall, will allow you to use a smaller value for L with comparable recall, but introduces some additional computation at query time.
*/
private final int probes;
public SimilaritySearchParameters(int l, int k, int w, String queryModel, String indexTimeSimilarityFunction,
String queryTimeSimilarityFunction, int candidates, int probes) {
L = l;
this.k = k;
this.w = w;
this.queryModel = queryModel;
this.indexTimeSimilarityFunction = indexTimeSimilarityFunction;
this.queryTimeSimilarityFunction = queryTimeSimilarityFunction;
this.candidates = candidates;
this.probes = probes;
}
public int getL() {
return L;
}
public int getK() {
return k;
}
public int getW() {
return w;
}
public String getQueryModel() {
return queryModel;
}
public String getQueryTimeSimilarityFunction() {
return queryTimeSimilarityFunction;
}
public String getIndexTimeSimilarityFunction() {
return indexTimeSimilarityFunction;
}
public int getCandidates() {
return candidates;
}
public int getProbes() {
return probes;
}
}
public SimilaritySearchParameters getSimilaritySearchParameters() {
return similaritySearchParameters;
}
}