blob: 254d3a0e88ba2156eac6fddfd2b7b7158e748642 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jackrabbit.oak.plugins.index.elastic.index;
import co.elastic.clients.elasticsearch._types.Time;
import co.elastic.clients.elasticsearch._types.mapping.DynamicMapping;
import co.elastic.clients.elasticsearch._types.mapping.Property;
import co.elastic.clients.elasticsearch._types.mapping.TypeMapping;
import co.elastic.clients.elasticsearch.indices.CreateIndexRequest;
import co.elastic.clients.elasticsearch.indices.IndexSettings;
import co.elastic.clients.elasticsearch.indices.IndexSettingsAnalysis;
import co.elastic.clients.elasticsearch.indices.PutIndicesSettingsRequest;
import co.elastic.clients.json.JsonData;
import co.elastic.clients.util.ObjectBuilder;
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition;
import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticPropertyDefinition;
import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition;
import org.jetbrains.annotations.NotNull;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
/**
* Provides utility functions around Elasticsearch indexing
*/
class ElasticIndexHelper {
/**
* Mapping version that uses <a href="https://semver.org/">SemVer Specification</a> to allow changes without
* breaking existing queries.
* Changes breaking compatibility should increment the major version (indicating that a reindex is mandatory).
* Changes not breaking compatibility should increment the minor version (old queries still work, but they might not
* use the new feature).
* Changes that do not affect queries should increment the patch version (eg: bug fixes).
*/
protected static final String MAPPING_VERSION = "1.0.0";
// Unset the refresh interval and disable replicas at index creation to optimize for initial loads
// https://www.elastic.co/guide/en/elasticsearch/reference/current/tune-for-indexing-speed.html
private static final Time INITIAL_REFRESH_INTERVAL = Time.of(b -> b.time("-1"));
private static final String INITIAL_NUMBER_OF_REPLICAS = "0";
private static final String OAK_WORD_DELIMITER_GRAPH_FILTER = "oak_word_delimiter_graph_filter";
protected static final String SUGGEST_NESTED_VALUE = "value";
protected static final String DYNAMIC_BOOST_NESTED_VALUE = "value";
protected static final String DYNAMIC_BOOST_NESTED_BOOST = "boost";
/**
* Returns a {@code CreateIndexRequest} with settings and mappings translated from the specified {@code ElasticIndexDefinition}.
* The returned object can be used to create and index optimized for bulk loads (eg: reindexing) but not for queries.
* To make it usable, a #enableIndexRequest needs to be performed.
*
* @param remoteIndexName the final index name
* @param indexDefinition the definition used to read settings/mappings
* @return a {@code CreateIndexRequest}
*/
public static CreateIndexRequest createIndexRequest(@NotNull String remoteIndexName,
@NotNull ElasticIndexDefinition indexDefinition) {
return new CreateIndexRequest.Builder()
.index(remoteIndexName)
.settings(s -> loadSettings(s, indexDefinition))
.mappings(s -> loadMappings(s, indexDefinition))
.build();
}
private static ObjectBuilder<TypeMapping> loadMappings(@NotNull TypeMapping.Builder builder,
@NotNull ElasticIndexDefinition indexDefinition) {
builder.dynamic(Arrays
.stream(DynamicMapping.values())
.filter(dm -> dm.jsonValue().equals(indexDefinition.dynamicMapping))
.findFirst().orElse(DynamicMapping.True)
);
mapInternalProperties(builder);
mapIndexRules(builder, indexDefinition);
return builder;
}
private static void mapInternalProperties(@NotNull TypeMapping.Builder builder) {
builder.properties(FieldNames.PATH,
b1 -> b1.keyword(builder3 -> builder3))
.properties(FieldNames.ANCESTORS,
b1 -> b1.text(
b2 -> b2.analyzer("ancestor_analyzer")
.searchAnalyzer("keyword")
.searchQuoteAnalyzer("keyword")))
.properties(FieldNames.PATH_DEPTH,
b1 -> b1.integer(
b2 -> b2.docValues(false)))
.properties(FieldNames.FULLTEXT,
b1 -> b1.text(
b2 -> b2.analyzer("oak_analyzer")))
.properties(ElasticIndexDefinition.DYNAMIC_BOOST_FULLTEXT,
b1 -> b1.text(
b2 -> b2.analyzer("oak_analyzer")));
// TODO: the mapping below is for features currently not supported. These need to be reviewed
// mappingBuilder.startObject(FieldNames.NOT_NULL_PROPS)
// .field("type", "keyword")
// .endObject();
// mappingBuilder.startObject(FieldNames.NULL_PROPS)
// .field("type", "keyword")
// .endObject();
}
/**
* Returns a {@code PutIndicesSettingsRequest} to make an index ready to be queried and updated in near real time.
*
* @param remoteIndexName the final index name (no alias)
* @param indexDefinition the definition used to read settings/mappings
* @return an {@code PutIndicesSettingsRequest}
*/
public static PutIndicesSettingsRequest enableIndexRequest(String remoteIndexName, ElasticIndexDefinition indexDefinition) {
IndexSettings indexSettings = IndexSettings.of(is -> is
.numberOfReplicas(Integer.toString(indexDefinition.numberOfReplicas))
// TODO: we should pass null to reset the refresh interval to the default value but the following bug prevents it. We need to wait for a fix
// <a href="https://github.com/elastic/elasticsearch-java/issues/283">https://github.com/elastic/elasticsearch-java/issues/283</a>
.refreshInterval(Time.of(t -> t.time("1s"))));
return PutIndicesSettingsRequest.of(pisr -> pisr
.index(remoteIndexName)
.settings(indexSettings));
}
private static ObjectBuilder<IndexSettings> loadSettings(@NotNull IndexSettings.Builder builder,
@NotNull ElasticIndexDefinition indexDefinition) {
if (!indexDefinition.getSimilarityProperties().isEmpty()) {
builder.otherSettings(ElasticIndexDefinition.ELASTIKNN, JsonData.of(true));
}
// collect analyzer settings
IndexSettingsAnalysis.Builder analyzerBuilder =
ElasticCustomAnalyzer.buildCustomAnalyzers(indexDefinition.getAnalyzersNodeState(), "oak_analyzer");
if (analyzerBuilder == null) {
analyzerBuilder = new IndexSettingsAnalysis.Builder()
.filter(OAK_WORD_DELIMITER_GRAPH_FILTER,
tokenFilter -> tokenFilter.definition(
tokenFilterDef -> tokenFilterDef.wordDelimiterGraph(
wdgBuilder -> wdgBuilder.generateWordParts(true)
.stemEnglishPossessive(true)
.generateNumberParts(true)
.splitOnNumerics(indexDefinition.analyzerConfigSplitOnNumerics())
.splitOnCaseChange(indexDefinition.analyzerConfigSplitOnCaseChange())
.preserveOriginal(indexDefinition.analyzerConfigIndexOriginalTerms()))
))
.analyzer("oak_analyzer",
ab -> ab.custom(
customAnalyzer -> customAnalyzer.tokenizer("standard")
.filter("lowercase", OAK_WORD_DELIMITER_GRAPH_FILTER)));
}
// path restrictions support
analyzerBuilder.analyzer("ancestor_analyzer",
ab -> ab.custom(customAnalyzer -> customAnalyzer.tokenizer("path_hierarchy")));
// spellcheck support
analyzerBuilder.filter("shingle",
tokenFilter -> tokenFilter.definition(
tokenFilterDef -> tokenFilterDef.shingle(
shingle -> shingle.minShingleSize("2").maxShingleSize("3"))));
analyzerBuilder.analyzer("trigram",
ab -> ab.custom(
customAnalyzer -> customAnalyzer.tokenizer("standard").filter("lowercase", "shingle")));
// set up the index
builder.index(indexBuilder -> indexBuilder
// Make the index more lenient when a field cannot be converted to the mapped type. Without this setting
// the entire document will fail to update. Instead, only the specific field won't be updated.
.mapping(mf -> mf.ignoreMalformed(true))
// static setting: cannot be changed after the index gets created
.numberOfShards(Integer.toString(indexDefinition.numberOfShards))
// dynamic settings: see #enableIndexRequest
.refreshInterval(INITIAL_REFRESH_INTERVAL)
.numberOfReplicas(INITIAL_NUMBER_OF_REPLICAS))
.analysis(analyzerBuilder.build());
return builder;
}
private static void mapIndexRules(@NotNull TypeMapping.Builder builder,
@NotNull ElasticIndexDefinition indexDefinition) {
checkIndexRules(indexDefinition);
boolean useInSuggest = false;
for (Map.Entry<String, List<PropertyDefinition>> entry : indexDefinition.getPropertiesByName().entrySet()) {
final String name = entry.getKey();
final List<PropertyDefinition> propertyDefinitions = entry.getValue();
Type<?> type = null;
for (PropertyDefinition pd : propertyDefinitions) {
type = Type.fromTag(pd.getType(), false);
if (pd.useInSuggest) {
useInSuggest = true;
}
}
Property.Builder pBuilder = new Property.Builder();
// https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-types.html
if (Type.BINARY.equals(type)) {
pBuilder.binary(b -> b);
} else if (Type.LONG.equals(type)) {
pBuilder.long_(b -> b);
} else if (Type.DOUBLE.equals(type) || Type.DECIMAL.equals(type)) {
pBuilder.double_(b -> b);
} else if (Type.DATE.equals(type)) {
pBuilder.date(b -> b);
} else if (Type.BOOLEAN.equals(type)) {
pBuilder.boolean_(b -> b);
} else {
if (indexDefinition.isAnalyzed(propertyDefinitions)) {
// always add keyword for sorting / faceting as sub-field
pBuilder.text(
b1 -> b1.analyzer("oak_analyzer")
.fields("keyword",
b2 -> b2.keyword(
b3 -> b3.ignoreAbove(256))));
} else {
// always add keyword for sorting / faceting
pBuilder.keyword(b1 -> b1.ignoreAbove(256));
}
}
builder.properties(name, pBuilder.build());
builder.properties(FieldNames.SPELLCHECK,
b1 -> b1.text(
b2 -> b2.analyzer("trigram"))
);
if (useInSuggest) {
builder.properties(FieldNames.SUGGEST,
b1 -> b1.nested(
// TODO: evaluate https://www.elastic.co/guide/en/elasticsearch/reference/current/faster-prefix-queries.html
b2 -> b2.properties(SUGGEST_NESTED_VALUE,
b3 -> b3.text(
b4 -> b4.analyzer("oak_analyzer")
)
)
)
);
}
for (PropertyDefinition pd : indexDefinition.getDynamicBoostProperties()) {
builder.properties(pd.nodeName,
b1 -> b1.nested(
b2 -> b2.properties(DYNAMIC_BOOST_NESTED_VALUE,
b3 -> b3.text(
b4 -> b4.analyzer("oak_analyzer")))
.properties(DYNAMIC_BOOST_NESTED_BOOST,
b3 -> b3.double_(f -> f)
)
)
);
}
for (PropertyDefinition propertyDefinition : indexDefinition.getSimilarityProperties()) {
ElasticPropertyDefinition pd = (ElasticPropertyDefinition) propertyDefinition;
int denseVectorSize = pd.getSimilaritySearchDenseVectorSize();
Reader eknnConfig = new StringReader(
"{" +
" \"type\": \"elastiknn_dense_float_vector\"," +
" \"elastiknn\": {" +
" \"dims\": " + denseVectorSize + "," +
" \"model\": \"lsh\"," +
" \"similarity\": \"" + pd.getSimilaritySearchParameters().getIndexTimeSimilarityFunction() + "\"," +
" \"L\": " + pd.getSimilaritySearchParameters().getL() + "," +
" \"k\": " + pd.getSimilaritySearchParameters().getK() + "," +
" \"w\": " + pd.getSimilaritySearchParameters().getW() +
" }" +
"}");
builder.properties(FieldNames.createSimilarityFieldName(pd.name), b1 -> b1.withJson(eknnConfig));
}
builder.properties(ElasticIndexDefinition.SIMILARITY_TAGS,
b1 -> b1.text(
b2 -> b2.analyzer("oak_analyzer")
)
);
}
}
// we need to check if in the defined rules there are properties with the same name and different types
private static void checkIndexRules(ElasticIndexDefinition indexDefinition) {
final List<Map.Entry<String, List<PropertyDefinition>>> multiTypesFields = indexDefinition.getPropertiesByName()
.entrySet()
.stream()
.filter(e -> e.getValue().size() > 1)
.filter(e -> e.getValue().stream().map(PropertyDefinition::getType).distinct().count() > 1)
.collect(Collectors.toList());
if (!multiTypesFields.isEmpty()) {
String fields = multiTypesFields.stream().map(Map.Entry::getKey).collect(Collectors.joining(", ", "[", "]"));
throw new IllegalStateException(indexDefinition.getIndexPath() + " has properties with the same name and " +
"different types " + fields);
}
}
}