blob: 844c8aa9908d6b974fe1eb1097b2afbb1ea20134 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.jackrabbit.oak.plugins.index.elastic;
import static org.apache.jackrabbit.oak.plugins.index.search.util.ConfigUtil.getOptionalValue;
import static org.apache.jackrabbit.oak.plugins.index.search.util.ConfigUtil.getOptionalValues;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants;
import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition;
import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition;
import org.apache.jackrabbit.oak.spi.state.NodeState;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
public class ElasticIndexDefinition extends IndexDefinition {
public static final String TYPE_ELASTICSEARCH = "elasticsearch";
public static final String BULK_ACTIONS = "bulkActions";
public static final int BULK_ACTIONS_DEFAULT = 250;
public static final String BULK_SIZE_BYTES = "bulkSizeBytes";
public static final long BULK_SIZE_BYTES_DEFAULT = 1024 * 1024; // 1MB
public static final String BULK_FLUSH_INTERVAL_MS = "bulkFlushIntervalMs";
public static final long BULK_FLUSH_INTERVAL_MS_DEFAULT = 3000;
public static final String NUMBER_OF_SHARDS = "numberOfShards";
public static final int NUMBER_OF_SHARDS_DEFAULT = 1;
public static final String NUMBER_OF_REPLICAS = "numberOfReplicas";
public static final int NUMBER_OF_REPLICAS_DEFAULT = 1;
public static final String QUERY_FETCH_SIZES = "queryFetchSizes";
public static final Long[] QUERY_FETCH_SIZES_DEFAULT = new Long[]{10L, 100L, 1000L};
public static final String QUERY_TIMEOUT_MS = "queryTimeoutMs";
public static final long QUERY_TIMEOUT_MS_DEFAULT = 60000;
public static final String TRACK_TOTAL_HITS = "trackTotalHits";
public static final Integer TRACK_TOTAL_HITS_DEFAULT = 10000;
/**
* Hidden property for storing the index mapping version.
*/
public static final String PROP_INDEX_MAPPING_VERSION = ":mappingVersion";
public static final String DYNAMIC_MAPPING = "dynamicMapping";
// possible values are: true, false, runtime, strict. See https://www.elastic.co/guide/en/elasticsearch/reference/current/dynamic.html
public static final String DYNAMIC_MAPPING_DEFAULT = "true";
// when true, fails indexing in case of bulk failures
public static final String FAIL_ON_ERROR = "failOnError";
public static final boolean FAIL_ON_ERROR_DEFAULT = true;
/**
* When 0, the index name gets dynamically generated by adding a random suffix to the index name.
*/
public static final String INDEX_NAME_SEED = "indexNameSeed";
public static final long INDEX_NAME_SEED_DEFAULT = 0L;
/**
* Hidden property for storing a seed value to be used as suffix in remote index name.
*/
public static final String PROP_INDEX_NAME_SEED = ":nameSeed";
/**
* Hidden property to store similarity tags
*/
public static final String SIMILARITY_TAGS = ":simTags";
/**
* Hidden property to handle dynamic tags for fulltext queries
*/
public static final String DYNAMIC_BOOST_FULLTEXT = ":dynamic-boost-ft";
/**
* Dynamic properties are fields that are not explicitly defined in the index mapping and are added on the fly when a document is indexed.
* Examples: aggregations with relative nodes, regex properties (to be supported), etc.
*/
public static final String DYNAMIC_PROPERTIES = ":dynamic-properties";
public static final String SPLIT_ON_CASE_CHANGE = "splitOnCaseChange";
public static final String SPLIT_ON_NUMERICS = "splitOnNumerics";
public static final String ELASTIKNN = "elastiknn";
private static final String SIMILARITY_TAGS_ENABLED = "similarityTagsEnabled";
private static final boolean SIMILARITY_TAGS_ENABLED_DEFAULT = true;
private static final String SIMILARITY_TAGS_FIELDS = "similarityTagsFields";
// MLT queries, when no fields are specified, do not use the entire document but only a maximum of
// max_query_terms (default 25). Even increasing this value, the query could produce not so relevant
// results (eg: based on the :fulltext content). To work this around, we can specify DYNAMIC_BOOST_FULLTEXT
// field with overridden mlt params and increased boost since it usually contains relevant terms. This will make sure
// that the MLT queries give more priority to the terms in this field while the rest (*) are considered secondary.
// TODO: we can further improve search relevance by using the actual tags combined with the weights using a function query.
// Right now, we are just matching the tags without looking at the weights. Therefore, a tag can be matched in a field with a lower weight.
private static final String[] SIMILARITY_TAGS_FIELDS_DEFAULT = new String[] {
"mlt.fl=" + DYNAMIC_BOOST_FULLTEXT + "&mlt.mintf=1&mlt.qf=3",
"mlt.fl=*&mlt.mintf=2"
};
private static final String SIMILARITY_TAGS_BOOST = "similarityTagsBoost";
private static final float SIMILARITY_TAGS_BOOST_DEFAULT = 0.5f;
private static final Function<Integer, Boolean> isAnalyzable;
static {
int[] NOT_ANALYZED_TYPES = new int[] {
Type.BINARY.tag(), Type.LONG.tag(), Type.DOUBLE.tag(), Type.DECIMAL.tag(), Type.DATE.tag(), Type.BOOLEAN.tag()
};
Arrays.sort(NOT_ANALYZED_TYPES); // need for binary search
isAnalyzable = type -> Arrays.binarySearch(NOT_ANALYZED_TYPES, type) < 0;
}
private final String indexPrefix;
private final String indexAlias;
public final int bulkActions;
public final long bulkSizeBytes;
public final long bulkFlushIntervalMs;
private final boolean similarityTagsEnabled;
private final float similarityTagsBoost;
public final int numberOfShards;
public final int numberOfReplicas;
public final int[] queryFetchSizes;
public final long queryTimeoutMs;
public final Integer trackTotalHits;
public final String dynamicMapping;
public final boolean failOnError;
public final long indexNameSeed;
private final Map<String, List<PropertyDefinition>> propertiesByName;
private final List<PropertyDefinition> dynamicBoostProperties;
private final List<PropertyDefinition> similarityProperties;
private final List<PropertyDefinition> similarityTagsProperties;
private final String[] similarityTagsFields;
public ElasticIndexDefinition(NodeState root, NodeState defn, String indexPath, String indexPrefix) {
super(root, defn, determineIndexFormatVersion(defn), determineUniqueId(defn), indexPath);
this.indexPrefix = indexPrefix;
this.indexAlias = ElasticIndexNameHelper.getElasticSafeIndexName(indexPrefix, getIndexPath());
this.bulkActions = getOptionalValue(defn, BULK_ACTIONS, BULK_ACTIONS_DEFAULT);
this.bulkSizeBytes = getOptionalValue(defn, BULK_SIZE_BYTES, BULK_SIZE_BYTES_DEFAULT);
this.bulkFlushIntervalMs = getOptionalValue(defn, BULK_FLUSH_INTERVAL_MS, BULK_FLUSH_INTERVAL_MS_DEFAULT);
this.numberOfShards = getOptionalValue(defn, NUMBER_OF_SHARDS, NUMBER_OF_SHARDS_DEFAULT);
this.numberOfReplicas = getOptionalValue(defn, NUMBER_OF_REPLICAS, NUMBER_OF_REPLICAS_DEFAULT);
this.similarityTagsEnabled = getOptionalValue(defn, SIMILARITY_TAGS_ENABLED, SIMILARITY_TAGS_ENABLED_DEFAULT);
this.similarityTagsBoost = getOptionalValue(defn, SIMILARITY_TAGS_BOOST, SIMILARITY_TAGS_BOOST_DEFAULT);
this.queryFetchSizes = Arrays.stream(getOptionalValues(defn, QUERY_FETCH_SIZES, Type.LONGS, Long.class, QUERY_FETCH_SIZES_DEFAULT))
.mapToInt(Long::intValue).toArray();
this.queryTimeoutMs = getOptionalValue(defn, QUERY_TIMEOUT_MS, QUERY_TIMEOUT_MS_DEFAULT);
this.trackTotalHits = getOptionalValue(defn, TRACK_TOTAL_HITS, TRACK_TOTAL_HITS_DEFAULT);
this.dynamicMapping = getOptionalValue(defn, DYNAMIC_MAPPING, DYNAMIC_MAPPING_DEFAULT);
this.failOnError = getOptionalValue(defn, FAIL_ON_ERROR,
Boolean.parseBoolean(System.getProperty(TYPE_ELASTICSEARCH + "." + FAIL_ON_ERROR, Boolean.toString(FAIL_ON_ERROR_DEFAULT)))
);
this.indexNameSeed = getOptionalValue(defn, INDEX_NAME_SEED, INDEX_NAME_SEED_DEFAULT);
this.similarityTagsFields = getOptionalValues(defn, SIMILARITY_TAGS_FIELDS, Type.STRINGS, String.class, SIMILARITY_TAGS_FIELDS_DEFAULT);
this.propertiesByName = getDefinedRules()
.stream()
.flatMap(rule -> Stream.concat(StreamSupport.stream(rule.getProperties().spliterator(), false),
rule.getFunctionRestrictions().stream()))
.filter(pd -> pd.index) // keep only properties that can be indexed
.collect(Collectors.groupingBy(pd -> {
if (pd.function != null) {
return pd.function;
} else {
return pd.name;
}
}));
this.dynamicBoostProperties = getDefinedRules()
.stream()
.flatMap(IndexingRule::getNamePatternsProperties)
.filter(pd -> pd.dynamicBoost)
.collect(Collectors.toList());
this.similarityProperties = getDefinedRules()
.stream()
.flatMap(rule -> rule.getSimilarityProperties().stream())
.collect(Collectors.toList());
this.similarityTagsProperties = propertiesByName.values().stream()
.flatMap(List::stream)
.filter(pd -> pd.similarityTags).collect(Collectors.toList());
}
@Nullable
public NodeState getAnalyzersNodeState() {
return definition.getChildNode(FulltextIndexConstants.ANALYZERS);
}
public String getIndexPrefix() {
return indexPrefix;
}
/**
* Returns the index alias on the Elasticsearch cluster. This alias should be used for any query related operations.
* The actual index name is used only when a reindex is in progress.
* @return the Elasticsearch index alias
*/
public String getIndexAlias() {
return indexAlias;
}
public Map<String, List<PropertyDefinition>> getPropertiesByName() {
return propertiesByName;
}
public List<PropertyDefinition> getDynamicBoostProperties() {
return dynamicBoostProperties;
}
public List<PropertyDefinition> getSimilarityProperties() {
return similarityProperties;
}
public List<PropertyDefinition> getSimilarityTagsProperties() {
return similarityTagsProperties;
}
public String[] getSimilarityTagsFields() {
return similarityTagsFields;
}
public boolean areSimilarityTagsEnabled() {
return similarityTagsEnabled;
}
public float getSimilarityTagsBoost() {
return similarityTagsBoost;
}
/**
* Returns the keyword field name mapped in Elasticsearch for the specified property name.
* @param propertyName the property name in the index rules
* @return the field name identifier in Elasticsearch
*/
public String getElasticKeyword(String propertyName) {
List<PropertyDefinition> propertyDefinitions = propertiesByName.get(propertyName);
if (propertyDefinitions == null) {
// if there are no property definitions we return the default keyword name
// this can happen for properties that were not explicitly defined (eg: created with a regex)
return propertyName + ".keyword";
}
String field = propertyName;
// it's ok to look at the first property since we are sure they all have the same type
int type = propertyDefinitions.get(0).getType();
if (isAnalyzable.apply(type) && isAnalyzed(propertyDefinitions)) {
field += ".keyword";
}
return field;
}
public boolean isAnalyzed(List<PropertyDefinition> propertyDefinitions) {
return propertyDefinitions.stream().anyMatch(pd -> pd.analyzed);
}
@Override
protected String getDefaultFunctionName() {
/*
This has nothing to do with lucene index. While parsing queries, spellCheck queries are handled
via PropertyRestriction having native*lucene as key.
*/
return "lucene";
}
/**
* Returns {@code true} if original terms need to be preserved at indexing analysis phase
*/
public boolean analyzerConfigIndexOriginalTerms() {
NodeState analyzersTree = definition.getChildNode(FulltextIndexConstants.ANALYZERS);
return getOptionalValue(analyzersTree, FulltextIndexConstants.INDEX_ORIGINAL_TERM, false);
}
public boolean analyzerConfigSplitOnCaseChange() {
NodeState analyzersTree = definition.getChildNode(FulltextIndexConstants.ANALYZERS);
return getOptionalValue(analyzersTree, SPLIT_ON_CASE_CHANGE, false);
}
public boolean analyzerConfigSplitOnNumerics() {
NodeState analyzersTree = definition.getChildNode(FulltextIndexConstants.ANALYZERS);
return getOptionalValue(analyzersTree, SPLIT_ON_NUMERICS, false);
}
/**
* Returns the mapping version for this index definition.
* If the version is not specified, the default value is {@code 1.0.0}.
*/
public String getMappingVersion() {
return getOptionalValue(definition, PROP_INDEX_MAPPING_VERSION, "1.0.0");
}
@Override
protected PropertyDefinition createPropertyDefinition(IndexDefinition.IndexingRule rule, String name, NodeState nodeState) {
return new ElasticPropertyDefinition(rule, name, nodeState);
}
/**
* Class to help with {@link ElasticIndexDefinition} creation.
* The built object represents the index definition only without the node structure.
*/
public static class Builder extends IndexDefinition.Builder {
private final String indexPrefix;
public Builder(@NotNull String indexPrefix) {
this.indexPrefix = indexPrefix;
}
@Override
public ElasticIndexDefinition build() {
return (ElasticIndexDefinition) super.build();
}
@Override
public Builder reindex() {
super.reindex();
return this;
}
@Override
protected IndexDefinition createInstance(NodeState indexDefnStateToUse) {
return new ElasticIndexDefinition(root, indexDefnStateToUse, indexPath, indexPrefix);
}
}
}