| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| package org.apache.jackrabbit.oak.plugins.index.elastic; |
| |
| import static org.apache.jackrabbit.oak.plugins.index.search.util.ConfigUtil.getOptionalValue; |
| import static org.apache.jackrabbit.oak.plugins.index.search.util.ConfigUtil.getOptionalValues; |
| |
| import java.util.Arrays; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.function.Function; |
| import java.util.stream.Collectors; |
| import java.util.stream.Stream; |
| import java.util.stream.StreamSupport; |
| |
| import org.apache.jackrabbit.oak.api.Type; |
| import org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants; |
| import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition; |
| import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition; |
| import org.apache.jackrabbit.oak.spi.state.NodeState; |
| import org.jetbrains.annotations.NotNull; |
| import org.jetbrains.annotations.Nullable; |
| |
| public class ElasticIndexDefinition extends IndexDefinition { |
| |
| public static final String TYPE_ELASTICSEARCH = "elasticsearch"; |
| |
| public static final String BULK_ACTIONS = "bulkActions"; |
| public static final int BULK_ACTIONS_DEFAULT = 250; |
| |
| public static final String BULK_SIZE_BYTES = "bulkSizeBytes"; |
| public static final long BULK_SIZE_BYTES_DEFAULT = 1024 * 1024; // 1MB |
| |
| public static final String BULK_FLUSH_INTERVAL_MS = "bulkFlushIntervalMs"; |
| public static final long BULK_FLUSH_INTERVAL_MS_DEFAULT = 3000; |
| |
| public static final String NUMBER_OF_SHARDS = "numberOfShards"; |
| public static final int NUMBER_OF_SHARDS_DEFAULT = 1; |
| |
| public static final String NUMBER_OF_REPLICAS = "numberOfReplicas"; |
| public static final int NUMBER_OF_REPLICAS_DEFAULT = 1; |
| |
| public static final String QUERY_FETCH_SIZES = "queryFetchSizes"; |
| public static final Long[] QUERY_FETCH_SIZES_DEFAULT = new Long[]{10L, 100L, 1000L}; |
| |
| public static final String QUERY_TIMEOUT_MS = "queryTimeoutMs"; |
| public static final long QUERY_TIMEOUT_MS_DEFAULT = 60000; |
| |
| public static final String TRACK_TOTAL_HITS = "trackTotalHits"; |
| public static final Integer TRACK_TOTAL_HITS_DEFAULT = 10000; |
| |
| /** |
| * Hidden property for storing the index mapping version. |
| */ |
| public static final String PROP_INDEX_MAPPING_VERSION = ":mappingVersion"; |
| |
| public static final String DYNAMIC_MAPPING = "dynamicMapping"; |
| // possible values are: true, false, runtime, strict. See https://www.elastic.co/guide/en/elasticsearch/reference/current/dynamic.html |
| public static final String DYNAMIC_MAPPING_DEFAULT = "true"; |
| |
| // when true, fails indexing in case of bulk failures |
| public static final String FAIL_ON_ERROR = "failOnError"; |
| public static final boolean FAIL_ON_ERROR_DEFAULT = true; |
| |
| /** |
| * When 0, the index name gets dynamically generated by adding a random suffix to the index name. |
| */ |
| public static final String INDEX_NAME_SEED = "indexNameSeed"; |
| public static final long INDEX_NAME_SEED_DEFAULT = 0L; |
| |
| /** |
| * Hidden property for storing a seed value to be used as suffix in remote index name. |
| */ |
| public static final String PROP_INDEX_NAME_SEED = ":nameSeed"; |
| |
| /** |
| * Hidden property to store similarity tags |
| */ |
| public static final String SIMILARITY_TAGS = ":simTags"; |
| |
| /** |
| * Hidden property to handle dynamic tags for fulltext queries |
| */ |
| public static final String DYNAMIC_BOOST_FULLTEXT = ":dynamic-boost-ft"; |
| |
| /** |
| * Dynamic properties are fields that are not explicitly defined in the index mapping and are added on the fly when a document is indexed. |
| * Examples: aggregations with relative nodes, regex properties (to be supported), etc. |
| */ |
| public static final String DYNAMIC_PROPERTIES = ":dynamic-properties"; |
| |
| public static final String SPLIT_ON_CASE_CHANGE = "splitOnCaseChange"; |
| public static final String SPLIT_ON_NUMERICS = "splitOnNumerics"; |
| |
| public static final String ELASTIKNN = "elastiknn"; |
| |
| private static final String SIMILARITY_TAGS_ENABLED = "similarityTagsEnabled"; |
| private static final boolean SIMILARITY_TAGS_ENABLED_DEFAULT = true; |
| |
| private static final String SIMILARITY_TAGS_FIELDS = "similarityTagsFields"; |
| |
| // MLT queries, when no fields are specified, do not use the entire document but only a maximum of |
| // max_query_terms (default 25). Even increasing this value, the query could produce not so relevant |
| // results (eg: based on the :fulltext content). To work this around, we can specify DYNAMIC_BOOST_FULLTEXT |
| // field with overridden mlt params and increased boost since it usually contains relevant terms. This will make sure |
| // that the MLT queries give more priority to the terms in this field while the rest (*) are considered secondary. |
| // TODO: we can further improve search relevance by using the actual tags combined with the weights using a function query. |
| // Right now, we are just matching the tags without looking at the weights. Therefore, a tag can be matched in a field with a lower weight. |
| private static final String[] SIMILARITY_TAGS_FIELDS_DEFAULT = new String[] { |
| "mlt.fl=" + DYNAMIC_BOOST_FULLTEXT + "&mlt.mintf=1&mlt.qf=3", |
| "mlt.fl=*&mlt.mintf=2" |
| }; |
| |
| private static final String SIMILARITY_TAGS_BOOST = "similarityTagsBoost"; |
| private static final float SIMILARITY_TAGS_BOOST_DEFAULT = 0.5f; |
| |
| private static final Function<Integer, Boolean> isAnalyzable; |
| |
| static { |
| int[] NOT_ANALYZED_TYPES = new int[] { |
| Type.BINARY.tag(), Type.LONG.tag(), Type.DOUBLE.tag(), Type.DECIMAL.tag(), Type.DATE.tag(), Type.BOOLEAN.tag() |
| }; |
| Arrays.sort(NOT_ANALYZED_TYPES); // need for binary search |
| isAnalyzable = type -> Arrays.binarySearch(NOT_ANALYZED_TYPES, type) < 0; |
| } |
| |
| private final String indexPrefix; |
| private final String indexAlias; |
| public final int bulkActions; |
| public final long bulkSizeBytes; |
| public final long bulkFlushIntervalMs; |
| private final boolean similarityTagsEnabled; |
| private final float similarityTagsBoost; |
| public final int numberOfShards; |
| public final int numberOfReplicas; |
| public final int[] queryFetchSizes; |
| public final long queryTimeoutMs; |
| public final Integer trackTotalHits; |
| public final String dynamicMapping; |
| public final boolean failOnError; |
| public final long indexNameSeed; |
| |
| private final Map<String, List<PropertyDefinition>> propertiesByName; |
| private final List<PropertyDefinition> dynamicBoostProperties; |
| private final List<PropertyDefinition> similarityProperties; |
| private final List<PropertyDefinition> similarityTagsProperties; |
| private final String[] similarityTagsFields; |
| |
| public ElasticIndexDefinition(NodeState root, NodeState defn, String indexPath, String indexPrefix) { |
| super(root, defn, determineIndexFormatVersion(defn), determineUniqueId(defn), indexPath); |
| this.indexPrefix = indexPrefix; |
| this.indexAlias = ElasticIndexNameHelper.getElasticSafeIndexName(indexPrefix, getIndexPath()); |
| this.bulkActions = getOptionalValue(defn, BULK_ACTIONS, BULK_ACTIONS_DEFAULT); |
| this.bulkSizeBytes = getOptionalValue(defn, BULK_SIZE_BYTES, BULK_SIZE_BYTES_DEFAULT); |
| this.bulkFlushIntervalMs = getOptionalValue(defn, BULK_FLUSH_INTERVAL_MS, BULK_FLUSH_INTERVAL_MS_DEFAULT); |
| this.numberOfShards = getOptionalValue(defn, NUMBER_OF_SHARDS, NUMBER_OF_SHARDS_DEFAULT); |
| this.numberOfReplicas = getOptionalValue(defn, NUMBER_OF_REPLICAS, NUMBER_OF_REPLICAS_DEFAULT); |
| this.similarityTagsEnabled = getOptionalValue(defn, SIMILARITY_TAGS_ENABLED, SIMILARITY_TAGS_ENABLED_DEFAULT); |
| this.similarityTagsBoost = getOptionalValue(defn, SIMILARITY_TAGS_BOOST, SIMILARITY_TAGS_BOOST_DEFAULT); |
| this.queryFetchSizes = Arrays.stream(getOptionalValues(defn, QUERY_FETCH_SIZES, Type.LONGS, Long.class, QUERY_FETCH_SIZES_DEFAULT)) |
| .mapToInt(Long::intValue).toArray(); |
| this.queryTimeoutMs = getOptionalValue(defn, QUERY_TIMEOUT_MS, QUERY_TIMEOUT_MS_DEFAULT); |
| this.trackTotalHits = getOptionalValue(defn, TRACK_TOTAL_HITS, TRACK_TOTAL_HITS_DEFAULT); |
| this.dynamicMapping = getOptionalValue(defn, DYNAMIC_MAPPING, DYNAMIC_MAPPING_DEFAULT); |
| this.failOnError = getOptionalValue(defn, FAIL_ON_ERROR, |
| Boolean.parseBoolean(System.getProperty(TYPE_ELASTICSEARCH + "." + FAIL_ON_ERROR, Boolean.toString(FAIL_ON_ERROR_DEFAULT))) |
| ); |
| this.indexNameSeed = getOptionalValue(defn, INDEX_NAME_SEED, INDEX_NAME_SEED_DEFAULT); |
| this.similarityTagsFields = getOptionalValues(defn, SIMILARITY_TAGS_FIELDS, Type.STRINGS, String.class, SIMILARITY_TAGS_FIELDS_DEFAULT); |
| |
| this.propertiesByName = getDefinedRules() |
| .stream() |
| .flatMap(rule -> Stream.concat(StreamSupport.stream(rule.getProperties().spliterator(), false), |
| rule.getFunctionRestrictions().stream())) |
| .filter(pd -> pd.index) // keep only properties that can be indexed |
| .collect(Collectors.groupingBy(pd -> { |
| if (pd.function != null) { |
| return pd.function; |
| } else { |
| return pd.name; |
| } |
| })); |
| |
| this.dynamicBoostProperties = getDefinedRules() |
| .stream() |
| .flatMap(IndexingRule::getNamePatternsProperties) |
| .filter(pd -> pd.dynamicBoost) |
| .collect(Collectors.toList()); |
| |
| this.similarityProperties = getDefinedRules() |
| .stream() |
| .flatMap(rule -> rule.getSimilarityProperties().stream()) |
| .collect(Collectors.toList()); |
| |
| this.similarityTagsProperties = propertiesByName.values().stream() |
| .flatMap(List::stream) |
| .filter(pd -> pd.similarityTags).collect(Collectors.toList()); |
| } |
| |
| @Nullable |
| public NodeState getAnalyzersNodeState() { |
| return definition.getChildNode(FulltextIndexConstants.ANALYZERS); |
| } |
| |
| public String getIndexPrefix() { |
| return indexPrefix; |
| } |
| |
| /** |
| * Returns the index alias on the Elasticsearch cluster. This alias should be used for any query related operations. |
| * The actual index name is used only when a reindex is in progress. |
| * @return the Elasticsearch index alias |
| */ |
| public String getIndexAlias() { |
| return indexAlias; |
| } |
| |
| public Map<String, List<PropertyDefinition>> getPropertiesByName() { |
| return propertiesByName; |
| } |
| |
| public List<PropertyDefinition> getDynamicBoostProperties() { |
| return dynamicBoostProperties; |
| } |
| |
| public List<PropertyDefinition> getSimilarityProperties() { |
| return similarityProperties; |
| } |
| |
| public List<PropertyDefinition> getSimilarityTagsProperties() { |
| return similarityTagsProperties; |
| } |
| |
| public String[] getSimilarityTagsFields() { |
| return similarityTagsFields; |
| } |
| |
| public boolean areSimilarityTagsEnabled() { |
| return similarityTagsEnabled; |
| } |
| |
| public float getSimilarityTagsBoost() { |
| return similarityTagsBoost; |
| } |
| |
| /** |
| * Returns the keyword field name mapped in Elasticsearch for the specified property name. |
| * @param propertyName the property name in the index rules |
| * @return the field name identifier in Elasticsearch |
| */ |
| public String getElasticKeyword(String propertyName) { |
| List<PropertyDefinition> propertyDefinitions = propertiesByName.get(propertyName); |
| if (propertyDefinitions == null) { |
| // if there are no property definitions we return the default keyword name |
| // this can happen for properties that were not explicitly defined (eg: created with a regex) |
| return propertyName + ".keyword"; |
| } |
| |
| String field = propertyName; |
| // it's ok to look at the first property since we are sure they all have the same type |
| int type = propertyDefinitions.get(0).getType(); |
| if (isAnalyzable.apply(type) && isAnalyzed(propertyDefinitions)) { |
| field += ".keyword"; |
| } |
| return field; |
| } |
| |
| public boolean isAnalyzed(List<PropertyDefinition> propertyDefinitions) { |
| return propertyDefinitions.stream().anyMatch(pd -> pd.analyzed); |
| } |
| |
| @Override |
| protected String getDefaultFunctionName() { |
| /* |
| This has nothing to do with lucene index. While parsing queries, spellCheck queries are handled |
| via PropertyRestriction having native*lucene as key. |
| */ |
| return "lucene"; |
| } |
| |
| /** |
| * Returns {@code true} if original terms need to be preserved at indexing analysis phase |
| */ |
| public boolean analyzerConfigIndexOriginalTerms() { |
| NodeState analyzersTree = definition.getChildNode(FulltextIndexConstants.ANALYZERS); |
| return getOptionalValue(analyzersTree, FulltextIndexConstants.INDEX_ORIGINAL_TERM, false); |
| } |
| |
| public boolean analyzerConfigSplitOnCaseChange() { |
| NodeState analyzersTree = definition.getChildNode(FulltextIndexConstants.ANALYZERS); |
| return getOptionalValue(analyzersTree, SPLIT_ON_CASE_CHANGE, false); |
| } |
| |
| public boolean analyzerConfigSplitOnNumerics() { |
| NodeState analyzersTree = definition.getChildNode(FulltextIndexConstants.ANALYZERS); |
| return getOptionalValue(analyzersTree, SPLIT_ON_NUMERICS, false); |
| } |
| |
| /** |
| * Returns the mapping version for this index definition. |
| * If the version is not specified, the default value is {@code 1.0.0}. |
| */ |
| public String getMappingVersion() { |
| return getOptionalValue(definition, PROP_INDEX_MAPPING_VERSION, "1.0.0"); |
| } |
| |
| @Override |
| protected PropertyDefinition createPropertyDefinition(IndexDefinition.IndexingRule rule, String name, NodeState nodeState) { |
| return new ElasticPropertyDefinition(rule, name, nodeState); |
| } |
| |
| /** |
| * Class to help with {@link ElasticIndexDefinition} creation. |
| * The built object represents the index definition only without the node structure. |
| */ |
| public static class Builder extends IndexDefinition.Builder { |
| |
| private final String indexPrefix; |
| |
| public Builder(@NotNull String indexPrefix) { |
| this.indexPrefix = indexPrefix; |
| } |
| |
| @Override |
| public ElasticIndexDefinition build() { |
| return (ElasticIndexDefinition) super.build(); |
| } |
| |
| @Override |
| public Builder reindex() { |
| super.reindex(); |
| return this; |
| } |
| |
| @Override |
| protected IndexDefinition createInstance(NodeState indexDefnStateToUse) { |
| return new ElasticIndexDefinition(root, indexDefnStateToUse, indexPath, indexPrefix); |
| } |
| } |
| } |