blob: 704556ea744f1093693c6d93b1a96b261f29131f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.designer;
import java.io.IOException;
import java.math.RoundingMode;
import java.text.NumberFormat;
import java.text.ParsePosition;
import java.time.ZoneId;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeFormatterBuilder;
import java.time.format.ResolverStyle;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.google.common.collect.Sets;
import org.apache.commons.lang3.LocaleUtils;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.ManagedIndexSchema;
import org.apache.solr.schema.NumberType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TextField;
import org.apache.solr.update.processor.ParseBooleanFieldUpdateProcessorFactory;
import org.apache.solr.update.processor.ParseDateFieldUpdateProcessorFactory;
import org.apache.solr.update.processor.ParseDoubleFieldUpdateProcessorFactory;
import org.apache.solr.update.processor.ParseLongFieldUpdateProcessorFactory;
import static org.apache.solr.common.params.CommonParams.VERSION_FIELD;
import static org.apache.solr.update.processor.ParseDateFieldUpdateProcessorFactory.validateFormatter;
// Just a quick hack to flush out the design, more intelligence is needed
public class DefaultSchemaSuggester implements SchemaSuggester {
private static final List<String> DEFAULT_DATE_TIME_PATTERNS =
Arrays.asList("yyyy-MM-dd['T'[HH:mm[:ss[.SSS]][z", "yyyy-MM-dd['T'[HH:mm[:ss[,SSS]][z", "yyyy-MM-dd HH:mm[:ss[.SSS]][z", "yyyy-MM-dd HH:mm[:ss[,SSS]][z", "[EEE, ]dd MMM yyyy HH:mm[:ss] z", "EEEE, dd-MMM-yy HH:mm:ss z", "EEE MMM ppd HH:mm:ss [z ]yyyy");
private static final String FORMATS_PARAM = "format";
private static final String DEFAULT_TIME_ZONE_PARAM = "defaultTimeZone";
private static final String LOCALE_PARAM = "locale";
private static final String TRUE_VALUES_PARAM = "trueValue";
private static final String FALSE_VALUES_PARAM = "falseValue";
private static final String CASE_SENSITIVE_PARAM = "caseSensitive";
private static final String TYPE_CHANGE_ERROR = "Failed to parse all sample values as %s for changing type for field %s to %s";
// boolean parsing
private final Set<String> trueValues = new HashSet<>(Arrays.asList("true"));
private final Set<String> falseValues = new HashSet<>(Arrays.asList("false"));
private final List<DateTimeFormatter> dateTimeFormatters = new LinkedList<>();
private boolean caseSensitive = false;
@Override
public void validateTypeChange(SchemaField field, FieldType toType, List<SolrInputDocument> docs) throws IOException {
final NumberType toNumType = toType.getNumberType();
if (toNumType != null) {
validateNumericTypeChange(field, toType, docs, toNumType);
}
}
protected void validateNumericTypeChange(SchemaField field, FieldType toType, List<SolrInputDocument> docs, final NumberType toNumType) {
// desired type is numeric, make sure all the sample values are numbers
List<Object> fieldValues = docs.stream()
.map(d -> d.getFieldValue(field.getName()))
.filter(Objects::nonNull)
.flatMap(c -> (c instanceof Collection) ? ((Collection<?>) c).stream() : Stream.of(c))
.collect(Collectors.toList());
switch (toNumType) {
case DOUBLE:
case FLOAT:
if (isFloatOrDouble(fieldValues, Locale.ROOT) == null) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
String.format(Locale.ROOT, TYPE_CHANGE_ERROR, toNumType.name(), field.getName(), toType.getTypeName()));
}
break;
case LONG:
case INTEGER:
if (isIntOrLong(fieldValues, Locale.ROOT) == null) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
String.format(Locale.ROOT, TYPE_CHANGE_ERROR, toNumType.name(), field.getName(), toType.getTypeName()));
}
break;
case DATE:
if (!isDateTime(fieldValues)) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
String.format(Locale.ROOT, TYPE_CHANGE_ERROR, toNumType.name(), field.getName(), toType.getTypeName()));
}
break;
}
}
@Override
public Optional<SchemaField> suggestField(String fieldName, List<Object> sampleValues, IndexSchema schema, List<String> langs) {
// start by looking at the fieldName and seeing if there is a dynamic field in the schema that already applies
if (schema.isDynamicField(fieldName)) {
return Optional.of(schema.getFieldOrNull(fieldName));
}
// TODO: use passed in langs
Locale locale = Locale.ROOT;
boolean isMV = isMultiValued(sampleValues);
String fieldTypeName = guessFieldType(fieldName, sampleValues, schema, isMV, locale);
FieldType fieldType = schema.getFieldTypeByName(fieldTypeName);
if (fieldType == null) {
// TODO: construct this field type on-the-fly ...
throw new IllegalStateException("FieldType '" + fieldTypeName + "' not found in the schema!");
}
Map<String, String> fieldProps = guessFieldProps(fieldName, fieldType, sampleValues, isMV, schema);
SchemaField schemaField = schema.newField(fieldName, fieldTypeName, fieldProps);
return Optional.of(schemaField);
}
@Override
public ManagedIndexSchema adaptExistingFieldToData(SchemaField schemaField, List<Object> sampleValues, ManagedIndexSchema schema) {
// Promote a single-valued to multi-valued if needed
if (!schemaField.multiValued() && isMultiValued(sampleValues)) {
// this existing field needs to be promoted to multi-valued
SimpleOrderedMap<Object> fieldProps = schemaField.getNamedPropertyValues(false);
fieldProps.add("multiValued", true);
fieldProps.remove("name");
fieldProps.remove("type");
schema = schema.replaceField(schemaField.getName(), schemaField.getType(), fieldProps.asShallowMap());
}
// TODO: other "healing" type operations here ... but we have to be careful about overriding explicit user changes
// such as a user making a text field a string field, we wouldn't want to revert that field back to text
return schema;
}
@Override
public Map<String, List<Object>> transposeDocs(List<SolrInputDocument> docs) {
Map<String, List<Object>> mapByField = new HashMap<>();
docs.forEach(doc -> doc.getFieldNames().forEach(f -> {
// skip the version field on incoming docs
if (!VERSION_FIELD.equals(f)) {
List<Object> values = mapByField.computeIfAbsent(f, k -> new LinkedList<>());
Collection<Object> fieldValues = doc.getFieldValues(f);
if (fieldValues != null && !fieldValues.isEmpty()) {
if (fieldValues.size() == 1) {
// flatten so every field doesn't end up multi-valued
values.add(fieldValues.iterator().next());
} else {
// truly multi-valued
values.add(fieldValues);
}
}
}
}));
return mapByField;
}
protected String guessFieldType(String fieldName, final List<Object> sampleValues, IndexSchema schema, boolean isMV, Locale locale) {
String type = null;
// flatten values to a single stream for easier analysis; also remove nulls
List<Object> flattened = sampleValues.stream()
.flatMap(c -> (c instanceof Collection) ? ((Collection<?>) c).stream() : Stream.of(c))
.filter(Objects::nonNull)
.collect(Collectors.toList());
if (isBoolean(flattened)) {
type = isMV ? "booleans" : "boolean";
} else {
String intType = isIntOrLong(flattened, locale);
if (intType != null) {
type = isMV ? intType + "s" : intType;
} else {
String floatType = isFloatOrDouble(flattened, locale);
if (floatType != null) {
type = isMV ? floatType + "s" : floatType;
}
}
}
if (type == null) {
if (isDateTime(flattened)) {
type = isMV ? "pdates" : "pdate";
} else if (isText(flattened)) {
type = "en".equals(locale.getLanguage()) ? "text_en" : "text_general";
}
}
// if we get here and haven't made a decision, it's a string
if (type == null) {
type = isMV ? "strings" : "string";
}
return type;
}
protected boolean isText(List<Object> values) {
if (values == null || values.isEmpty()) {
return false;
}
int maxLength = -1;
int maxTerms = -1;
for (Object next : values) {
if (!(next instanceof String)) {
return false;
}
String cs = (String) next;
int len = cs.length();
if (len > maxLength) {
maxLength = len;
}
String[] terms = cs.split("\\s+");
if (terms.length > maxTerms) {
maxTerms = terms.length;
}
}
// don't want to choose text for fields where string will do
// if most of the sample values are unique but only a few terms, then it's likely a text field
return (maxLength > 60 || maxTerms > 12 || (maxTerms > 4 && values.size() >= 10 && ((float) Sets.newHashSet(values).size() / values.size()) > 0.9f));
}
protected String isFloatOrDouble(List<Object> values, Locale locale) {
NumberFormat format = NumberFormat.getInstance(locale);
format.setParseIntegerOnly(false);
format.setRoundingMode(RoundingMode.CEILING);
//boolean isFloat = true;
for (Object next : values) {
Object parsed = ParseDoubleFieldUpdateProcessorFactory.parsePossibleDouble(next, format);
if (parsed == null) {
// not a double ...
return null;
}
/*
Tried to be clever and pick pfloat if double precision is not needed, but the ParseDoubleFieldUpdateProcessorFactory
doesn't work with pfloat, so you don't get any locale sensitive parsing in the URP chain, so pdouble it is ...
Number num = (Number) parsed;
String str = num.toString();
int dotAt = str.indexOf('.');
if (dotAt != -1) {
String scalePart = str.substring(dotAt + 1);
if (scalePart.length() > 2) {
isFloat = false;
}
}
*/
}
return "pdouble";
}
protected boolean isBoolean(List<Object> values) {
for (Object next : values) {
Object parsed = ParseBooleanFieldUpdateProcessorFactory.parsePossibleBoolean(next, caseSensitive, trueValues, falseValues);
if (parsed == null) {
return false;
}
}
// all values are booleans
return true;
}
protected String isIntOrLong(List<Object> values, Locale locale) {
NumberFormat format = NumberFormat.getInstance(locale);
format.setParseIntegerOnly(true);
long maxLong = Long.MIN_VALUE;
for (Object next : values) {
Object parsed = ParseLongFieldUpdateProcessorFactory.parsePossibleLong(next, format);
if (parsed == null) {
// not a long ...
return null;
} else {
long parsedLong = ((Number) parsed).longValue();
if (parsedLong > maxLong) {
maxLong = parsedLong;
}
}
}
// if all values are less than some smallish threshold, then it's likely this field holds small numbers
// but be very conservative here as it's simply an optimization and we can always fall back to long
return maxLong < 10000 ? "pint" : "plong";
}
protected boolean isDateTime(List<Object> values) {
if (dateTimeFormatters.isEmpty()) {
return false;
}
for (Object next : values) {
Object parsedDate = ParseDateFieldUpdateProcessorFactory.parsePossibleDate(next, dateTimeFormatters, new ParsePosition(0));
if (parsedDate == null) {
// not a date value
return false;
}
}
return true;
}
public boolean isMultiValued(String name, List<SolrInputDocument> docs) {
Map<String, List<Object>> transposed = transposeDocs(docs);
List<Object> sampleValues = transposed.get(name);
return sampleValues != null && isMultiValued(sampleValues);
}
protected boolean isMultiValued(final List<Object> sampleValues) {
for (Object next : sampleValues) {
if (next instanceof Collection) {
return true;
}
}
return false;
}
protected Map<String, String> guessFieldProps(String fieldName, FieldType fieldType, List<Object> sampleValues, boolean isMV, IndexSchema schema) {
Map<String, String> props = new HashMap<>();
props.put("indexed", "true");
if (isMV && !fieldType.isMultiValued()) {
props.put("multiValued", "true"); // override the mv setting on the type
}
boolean docValues = true;
if (fieldType instanceof TextField) {
docValues = false;
} else {
// Not sure if this field supports docValues, so try creating a SchemaField
Map<String, String> tmpProps = new HashMap<>(props);
tmpProps.put("docValues", "true"); // to test if docValues is supported
try {
fieldType.checkSchemaField(schema.newField(fieldName, fieldType.getTypeName(), tmpProps));
} catch (SolrException solrException) {
docValues = false;
}
}
props.put("docValues", String.valueOf(docValues));
if (!docValues) {
props.put("stored", "true");
} else {
props.put("stored", "false");
props.put("useDocValuesAsStored", "true");
}
return props;
}
@Override
@SuppressWarnings({"rawtypes"})
public void init(NamedList args) {
initDateTimeFormatters(args);
initBooleanParsing(args);
}
@SuppressWarnings({"unchecked", "rawtypes"})
protected void initDateTimeFormatters(NamedList args) {
Locale locale = Locale.US;
String localeParam = (String) args.remove(LOCALE_PARAM);
if (null != localeParam) {
locale = LocaleUtils.toLocale(localeParam);
}
ZoneId defaultTimeZone = ZoneOffset.UTC;
Object defaultTimeZoneParam = args.remove(DEFAULT_TIME_ZONE_PARAM);
if (null != defaultTimeZoneParam) {
defaultTimeZone = ZoneId.of(defaultTimeZoneParam.toString());
}
Collection<String> dateTimePatterns = args.removeConfigArgs(FORMATS_PARAM);
if (dateTimePatterns == null || dateTimePatterns.isEmpty()) {
dateTimePatterns = DEFAULT_DATE_TIME_PATTERNS;
}
for (String pattern : dateTimePatterns) {
DateTimeFormatter formatter = new DateTimeFormatterBuilder().parseLenient().parseCaseInsensitive()
.appendPattern(pattern).toFormatter(locale).withResolverStyle(ResolverStyle.LENIENT).withZone(defaultTimeZone);
validateFormatter(formatter);
dateTimeFormatters.add(formatter);
}
}
@SuppressWarnings({"unchecked", "rawtypes"})
protected void initBooleanParsing(NamedList args) {
Object caseSensitiveParam = args.remove(CASE_SENSITIVE_PARAM);
if (null != caseSensitiveParam) {
if (caseSensitiveParam instanceof Boolean) {
caseSensitive = (Boolean) caseSensitiveParam;
} else {
caseSensitive = Boolean.parseBoolean(caseSensitiveParam.toString());
}
}
Collection<String> trueValuesParam = args.removeConfigArgs(TRUE_VALUES_PARAM);
if (!trueValuesParam.isEmpty()) {
trueValues.clear();
for (String trueVal : trueValuesParam) {
trueValues.add(caseSensitive ? trueVal : trueVal.toLowerCase(Locale.ROOT));
}
}
Collection<String> falseValuesParam = args.removeConfigArgs(FALSE_VALUES_PARAM);
if (!falseValuesParam.isEmpty()) {
falseValues.clear();
for (String val : falseValuesParam) {
final String falseVal = caseSensitive ? val : val.toLowerCase(Locale.ROOT);
if (trueValues.contains(falseVal)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"Param '" + FALSE_VALUES_PARAM + "' contains a value also in param '" + TRUE_VALUES_PARAM
+ "': '" + val + "'");
}
falseValues.add(falseVal);
}
}
}
}