blob: 0483bb0a59f3ed2067c9a3d6dae66f324645558a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.component;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.queries.function.FunctionQuery;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.queries.function.valuesource.FieldCacheSource;
import org.apache.lucene.queries.function.valuesource.QueryValueSource;
import org.apache.lucene.search.Query;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.StatsParams;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.request.DocValuesStats;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.NumberType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.QParser;
import org.apache.solr.search.QParserPlugin;
import org.apache.solr.search.QueryParsing;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SyntaxError;
import org.apache.solr.util.hll.HLL;
import org.apache.solr.util.hll.HLLType;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
/**
* Models all of the information associated with a single {@link StatsParams#STATS_FIELD}
* instance.
*
* @see StatsComponent
*/
public class StatsField {
/**
* An enumeration representing the sumer set of all possible stat values that can be computed.
* Each of these enum values can be specified as a local param in a <code>stats.field</code>
* (eg: <code>stats.field={!min=true mean=true}my_field_name</code>) but not all enum values
* are valid for all field types (eg: <code>mean</code> is meaningless for String fields)
*
* @lucene.internal
* @lucene.experimental
*/
public static enum Stat {
min(true),
max(true),
missing(true),
sum(true),
count(true),
mean(false, sum, count),
sumOfSquares(true),
stddev(false, sum, count, sumOfSquares),
distinctValues(true),
countDistinct(false, distinctValues),
percentiles(true){
/** special for percentiles **/
boolean parseParams(StatsField sf) {
String percentileParas = sf.localParams.get(this.name());
if (percentileParas != null) {
List<Double> percentiles = new ArrayList<Double>();
try {
for (String percentile : StrUtils.splitSmart(percentileParas, ',')) {
percentiles.add(Double.parseDouble(percentile));
}
if (!percentiles.isEmpty()) {
sf.percentilesList.addAll(percentiles);
sf.tdigestCompression = sf.localParams.getDouble("tdigestCompression",
sf.tdigestCompression);
return true;
}
} catch (NumberFormatException e) {
throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to parse "
+ StatsParams.STATS_FIELD + " local params: " + sf.localParams + " due to: "
+ e.getMessage(), e);
}
}
return false;
}
},
cardinality(true) {
/** special for percentiles **/
boolean parseParams(StatsField sf) {
try {
sf.hllOpts = HllOptions.parseHllOptions(sf.localParams, sf.schemaField);
return (null != sf.hllOpts);
} catch (Exception e) {
throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to parse "
+ StatsParams.STATS_FIELD + " local params: " + sf.localParams + " due to: "
+ e.getMessage(), e);
}
}
};
private final List<Stat> distribDeps;
/**
* Sole constructor for Stat enum values
* @param deps the set of stat values, other then this one, which are a distributed
* dependency and must be computed and returned by each individual shards in
* order to compute <i>this</i> stat over the entire distributed result set.
* @param selfDep indicates that when computing this stat across a distributed result
* set, each shard must compute this stat <i>in addition to</i> any other
* distributed dependencies.
* @see #getDistribDeps
*/
Stat(boolean selfDep, Stat... deps) {
distribDeps = new ArrayList<Stat>(deps.length+1);
distribDeps.addAll(Arrays.asList(deps));
if (selfDep) {
distribDeps.add(this);
}
}
/**
* Given a String, returns the corresponding Stat enum value if any, otherwise returns null.
*/
public static Stat forName(String paramKey) {
try {
return Stat.valueOf(paramKey);
} catch (IllegalArgumentException e) {
return null;
}
}
/**
* The stats that must be computed and returned by each shard involved in a distributed
* request in order to compute the overall value for this stat across the entire distributed
* result set. A Stat instance may include itself in the <code>getDistribDeps()</code> result,
* but that is not always the case.
*/
public EnumSet<Stat> getDistribDeps() {
return EnumSet.copyOf(this.distribDeps);
}
/**
* Called when the name of a stat is found as a local param on this {@link StatsField}
* @return true if the user is requesting this stat, else false
*/
boolean parseParams(StatsField sf) {
return sf.localParams.getBool(this.name(), false);
}
}
/**
* the equivalent stats if "calcdistinct" is specified
* @see Stat#countDistinct
* @see Stat#distinctValues
*/
private static final EnumSet<Stat> CALCDISTINCT_PSUEDO_STAT = EnumSet.of(Stat.countDistinct, Stat.distinctValues);
/**
* The set of stats computed by default when no localparams are used to specify explicit stats
*/
public final static Set<Stat> DEFAULT_STATS = Collections.<Stat>unmodifiableSet
(EnumSet.of(Stat.min, Stat.max, Stat.missing, Stat.sum, Stat.count, Stat.mean, Stat.sumOfSquares, Stat.stddev));
private final SolrIndexSearcher searcher;
private final ResponseBuilder rb;
private final String originalParam; // for error messages
private final SolrParams localParams;
private final ValueSource valueSource; // may be null if simple field stats
private final SchemaField schemaField; // may be null if function/query stats
private final String key;
private final boolean topLevelCalcDistinct;
private final String[] facets;
private final List<String> tagList;
private final List<String> excludeTagList;
private final EnumSet<Stat> statsToCalculate = EnumSet.noneOf(Stat.class);
private final EnumSet<Stat> statsInResponse = EnumSet.noneOf(Stat.class);
private final List<Double> percentilesList= new ArrayList<Double>();
private final boolean isShard;
private double tdigestCompression = 100.0D;
private HllOptions hllOpts;
/**
* @param rb the current request/response
* @param statsParam the raw {@link StatsParams#STATS_FIELD} string
*/
public StatsField(ResponseBuilder rb, String statsParam) {
this.rb = rb;
this.searcher = rb.req.getSearcher();
this.originalParam = statsParam;
SolrParams params = rb.req.getParams();
try {
isShard = params.getBool("isShard", false);
SolrParams localParams = QueryParsing.getLocalParams(originalParam, params);
if (null == localParams) {
// simplest possible input: bare string (field name)
ModifiableSolrParams customParams = new ModifiableSolrParams();
customParams.add(QueryParsing.V, originalParam);
localParams = customParams;
}
this.localParams = localParams;
String parserName = localParams.get(QueryParsing.TYPE);
SchemaField sf = null;
ValueSource vs = null;
if ( StringUtils.isBlank(parserName) ) {
// basic request for field stats
sf = searcher.getSchema().getField(localParams.get(QueryParsing.V));
} else {
// we have a non trivial request to compute stats over a query (or function)
// NOTE we could use QParser.getParser(...) here, but that would redundantly
// reparse everything. ( TODO: refactor a common method in QParser ?)
QParserPlugin qplug = rb.req.getCore().getQueryPlugin(parserName);
if (qplug == null) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "invalid query parser '" + parserName +
(originalParam == null? "'": "' for query '" + originalParam + "'"));
}
QParser qp = qplug.createParser(localParams.get(QueryParsing.V),
localParams, params, rb.req);
// figure out what type of query we are dealing, get the most direct ValueSource
vs = extractValueSource(qp.parse());
// if this ValueSource directly corresponds to a SchemaField, act as if
// we were asked to compute stats on it directly
// ie: "stats.field={!func key=foo}field(foo)" == "stats.field=foo"
sf = extractSchemaField(vs, searcher.getSchema());
if (null != sf) {
vs = null;
}
}
assert ( (null == vs) ^ (null == sf) ) : "exactly one of vs & sf must be null";
this.schemaField = sf;
this.valueSource = vs;
} catch (SyntaxError e) {
throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to parse " +
StatsParams.STATS_FIELD + ": " + originalParam + " due to: "
+ e.getMessage(), e);
}
// allow explicit setting of the response key via localparams...
this.key = localParams.get(CommonParams.OUTPUT_KEY,
// default to the main param value...
localParams.get(CommonParams.VALUE,
// default to entire original param str.
originalParam));
this.topLevelCalcDistinct = null == schemaField
? params.getBool(StatsParams.STATS_CALC_DISTINCT, false)
: params.getFieldBool(schemaField.getName(), StatsParams.STATS_CALC_DISTINCT, false);
populateStatsSets();
String[] facets = params.getFieldParams(key, StatsParams.STATS_FACET);
this.facets = (null == facets) ? new String[0] : facets;
String tagStr = localParams.get(CommonParams.TAG);
this.tagList = (null == tagStr)
? Collections.<String>emptyList()
: StrUtils.splitSmart(tagStr,',');
// figure out if we need a special base DocSet
String excludeStr = localParams.get(CommonParams.EXCLUDE);
this.excludeTagList = (null == excludeStr)
? Collections.<String>emptyList()
: StrUtils.splitSmart(excludeStr,',');
assert ( (null == this.valueSource) ^ (null == this.schemaField) )
: "exactly one of valueSource & schemaField must be null";
}
/**
* Inspects a {@link Query} to see if it directly maps to a {@link ValueSource},
* and if so returns it -- otherwise wraps it as needed.
*
* @param q Query whose scores we have been asked to compute stats of
* @returns a ValueSource to use for computing the stats
*/
private static ValueSource extractValueSource(Query q) {
return (q instanceof FunctionQuery) ?
// Common case: we're wrapping a func, so we can directly pull out ValueSource
((FunctionQuery) q).getValueSource() :
// asked to compute stats over a query, wrap it up as a ValueSource
new QueryValueSource(q, 0.0F);
}
/**
* Inspects a {@link ValueSource} to see if it directly maps to a {@link SchemaField},
* and if so returns it.
*
* @param vs ValueSource we've been asked to compute stats of
* @param schema The Schema to use
* @returns Corrisponding {@link SchemaField} or null if the ValueSource is more complex
* @see FieldCacheSource
*/
private static SchemaField extractSchemaField(ValueSource vs, IndexSchema schema) {
if (vs instanceof FieldCacheSource) {
String fieldName = ((FieldCacheSource)vs).getField();
return schema.getField(fieldName);
}
return null;
}
/**
* The key to be used when refering to this {@link StatsField} instance in the
* response tp clients.
*/
public String getOutputKey() {
return key;
}
/**
* Computes a base {@link DocSet} for the current request to be used
* when computing global stats for the local index.
*
* This is typically the same as the main DocSet for the {@link ResponseBuilder}
* unless {@link CommonParams#TAG tag}ged filter queries have been excluded using
* the {@link CommonParams#EXCLUDE ex} local param
*/
public DocSet computeBaseDocSet() throws IOException {
DocSet docs = rb.getResults().docSet;
Map<?,?> tagMap = (Map<?,?>) rb.req.getContext().get("tags");
if (excludeTagList.isEmpty() || null == tagMap) {
// either the exclude list is empty, or there
// aren't any tagged filters to exclude anyway.
return docs;
}
IdentityHashMap<Query,Boolean> excludeSet = new IdentityHashMap<Query,Boolean>();
for (String excludeTag : excludeTagList) {
Object olst = tagMap.get(excludeTag);
// tagMap has entries of List<String,List<QParser>>, but subject to change in the future
if (!(olst instanceof Collection)) continue;
for (Object o : (Collection<?>)olst) {
if (!(o instanceof QParser)) continue;
QParser qp = (QParser)o;
try {
excludeSet.put(qp.getQuery(), Boolean.TRUE);
} catch (SyntaxError e) {
// this shouldn't be possible since the request should have already
// failed when attempting to execute the query, but just in case...
throw new SolrException(ErrorCode.BAD_REQUEST, "Excluded query can't be parsed: " +
originalParam + " due to: " + e.getMessage(), e);
}
}
}
if (excludeSet.size() == 0) return docs;
List<Query> qlist = new ArrayList<Query>();
// add the base query
if (!excludeSet.containsKey(rb.getQuery())) {
qlist.add(rb.getQuery());
}
// add the filters
if (rb.getFilters() != null) {
for (Query q : rb.getFilters()) {
if (!excludeSet.containsKey(q)) {
qlist.add(q);
}
}
}
// get the new base docset for this facet
return searcher.getDocSet(qlist);
}
/**
* Computes the {@link StatsValues} for this {@link StatsField} relative to the
* specified {@link DocSet}
* @see #computeBaseDocSet
*/
public StatsValues computeLocalStatsValues(DocSet base) throws IOException {
if (statsToCalculate.isEmpty()) {
// perf optimization for the case where we compute nothing
// ie: stats.field={!min=$domin}myfield&domin=false
return StatsValuesFactory.createStatsValues(this);
}
if (null != schemaField && !schemaField.getType().isPointField()
&& (schemaField.multiValued() || schemaField.getType().multiValuedFieldCache())) {
// TODO: should this also be used for single-valued string fields? (should work fine)
return DocValuesStats.getCounts(searcher, this, base, facets);
} else {
// either a single valued field we pull from FieldCache, or an explicit
// function ValueSource
return computeLocalValueSourceStats(base);
}
}
private StatsValues computeLocalValueSourceStats(DocSet base) throws IOException {
IndexSchema schema = searcher.getSchema();
final StatsValues allstats = StatsValuesFactory.createStatsValues(this);
List<FieldFacetStats> facetStats = new ArrayList<>();
for( String facetField : facets ) {
SchemaField fsf = schema.getField(facetField);
if ( fsf.multiValued()) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"Stats can only facet on single-valued fields, not: " + facetField );
}
facetStats.add(new FieldFacetStats(searcher, fsf, this));
}
final Iterator<LeafReaderContext> ctxIt = searcher.getIndexReader().leaves().iterator();
LeafReaderContext ctx = null;
for (DocIterator docsIt = base.iterator(); docsIt.hasNext(); ) {
final int doc = docsIt.nextDoc();
if (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()) {
// advance
do {
ctx = ctxIt.next();
} while (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc());
assert doc >= ctx.docBase;
// propagate the context among accumulators.
allstats.setNextReader(ctx);
for (FieldFacetStats f : facetStats) {
f.setNextReader(ctx);
}
}
// accumulate
allstats.accumulate(doc - ctx.docBase);
for (FieldFacetStats f : facetStats) {
f.facet(doc - ctx.docBase);
}
}
for (FieldFacetStats f : facetStats) {
allstats.addFacet(f.name, f.facetStatsValues);
}
return allstats;
}
/**
* The searcher that should be used for processing local stats
* @see SolrQueryRequest#getSearcher
*/
public SolrIndexSearcher getSearcher() {
// see AbstractStatsValues.setNextReader
return searcher;
}
/**
* The {@link SchemaField} whose results these stats are computed over, may be null
* if the stats are computed over the results of a function or query
*
* @see #getValueSource
*/
public SchemaField getSchemaField() {
return schemaField;
}
/**
* The {@link ValueSource} of a function or query whose results these stats are computed
* over, may be null if the stats are directly over a {@link SchemaField}
*
* @see #getValueSource
*/
public ValueSource getValueSource() {
return valueSource;
}
public List<String> getTagList() {
return tagList;
}
public String toString() {
return "StatsField<" + originalParam + ">";
}
/**
* A helper method which inspects the {@link #localParams} associated with this StatsField,
* and uses them to populate the {@link #statsInResponse} and {@link #statsToCalculate} data
* structures
*/
private void populateStatsSets() {
boolean statSpecifiedByLocalParam = false;
// local individual stat
Iterator<String> itParams = localParams.getParameterNamesIterator();
while (itParams.hasNext()) {
String paramKey = itParams.next();
Stat stat = Stat.forName(paramKey);
if (stat != null) {
statSpecifiedByLocalParam = true;
if (stat.parseParams(this)) {
statsInResponse.add(stat);
}
}
}
// if no individual stat setting use the default set
if ( ! ( statSpecifiedByLocalParam
// calcdistinct (as a local param) is a psuedo-stat, prevents default set
|| localParams.getBool("calcdistinct", false) ) ) {
statsInResponse.addAll(DEFAULT_STATS);
}
// calcDistinct is a psuedo-stat with optional top level param default behavior
// if not overridden by the specific individual stats
if (localParams.getBool("calcdistinct", topLevelCalcDistinct)) {
for (Stat stat : CALCDISTINCT_PSUEDO_STAT) {
// assume true, but don't include if specific stat overrides
if (localParams.getBool(stat.name(), true)) {
statsInResponse.add(stat);
}
}
}
for (Stat stat : statsInResponse) {
statsToCalculate.addAll(stat.getDistribDeps());
}
}
public boolean calculateStats(Stat stat) {
return statsToCalculate.contains(stat);
}
public boolean includeInResponse(Stat stat) {
if (isShard) {
return statsToCalculate.contains(stat);
}
if (statsInResponse.contains(stat)) {
return true;
}
return false;
}
public List<Double> getPercentilesList() {
return percentilesList;
}
public boolean getIsShard() {
return isShard;
}
public double getTdigestCompression() {
return tdigestCompression;
}
public HllOptions getHllOptions() {
return hllOpts;
}
/**
* Helper Struct for parsing and encapsulating all of the options relaed to building a {@link HLL}
*
* @see Stat#cardinality
* @lucene.internal
*/
public static final class HllOptions {
final HashFunction hasher;
// NOTE: this explanation linked to from the java-hll jdocs...
// https://github.com/aggregateknowledge/postgresql-hll/blob/master/README.markdown#explanation-of-parameters-and-tuning
// ..if i'm understanding the regwidth chart correctly, a value of 6 should be a enough
// to support any max cardinality given that we're always dealing with hashes and
// the cardinality of the set of all long values is 2**64 == 1.9e19
//
// But i guess that assumes a *perfect* hash and high log2m? ... if the hash algo is imperfect
// and/or log2m is low (ie: user is less concerned about accuracy), then many diff hash values
// might fall in the same register (ie: bucket) and having a wider register to count more of
// them may be useful
final int log2m;
final int regwidth;
final static String ERR = "cardinality must be specified as 'true' (for default tunning) or decimal number between 0 and 1 to adjust accuracy vs memory usage (large number is more memory and more accuracy)";
private HllOptions(int log2m, int regwidth, HashFunction hasher) {
this.log2m = log2m;
this.regwidth = regwidth;
this.hasher = hasher;
}
/**
* Creates an HllOptions based on the (local) params specified (if appropriate).
*
* @param localParams the LocalParams for this {@link StatsField}
* @param field the field corresponding to this {@link StatsField}, may be null if these stats are over a value source
* @return the {@link HllOptions} to use based on the params, or null if no {@link HLL} should be computed
* @throws SolrException if there are invalid options
*/
public static HllOptions parseHllOptions(SolrParams localParams, SchemaField field)
throws SolrException {
String cardinalityOpt = localParams.get(Stat.cardinality.name());
if (StringUtils.isBlank(cardinalityOpt)) {
return null;
}
final NumberType hashableNumType = getHashableNumericType(field);
// some sane defaults
int log2m = 13; // roughly equivalent to "cardinality='0.33'"
int regwidth = 6; // with decent hash, this is plenty for all valid long hashes
if (NumberType.FLOAT.equals(hashableNumType) || NumberType.INTEGER.equals(hashableNumType)) {
// for 32bit values, we can adjust our default regwidth down a bit
regwidth--;
// NOTE: EnumField uses LegacyNumericType.INT, and in theory we could be super conservative
// with it, but there's no point - just let the EXPLICIT HLL handle it
}
// TODO: we could attempt additional reductions in the default regwidth based on index
// statistics -- but thta doesn't seem worth the effort. for tiny indexes, the
// EXPLICIT and SPARSE HLL representations have us nicely covered, and in general we don't
// want to be too aggresive about lowering regwidth or we could really poor results if
// log2m is also low and there is heavy hashkey collision
try {
// NFE will short out here if it's not a number
final double accuracyOpt = Double.parseDouble(cardinalityOpt);
// if a float between 0 and 1 is specified, treat it as a prefrence of accuracy
// - 0 means accuracy is not a concern, save RAM
// - 1 means be as accurate as possible, using as much RAM as needed.
if (accuracyOpt < 0D || 1.0D < accuracyOpt) {
throw new SolrException(ErrorCode.BAD_REQUEST, ERR);
}
// use accuracyOpt as a scaling factor between min & max legal log2m values
log2m = HLL.MINIMUM_LOG2M_PARAM
+ (int) Math.round(accuracyOpt * (HLL.MAXIMUM_LOG2M_PARAM - HLL.MINIMUM_LOG2M_PARAM));
// use accuracyOpt as a scaling factor for regwidth as well, BUT...
// be more conservative -- HLL.MIN_REGWIDTH_PARAM is too absurdly low to be useful
// use previously computed (hashableNumType) default regwidth -1 as lower bound for scaling
final int MIN_HUERISTIC_REGWIDTH = regwidth-1;
regwidth = MIN_HUERISTIC_REGWIDTH
+ (int) Math.round(accuracyOpt * (HLL.MAXIMUM_REGWIDTH_PARAM - MIN_HUERISTIC_REGWIDTH));
} catch (NumberFormatException nfe) {
// param value isn't a number -- let's check for simple true/false
if (! localParams.getBool(Stat.cardinality.name(), false)) {
return null;
}
}
// let explicit params override both the default and/or any accuracy specification
log2m = localParams.getInt("hllLog2m", log2m);
regwidth = localParams.getInt("hllRegwidth", regwidth);
// validate legal values
if (log2m < HLL.MINIMUM_LOG2M_PARAM || HLL.MAXIMUM_LOG2M_PARAM < log2m) {
throw new SolrException(ErrorCode.BAD_REQUEST, "hllLog2m must be at least " +
HLL.MINIMUM_LOG2M_PARAM + " and at most " + HLL.MAXIMUM_LOG2M_PARAM
+ " (" + log2m +")");
}
if (regwidth < HLL.MINIMUM_REGWIDTH_PARAM || HLL.MAXIMUM_REGWIDTH_PARAM < regwidth) {
throw new SolrException(ErrorCode.BAD_REQUEST, "hllRegwidth must be at least " +
HLL.MINIMUM_REGWIDTH_PARAM + " and at most " + HLL.MAXIMUM_REGWIDTH_PARAM);
}
HashFunction hasher = localParams.getBool("hllPreHashed", false) ? null : Hashing.murmur3_128();
if (null == hasher) {
// if this is a function, or a non Long field, pre-hashed is invalid
// NOTE: we ignore hashableNumType - it's LONG for non numerics like Strings
if (null == field || !(NumberType.LONG.equals(field.getType().getNumberType()) || NumberType.DATE.equals(field.getType().getNumberType()))) {
throw new SolrException(ErrorCode.BAD_REQUEST, "hllPreHashed is only supported with Long based fields");
}
}
// if we're still here, then we need an HLL...
return new HllOptions(log2m, regwidth, hasher);
}
/** @see HLL */
public int getLog2m() {
return log2m;
}
/** @see HLL */
public int getRegwidth() {
return regwidth;
}
/** May be null if user has indicated that field values are pre-hashed */
public HashFunction getHasher() {
return hasher;
}
public HLL newHLL() {
// Although it (in theory) saves memory for "medium" size sets, the SPARSE type seems to have
// some nasty impacts on response time as it gets larger - particularly in distrib requests.
// Merging large SPARSE HLLs is much much slower then merging FULL HLLs with the same num docs
//
// TODO: add more tunning options for this.
return new HLL(getLog2m(), getRegwidth(), -1 /* auto explict threshold */,
false /* no sparse representation */, HLLType.EMPTY);
}
}
/**
* Returns the effective {@link NumberType} for the field for the purposes of hash values.
* ie: If the field has an explict NumberType that is returned; If the field has no explicit
* NumberType then {@link NumberType#LONG} is returned; If field is null, then
* {@link NumberType#FLOAT} is assumed for ValueSource.
*/
private static NumberType getHashableNumericType(SchemaField field) {
if (null == field) {
return NumberType.FLOAT;
}
final NumberType result = field.getType().getNumberType();
return null == result ? NumberType.LONG : result;
}
}