| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.solr.handler.component; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.EnumSet; |
| import java.util.IdentityHashMap; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.apache.commons.lang.StringUtils; |
| import org.apache.lucene.document.FieldType.NumericType; |
| import org.apache.lucene.index.LeafReaderContext; |
| import org.apache.lucene.queries.function.FunctionQuery; |
| import org.apache.lucene.queries.function.ValueSource; |
| import org.apache.lucene.queries.function.valuesource.FieldCacheSource; |
| import org.apache.lucene.queries.function.valuesource.QueryValueSource; |
| import org.apache.lucene.search.Query; |
| import org.apache.solr.common.SolrException; |
| import org.apache.solr.common.SolrException.ErrorCode; |
| import org.apache.solr.common.params.CommonParams; |
| import org.apache.solr.common.params.ModifiableSolrParams; |
| import org.apache.solr.common.params.SolrParams; |
| import org.apache.solr.common.params.StatsParams; |
| import org.apache.solr.common.util.StrUtils; |
| import org.apache.solr.request.DocValuesStats; |
| import org.apache.solr.request.SolrQueryRequest; |
| import org.apache.solr.schema.IndexSchema; |
| import org.apache.solr.schema.SchemaField; |
| import org.apache.solr.search.DocIterator; |
| import org.apache.solr.search.DocSet; |
| import org.apache.solr.search.QParser; |
| import org.apache.solr.search.QParserPlugin; |
| import org.apache.solr.search.QueryParsing; |
| import org.apache.solr.search.SolrIndexSearcher; |
| import org.apache.solr.search.SyntaxError; |
| |
| import net.agkn.hll.HLL; |
| import net.agkn.hll.HLLType; |
| import com.google.common.hash.Hashing; |
| import com.google.common.hash.HashFunction; |
| |
| /** |
| * Models all of the information associated with a single {@link StatsParams#STATS_FIELD} |
| * instance. |
| * |
| * @see StatsComponent |
| */ |
| public class StatsField { |
| |
| /** |
| * An enumeration representing the sumer set of all possible stat values that can be computed. |
| * Each of these enum values can be specified as a local param in a <code>stats.field</code> |
| * (eg: <code>stats.field={!min=true mean=true}my_field_name</code>) but not all enum values |
| * are valid for all field types (eg: <code>mean</code> is meaningless for String fields) |
| * |
| * @lucene.internal |
| * @lucene.experimental |
| */ |
| public static enum Stat { |
| min(true), |
| max(true), |
| missing(true), |
| sum(true), |
| count(true), |
| mean(false, sum, count), |
| sumOfSquares(true), |
| stddev(false, sum, count, sumOfSquares), |
| distinctValues(true), |
| countDistinct(false, distinctValues), |
| percentiles(true){ |
| /** special for percentiles **/ |
| boolean parseParams(StatsField sf) { |
| String percentileParas = sf.localParams.get(this.name()); |
| if (percentileParas != null) { |
| List<Double> percentiles = new ArrayList<Double>(); |
| try { |
| for (String percentile : StrUtils.splitSmart(percentileParas, ',')) { |
| percentiles.add(Double.parseDouble(percentile)); |
| } |
| if (!percentiles.isEmpty()) { |
| sf.percentilesList.addAll(percentiles); |
| sf.tdigestCompression = sf.localParams.getDouble("tdigestCompression", |
| sf.tdigestCompression); |
| return true; |
| } |
| } catch (NumberFormatException e) { |
| throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to parse " |
| + StatsParams.STATS_FIELD + " local params: " + sf.localParams + " due to: " |
| + e.getMessage(), e); |
| } |
| |
| } |
| return false; |
| } |
| }, |
| cardinality(true) { |
| /** special for percentiles **/ |
| boolean parseParams(StatsField sf) { |
| try { |
| sf.hllOpts = HllOptions.parseHllOptions(sf.localParams, sf.schemaField); |
| return (null != sf.hllOpts); |
| } catch (Exception e) { |
| throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to parse " |
| + StatsParams.STATS_FIELD + " local params: " + sf.localParams + " due to: " |
| + e.getMessage(), e); |
| } |
| } |
| }; |
| |
| private final List<Stat> distribDeps; |
| |
| /** |
| * Sole constructor for Stat enum values |
| * @param deps the set of stat values, other then this one, which are a distributed |
| * dependency and must be computed and returned by each individual shards in |
| * order to compute <i>this</i> stat over the entire distributed result set. |
| * @param selfDep indicates that when computing this stat across a distributed result |
| * set, each shard must compute this stat <i>in addition to</i> any other |
| * distributed dependences. |
| * @see #getDistribDeps |
| */ |
| Stat(boolean selfDep, Stat... deps) { |
| distribDeps = new ArrayList<Stat>(deps.length+1); |
| distribDeps.addAll(Arrays.asList(deps)); |
| if (selfDep) { |
| distribDeps.add(this); |
| } |
| } |
| |
| /** |
| * Given a String, returns the corrisponding Stat enum value if any, otherwise returns null. |
| */ |
| public static Stat forName(String paramKey) { |
| try { |
| return Stat.valueOf(paramKey); |
| } catch (IllegalArgumentException e) { |
| return null; |
| } |
| } |
| |
| /** |
| * The stats that must be computed and returned by each shard involved in a distributed |
| * request in order to compute the overall value for this stat across the entire distributed |
| * result set. A Stat instance may include itself in the <code>getDistribDeps()</code> result, |
| * but that is not always the case. |
| */ |
| public EnumSet<Stat> getDistribDeps() { |
| return EnumSet.copyOf(this.distribDeps); |
| } |
| |
| /** |
| * Called when the name of a stat is found as a local param on this {@link StatsField} |
| * @return true if the user is requesting this stat, else false |
| */ |
| boolean parseParams(StatsField sf) { |
| return sf.localParams.getBool(this.name(), false); |
| } |
| |
| } |
| |
| /** |
| * the equivilent stats if "calcdistinct" is specified |
| * @see Stat#countDistinct |
| * @see Stat#distinctValues |
| */ |
| private static final EnumSet<Stat> CALCDISTINCT_PSUEDO_STAT = EnumSet.of(Stat.countDistinct, Stat.distinctValues); |
| |
| /** |
| * The set of stats computed by default when no localparams are used to specify explicit stats |
| */ |
| public final static Set<Stat> DEFAULT_STATS = Collections.<Stat>unmodifiableSet |
| (EnumSet.of(Stat.min, Stat.max, Stat.missing, Stat.sum, Stat.count, Stat.mean, Stat.sumOfSquares, Stat.stddev)); |
| |
| private final SolrIndexSearcher searcher; |
| private final ResponseBuilder rb; |
| private final String originalParam; // for error messages |
| private final SolrParams localParams; |
| private final ValueSource valueSource; // may be null if simple field stats |
| private final SchemaField schemaField; // may be null if function/query stats |
| private final String key; |
| private final boolean topLevelCalcDistinct; |
| private final String[] facets; |
| private final List<String> tagList; |
| private final List<String> excludeTagList; |
| private final EnumSet<Stat> statsToCalculate = EnumSet.noneOf(Stat.class); |
| private final EnumSet<Stat> statsInResponse = EnumSet.noneOf(Stat.class); |
| private final List<Double> percentilesList= new ArrayList<Double>(); |
| private final boolean isShard; |
| |
| private double tdigestCompression = 100.0D; |
| private HllOptions hllOpts; |
| |
| /** |
| * @param rb the current request/response |
| * @param statsParam the raw {@link StatsParams#STATS_FIELD} string |
| */ |
| public StatsField(ResponseBuilder rb, String statsParam) { |
| this.rb = rb; |
| this.searcher = rb.req.getSearcher(); |
| this.originalParam = statsParam; |
| |
| SolrParams params = rb.req.getParams(); |
| try { |
| isShard = params.getBool("isShard", false); |
| SolrParams localParams = QueryParsing.getLocalParams(originalParam, params); |
| if (null == localParams) { |
| // simplest possible input: bare string (field name) |
| ModifiableSolrParams customParams = new ModifiableSolrParams(); |
| customParams.add(QueryParsing.V, originalParam); |
| localParams = customParams; |
| } |
| |
| this.localParams = localParams; |
| |
| String parserName = localParams.get(QueryParsing.TYPE); |
| SchemaField sf = null; |
| ValueSource vs = null; |
| |
| if ( StringUtils.isBlank(parserName) ) { |
| |
| // basic request for field stats |
| sf = searcher.getSchema().getField(localParams.get(QueryParsing.V)); |
| |
| } else { |
| // we have a non trivial request to compute stats over a query (or function) |
| |
| // NOTE we could use QParser.getParser(...) here, but that would redundently |
| // reparse everything. ( TODO: refactor a common method in QParser ?) |
| QParserPlugin qplug = rb.req.getCore().getQueryPlugin(parserName); |
| QParser qp = qplug.createParser(localParams.get(QueryParsing.V), |
| localParams, params, rb.req); |
| |
| // figure out what type of query we are dealing, get the most direct ValueSource |
| vs = extractValueSource(qp.parse()); |
| |
| // if this ValueSource directly corrisponds to a SchemaField, act as if |
| // we were asked to compute stats on it directly |
| // ie: "stats.field={!func key=foo}field(foo)" == "stats.field=foo" |
| sf = extractSchemaField(vs, searcher.getSchema()); |
| if (null != sf) { |
| vs = null; |
| } |
| } |
| |
| assert ( (null == vs) ^ (null == sf) ) : "exactly one of vs & sf must be null"; |
| |
| this.schemaField = sf; |
| this.valueSource = vs; |
| |
| } catch (SyntaxError e) { |
| throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to parse " + |
| StatsParams.STATS_FIELD + ": " + originalParam + " due to: " |
| + e.getMessage(), e); |
| } |
| |
| // allow explicit setting of the response key via localparams... |
| this.key = localParams.get(CommonParams.OUTPUT_KEY, |
| // default to the main param value... |
| localParams.get(CommonParams.VALUE, |
| // default to entire original param str. |
| originalParam)); |
| |
| this.topLevelCalcDistinct = null == schemaField |
| ? params.getBool(StatsParams.STATS_CALC_DISTINCT, false) |
| : params.getFieldBool(schemaField.getName(), StatsParams.STATS_CALC_DISTINCT, false); |
| |
| populateStatsSets(); |
| |
| String[] facets = params.getFieldParams(key, StatsParams.STATS_FACET); |
| this.facets = (null == facets) ? new String[0] : facets; |
| String tagStr = localParams.get(CommonParams.TAG); |
| this.tagList = (null == tagStr) |
| ? Collections.<String>emptyList() |
| : StrUtils.splitSmart(tagStr,','); |
| |
| // figure out if we need a special base DocSet |
| String excludeStr = localParams.get(CommonParams.EXCLUDE); |
| this.excludeTagList = (null == excludeStr) |
| ? Collections.<String>emptyList() |
| : StrUtils.splitSmart(excludeStr,','); |
| |
| assert ( (null == this.valueSource) ^ (null == this.schemaField) ) |
| : "exactly one of valueSource & schemaField must be null"; |
| } |
| |
| /** |
| * Inspects a {@link Query} to see if it directly maps to a {@link ValueSource}, |
| * and if so returns it -- otherwise wraps it as needed. |
| * |
| * @param q Query whose scores we have been asked to compute stats of |
| * @returns a ValueSource to use for computing the stats |
| */ |
| private static ValueSource extractValueSource(Query q) { |
| return (q instanceof FunctionQuery) ? |
| // Common case: we're wrapping a func, so we can directly pull out ValueSource |
| ((FunctionQuery) q).getValueSource() : |
| // asked to compute stats over a query, wrap it up as a ValueSource |
| new QueryValueSource(q, 0.0F); |
| } |
| |
| /** |
| * Inspects a {@link ValueSource} to see if it directly maps to a {@link SchemaField}, |
| * and if so returns it. |
| * |
| * @param vs ValueSource we've been asked to compute stats of |
| * @param schema The Schema to use |
| * @returns Corrisponding {@link SchemaField} or null if the ValueSource is more complex |
| * @see FieldCacheSource |
| */ |
| private static SchemaField extractSchemaField(ValueSource vs, IndexSchema schema) { |
| if (vs instanceof FieldCacheSource) { |
| String fieldName = ((FieldCacheSource)vs).getField(); |
| return schema.getField(fieldName); |
| } |
| return null; |
| } |
| |
| /** |
| * The key to be used when refering to this {@link StatsField} instance in the |
| * response tp clients. |
| */ |
| public String getOutputKey() { |
| return key; |
| } |
| |
| /** |
| * Computes a base {@link DocSet} for the current request to be used |
| * when computing global stats for the local index. |
| * |
| * This is typically the same as the main DocSet for the {@link ResponseBuilder} |
| * unless {@link CommonParams#TAG tag}ged filter queries have been excluded using |
| * the {@link CommonParams#EXCLUDE ex} local param |
| */ |
| public DocSet computeBaseDocSet() throws IOException { |
| |
| DocSet docs = rb.getResults().docSet; |
| Map<?,?> tagMap = (Map<?,?>) rb.req.getContext().get("tags"); |
| |
| if (excludeTagList.isEmpty() || null == tagMap) { |
| // either the exclude list is empty, or there |
| // aren't any tagged filters to exclude anyway. |
| return docs; |
| } |
| |
| IdentityHashMap<Query,Boolean> excludeSet = new IdentityHashMap<Query,Boolean>(); |
| for (String excludeTag : excludeTagList) { |
| Object olst = tagMap.get(excludeTag); |
| // tagMap has entries of List<String,List<QParser>>, but subject to change in the future |
| if (!(olst instanceof Collection)) continue; |
| for (Object o : (Collection<?>)olst) { |
| if (!(o instanceof QParser)) continue; |
| QParser qp = (QParser)o; |
| try { |
| excludeSet.put(qp.getQuery(), Boolean.TRUE); |
| } catch (SyntaxError e) { |
| // this shouldn't be possible since the request should have already |
| // failed when attempting to execute the query, but just in case... |
| throw new SolrException(ErrorCode.BAD_REQUEST, "Excluded query can't be parsed: " + |
| originalParam + " due to: " + e.getMessage(), e); |
| } |
| } |
| } |
| if (excludeSet.size() == 0) return docs; |
| |
| List<Query> qlist = new ArrayList<Query>(); |
| |
| // add the base query |
| if (!excludeSet.containsKey(rb.getQuery())) { |
| qlist.add(rb.getQuery()); |
| } |
| |
| // add the filters |
| if (rb.getFilters() != null) { |
| for (Query q : rb.getFilters()) { |
| if (!excludeSet.containsKey(q)) { |
| qlist.add(q); |
| } |
| } |
| } |
| |
| // get the new base docset for this facet |
| return searcher.getDocSet(qlist); |
| } |
| |
| /** |
| * Computes the {@link StatsValues} for this {@link StatsField} relative to the |
| * specified {@link DocSet} |
| * @see #computeBaseDocSet |
| */ |
| public StatsValues computeLocalStatsValues(DocSet base) throws IOException { |
| |
| if (statsToCalculate.isEmpty()) { |
| // perf optimization for the case where we compute nothing |
| // ie: stats.field={!min=$domin}myfield&domin=false |
| return StatsValuesFactory.createStatsValues(this); |
| } |
| |
| if (null != schemaField |
| && (schemaField.multiValued() || schemaField.getType().multiValuedFieldCache())) { |
| |
| // TODO: should this also be used for single-valued string fields? (should work fine) |
| return DocValuesStats.getCounts(searcher, this, base, facets); |
| } else { |
| // either a single valued field we pull from FieldCache, or an explicit |
| // function ValueSource |
| return computeLocalValueSourceStats(base); |
| } |
| } |
| |
| private StatsValues computeLocalValueSourceStats(DocSet base) throws IOException { |
| |
| IndexSchema schema = searcher.getSchema(); |
| |
| final StatsValues allstats = StatsValuesFactory.createStatsValues(this); |
| |
| List<FieldFacetStats> facetStats = new ArrayList<>(); |
| for( String facetField : facets ) { |
| SchemaField fsf = schema.getField(facetField); |
| |
| if ( fsf.multiValued()) { |
| throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, |
| "Stats can only facet on single-valued fields, not: " + facetField ); |
| } |
| |
| facetStats.add(new FieldFacetStats(searcher, fsf, this)); |
| } |
| |
| final Iterator<LeafReaderContext> ctxIt = searcher.getIndexReader().leaves().iterator(); |
| LeafReaderContext ctx = null; |
| for (DocIterator docsIt = base.iterator(); docsIt.hasNext(); ) { |
| final int doc = docsIt.nextDoc(); |
| if (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()) { |
| // advance |
| do { |
| ctx = ctxIt.next(); |
| } while (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()); |
| assert doc >= ctx.docBase; |
| |
| // propagate the context among accumulators. |
| allstats.setNextReader(ctx); |
| for (FieldFacetStats f : facetStats) { |
| f.setNextReader(ctx); |
| } |
| } |
| |
| // accumulate |
| allstats.accumulate(doc - ctx.docBase); |
| for (FieldFacetStats f : facetStats) { |
| f.facet(doc - ctx.docBase); |
| } |
| } |
| |
| for (FieldFacetStats f : facetStats) { |
| allstats.addFacet(f.name, f.facetStatsValues); |
| } |
| return allstats; |
| } |
| |
| /** |
| * The searcher that should be used for processing local stats |
| * @see SolrQueryRequest#getSearcher |
| */ |
| public SolrIndexSearcher getSearcher() { |
| // see AbstractStatsValues.setNextReader |
| |
| return searcher; |
| } |
| |
| /** |
| * The {@link SchemaField} whose results these stats are computed over, may be null |
| * if the stats are computed over the results of a function or query |
| * |
| * @see #getValueSource |
| */ |
| public SchemaField getSchemaField() { |
| return schemaField; |
| } |
| |
| /** |
| * The {@link ValueSource} of a function or query whose results these stats are computed |
| * over, may be null if the stats are directly over a {@link SchemaField} |
| * |
| * @see #getValueSource |
| */ |
| public ValueSource getValueSource() { |
| return valueSource; |
| } |
| |
| public List<String> getTagList() { |
| return tagList; |
| } |
| |
| public String toString() { |
| return "StatsField<" + originalParam + ">"; |
| } |
| |
| /** |
| * A helper method which inspects the {@link #localParams} associated with this StatsField, |
| * and uses them to populate the {@link #statsInResponse} and {@link #statsToCalculate} data |
| * structures |
| */ |
| private void populateStatsSets() { |
| boolean statSpecifiedByLocalParam = false; |
| // local individual stat |
| Iterator<String> itParams = localParams.getParameterNamesIterator(); |
| |
| while (itParams.hasNext()) { |
| String paramKey = itParams.next(); |
| Stat stat = Stat.forName(paramKey); |
| if (stat != null) { |
| statSpecifiedByLocalParam = true; |
| if (stat.parseParams(this)) { |
| statsInResponse.add(stat); |
| } |
| } |
| } |
| |
| // if no individual stat setting use the default set |
| if ( ! ( statSpecifiedByLocalParam |
| // calcdistinct (as a local param) is a psuedo-stat, prevents default set |
| || localParams.getBool("calcdistinct", false) ) ) { |
| statsInResponse.addAll(DEFAULT_STATS); |
| } |
| |
| // calcDistinct is a psuedo-stat with optional top level param default behavior |
| // if not overridden by the specific individual stats |
| if (localParams.getBool("calcdistinct", topLevelCalcDistinct)) { |
| for (Stat stat : CALCDISTINCT_PSUEDO_STAT) { |
| // assume true, but don't include if specific stat overrides |
| if (localParams.getBool(stat.name(), true)) { |
| statsInResponse.add(stat); |
| } |
| } |
| } |
| |
| for (Stat stat : statsInResponse) { |
| statsToCalculate.addAll(stat.getDistribDeps()); |
| } |
| } |
| |
| public boolean calculateStats(Stat stat) { |
| return statsToCalculate.contains(stat); |
| } |
| |
| public boolean includeInResponse(Stat stat) { |
| if (isShard) { |
| return statsToCalculate.contains(stat); |
| } |
| |
| if (statsInResponse.contains(stat)) { |
| return true; |
| } |
| return false; |
| } |
| |
| public List<Double> getPercentilesList() { |
| return percentilesList; |
| } |
| |
| public boolean getIsShard() { |
| return isShard; |
| } |
| |
| public double getTdigestCompression() { |
| return tdigestCompression; |
| } |
| |
| public HllOptions getHllOptions() { |
| return hllOpts; |
| } |
| |
| /** |
| * Helper Struct for parsing and encapsulating all of the options relaed to building a {@link HLL} |
| * |
| * @see Stat#cardinality |
| * @lucene.internal |
| */ |
| public static final class HllOptions { |
| final HashFunction hasher; |
| |
| // NOTE: this explanation linked to from the java-hll jdocs... |
| // https://github.com/aggregateknowledge/postgresql-hll/blob/master/README.markdown#explanation-of-parameters-and-tuning |
| // ..if i'm understanding the regwidth chart correctly, a value of 6 should be a enough |
| // to support any max cardinality given that we're always dealing with hashes and |
| // the cardinality of the set of all long values is 2**64 == 1.9e19 |
| // |
| // But i guess that assumes a *perfect* hash and high log2m? ... if the hash algo is imperfect |
| // and/or log2m is low (ie: user is less concerned about accuracy), then many diff hash values |
| // might fall in the same register (ie: bucket) and having a wider register to count more of |
| // them may be useful |
| |
| final int log2m; |
| final int regwidth; |
| |
| final static String ERR = "cardinality must be specified as 'true' (for default tunning) or decimal number between 0 and 1 to adjust accuracy vs memory usage (large number is more memory and more accuracy)"; |
| |
| private HllOptions(int log2m, int regwidth, HashFunction hasher) { |
| this.log2m = log2m; |
| this.regwidth = regwidth; |
| this.hasher = hasher; |
| } |
| /** |
| * Creates an HllOptions based on the (local) params specified (if appropriate). |
| * |
| * @param localParams the LocalParams for this {@link StatsField} |
| * @param field the field corrisponding to this {@link StatsField}, may be null if these stats are over a value source |
| * @return the {@link HllOptions} to use basd on the params, or null if no {@link HLL} should be computed |
| * @throws SolrException if there are invalid options |
| */ |
| public static HllOptions parseHllOptions(SolrParams localParams, SchemaField field) |
| throws SolrException { |
| |
| String cardinalityOpt = localParams.get(Stat.cardinality.name()); |
| if (StringUtils.isBlank(cardinalityOpt)) { |
| return null; |
| } |
| |
| final NumericType hashableNumType = getHashableNumericType(field); |
| |
| // some sane defaults |
| int log2m = 13; // roughly equivilent to "cardinality='0.33'" |
| int regwidth = 6; // with decent hash, this is plenty for all valid long hashes |
| |
| if (NumericType.FLOAT.equals(hashableNumType) || NumericType.INT.equals(hashableNumType)) { |
| // for 32bit values, we can adjust our default regwidth down a bit |
| regwidth--; |
| |
| // NOTE: EnumField uses NumericType.INT, and in theory we could be super conservative |
| // with it, but there's no point - just let the EXPLICIT HLL handle it |
| } |
| |
| // TODO: we could attempt additional reductions in the default regwidth based on index |
| // statistics -- but thta doesn't seem worth the effort. for tiny indexes, the |
| // EXPLICIT and SPARSE HLL representations have us nicely covered, and in general we don't |
| // want to be too aggresive about lowering regwidth or we could really poor results if |
| // log2m is also low and there is heavy hashkey collision |
| |
| try { |
| // NFE will short out here if it's not a number |
| final double accuracyOpt = Double.parseDouble(cardinalityOpt); |
| |
| // if a float between 0 and 1 is specified, treat it as a prefrence of accuracy |
| // - 0 means accuracy is not a concern, save RAM |
| // - 1 means be as accurate as possible, using as much RAM as needed. |
| |
| if (accuracyOpt < 0D || 1.0D < accuracyOpt) { |
| throw new SolrException(ErrorCode.BAD_REQUEST, ERR); |
| } |
| |
| // use accuracyOpt as a scaling factor between min & max legal log2m values |
| log2m = HLL.MINIMUM_LOG2M_PARAM |
| + (int) Math.round(accuracyOpt * (HLL.MAXIMUM_LOG2M_PARAM - HLL.MINIMUM_LOG2M_PARAM)); |
| |
| // use accuracyOpt as a scaling factor for regwidth as well, BUT... |
| // be more conservative -- HLL.MIN_REGWIDTH_PARAM is too absurdly low to be useful |
| // use previously computed (hashableNumType) default regwidth -1 as lower bound for scaling |
| final int MIN_HUERISTIC_REGWIDTH = regwidth-1; |
| regwidth = MIN_HUERISTIC_REGWIDTH |
| + (int) Math.round(accuracyOpt * (HLL.MAXIMUM_REGWIDTH_PARAM - MIN_HUERISTIC_REGWIDTH)); |
| |
| } catch (NumberFormatException nfe) { |
| // param value isn't a number -- let's check for simple true/false |
| if (! localParams.getBool(Stat.cardinality.name(), false)) { |
| return null; |
| } |
| } |
| |
| // let explicit params override both the default and/or any accuracy specification |
| log2m = localParams.getInt("hllLog2m", log2m); |
| regwidth = localParams.getInt("hllRegwidth", regwidth); |
| |
| // validate legal values |
| if (log2m < HLL.MINIMUM_LOG2M_PARAM || HLL.MAXIMUM_LOG2M_PARAM < log2m) { |
| throw new SolrException(ErrorCode.BAD_REQUEST, "hllLog2m must be at least " + |
| HLL.MINIMUM_LOG2M_PARAM + " and at most " + HLL.MAXIMUM_LOG2M_PARAM |
| + " (" + log2m +")"); |
| } |
| if (regwidth < HLL.MINIMUM_REGWIDTH_PARAM || HLL.MAXIMUM_REGWIDTH_PARAM < regwidth) { |
| throw new SolrException(ErrorCode.BAD_REQUEST, "hllRegwidth must be at least " + |
| HLL.MINIMUM_REGWIDTH_PARAM + " and at most " + HLL.MAXIMUM_REGWIDTH_PARAM); |
| } |
| |
| HashFunction hasher = localParams.getBool("hllPreHashed", false) ? null : Hashing.murmur3_128(); |
| |
| if (null == hasher) { |
| // if this is a function, or a non Long field, pre-hashed is invalid |
| // NOTE: we ignore hashableNumType - it's LONG for non numerics like Strings |
| if (null == field || !NumericType.LONG.equals(field.getType().getNumericType())) { |
| throw new SolrException(ErrorCode.BAD_REQUEST, "hllPreHashed is only supported with Long based fields"); |
| } |
| } |
| |
| // if we're still here, then we need an HLL... |
| return new HllOptions(log2m, regwidth, hasher); |
| } |
| /** @see HLL */ |
| public int getLog2m() { |
| return log2m; |
| } |
| /** @see HLL */ |
| public int getRegwidth() { |
| return regwidth; |
| } |
| /** May be null if user has indicated that field values are pre-hashed */ |
| public HashFunction getHasher() { |
| return hasher; |
| } |
| public HLL newHLL() { |
| // Although it (in theory) saves memory for "medium" size sets, the SPARSE type seems to have |
| // some nasty impacts on response time as it gets larger - particularly in distrib requests. |
| // Merging large SPARSE HLLs is much much slower then merging FULL HLLs with the same num docs |
| // |
| // TODO: add more tunning options for this. |
| return new HLL(getLog2m(), getRegwidth(), -1 /* auto explict threshold */, |
| false /* no sparse representation */, HLLType.EMPTY); |
| |
| } |
| } |
| |
| /** |
| * Returns the effective {@link NumericType} for the field for the purposes of hash values. |
| * ie: If the field has an explict NumericType that is returned; If the field has no explicit |
| * NumericType then {@link NumericType#LONG} is returned; If field is null, then |
| * {@link NumericType#FLOAT} is assumed for ValueSource. |
| */ |
| private static NumericType getHashableNumericType(SchemaField field) { |
| if (null == field) { |
| return NumericType.FLOAT; |
| } |
| final NumericType result = field.getType().getNumericType(); |
| return null == result ? NumericType.LONG : result; |
| } |
| } |