solr/core/src/java/org/apache/solr/handler/component/StatsField.java - solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.solr.handler.component;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.EnumSet;
 import java.util.IdentityHashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;

 import org.apache.commons.lang.StringUtils;
 import org.apache.lucene.document.FieldType.NumericType;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.queries.function.FunctionQuery;
 import org.apache.lucene.queries.function.ValueSource;
 import org.apache.lucene.queries.function.valuesource.FieldCacheSource;
 import org.apache.lucene.queries.function.valuesource.QueryValueSource;
 import org.apache.lucene.search.Query;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.common.params.CommonParams;
 import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.params.StatsParams;
 import org.apache.solr.common.util.StrUtils;
 import org.apache.solr.request.DocValuesStats;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.schema.SchemaField;
 import org.apache.solr.search.DocIterator;
 import org.apache.solr.search.DocSet;
 import org.apache.solr.search.QParser;
 import org.apache.solr.search.QParserPlugin;
 import org.apache.solr.search.QueryParsing;
 import org.apache.solr.search.SolrIndexSearcher;
 import org.apache.solr.search.SyntaxError;

 import net.agkn.hll.HLL;
 import net.agkn.hll.HLLType;
 import com.google.common.hash.Hashing;
 import com.google.common.hash.HashFunction;

 /**
  * Models all of the information associated with a single {@link StatsParams#STATS_FIELD}
  * instance.
  *
  * @see StatsComponent
  */
 public class StatsField {

   /**
    * An enumeration representing the sumer set of all possible stat values that can be computed.
    * Each of these enum values can be specified as a local param in a <code>stats.field</code>
    * (eg: <code>stats.field={!min=true mean=true}my_field_name</code>) but not all enum values
    * are valid for all field types (eg: <code>mean</code> is meaningless for String fields)
    *
    * @lucene.internal
    * @lucene.experimental
    */
   public static enum Stat {
     min(true),
     max(true),
     missing(true),
     sum(true),
     count(true),
     mean(false, sum, count),
     sumOfSquares(true),
     stddev(false, sum, count, sumOfSquares),
     distinctValues(true),
     countDistinct(false, distinctValues),
     percentiles(true){
       /** special for percentiles **/
       boolean parseParams(StatsField sf) {
         String percentileParas = sf.localParams.get(this.name());
         if (percentileParas != null) {
           List<Double> percentiles = new ArrayList<Double>();
           try {
             for (String percentile : StrUtils.splitSmart(percentileParas, ',')) {
               percentiles.add(Double.parseDouble(percentile));
             }
             if (!percentiles.isEmpty()) {
               sf.percentilesList.addAll(percentiles);
               sf.tdigestCompression = sf.localParams.getDouble("tdigestCompression",
                                                                sf.tdigestCompression);
               return true;
             }
           } catch (NumberFormatException e) {
             throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to parse "
                 + StatsParams.STATS_FIELD + " local params: " + sf.localParams + " due to: "
                 + e.getMessage(), e);
           }

         }
         return false;
       }
     },
     cardinality(true) {
       /** special for percentiles **/
       boolean parseParams(StatsField sf) {
         try {
           sf.hllOpts = HllOptions.parseHllOptions(sf.localParams, sf.schemaField);
           return (null != sf.hllOpts);
         } catch (Exception e) {
           throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to parse "
               + StatsParams.STATS_FIELD + " local params: " + sf.localParams + " due to: "
               + e.getMessage(), e);
         }
       }
     };

     private final List<Stat> distribDeps;

     /**
      * Sole constructor for Stat enum values
      * @param deps the set of stat values, other then this one, which are a distributed
      *        dependency and must be computed and returned by each individual shards in
      *        order to compute <i>this</i> stat over the entire distributed result set.
      * @param selfDep indicates that when computing this stat across a distributed result
      *        set, each shard must compute this stat <i>in addition to</i> any other
      *        distributed dependences.
      * @see #getDistribDeps
      */
     Stat(boolean selfDep, Stat... deps) {
       distribDeps = new ArrayList<Stat>(deps.length+1);
       distribDeps.addAll(Arrays.asList(deps));
       if (selfDep) {
         distribDeps.add(this);
       }
     }

     /**
      * Given a String, returns the corrisponding Stat enum value if any, otherwise returns null.
      */
     public static Stat forName(String paramKey) {
       try {
         return Stat.valueOf(paramKey);
       } catch (IllegalArgumentException e) {
         return null;
       }
     }

     /**
      * The stats that must be computed and returned by each shard involved in a distributed
      * request in order to compute the overall value for this stat across the entire distributed
      * result set.  A Stat instance may include itself in the <code>getDistribDeps()</code> result,
      * but that is not always the case.
      */
     public EnumSet<Stat> getDistribDeps() {
       return EnumSet.copyOf(this.distribDeps);
     }

     /**
      * Called when the name of a stat is found as a local param on this {@link StatsField}
      * @return true if the user is requesting this stat, else false
      */
     boolean parseParams(StatsField sf) {
       return sf.localParams.getBool(this.name(), false);
     }

   }

   /**
    * the equivilent stats if "calcdistinct" is specified
    * @see Stat#countDistinct
    * @see Stat#distinctValues
    */
   private static final EnumSet<Stat> CALCDISTINCT_PSUEDO_STAT = EnumSet.of(Stat.countDistinct, Stat.distinctValues);

   /**
    * The set of stats computed by default when no localparams are used to specify explicit stats
    */
   public final static Set<Stat> DEFAULT_STATS = Collections.<Stat>unmodifiableSet
     (EnumSet.of(Stat.min, Stat.max, Stat.missing, Stat.sum, Stat.count, Stat.mean, Stat.sumOfSquares, Stat.stddev));

   private final SolrIndexSearcher searcher;
   private final ResponseBuilder rb;
   private final String originalParam; // for error messages
   private final SolrParams localParams;
   private final ValueSource valueSource; // may be null if simple field stats
   private final SchemaField schemaField; // may be null if function/query stats
   private final String key;
   private final boolean  topLevelCalcDistinct;
   private final String[] facets;
   private final List<String> tagList;
   private final List<String> excludeTagList;
   private final EnumSet<Stat> statsToCalculate = EnumSet.noneOf(Stat.class);
   private final EnumSet<Stat> statsInResponse = EnumSet.noneOf(Stat.class);
   private final List<Double> percentilesList= new ArrayList<Double>();
   private final boolean isShard;

   private double tdigestCompression = 100.0D;
   private HllOptions hllOpts;

   /**
    * @param rb the current request/response
    * @param statsParam the raw {@link StatsParams#STATS_FIELD} string
    */
   public StatsField(ResponseBuilder rb, String statsParam) {
     this.rb = rb;
     this.searcher = rb.req.getSearcher();
     this.originalParam = statsParam;

     SolrParams params = rb.req.getParams();
     try {
       isShard = params.getBool("isShard", false);
       SolrParams localParams = QueryParsing.getLocalParams(originalParam, params);
       if (null == localParams) {
         // simplest possible input: bare string (field name)
         ModifiableSolrParams customParams = new ModifiableSolrParams();
         customParams.add(QueryParsing.V, originalParam);
         localParams = customParams;
       }

       this.localParams = localParams;

       String parserName = localParams.get(QueryParsing.TYPE);
       SchemaField sf = null;
       ValueSource vs = null;

       if ( StringUtils.isBlank(parserName) ) {

         // basic request for field stats
         sf = searcher.getSchema().getField(localParams.get(QueryParsing.V));

       } else {
         // we have a non trivial request to compute stats over a query (or function)

         // NOTE we could use QParser.getParser(...) here, but that would redundently
         // reparse everything.  ( TODO: refactor a common method in QParser ?)
         QParserPlugin qplug = rb.req.getCore().getQueryPlugin(parserName);
         QParser qp =  qplug.createParser(localParams.get(QueryParsing.V),
                                          localParams, params, rb.req);

         // figure out what type of query we are dealing, get the most direct ValueSource
         vs = extractValueSource(qp.parse());

         // if this ValueSource directly corrisponds to a SchemaField, act as if
         // we were asked to compute stats on it directly
         // ie:  "stats.field={!func key=foo}field(foo)" == "stats.field=foo"
         sf = extractSchemaField(vs, searcher.getSchema());
         if (null != sf) {
           vs = null;
         }
       }

       assert ( (null == vs) ^ (null == sf) ) : "exactly one of vs & sf must be null";

       this.schemaField = sf;
       this.valueSource = vs;

     } catch (SyntaxError e) {
       throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to parse " +
                               StatsParams.STATS_FIELD + ": " + originalParam + " due to: "
                               + e.getMessage(), e);
     }

     // allow explicit setting of the response key via localparams...
     this.key = localParams.get(CommonParams.OUTPUT_KEY,
                                // default to the main param value...
                                localParams.get(CommonParams.VALUE,
                                                // default to entire original param str.
                                                originalParam));

     this.topLevelCalcDistinct = null == schemaField
         ? params.getBool(StatsParams.STATS_CALC_DISTINCT, false)
         : params.getFieldBool(schemaField.getName(), StatsParams.STATS_CALC_DISTINCT, false);

     populateStatsSets();

     String[] facets = params.getFieldParams(key, StatsParams.STATS_FACET);
     this.facets = (null == facets) ? new String[0] : facets;
     String tagStr = localParams.get(CommonParams.TAG);
     this.tagList = (null == tagStr)
         ? Collections.<String>emptyList()
         : StrUtils.splitSmart(tagStr,',');

     // figure out if we need a special base DocSet
     String excludeStr = localParams.get(CommonParams.EXCLUDE);
     this.excludeTagList = (null == excludeStr)
       ? Collections.<String>emptyList()
       : StrUtils.splitSmart(excludeStr,',');

     assert ( (null == this.valueSource) ^ (null == this.schemaField) )
       : "exactly one of valueSource & schemaField must be null";
   }

   /**
    * Inspects a {@link Query} to see if it directly maps to a {@link ValueSource},
    * and if so returns it -- otherwise wraps it as needed.
    *
    * @param q Query whose scores we have been asked to compute stats of
    * @returns a ValueSource to use for computing the stats
    */
   private static ValueSource extractValueSource(Query q) {
     return (q instanceof FunctionQuery) ?
       // Common case: we're wrapping a func, so we can directly pull out ValueSource
       ((FunctionQuery) q).getValueSource() :
       // asked to compute stats over a query, wrap it up as a ValueSource
       new QueryValueSource(q, 0.0F);
   }

   /**
    * Inspects a {@link ValueSource} to see if it directly maps to a {@link SchemaField},
    * and if so returns it.
    *
    * @param vs ValueSource we've been asked to compute stats of
    * @param schema The Schema to use
    * @returns Corrisponding {@link SchemaField} or null if the ValueSource is more complex
    * @see FieldCacheSource
    */
   private static SchemaField extractSchemaField(ValueSource vs, IndexSchema schema) {
     if (vs instanceof FieldCacheSource) {
       String fieldName = ((FieldCacheSource)vs).getField();
       return schema.getField(fieldName);
     }
     return null;
   }

   /**
    * The key to be used when refering to this {@link StatsField} instance in the
    * response tp clients.
    */
   public String getOutputKey() {
     return key;
   }

   /**
    * Computes a base {@link DocSet} for the current request to be used
    * when computing global stats for the local index.
    *
    * This is typically the same as the main DocSet for the {@link ResponseBuilder}
    * unless {@link CommonParams#TAG tag}ged filter queries have been excluded using
    * the {@link CommonParams#EXCLUDE ex} local param
    */
   public DocSet computeBaseDocSet() throws IOException {

     DocSet docs = rb.getResults().docSet;
     Map<?,?> tagMap = (Map<?,?>) rb.req.getContext().get("tags");

     if (excludeTagList.isEmpty() || null == tagMap) {
       // either the exclude list is empty, or there
       // aren't any tagged filters to exclude anyway.
       return docs;
     }

     IdentityHashMap<Query,Boolean> excludeSet = new IdentityHashMap<Query,Boolean>();
     for (String excludeTag : excludeTagList) {
       Object olst = tagMap.get(excludeTag);
       // tagMap has entries of List<String,List<QParser>>, but subject to change in the future
       if (!(olst instanceof Collection)) continue;
       for (Object o : (Collection<?>)olst) {
         if (!(o instanceof QParser)) continue;
         QParser qp = (QParser)o;
         try {
           excludeSet.put(qp.getQuery(), Boolean.TRUE);
         } catch (SyntaxError e) {
           // this shouldn't be possible since the request should have already
           // failed when attempting to execute the query, but just in case...
           throw new SolrException(ErrorCode.BAD_REQUEST, "Excluded query can't be parsed: " +
                                   originalParam + " due to: " + e.getMessage(), e);
         }
       }
     }
     if (excludeSet.size() == 0) return docs;

     List<Query> qlist = new ArrayList<Query>();

     // add the base query
     if (!excludeSet.containsKey(rb.getQuery())) {
       qlist.add(rb.getQuery());
     }

     // add the filters
     if (rb.getFilters() != null) {
       for (Query q : rb.getFilters()) {
         if (!excludeSet.containsKey(q)) {
           qlist.add(q);
         }
       }
     }

     // get the new base docset for this facet
     return searcher.getDocSet(qlist);
   }

   /**
    * Computes the {@link StatsValues} for this {@link StatsField} relative to the
    * specified {@link DocSet}
    * @see #computeBaseDocSet
    */
   public StatsValues computeLocalStatsValues(DocSet base) throws IOException {

     if (statsToCalculate.isEmpty()) {
       // perf optimization for the case where we compute nothing
       // ie: stats.field={!min=$domin}myfield&domin=false
       return StatsValuesFactory.createStatsValues(this);
     }

     if (null != schemaField
         && (schemaField.multiValued() || schemaField.getType().multiValuedFieldCache())) {

       // TODO: should this also be used for single-valued string fields? (should work fine)
       return DocValuesStats.getCounts(searcher, this, base, facets);
     } else {
       // either a single valued field we pull from FieldCache, or an explicit
       // function ValueSource
       return computeLocalValueSourceStats(base);
     }
   }

   private StatsValues computeLocalValueSourceStats(DocSet base) throws IOException {

     IndexSchema schema = searcher.getSchema();

     final StatsValues allstats = StatsValuesFactory.createStatsValues(this);

     List<FieldFacetStats> facetStats = new ArrayList<>();
     for( String facetField : facets ) {
       SchemaField fsf = schema.getField(facetField);

       if ( fsf.multiValued()) {
         throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
           "Stats can only facet on single-valued fields, not: " + facetField );
       }

       facetStats.add(new FieldFacetStats(searcher, fsf, this));
     }

     final Iterator<LeafReaderContext> ctxIt = searcher.getIndexReader().leaves().iterator();
     LeafReaderContext ctx = null;
     for (DocIterator docsIt = base.iterator(); docsIt.hasNext(); ) {
       final int doc = docsIt.nextDoc();
       if (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()) {
         // advance
         do {
           ctx = ctxIt.next();
         } while (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc());
         assert doc >= ctx.docBase;

         // propagate the context among accumulators.
         allstats.setNextReader(ctx);
         for (FieldFacetStats f : facetStats) {
           f.setNextReader(ctx);
         }
       }

       // accumulate
       allstats.accumulate(doc - ctx.docBase);
       for (FieldFacetStats f : facetStats) {
         f.facet(doc - ctx.docBase);
       }
     }

     for (FieldFacetStats f : facetStats) {
       allstats.addFacet(f.name, f.facetStatsValues);
     }
     return allstats;
   }

   /**
    * The searcher that should be used for processing local stats
    * @see SolrQueryRequest#getSearcher
    */
   public SolrIndexSearcher getSearcher() {
     // see AbstractStatsValues.setNextReader

     return searcher;
   }

   /**
    * The {@link SchemaField} whose results these stats are computed over, may be null
    * if the stats are computed over the results of a function or query
    *
    * @see #getValueSource
    */
   public SchemaField getSchemaField() {
     return schemaField;
   }

   /**
    * The {@link ValueSource} of a function or query whose results these stats are computed
    * over, may be null if the stats are directly over a {@link SchemaField}
    *
    * @see #getValueSource
    */
   public ValueSource getValueSource() {
     return valueSource;
   }

   public List<String> getTagList() {
     return tagList;
   }

   public String toString() {
     return "StatsField<" + originalParam + ">";
   }

   /**
    * A helper method which inspects the {@link #localParams} associated with this StatsField,
    * and uses them to populate the {@link #statsInResponse} and {@link #statsToCalculate} data
    * structures
    */
   private void populateStatsSets() {
     boolean statSpecifiedByLocalParam = false;
     // local individual stat
     Iterator<String> itParams = localParams.getParameterNamesIterator();

     while (itParams.hasNext()) {
       String paramKey = itParams.next();
       Stat stat = Stat.forName(paramKey);
       if (stat != null) {
         statSpecifiedByLocalParam = true;
         if (stat.parseParams(this)) {
           statsInResponse.add(stat);
         }
       }
     }

     // if no individual stat setting use the default set
     if ( ! ( statSpecifiedByLocalParam
              // calcdistinct (as a local param) is a psuedo-stat, prevents default set
              || localParams.getBool("calcdistinct", false) ) ) {
       statsInResponse.addAll(DEFAULT_STATS);
     }

     // calcDistinct is a psuedo-stat with optional top level param default behavior
     // if not overridden by the specific individual stats
     if (localParams.getBool("calcdistinct", topLevelCalcDistinct)) {
       for (Stat stat : CALCDISTINCT_PSUEDO_STAT) {
         // assume true, but don't include if specific stat overrides
         if (localParams.getBool(stat.name(), true)) {
           statsInResponse.add(stat);
         }
       }
     }

     for (Stat stat : statsInResponse) {
       statsToCalculate.addAll(stat.getDistribDeps());
     }
   }

   public boolean calculateStats(Stat stat) {
     return statsToCalculate.contains(stat);
   }

   public boolean includeInResponse(Stat stat) {
     if (isShard) {
       return statsToCalculate.contains(stat);
     }

     if (statsInResponse.contains(stat)) {
       return true;
     }
     return false;
   }

   public List<Double> getPercentilesList() {
     return percentilesList;
   }

   public boolean getIsShard() {
     return isShard;
   }

   public double getTdigestCompression() {
     return tdigestCompression;
   }

   public HllOptions getHllOptions() {
     return hllOpts;
   }

   /**
    * Helper Struct for parsing and encapsulating all of the options relaed to building a {@link HLL}
    *
    * @see Stat#cardinality
    * @lucene.internal
    */
   public static final class HllOptions {
     final HashFunction hasher;

     // NOTE: this explanation linked to from the java-hll jdocs...
     // https://github.com/aggregateknowledge/postgresql-hll/blob/master/README.markdown#explanation-of-parameters-and-tuning
     // ..if i'm understanding the regwidth chart correctly, a value of 6 should be a enough
     // to support any max cardinality given that we're always dealing with hashes and
     // the cardinality of the set of all long values is 2**64 == 1.9e19
     //
     // But i guess that assumes a *perfect* hash and high log2m? ... if the hash algo is imperfect
     // and/or log2m is low (ie: user is less concerned about accuracy), then many diff hash values
     // might fall in the same register (ie: bucket) and having a wider register to count more of
     // them may be useful

     final int log2m;
     final int regwidth;

     final static String ERR = "cardinality must be specified as 'true' (for default tunning) or decimal number between 0 and 1 to adjust accuracy vs memory usage (large number is more memory and more accuracy)";

     private HllOptions(int log2m, int regwidth, HashFunction hasher) {
       this.log2m = log2m;
       this.regwidth = regwidth;
       this.hasher = hasher;
     }
     /**
      * Creates an HllOptions based on the (local) params specified (if appropriate).
      *
      * @param localParams the LocalParams for this {@link StatsField}
      * @param field the field corrisponding to this {@link StatsField}, may be null if these stats are over a value source
      * @return the {@link HllOptions} to use basd on the params, or null if no {@link HLL} should be computed
      * @throws SolrException if there are invalid options
      */
     public static HllOptions parseHllOptions(SolrParams localParams, SchemaField field)
       throws SolrException {

       String cardinalityOpt = localParams.get(Stat.cardinality.name());
       if (StringUtils.isBlank(cardinalityOpt)) {
         return null;
       }

       final NumericType hashableNumType = getHashableNumericType(field);

       // some sane defaults
       int log2m = 13;   // roughly equivilent to "cardinality='0.33'"
       int regwidth = 6; // with decent hash, this is plenty for all valid long hashes

       if (NumericType.FLOAT.equals(hashableNumType) || NumericType.INT.equals(hashableNumType)) {
         // for 32bit values, we can adjust our default regwidth down a bit
         regwidth--;

         // NOTE: EnumField uses NumericType.INT, and in theory we could be super conservative
         // with it, but there's no point - just let the EXPLICIT HLL handle it
       }

       // TODO: we could attempt additional reductions in the default regwidth based on index
       // statistics -- but thta doesn't seem worth the effort.  for tiny indexes, the
       // EXPLICIT and SPARSE HLL representations have us nicely covered, and in general we don't
       // want to be too aggresive about lowering regwidth or we could really poor results if
       // log2m is also low and  there is heavy hashkey collision

       try {
         // NFE will short out here if it's not a number
         final double accuracyOpt = Double.parseDouble(cardinalityOpt);

         // if a float between 0 and 1 is specified, treat it as a prefrence of accuracy
         // - 0 means accuracy is not a concern, save RAM
         // - 1 means be as accurate as possible, using as much RAM as needed.

         if (accuracyOpt < 0D || 1.0D < accuracyOpt) {
           throw new SolrException(ErrorCode.BAD_REQUEST, ERR);
         }

         // use accuracyOpt as a scaling factor between min & max legal log2m values
         log2m = HLL.MINIMUM_LOG2M_PARAM
           + (int) Math.round(accuracyOpt * (HLL.MAXIMUM_LOG2M_PARAM - HLL.MINIMUM_LOG2M_PARAM));

         // use accuracyOpt as a scaling factor for regwidth as well, BUT...
         // be more conservative -- HLL.MIN_REGWIDTH_PARAM is too absurdly low to be useful
         // use previously computed (hashableNumType) default regwidth -1 as lower bound for scaling
         final int MIN_HUERISTIC_REGWIDTH = regwidth-1;
         regwidth = MIN_HUERISTIC_REGWIDTH
           + (int) Math.round(accuracyOpt * (HLL.MAXIMUM_REGWIDTH_PARAM - MIN_HUERISTIC_REGWIDTH));

       } catch (NumberFormatException nfe) {
         // param value isn't a number -- let's check for simple true/false
         if (! localParams.getBool(Stat.cardinality.name(), false)) {
           return null;
         }
       }

       // let explicit params override both the default and/or any accuracy specification
       log2m = localParams.getInt("hllLog2m", log2m);
       regwidth = localParams.getInt("hllRegwidth", regwidth);

       // validate legal values
       if (log2m < HLL.MINIMUM_LOG2M_PARAM || HLL.MAXIMUM_LOG2M_PARAM < log2m) {
         throw new SolrException(ErrorCode.BAD_REQUEST, "hllLog2m must be at least " +
                                 HLL.MINIMUM_LOG2M_PARAM + " and at most " + HLL.MAXIMUM_LOG2M_PARAM
                                 + " (" + log2m +")");
       }
       if (regwidth < HLL.MINIMUM_REGWIDTH_PARAM || HLL.MAXIMUM_REGWIDTH_PARAM < regwidth) {
         throw new SolrException(ErrorCode.BAD_REQUEST, "hllRegwidth must be at least " +
                                 HLL.MINIMUM_REGWIDTH_PARAM + " and at most " + HLL.MAXIMUM_REGWIDTH_PARAM);
       }

       HashFunction hasher = localParams.getBool("hllPreHashed", false) ? null : Hashing.murmur3_128();

       if (null == hasher) {
         // if this is a function, or a non Long field, pre-hashed is invalid
         // NOTE: we ignore hashableNumType - it's LONG for non numerics like Strings
         if (null == field || !NumericType.LONG.equals(field.getType().getNumericType())) {
           throw new SolrException(ErrorCode.BAD_REQUEST, "hllPreHashed is only supported with Long based fields");
         }
       }

       // if we're still here, then we need an HLL...
       return new HllOptions(log2m, regwidth, hasher);
     }
     /** @see HLL */
     public int getLog2m() {
       return log2m;
     }
     /** @see HLL */
     public int getRegwidth() {
       return regwidth;
     }
     /** May be null if user has indicated that field values are pre-hashed */
     public HashFunction getHasher() {
       return hasher;
     }
     public HLL newHLL() {
       // Although it (in theory) saves memory for "medium" size sets, the SPARSE type seems to have
       // some nasty impacts on response time as it gets larger - particularly in distrib requests.
       // Merging large SPARSE HLLs is much much slower then merging FULL HLLs with the same num docs
       //
       // TODO: add more tunning options for this.
       return new HLL(getLog2m(), getRegwidth(), -1 /* auto explict threshold */,
                      false /* no sparse representation */, HLLType.EMPTY);

     }
   }

   /**
    * Returns the effective {@link NumericType} for the field for the purposes of hash values.
    * ie: If the field has an explict NumericType that is returned; If the field has no explicit
    * NumericType then {@link NumericType#LONG} is returned;  If field is null, then
    * {@link NumericType#FLOAT} is assumed for ValueSource.
    */
   private static NumericType getHashableNumericType(SchemaField field) {
     if (null == field) {
       return NumericType.FLOAT;
     }
     final NumericType result = field.getType().getNumericType();
     return null == result ? NumericType.LONG : result;
   }
 }