| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.solr.handler.component; |
| |
| import java.lang.invoke.MethodHandles; |
| import java.nio.charset.StandardCharsets; |
| |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.List; |
| import java.util.Map; |
| |
| import org.apache.lucene.util.TestUtil; |
| import org.apache.lucene.util.LuceneTestCase.Slow; |
| |
| import org.apache.solr.BaseDistributedSearchTestCase; |
| import org.apache.solr.SolrTestCaseJ4.SuppressSSL; |
| import org.apache.solr.client.solrj.response.FieldStatsInfo; |
| import org.apache.solr.client.solrj.response.QueryResponse; |
| import org.apache.solr.common.params.SolrParams; |
| import org.apache.solr.common.params.ModifiableSolrParams; |
| |
| import org.apache.solr.util.hll.HLL; |
| import com.google.common.hash.Hashing; |
| import com.google.common.hash.HashFunction; |
| |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| @Slow |
| @SuppressSSL(bugUrl = "https://issues.apache.org/jira/browse/SOLR-9062") |
| public class TestDistributedStatsComponentCardinality extends BaseDistributedSearchTestCase { |
| |
| private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); |
| |
| final static HashFunction HASHER = Hashing.murmur3_128(); |
| |
| final static long BIG_PRIME = 982451653L; |
| |
| final static int MIN_NUM_DOCS = 10000; |
| final static int MAX_NUM_DOCS = MIN_NUM_DOCS * 2; |
| |
| final static List<String> STAT_FIELDS = |
| Collections.unmodifiableList(Arrays.asList( "int_i", "long_l", "string_s" )); |
| |
| final int NUM_DOCS; |
| final long MAX_LONG; |
| final long MIN_LONG; |
| |
| public TestDistributedStatsComponentCardinality() { |
| // we need DVs on point fields to compute stats |
| if (Boolean.getBoolean(NUMERIC_POINTS_SYSPROP)) System.setProperty(NUMERIC_DOCVALUES_SYSPROP,"true"); |
| |
| // we want some randomness in the shard number, but we don't want multiple iterations |
| fixShardCount(TEST_NIGHTLY ? 7 : random().nextInt(3) + 1); |
| |
| handle.put("maxScore", SKIPVAL); |
| NUM_DOCS = TestUtil.nextInt(random(), 10000, 15000); |
| MAX_LONG = TestUtil.nextLong(random(), 0, NUM_DOCS * BIG_PRIME); |
| MIN_LONG = MAX_LONG - (((long)NUM_DOCS-1) * BIG_PRIME); |
| } |
| |
| /** CAUTION: this builds a very large index */ |
| public void buildIndex() throws Exception { |
| log.info("Building an index of {} docs", NUM_DOCS); |
| |
| // we want a big spread in the long values we use, decrement by BIG_PRIME as we index |
| long longValue = MAX_LONG; |
| |
| for (int i = 1; i <= NUM_DOCS; i++) { |
| // with these values, we know that every doc indexed has a unique value in all of the |
| // fields we will compute cardinality against. |
| // which means the number of docs matching a query is the true cardinality for each field |
| |
| final String strValue = "s"+longValue; |
| indexDoc(sdoc("id","" + i, |
| "int_i", ""+i, |
| "int_i_prehashed_l", ""+HASHER.hashInt(i).asLong(), |
| "long_l", ""+longValue, |
| "long_l_prehashed_l", ""+HASHER.hashLong(longValue).asLong(), |
| "string_s", strValue, |
| "string_s_prehashed_l", ""+HASHER.hashString(strValue, StandardCharsets.UTF_8).asLong())); |
| |
| longValue -= BIG_PRIME; |
| } |
| |
| commit(); |
| |
| } |
| |
| public void test() throws Exception { |
| buildIndex(); |
| |
| { // simple sanity checks - don't leak variables |
| QueryResponse rsp = null; |
| rsp = query(params("rows", "0", "q", "id:42")); |
| assertEquals(1, rsp.getResults().getNumFound()); |
| |
| rsp = query(params("rows", "0", "q", "*:*", |
| "stats","true", "stats.field", "{!min=true max=true}long_l")); |
| assertEquals(NUM_DOCS, rsp.getResults().getNumFound()); |
| assertEquals(MIN_LONG, Math.round((double) rsp.getFieldStatsInfo().get("long_l").getMin())); |
| assertEquals(MAX_LONG, Math.round((double) rsp.getFieldStatsInfo().get("long_l").getMax())); |
| } |
| |
| final int NUM_QUERIES = atLeast(100); |
| |
| // Some Randomized queries with randomized log2m and max regwidth |
| for (int i = 0; i < NUM_QUERIES; i++) { |
| |
| // testing shows that on random data, at the size we're dealing with, |
| // MINIMUM_LOG2M_PARAM is just too absurdly small to give anything remotely close the |
| // the theoretically expected relative error. |
| // |
| // So we have to use a slightly higher lower bound on what log2m values we randomly test |
| final int log2m = TestUtil.nextInt(random(), |
| 2 + HLL.MINIMUM_LOG2M_PARAM, |
| HLL.MAXIMUM_LOG2M_PARAM); |
| |
| // use max regwidth to try and prevent hash collisions from introducing problems |
| final int regwidth = HLL.MAXIMUM_REGWIDTH_PARAM; |
| |
| final int lowId = TestUtil.nextInt(random(), 1, NUM_DOCS-2000); |
| final int highId = TestUtil.nextInt(random(), lowId+1000, NUM_DOCS); |
| final int numMatches = 1+highId-lowId; |
| |
| SolrParams p = buildCardinalityQ(lowId, highId, log2m, regwidth); |
| QueryResponse rsp = query(p); |
| assertEquals("sanity check num matches, p="+p, numMatches, rsp.getResults().getNumFound()); |
| |
| Map<String,FieldStatsInfo> stats = rsp.getFieldStatsInfo(); |
| |
| if (Boolean.getBoolean(NUMERIC_POINTS_SYSPROP)) { |
| log.warn("SOLR-10918: can't relying on exact match with pre-hashed values when using points"); |
| } else { |
| for (String f : STAT_FIELDS) { |
| // regardless of log2m and regwidth, the estimated cardinality of the |
| // hashed vs prehashed values should be exactly the same for each field |
| |
| assertEquals(f + ": hashed vs prehashed, real="+ numMatches + ", p=" + p, |
| stats.get(f).getCardinality().longValue(), |
| stats.get(f+"_prehashed_l").getCardinality().longValue()); |
| } |
| } |
| |
| for (String f : STAT_FIELDS) { |
| // check the relative error of the estimate returned against the known truth |
| |
| final double relErr = expectedRelativeError(log2m); |
| final long estimate = stats.get(f).getCardinality().longValue(); |
| assertTrue(f + ": relativeErr="+relErr+", estimate="+estimate+", real="+numMatches+", p=" + p, |
| (Math.abs(numMatches - estimate) / numMatches) < relErr); |
| |
| } |
| } |
| |
| // Some Randomized queries with both low and high accuracy options |
| for (int i = 0; i < NUM_QUERIES; i++) { |
| |
| final int lowId = TestUtil.nextInt(random(), 1, NUM_DOCS-2000); |
| final int highId = TestUtil.nextInt(random(), lowId+1000, NUM_DOCS); |
| final int numMatches = 1+highId-lowId; |
| |
| // WTF? - https://github.com/aggregateknowledge/java-hll/issues/15 |
| // |
| // aparently we can't rely on estimates always being more accurate with higher log2m values? |
| // so for now, just try testing accuracy values that differ by at least 0.5 |
| // |
| // (that should give us a significant enough log2m diff that the "highAccuracy" is always |
| // more accurate -- if, not then the entire premise of the float value is fundementally bogus) |
| // |
| final double lowAccuracy = random().nextDouble() / 2; |
| // final double highAccuracy = Math.min(1.0D, lowAccuracy + (random().nextDouble() / 2)); |
| final double highAccuracy = Math.min(1.0D, lowAccuracy + 0.5D); |
| |
| SolrParams p = buildCardinalityQ(lowId, highId, lowAccuracy, highAccuracy); |
| QueryResponse rsp = query(p); |
| assertEquals("sanity check num matches, p="+p, numMatches, rsp.getResults().getNumFound()); |
| |
| Map<String,FieldStatsInfo> stats = rsp.getFieldStatsInfo(); |
| |
| // can't use STAT_FIELDS here ... |
| // |
| // hueristic differences for regwidth on 32 bit values mean we get differences |
| // between estimates for the normal field vs the prehashed (long) field |
| // |
| // so we settle for only testing things where the regwidth is consistent |
| // w/the prehashed long... |
| for (String f : new String[] { "long_l", "string_s" }) { |
| |
| // regardless of accuracy, the estimated cardinality of the |
| // hashed vs prehashed values should be exactly the same for each field |
| |
| assertEquals(f + ": hashed vs prehashed (low), real="+ numMatches + ", p=" + p, |
| stats.get("low_"+f).getCardinality().longValue(), |
| stats.get("low_"+f+"_prehashed_l").getCardinality().longValue()); |
| assertEquals(f + ": hashed vs prehashed (high), real="+ numMatches + ", p=" + p, |
| stats.get("high_"+f).getCardinality().longValue(), |
| stats.get("high_"+f+"_prehashed_l").getCardinality().longValue()); |
| } |
| |
| for (String f : STAT_FIELDS) { |
| for (String ff : new String[] { f, f+"_prehashed_l"}) { |
| // for both the prehashed and regular fields, the high accuracy option |
| // should always produce an estimate at least as good as the low accuracy option |
| |
| long poorEst = stats.get("low_"+ff).getCardinality(); |
| long goodEst = stats.get("high_"+ff).getCardinality(); |
| assertTrue(ff + ": goodEst="+goodEst+", poorEst="+poorEst+", real="+numMatches+", p=" + p, |
| Math.abs(numMatches - goodEst) <= Math.abs(numMatches - poorEst)); |
| } |
| } |
| } |
| } |
| |
| /** |
| * Returns the (max) expected relative error according ot the HLL algorithm docs |
| */ |
| private static double expectedRelativeError(final int log2m) { |
| final long m = 1 << log2m; |
| // theoretical error is 1.04D * sqrt(m) |
| // fudge slightly to account for variance in random data |
| return 1.1D / Math.sqrt(m); |
| } |
| |
| /** |
| * Helper utility for building up a set of query params. |
| * |
| * The main query is a simple range query against the id field (using lowId TO highId). |
| * 2 stats.field params are generated for every field in {@link #STAT_FIELDS} -- |
| * both with and w/o a prehashed_l suffix -- using the specified log2m and regwidth. |
| * |
| * The response keys will be the full field names |
| */ |
| private static SolrParams buildCardinalityQ(final int lowId, |
| final int highId, |
| final int log2m, |
| final int regwidth) { |
| ModifiableSolrParams p = params("q", "id_i1:["+lowId+" TO "+highId+"]", |
| "rows", "0", "stats", "true"); |
| final String prefix = "{!cardinality=true hllLog2m="+log2m+" hllRegwidth="+regwidth; |
| for (String f : STAT_FIELDS) { |
| p.add("stats.field", prefix+"}"+f); |
| p.add("stats.field", prefix+" hllPreHashed=true}"+f+"_prehashed_l"); |
| } |
| return p; |
| } |
| |
| /** |
| * Helper utility for building up a set of query params. |
| * |
| * The main query is a simple range query against the id field (using lowId TO highId). |
| * 4 stats.field params are generated for every field in {@link #STAT_FIELDS} -- |
| * both with and w/o a prehashed_l suffix, and using both the low and high accuracy values |
| * |
| * The response keys will be the full field names with either a "low_" or "high_" prefix |
| */ |
| private static SolrParams buildCardinalityQ(final int lowId, |
| final int highId, |
| final double lowAccuracy, |
| final double highAccuracy) { |
| ModifiableSolrParams p = params("q", "id_i1:["+lowId+" TO "+highId+"]", |
| "rows", "0", "stats", "true"); |
| final String[] prefixes = new String[] { |
| "{!cardinality=" + lowAccuracy + " key=low_", |
| "{!cardinality=" + highAccuracy + " key=high_" |
| }; |
| |
| for (String f : STAT_FIELDS) { |
| for (String prefix : prefixes) { |
| p.add("stats.field", prefix+f+"}"+f); |
| p.add("stats.field", prefix+f+"_prehashed_l hllPreHashed=true}"+f+"_prehashed_l"); |
| } |
| } |
| return p; |
| } |
| } |