blob: 2b2a6f101fadf8c58562b0db8755b4720577afce [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.solr.BaseDistributedSearchTestCase;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.junit.Test;
* A test the demonstrates some of the expected behavior fo "long tail" terms when using <code>refine:simple</code>
* <p>
* <b>NOTE:</b> This test ignores the control collection (in single node mode, there is no
* need for the overrequesting, all the data is local -- so comparisons with it wouldn't
* be valid in the cases we are testing here)
* </p>
* <p>
* <b>NOTE:</b> This test is heavily inspired by (and uses the same indexed documents) as
* {@link org.apache.solr.handler.component.DistributedFacetPivotLongTailTest} -- however the behavior of
* <code>refine:simple</code> is "simpler" then the refinement logic used by
* <code>facet.pivot</code> so the assertions in this test vary from that test.
* </p>
@LuceneTestCase.Nightly // can be slow
public class DistributedFacetSimpleRefinementLongTailTest extends BaseDistributedSearchTestCase {
private static List<String> ALL_STATS;
private final String STAT_FIELD;
private String ALL_STATS_JSON = "";
public DistributedFacetSimpleRefinementLongTailTest() {
// we need DVs on point fields to compute stats & facets
if (Boolean.getBoolean(NUMERIC_POINTS_SYSPROP)) System.setProperty(NUMERIC_DOCVALUES_SYSPROP,"true");
STAT_FIELD = random().nextBoolean() ? "stat_is" : "stat_i";
ALL_STATS = Arrays.asList("min", "max", "sum", "stddev", "avg", "sumsq", "unique",
"missing", "countvals", "percentile", "variance", "hll");
for (String stat : ALL_STATS) {
String val = stat.equals("percentile")? STAT_FIELD+",90": STAT_FIELD;
ALL_STATS_JSON += stat + ":'" + stat + "(" + val + ")',";
public void distribTearDown() throws Exception {
ALL_STATS = null;
@ShardsFixed(num = 3)
public void test() throws Exception {
buildIndexes(clients, STAT_FIELD);
public static void buildIndexes(final List<SolrClient> clients, final String statField) throws Exception {
assertEquals("This indexing code assumes exactly 3 shards/clients", 3, clients.size());
final AtomicInteger docNum = new AtomicInteger();
final SolrClient shard0 = clients.get(0);
final SolrClient shard1 = clients.get(1);
final SolrClient shard2 = clients.get(2);
// the 5 top foo_s terms have 100 docs each on every shard
for (int i = 0; i < 100; i++) {
for (int j = 0; j < 5; j++) {
shard0.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "aaa"+j, statField, j * 13 - i));
shard1.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "aaa"+j, statField, j * 3 + i));
shard2.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "aaa"+j, statField, i * 7 + j));
// 20 foo_s terms that come in "second" with 50 docs each
// on both shard0 & shard1 ("bbb_")
for (int i = 0; i < 50; i++) {
for (int j = 0; j < 20; j++) {
shard0.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "bbb"+j, statField, 0));
shard1.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "bbb"+j, statField, 1));
// distracting term appears on only on shard2 50 times
shard2.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "junkA"));
// put "bbb0" on shard2 exactly once to sanity check refinement
shard2.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "bbb0", statField, -2));
// long 'tail' foo_s term appears in 45 docs on every shard
// foo_s:tail is the only term with bar_s sub-pivot terms
for (int i = 0; i < 45; i++) {
// for sub-pivot, shard0 & shard1 have 6 docs each for "tailB"
// but the top 5 terms are ccc(0-4) -- 7 on each shard
// (4 docs each have junk terms)
String sub_term = (i < 35) ? "ccc"+(i % 5) : ((i < 41) ? "tailB" : "junkA");
shard0.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "tail", "bar_s", sub_term, statField, i));
shard1.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "tail", "bar_s", sub_term, statField, i));
// shard2's top 5 sub-pivot terms are junk only it has with 8 docs each
// and 5 docs that use "tailB"
// NOTE: none of these get statField ! !
sub_term = (i < 40) ? "junkB"+(i % 5) : "tailB";
shard2.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "tail", "bar_s", sub_term));
// really long tail uncommon foo_s terms on shard2
for (int i = 0; i < (TEST_NIGHTLY ? 30 : 10); i++) {
// NOTE: using "Z" here so these sort before bbb0 when they tie for '1' instance each on shard2
shard2.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "ZZZ"+i));
private void sanityCheckIndividualShards() throws Exception {
// sanity check that our expectations about each shard (non-distrib) are correct
SolrParams req = params( "q", "*:*", "distrib", "false", "json.facet",
" { foo:{ type:terms, limit:10, field:foo_s, facet:{ bar:{ type:terms, limit:10, field:bar_s }}}}");
List<NamedList>[] shardFooBuckets = new List[clients.size()];
for (int i = 0; i < clients.size(); i++) {
shardFooBuckets[i] = (List<NamedList>)
((NamedList<NamedList>)clients.get(i).query( req ).getResponse().get("facets")).get("foo").get("buckets");
// top 5 same on all shards
for (int i = 0; i < 3; i++) {
assertEquals(10, shardFooBuckets[i].size());
for (int j = 0; j < 5; j++) {
NamedList bucket = shardFooBuckets[i].get(j);
assertEquals(bucket.toString(), "aaa"+j, bucket.get("val"));
assertEquals(bucket.toString(), 100L, bucket.get("count"));
// top 6-10 same on shard0 & shard1
for (int i = 0; i < 2; i++) {
for (int j = 5; j < 10; j++) {
NamedList bucket = shardFooBuckets[i].get(j);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("bbb"));
assertEquals(bucket.toString(), 50L, bucket.get("count"));
// 6-10 on shard2
assertEquals("junkA", shardFooBuckets[2].get(5).get("val"));
assertEquals(50L, shardFooBuckets[2].get(5).get("count"));
assertEquals("tail", shardFooBuckets[2].get(6).get("val"));
assertEquals(45L, shardFooBuckets[2].get(6).get("count"));
for (int j = 7; j < 10; j++) {
NamedList bucket = shardFooBuckets[2].get(j);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("ZZZ"));
assertEquals(bucket.toString(), 1L, bucket.get("count"));
// check 'bar' sub buckets on "tail" from shard2
{ List<NamedList> bar_buckets = (List<NamedList>) ((NamedList<NamedList>) shardFooBuckets[2].get(6).get("bar")).get("buckets");
assertEquals(6, bar_buckets.size());
for (int j = 0; j < 5; j++) {
NamedList bucket = bar_buckets.get(j);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("junkB"));
assertEquals(bucket.toString(), 8L, bucket.get("count"));
NamedList bucket = bar_buckets.get(5);
assertEquals("tailB", bucket.get("val"));
assertEquals(5L, bucket.get("count"));
private void checkRefinementAndOverrequesting() throws Exception {
// // distributed queries // //
{ // w/o refinement, the default overrequest isn't enough to find the long 'tail' *OR* the correct count for 'bbb0'...
List<NamedList> foo_buckets = (List<NamedList>)
queryServer( params( "q", "*:*", "shards", getShardsString(), "json.facet",
"{ foo: { type:terms, refine:none, limit:6, field:foo_s } }"
) ).getResponse().get("facets")).get("foo").get("buckets");
assertEquals(6, foo_buckets.size());
for (int i = 0; i < 5; i++) {
NamedList bucket = foo_buckets.get(i);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("aaa"));
assertEquals(bucket.toString(), 300L, bucket.get("count"));
// this will be short the "+1" fo the doc added to shard2...
NamedList bucket = foo_buckets.get(5);
assertTrue(bucket.toString(), bucket.get("val").equals("bbb0")); // 'tail' is missed
assertEquals(bucket.toString(), 100L, bucket.get("count")); // will not include the "+1" for the doc added to shard2
// even if we enable refinement, we still won't find the long 'tail' ...
// regardless of wether we use either the default overrequest, or disable overrequesting...
for (String over : Arrays.asList( "", "overrequest:0,")) {
List<NamedList> foo_buckets = (List<NamedList>)
queryServer( params( "q", "*:*", "shards", getShardsString(), "json.facet",
"{ foo: { type:terms, refine:simple, limit:6, "+ over +" field:foo_s, facet:{ " + ALL_STATS_JSON +
" bar: { type:terms, refine:simple, limit:6, "+ over +" field:bar_s, facet:{"+ALL_STATS_JSON+"}}}}}"
) ).getResponse().get("facets")).get("foo").get("buckets");
assertEquals(6, foo_buckets.size());
for (int i = 0; i < 5; i++) {
NamedList bucket = foo_buckets.get(i);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("aaa"));
assertEquals(bucket.toString(), 300L, bucket.get("count"));
// ...but it should have correctly asked shard2 to refine bbb0
NamedList bucket = foo_buckets.get(5);
assertTrue(bucket.toString(), bucket.get("val").equals("bbb0"));
assertEquals(bucket.toString(), 101L, bucket.get("count"));
// ...and the status under bbb0 should be correct to include the refinement
assertEquals(ALL_STATS.size() + 3, bucket.size()); // val,count,facet
assertEquals(-2L, bucket.get("min")); // this min only exists on shard2
assertEquals(1L, bucket.get("max"));
assertEquals(101L, bucket.get("countvals"));
assertEquals(0L, bucket.get("missing"));
assertEquals(48.0D, bucket.get("sum"));
assertEquals(1.0D, bucket.get("percentile"));
assertEquals(0.475247524752475D, (double) bucket.get("avg"), 0.1E-7);
assertEquals(54.0D, (double) bucket.get("sumsq"), 0.1E-7);
assertEquals(0.55846323792D, (double) bucket.get("stddev"), 0.1E-7);
assertEquals(0.3118811881D, (double) bucket.get("variance"), 0.1E-7);
assertEquals(3L, bucket.get("unique"));
assertEquals(3L, bucket.get("hll"));
// with a limit==6, we have to "overrequest >= 20" in order to ensure that 'tail' is included in the top 6
// this is because of how the "simple" refinement process works: the "top buckets" are determined based
// on the info available in the first pass request.
// Even though 'tail' is returned in the top6 for shard2, the cumulative total for 'bbb0' from shard0 and shard1 is
// high enough that the simple facet refinement ignores 'tail' because it assumes 'bbb0's final total will be greater.
// Meanwhile, for the sub-facet on 'bar', a limit==6 means we should correctly find 'tailB' as the top sub-term of 'tail',
// regardless of how much overrequest is used (or even if we don't have any refinement) since it's always in the top6...
for (String bar_opts : Arrays.asList( "refine:none,",
"refine:none, overrequest:0,",
"refine:simple, overrequest:0," )) {
List<NamedList> buckets = (List<NamedList>)
queryServer( params( "q", "*:*", "shards", getShardsString(), "json.facet",
"{ foo: { type:terms, limit:6, overrequest:20, refine:simple, field:foo_s, facet:{ " +
" bar: { type:terms, limit:6, " + bar_opts + " field:bar_s }}}}"
) ).getResponse().get("facets")).get("foo").get("buckets");
assertEquals(6, buckets.size());
for (int i = 0; i < 5; i++) {
NamedList bucket = buckets.get(i);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("aaa"));
assertEquals(bucket.toString(), 300L, bucket.get("count"));
NamedList bucket = buckets.get(5);
assertEquals(bucket.toString(), "tail", bucket.get("val"));
assertEquals(bucket.toString(), 135L, bucket.get("count"));
// check the sub buckets
buckets = ((NamedList<NamedList<List<NamedList>>>) bucket).get("bar").get("buckets");
assertEquals(6, buckets.size());
bucket = buckets.get(0);
assertEquals(bucket.toString(), "tailB", bucket.get("val"));
assertEquals(bucket.toString(), 17L, bucket.get("count"));
for (int i = 1; i < 6; i++) { // ccc(0-4)
bucket = buckets.get(i);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("ccc"));
assertEquals(bucket.toString(), 14L, bucket.get("count"));
// if we lower the limit on the sub-bucket to '5', overrequesting of at least 1 should still ensure
// that we get the correct top5 including "tailB" -- even w/o refinement
for (String bar_opts : Arrays.asList( "refine:none,",
"refine:none, overrequest:1,",
"refine:simple, overrequest:1," )) {
List<NamedList> buckets = (List<NamedList>)
queryServer( params( "q", "*:*", "shards", getShardsString(), "json.facet",
"{ foo: { type:terms, limit:6, overrequest:20, refine:simple, field:foo_s, facet:{ " +
" bar: { type:terms, limit:5, " + bar_opts + " field:bar_s }}}}"
) ).getResponse().get("facets")).get("foo").get("buckets");
assertEquals(6, buckets.size());
for (int i = 0; i < 5; i++) {
NamedList bucket = buckets.get(i);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("aaa"));
assertEquals(bucket.toString(), 300L, bucket.get("count"));
NamedList bucket = buckets.get(5);
assertEquals(bucket.toString(), "tail", bucket.get("val"));
assertEquals(bucket.toString(), 135L, bucket.get("count"));
// check the sub buckets
buckets = ((NamedList<NamedList<List<NamedList>>>) bucket).get("bar").get("buckets");
assertEquals(5, buckets.size());
bucket = buckets.get(0);
assertEquals(bucket.toString(), "tailB", bucket.get("val"));
assertEquals(bucket.toString(), 17L, bucket.get("count"));
for (int i = 1; i < 5; i++) { // ccc(0-3)
bucket = buckets.get(i);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("ccc"));
assertEquals(bucket.toString(), 14L, bucket.get("count"));
// however: with a lower sub-facet limit==5, and overrequesting disabled,
// we're going to miss out on tailB even if we have refinement
for (String bar_opts : Arrays.asList( "refine:none, overrequest:0,",
"refine:simple, overrequest:0," )) {
List<NamedList> buckets = (List<NamedList>)
queryServer( params( "q", "*:*", "shards", getShardsString(), "json.facet",
"{ foo: { type:terms, limit:6, overrequest:20, refine:simple, field:foo_s, facet:{ " +
" bar: { type:terms, limit:5, " + bar_opts + " field:bar_s }}}}"
) ).getResponse().get("facets")).get("foo").get("buckets");
assertEquals(6, buckets.size());
for (int i = 0; i < 5; i++) {
NamedList bucket = buckets.get(i);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("aaa"));
assertEquals(bucket.toString(), 300L, bucket.get("count"));
NamedList bucket = buckets.get(5);
assertEquals(bucket.toString(), "tail", bucket.get("val"));
assertEquals(bucket.toString(), 135L, bucket.get("count"));
// check the sub buckets
buckets = ((NamedList<NamedList<List<NamedList>>>) bucket).get("bar").get("buckets");
assertEquals(5, buckets.size());
for (int i = 0; i < 5; i++) { // ccc(0-4)
bucket = buckets.get(i);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("ccc"));
assertEquals(bucket.toString(), 14L, bucket.get("count"));
private void checkSubFacetStats() throws Exception {
// Deep checking of some Facet stats
// the assertions only care about the first 5 results of each facet, but to get the long tail more are needed
// from the sub-shards. results should be the same regardless of: "high limit" vs "low limit + high overrequest"
checkSubFacetStats("refine:simple, limit: 100,");
checkSubFacetStats("refine:simple, overrequest: 100,");
// and the results shouldn't change if we explicitly disable refinement
checkSubFacetStats("refine:none, limit: 100,");
checkSubFacetStats("refine:none, overrequest: 100,");
private void checkSubFacetStats(String extraJson) throws Exception {
String commonJson = "type: terms, " + extraJson;
NamedList<NamedList> all_facets = (NamedList) queryServer
( params( "q", "*:*", "shards", getShardsString(), "rows" , "0", "json.facet",
"{ foo : { " + commonJson + " field: foo_s, facet: { " +
ALL_STATS_JSON + " bar: { " + commonJson + " field: bar_s, facet: { " + ALL_STATS_JSON +
// under bar, in addition to "ALL" simple stats, we also ask for skg...
", skg : 'relatedness($skg_fore,$skg_back)' } } } } }",
"skg_fore", STAT_FIELD+":[0 TO 40]", "skg_back", STAT_FIELD+":[-10000 TO 10000]"
) ).getResponse().get("facets");
List<NamedList> foo_buckets = (List) ((NamedList)all_facets.get("foo")).get("buckets");
NamedList aaa0_Bucket = foo_buckets.get(0);
assertEquals(ALL_STATS.size() + 3, aaa0_Bucket.size()); // val,count,facet
assertEquals("aaa0", aaa0_Bucket.get("val"));
assertEquals(300L, aaa0_Bucket.get("count"));
assertEquals(-99L, aaa0_Bucket.get("min"));
assertEquals(693L, aaa0_Bucket.get("max"));
assertEquals(300L, aaa0_Bucket.get("countvals"));
assertEquals(0L, aaa0_Bucket.get("missing"));
assertEquals(34650.0D, aaa0_Bucket.get("sum"));
assertEquals(483.70000000000016D, (double)aaa0_Bucket.get("percentile"), 0.1E-7);
assertEquals(115.5D, (double) aaa0_Bucket.get("avg"), 0.1E-7);
assertEquals(1.674585E7D, (double) aaa0_Bucket.get("sumsq"), 0.1E-7);
assertEquals(206.4493184076D, (double) aaa0_Bucket.get("stddev"), 0.1E-7);
assertEquals(42621.32107023412D, (double) aaa0_Bucket.get("variance"), 0.1E-7);
assertEquals(284L, aaa0_Bucket.get("unique"));
assertEquals(284L, aaa0_Bucket.get("hll"));
NamedList tail_Bucket = foo_buckets.get(5);
assertEquals(ALL_STATS.size() + 3, tail_Bucket.size()); // val,count,facet
assertEquals("tail", tail_Bucket.get("val"));
assertEquals(135L, tail_Bucket.get("count"));
assertEquals(0L, tail_Bucket.get("min"));
assertEquals(44L, tail_Bucket.get("max"));
assertEquals(90L, tail_Bucket.get("countvals"));
assertEquals(40.0D, tail_Bucket.get("percentile"));
assertEquals(45L, tail_Bucket.get("missing"));
assertEquals(1980.0D, tail_Bucket.get("sum"));
assertEquals(22.0D, (double) tail_Bucket.get("avg"), 0.1E-7);
assertEquals(58740.0D, (double) tail_Bucket.get("sumsq"), 0.1E-7);
assertEquals(13.0599310011D, (double) tail_Bucket.get("stddev"), 0.1E-7);
assertEquals(170.5617977535D, (double) tail_Bucket.get("variance"), 0.1E-7);
assertEquals(45L, tail_Bucket.get("unique"));
assertEquals(45L, tail_Bucket.get("hll"));
List<NamedList> tail_bar_buckets = (List) ((NamedList)tail_Bucket.get("bar")).get("buckets");
NamedList tailB_Bucket = tail_bar_buckets.get(0);
assertEquals(ALL_STATS.size() + 3, tailB_Bucket.size()); // val,count,skg ... NO SUB FACETS
assertEquals("tailB", tailB_Bucket.get("val"));
assertEquals(17L, tailB_Bucket.get("count"));
assertEquals(35L, tailB_Bucket.get("min"));
assertEquals(40L, tailB_Bucket.get("max"));
assertEquals(12L, tailB_Bucket.get("countvals"));
assertEquals(39.9D, tailB_Bucket.get("percentile"));
assertEquals(5L, tailB_Bucket.get("missing"));
assertEquals(450.0D, tailB_Bucket.get("sum"));
assertEquals(37.5D, (double) tailB_Bucket.get("avg"), 0.1E-7);
assertEquals(16910.0D, (double) tailB_Bucket.get("sumsq"), 0.1E-7);
assertEquals(1.78376517D, (double) tailB_Bucket.get("stddev"), 0.1E-7);
assertEquals(3.1818181817D, (double) tailB_Bucket.get("variance"), 0.1E-7);
assertEquals(6L, tailB_Bucket.get("unique"));
assertEquals(6L, tailB_Bucket.get("hll"));
// check the SKG stats on our tailB bucket
NamedList tailB_skg = (NamedList) tailB_Bucket.get("skg");
3, tailB_skg.size());
assertEquals(0.19990D, tailB_skg.get("relatedness"));
assertEquals(0.00334D, tailB_skg.get("foreground_popularity"));
assertEquals(0.00334D, tailB_skg.get("background_popularity"));
//assertEquals(12L, tailB_skg.get("foreground_count"));
//assertEquals(82L, tailB_skg.get("foreground_size"));
//assertEquals(12L, tailB_skg.get("background_count"));
//assertEquals(3591L, tailB_skg.get("background_size"));