blob: 6dd7a1c4646ddea19501afd892e835496d5946f6 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.stream.Collectors;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.facet.DrillSideways.DrillSidewaysResult;
import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.CollectorManager;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryCachingPolicy;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.NamedThreadFactory;
import org.apache.lucene.util.TestUtil;
public class TestDrillSideways extends FacetTestCase {
protected DrillSideways getNewDrillSideways(IndexSearcher searcher, FacetsConfig config,
SortedSetDocValuesReaderState state) {
return new DrillSideways(searcher, config, state);
}
protected DrillSideways getNewDrillSideways(IndexSearcher searcher, FacetsConfig config, TaxonomyReader taxoReader) {
return new DrillSideways(searcher, config, taxoReader);
}
protected DrillSideways getNewDrillSidewaysScoreSubdocsAtOnce(IndexSearcher searcher, FacetsConfig config,
TaxonomyReader taxoReader) {
return new DrillSideways(searcher, config, taxoReader) {
@Override
protected boolean scoreSubDocsAtOnce() {
return true;
}
};
}
protected DrillSideways getNewDrillSidewaysBuildFacetsResult(IndexSearcher searcher, FacetsConfig config,
TaxonomyReader taxoReader) {
return new DrillSideways(searcher, config, taxoReader) {
@Override
protected Facets buildFacetsResult(FacetsCollector drillDowns, FacetsCollector[] drillSideways,
String[] drillSidewaysDims) throws IOException {
Map<String, Facets> drillSidewaysFacets = new HashMap<>();
Facets drillDownFacets = getTaxonomyFacetCounts(taxoReader, config, drillDowns);
if (drillSideways != null) {
for (int i = 0; i < drillSideways.length; i++) {
drillSidewaysFacets.put(drillSidewaysDims[i], getTaxonomyFacetCounts(taxoReader, config, drillSideways[i]));
}
}
if (drillSidewaysFacets.isEmpty()) {
return drillDownFacets;
} else {
return new MultiFacets(drillSidewaysFacets, drillDownFacets);
}
}
};
}
private IndexSearcher getNewSearcher(IndexReader reader) {
// Do not wrap with an asserting searcher, since DrillSidewaysQuery doesn't
// implement all the required components like Weight#scorer.
return newSearcher(reader, true, false, random().nextBoolean());
}
// See LUCENE-10060:
public void testNoCaching() throws Exception {
Directory dir = newDirectory();
Directory taxoDir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
DirectoryTaxonomyWriter taxoWriter =
new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE);
FacetsConfig config = new FacetsConfig();
// Setup some basic test documents:
Document doc = new Document();
doc.add(new StringField("id", "1", Field.Store.NO));
doc.add(new SortedDocValuesField("id", new BytesRef("1")));
doc.add(new FacetField("Color", "Red"));
doc.add(new FacetField("Size", "Small"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(new StringField("id", "2", Field.Store.NO));
doc.add(new SortedDocValuesField("id", new BytesRef("2")));
doc.add(new FacetField("Color", "Green"));
doc.add(new FacetField("Size", "Small"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(new StringField("id", "3", Field.Store.NO));
doc.add(new SortedDocValuesField("id", new BytesRef("3")));
doc.add(new FacetField("Color", "Blue"));
doc.add(new FacetField("Size", "Small"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(new StringField("id", "4", Field.Store.NO));
doc.add(new SortedDocValuesField("id", new BytesRef("4")));
doc.add(new FacetField("Color", "Red"));
doc.add(new FacetField("Size", "Medium"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(new StringField("id", "5", Field.Store.NO));
doc.add(new SortedDocValuesField("id", new BytesRef("5")));
doc.add(new FacetField("Color", "Blue"));
doc.add(new FacetField("Size", "Medium"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(new StringField("id", "6", Field.Store.NO));
doc.add(new SortedDocValuesField("id", new BytesRef("6")));
doc.add(new FacetField("Color", "Blue"));
doc.add(new FacetField("Size", "Large"));
writer.addDocument(config.build(taxoWriter, doc));
// Delete a couple documents. We'll want to make sure they don't get counted in binning:
writer.deleteDocuments(new Term("id", "4"));
writer.deleteDocuments(new Term("id", "6"));
// Simple DDQ that just filters all results by Color == Blue:
DrillDownQuery ddq = new DrillDownQuery(config);
ddq.add("Color", "Blue");
// Setup an IndexSearcher that will try to cache queries aggressively:
IndexSearcher searcher = getNewSearcher(writer.getReader());
searcher.setQueryCachingPolicy(
new QueryCachingPolicy() {
@Override
public void onUse(Query query) {}
@Override
public boolean shouldCache(Query query) {
return true;
}
});
// Setup a DS instance for searching:
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
DrillSideways ds = getNewDrillSideways(searcher, config, taxoReader);
// We'll use a CollectorManager to trigger the trickiest caching behavior:
SimpleCollectorManager collectorManager =
new SimpleCollectorManager(
10, Comparator.comparing(cr -> cr.id), ScoreMode.COMPLETE_NO_SCORES);
// Make sure our CM produces Collectors that _do not_ need scores to ensure IndexSearcher tries
// to cache:
assertFalse(collectorManager.newCollector().scoreMode().needsScores());
// If we incorrectly cache here, the "sideways" FacetsCollectors will get populated with counts
// for the deleted
// docs. Make sure they don't:
DrillSideways.ConcurrentDrillSidewaysResult<List<DocAndScore>> concurrentResult =
ds.search(ddq, collectorManager);
assertEquals(2, concurrentResult.collectorResult.size());
assertEquals(
"dim=Color path=[] value=4 childCount=3\n Blue (2)\n Red (1)\n Green (1)\n",
concurrentResult.facets.getTopChildren(10, "Color").toString());
assertEquals(
"dim=Size path=[] value=2 childCount=2\n Small (1)\n Medium (1)\n",
concurrentResult.facets.getTopChildren(10, "Size").toString());
// Now do the same thing but use a Collector directly:
SimpleCollector collector = new SimpleCollector(ScoreMode.COMPLETE_NO_SCORES);
// Make sure our Collector _does not_ need scores to ensure IndexSearcher tries to cache:
assertFalse(collector.scoreMode().needsScores());
// If we incorrectly cache here, the "sideways" FacetsCollectors will get populated with counts
// for the deleted
// docs. Make sure they don't:
DrillSidewaysResult result = ds.search(ddq, collector);
assertEquals(2, collector.hits.size());
assertEquals(
"dim=Color path=[] value=4 childCount=3\n Blue (2)\n Red (1)\n Green (1)\n",
result.facets.getTopChildren(10, "Color").toString());
assertEquals(
"dim=Size path=[] value=2 childCount=2\n Small (1)\n Medium (1)\n",
result.facets.getTopChildren(10, "Size").toString());
writer.close();
IOUtils.close(searcher.getIndexReader(), taxoReader, taxoWriter, dir, taxoDir);
}
public void testBasic() throws Exception {
Directory dir = newDirectory();
Directory taxoDir = newDirectory();
// Writes facet ords to a separate directory from the
// main index:
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE);
FacetsConfig config = new FacetsConfig();
config.setHierarchical("Publish Date", true);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(new FacetField("Author", "Bob"));
doc.add(new FacetField("Publish Date", "2010", "10", "15"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(new FacetField("Author", "Lisa"));
doc.add(new FacetField("Publish Date", "2010", "10", "20"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(new FacetField("Author", "Lisa"));
doc.add(new FacetField("Publish Date", "2012", "1", "1"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(new FacetField("Author", "Susan"));
doc.add(new FacetField("Publish Date", "2012", "1", "7"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(new FacetField("Author", "Frank"));
doc.add(new FacetField("Publish Date", "1999", "5", "5"));
writer.addDocument(config.build(taxoWriter, doc));
// NRT open
IndexSearcher searcher = getNewSearcher(writer.getReader());
//System.out.println("searcher=" + searcher);
// NRT open
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
DrillSideways ds = getNewDrillSideways(searcher, config, taxoReader);
// case: drill-down on a single field; in this
// case the drill-sideways + drill-down counts ==
// drill-down of just the query:
DrillDownQuery ddq = new DrillDownQuery(config);
ddq.add("Author", "Lisa");
DrillSidewaysResult r = ds.search(null, ddq, 10);
assertEquals(2, r.hits.totalHits.value);
// Publish Date is only drill-down, and Lisa published
// one in 2012 and one in 2010:
assertEquals("dim=Publish Date path=[] value=2 childCount=2\n 2010 (1)\n 2012 (1)\n",
r.facets.getTopChildren(10, "Publish Date").toString());
// Author is drill-sideways + drill-down: Lisa
// (drill-down) published twice, and Frank/Susan/Bob
// published once:
assertEquals("dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n",
r.facets.getTopChildren(10, "Author").toString());
// Same simple case, but no baseQuery (pure browse):
// drill-down on a single field; in this case the
// drill-sideways + drill-down counts == drill-down of
// just the query:
ddq = new DrillDownQuery(config);
ddq.add("Author", "Lisa");
r = ds.search(null, ddq, 10);
assertEquals(2, r.hits.totalHits.value);
// Publish Date is only drill-down, and Lisa published
// one in 2012 and one in 2010:
assertEquals("dim=Publish Date path=[] value=2 childCount=2\n 2010 (1)\n 2012 (1)\n",
r.facets.getTopChildren(10, "Publish Date").toString());
// Author is drill-sideways + drill-down: Lisa
// (drill-down) published twice, and Frank/Susan/Bob
// published once:
assertEquals("dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n",
r.facets.getTopChildren(10, "Author").toString());
// Another simple case: drill-down on single fields
// but OR of two values
ddq = new DrillDownQuery(config);
ddq.add("Author", "Lisa");
ddq.add("Author", "Bob");
r = ds.search(null, ddq, 10);
assertEquals(3, r.hits.totalHits.value);
// Publish Date is only drill-down: Lisa and Bob
// (drill-down) published twice in 2010 and once in 2012:
assertEquals("dim=Publish Date path=[] value=3 childCount=2\n 2010 (2)\n 2012 (1)\n",
r.facets.getTopChildren(10, "Publish Date").toString());
// Author is drill-sideways + drill-down: Lisa
// (drill-down) published twice, and Frank/Susan/Bob
// published once:
assertEquals("dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n",
r.facets.getTopChildren(10, "Author").toString());
assertTrue(r.facets instanceof MultiFacets);
List<FacetResult> allResults = r.facets.getAllDims(10);
assertEquals(2, allResults.size());
assertEquals("dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n",
allResults.get(0).toString());
assertEquals("dim=Publish Date path=[] value=3 childCount=2\n 2010 (2)\n 2012 (1)\n",
allResults.get(1).toString());
// More interesting case: drill-down on two fields
ddq = new DrillDownQuery(config);
ddq.add("Author", "Lisa");
ddq.add("Publish Date", "2010");
r = ds.search(null, ddq, 10);
assertEquals(1, r.hits.totalHits.value);
// Publish Date is drill-sideways + drill-down: Lisa
// (drill-down) published once in 2010 and once in 2012:
assertEquals("dim=Publish Date path=[] value=2 childCount=2\n 2010 (1)\n 2012 (1)\n",
r.facets.getTopChildren(10, "Publish Date").toString());
// Author is drill-sideways + drill-down:
// only Lisa & Bob published (once each) in 2010:
assertEquals("dim=Author path=[] value=2 childCount=2\n Bob (1)\n Lisa (1)\n",
r.facets.getTopChildren(10, "Author").toString());
// Even more interesting case: drill down on two fields,
// but one of them is OR
ddq = new DrillDownQuery(config);
// Drill down on Lisa or Bob:
ddq.add("Author", "Lisa");
ddq.add("Publish Date", "2010");
ddq.add("Author", "Bob");
r = ds.search(null, ddq, 10);
assertEquals(2, r.hits.totalHits.value);
// Publish Date is both drill-sideways + drill-down:
// Lisa or Bob published twice in 2010 and once in 2012:
assertEquals("dim=Publish Date path=[] value=3 childCount=2\n 2010 (2)\n 2012 (1)\n",
r.facets.getTopChildren(10, "Publish Date").toString());
// Author is drill-sideways + drill-down:
// only Lisa & Bob published (once each) in 2010:
assertEquals("dim=Author path=[] value=2 childCount=2\n Bob (1)\n Lisa (1)\n",
r.facets.getTopChildren(10, "Author").toString());
// Test drilling down on invalid field:
ddq = new DrillDownQuery(config);
ddq.add("Foobar", "Baz");
r = ds.search(null, ddq, 10);
assertEquals(0, r.hits.totalHits.value);
assertNull(r.facets.getTopChildren(10, "Publish Date"));
assertNull(r.facets.getTopChildren(10, "Foobar"));
// Test drilling down on valid term or'd with invalid term:
ddq = new DrillDownQuery(config);
ddq.add("Author", "Lisa");
ddq.add("Author", "Tom");
r = ds.search(null, ddq, 10);
assertEquals(2, r.hits.totalHits.value);
// Publish Date is only drill-down, and Lisa published
// one in 2012 and one in 2010:
assertEquals("dim=Publish Date path=[] value=2 childCount=2\n 2010 (1)\n 2012 (1)\n",
r.facets.getTopChildren(10, "Publish Date").toString());
// Author is drill-sideways + drill-down: Lisa
// (drill-down) published twice, and Frank/Susan/Bob
// published once:
assertEquals("dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n",
r.facets.getTopChildren(10, "Author").toString());
// LUCENE-4915: test drilling down on a dimension but
// NOT facet counting it:
ddq = new DrillDownQuery(config);
ddq.add("Author", "Lisa");
ddq.add("Author", "Tom");
r = ds.search(null, ddq, 10);
assertEquals(2, r.hits.totalHits.value);
// Publish Date is only drill-down, and Lisa published
// one in 2012 and one in 2010:
assertEquals("dim=Publish Date path=[] value=2 childCount=2\n 2010 (1)\n 2012 (1)\n",
r.facets.getTopChildren(10, "Publish Date").toString());
// Test main query gets null scorer:
ddq = new DrillDownQuery(config, new TermQuery(new Term("foobar", "baz")));
ddq.add("Author", "Lisa");
r = ds.search(null, ddq, 10);
assertEquals(0, r.hits.totalHits.value);
assertNull(r.facets.getTopChildren(10, "Publish Date"));
assertNull(r.facets.getTopChildren(10, "Author"));
writer.close();
IOUtils.close(searcher.getIndexReader(), taxoReader, taxoWriter, dir, taxoDir);
}
public void testBasicWithCollectorManager() throws Exception {
Directory dir = newDirectory();
Directory taxoDir = newDirectory();
// Writes facet ords to a separate directory from the
// main index:
DirectoryTaxonomyWriter taxoWriter =
new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE);
FacetsConfig config = new FacetsConfig();
config.setHierarchical("Publish Date", true);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newStringField("content", "foo", Field.Store.NO));
doc.add(new FacetField("Author", "Bob"));
doc.add(new FacetField("Publish Date", "2010", "10", "15"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(newStringField("content", "foo", Field.Store.NO));
doc.add(new FacetField("Author", "Lisa"));
doc.add(new FacetField("Publish Date", "2010", "10", "20"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(newStringField("content", "foo", Field.Store.NO));
doc.add(new FacetField("Author", "Lisa"));
doc.add(new FacetField("Publish Date", "2012", "1", "1"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(newStringField("content", "bar", Field.Store.NO));
doc.add(new FacetField("Author", "Susan"));
doc.add(new FacetField("Publish Date", "2012", "1", "7"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(newStringField("content", "bar", Field.Store.NO));
doc.add(new FacetField("Author", "Frank"));
doc.add(new FacetField("Publish Date", "1999", "5", "5"));
writer.addDocument(config.build(taxoWriter, doc));
// NRT open
IndexSearcher searcher = getNewSearcher(writer.getReader());
// NRT open
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
DrillSideways ds = getNewDrillSideways(searcher, config, taxoReader);
Comparator<CollectedResult> comparator =
(a, b) -> {
int cmp = Float.compare(b.docAndScore.score, a.docAndScore.score); // highest-to-lowest
if (cmp == 0) {
cmp = Integer.compare(a.docAndScore.doc, b.docAndScore.doc); // tie-break
}
return cmp;
};
SimpleCollectorManager manager = new SimpleCollectorManager(10, comparator);
// case: drill-down on a single field; in this
// case the drill-sideways + drill-down counts ==
// drill-down of just the query:
DrillDownQuery ddq = new DrillDownQuery(config);
ddq.add("Author", "Lisa");
DrillSideways.ConcurrentDrillSidewaysResult<List<DocAndScore>> r = ds.search(ddq, manager);
assertEquals(2, r.collectorResult.size());
// Publish Date is only drill-down, and Lisa published
// one in 2012 and one in 2010:
assertEquals(
"dim=Publish Date path=[] value=2 childCount=2\n 2010 (1)\n 2012 (1)\n",
r.facets.getTopChildren(10, "Publish Date").toString());
// Author is drill-sideways + drill-down: Lisa
// (drill-down) published twice, and Frank/Susan/Bob
// published once:
assertEquals(
"dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n",
r.facets.getTopChildren(10, "Author").toString());
// Same simple case, but no baseQuery (pure browse):
// drill-down on a single field; in this case the
// drill-sideways + drill-down counts == drill-down of
// just the query:
ddq = new DrillDownQuery(config);
ddq.add("Author", "Lisa");
r = ds.search(ddq, manager);
assertEquals(2, r.collectorResult.size());
// Publish Date is only drill-down, and Lisa published
// one in 2012 and one in 2010:
assertEquals(
"dim=Publish Date path=[] value=2 childCount=2\n 2010 (1)\n 2012 (1)\n",
r.facets.getTopChildren(10, "Publish Date").toString());
// Author is drill-sideways + drill-down: Lisa
// (drill-down) published twice, and Frank/Susan/Bob
// published once:
assertEquals(
"dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n",
r.facets.getTopChildren(10, "Author").toString());
// Another simple case: drill-down on single fields
// but OR of two values
ddq = new DrillDownQuery(config);
ddq.add("Author", "Lisa");
ddq.add("Author", "Bob");
r = ds.search(ddq, manager);
assertEquals(3, r.collectorResult.size());
// Publish Date is only drill-down: Lisa and Bob
// (drill-down) published twice in 2010 and once in 2012:
assertEquals(
"dim=Publish Date path=[] value=3 childCount=2\n 2010 (2)\n 2012 (1)\n",
r.facets.getTopChildren(10, "Publish Date").toString());
// Author is drill-sideways + drill-down: Lisa
// (drill-down) published twice, and Frank/Susan/Bob
// published once:
assertEquals(
"dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n",
r.facets.getTopChildren(10, "Author").toString());
assertTrue(r.facets instanceof MultiFacets);
List<FacetResult> allResults = r.facets.getAllDims(10);
assertEquals(2, allResults.size());
assertEquals(
"dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n",
allResults.get(0).toString());
assertEquals(
"dim=Publish Date path=[] value=3 childCount=2\n 2010 (2)\n 2012 (1)\n",
allResults.get(1).toString());
// More interesting case: drill-down on two fields
ddq = new DrillDownQuery(config);
ddq.add("Author", "Lisa");
ddq.add("Publish Date", "2010");
r = ds.search(ddq, manager);
assertEquals(1, r.collectorResult.size());
// Publish Date is drill-sideways + drill-down: Lisa
// (drill-down) published once in 2010 and once in 2012:
assertEquals(
"dim=Publish Date path=[] value=2 childCount=2\n 2010 (1)\n 2012 (1)\n",
r.facets.getTopChildren(10, "Publish Date").toString());
// Author is drill-sideways + drill-down:
// only Lisa & Bob published (once each) in 2010:
assertEquals(
"dim=Author path=[] value=2 childCount=2\n Bob (1)\n Lisa (1)\n",
r.facets.getTopChildren(10, "Author").toString());
// Even more interesting case: drill down on two fields,
// but one of them is OR
ddq = new DrillDownQuery(config);
// Drill down on Lisa or Bob:
ddq.add("Author", "Lisa");
ddq.add("Publish Date", "2010");
ddq.add("Author", "Bob");
r = ds.search(ddq, manager);
assertEquals(2, r.collectorResult.size());
// Publish Date is both drill-sideways + drill-down:
// Lisa or Bob published twice in 2010 and once in 2012:
assertEquals(
"dim=Publish Date path=[] value=3 childCount=2\n 2010 (2)\n 2012 (1)\n",
r.facets.getTopChildren(10, "Publish Date").toString());
// Author is drill-sideways + drill-down:
// only Lisa & Bob published (once each) in 2010:
assertEquals(
"dim=Author path=[] value=2 childCount=2\n Bob (1)\n Lisa (1)\n",
r.facets.getTopChildren(10, "Author").toString());
// Test drilling down on invalid field:
ddq = new DrillDownQuery(config);
ddq.add("Foobar", "Baz");
r = ds.search(ddq, manager);
assertEquals(0, r.collectorResult.size());
assertNull(r.facets.getTopChildren(10, "Publish Date"));
assertNull(r.facets.getTopChildren(10, "Foobar"));
// Test drilling down on valid term or'd with invalid term:
ddq = new DrillDownQuery(config);
ddq.add("Author", "Lisa");
ddq.add("Author", "Tom");
r = ds.search(ddq, manager);
assertEquals(2, r.collectorResult.size());
// Publish Date is only drill-down, and Lisa published
// one in 2012 and one in 2010:
assertEquals(
"dim=Publish Date path=[] value=2 childCount=2\n 2010 (1)\n 2012 (1)\n",
r.facets.getTopChildren(10, "Publish Date").toString());
// Author is drill-sideways + drill-down: Lisa
// (drill-down) published twice, and Frank/Susan/Bob
// published once:
assertEquals(
"dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n",
r.facets.getTopChildren(10, "Author").toString());
// LUCENE-4915: test drilling down on a dimension but
// NOT facet counting it:
ddq = new DrillDownQuery(config);
ddq.add("Author", "Lisa");
ddq.add("Author", "Tom");
r = ds.search(ddq, manager);
assertEquals(2, r.collectorResult.size());
// Publish Date is only drill-down, and Lisa published
// one in 2012 and one in 2010:
assertEquals(
"dim=Publish Date path=[] value=2 childCount=2\n 2010 (1)\n 2012 (1)\n",
r.facets.getTopChildren(10, "Publish Date").toString());
// Test main query gets null scorer:
ddq = new DrillDownQuery(config, new TermQuery(new Term("foobar", "baz")));
ddq.add("Author", "Lisa");
r = ds.search(ddq, manager);
assertEquals(0, r.collectorResult.size());
assertNull(r.facets.getTopChildren(10, "Publish Date"));
assertNull(r.facets.getTopChildren(10, "Author"));
// Test no drill down dims:
ddq = new DrillDownQuery(config, new MatchAllDocsQuery());
r = ds.search(ddq, manager);
assertEquals(5, r.collectorResult.size());
assertEquals(
"dim=Publish Date path=[] value=5 childCount=3\n 2010 (2)\n 2012 (2)\n 1999 (1)\n",
r.facets.getTopChildren(10, "Publish Date").toString());
assertEquals(
"dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n",
r.facets.getTopChildren(10, "Author").toString());
// Test no drill down dims with null FacetsCollectorManager for the main query:
ddq = new DrillDownQuery(config, new MatchAllDocsQuery());
r =
new DrillSideways(searcher, config, taxoReader) {
@Override
protected FacetsCollectorManager createDrillDownFacetsCollectorManager() {
return null;
}
}.search(ddq, manager);
assertEquals(5, r.collectorResult.size());
// Expect null facets since we provided a null FacetsCollectorManager
assertNull(r.facets);
// Test the case where the base query rewrites itself. See LUCENE-9988:
Query baseQuery =
new TermQuery(new Term("content", "foo")) {
@Override
public Query rewrite(IndexReader reader) {
// return a new instance, forcing the DrillDownQuery to also rewrite itself, exposing
// the bug in LUCENE-9988:
return new TermQuery(getTerm());
}
};
ddq = new DrillDownQuery(config, baseQuery);
ddq.add("Author", "Lisa");
r = ds.search(ddq, manager);
assertEquals(2, r.collectorResult.size());
assertEquals(
"dim=Publish Date path=[] value=2 childCount=2\n 2010 (1)\n 2012 (1)\n",
r.facets.getTopChildren(10, "Publish Date").toString());
assertEquals(
"dim=Author path=[] value=3 childCount=2\n Lisa (2)\n Bob (1)\n",
r.facets.getTopChildren(10, "Author").toString());
writer.close();
IOUtils.close(searcher.getIndexReader(), taxoReader, taxoWriter, dir, taxoDir);
}
public void testSometimesInvalidDrillDown() throws Exception {
Directory dir = newDirectory();
Directory taxoDir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
// Writes facet ords to a separate directory from the
// main index:
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE);
FacetsConfig config = new FacetsConfig();
config.setHierarchical("Publish Date", true);
Document doc = new Document();
doc.add(new FacetField("Author", "Bob"));
doc.add(new FacetField("Publish Date", "2010", "10", "15"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(new FacetField("Author", "Lisa"));
doc.add(new FacetField("Publish Date", "2010", "10", "20"));
writer.addDocument(config.build(taxoWriter, doc));
writer.commit();
// 2nd segment has no Author:
doc = new Document();
doc.add(new FacetField("Foobar", "Lisa"));
doc.add(new FacetField("Publish Date", "2012", "1", "1"));
writer.addDocument(config.build(taxoWriter, doc));
// NRT open
IndexSearcher searcher = getNewSearcher(writer.getReader());
//System.out.println("searcher=" + searcher);
// NRT open
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
DrillDownQuery ddq = new DrillDownQuery(config);
ddq.add("Author", "Lisa");
DrillSidewaysResult r = getNewDrillSideways(searcher, config, taxoReader).search(null, ddq, 10);
assertEquals(1, r.hits.totalHits.value);
// Publish Date is only drill-down, and Lisa published
// one in 2012 and one in 2010:
assertEquals("dim=Publish Date path=[] value=1 childCount=1\n 2010 (1)\n",
r.facets.getTopChildren(10, "Publish Date").toString());
// Author is drill-sideways + drill-down: Lisa
// (drill-down) published once, and Bob
// published once:
assertEquals("dim=Author path=[] value=2 childCount=2\n Bob (1)\n Lisa (1)\n",
r.facets.getTopChildren(10, "Author").toString());
writer.close();
IOUtils.close(searcher.getIndexReader(), taxoReader, taxoWriter, dir, taxoDir);
}
public void testMultipleRequestsPerDim() throws Exception {
Directory dir = newDirectory();
Directory taxoDir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
// Writes facet ords to a separate directory from the
// main index:
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE);
FacetsConfig config = new FacetsConfig();
config.setHierarchical("dim", true);
Document doc = new Document();
doc.add(new FacetField("dim", "a", "x"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(new FacetField("dim", "a", "y"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(new FacetField("dim", "a", "z"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(new FacetField("dim", "b"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(new FacetField("dim", "c"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(new FacetField("dim", "d"));
writer.addDocument(config.build(taxoWriter, doc));
// NRT open
IndexSearcher searcher = getNewSearcher(writer.getReader());
//System.out.println("searcher=" + searcher);
// NRT open
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
DrillDownQuery ddq = new DrillDownQuery(config);
ddq.add("dim", "a");
DrillSidewaysResult r = getNewDrillSideways(searcher, config, taxoReader).search(null, ddq, 10);
assertEquals(3, r.hits.totalHits.value);
assertEquals("dim=dim path=[] value=6 childCount=4\n a (3)\n b (1)\n c (1)\n d (1)\n",
r.facets.getTopChildren(10, "dim").toString());
assertEquals("dim=dim path=[a] value=3 childCount=3\n x (1)\n y (1)\n z (1)\n",
r.facets.getTopChildren(10, "dim", "a").toString());
writer.close();
IOUtils.close(searcher.getIndexReader(), taxoReader, taxoWriter, dir, taxoDir);
}
private static class Doc implements Comparable<Doc> {
String id;
String contentToken;
public Doc() {
}
// -1 if the doc is missing this dim, else the index
// -into the values for this dim:
int[] dims;
// 2nd value per dim for the doc (so we test
// multi-valued fields):
int[] dims2;
boolean deleted;
@Override
public int compareTo(Doc other) {
return id.compareTo(other.id);
}
}
private double aChance, bChance, cChance;
private String randomContentToken(boolean isQuery) {
double d = random().nextDouble();
if (isQuery) {
if (d < 0.33) {
return "a";
} else if (d < 0.66) {
return "b";
} else {
return "c";
}
} else {
if (d <= aChance) {
return "a";
} else if (d < aChance + bChance) {
return "b";
} else {
return "c";
}
}
}
public void testRandom() throws Exception {
while (aChance == 0.0) {
aChance = random().nextDouble();
}
while (bChance == 0.0) {
bChance = random().nextDouble();
}
while (cChance == 0.0) {
cChance = random().nextDouble();
}
//aChance = .01;
//bChance = 0.5;
//cChance = 1.0;
double sum = aChance + bChance + cChance;
aChance /= sum;
bChance /= sum;
cChance /= sum;
int numDims = TestUtil.nextInt(random(), 2, 5);
//int numDims = 3;
int numDocs = atLeast(300);
//int numDocs = 20;
if (VERBOSE) {
System.out.println(
"numDims=" + numDims + " numDocs=" + numDocs + " aChance=" + aChance + " bChance=" + bChance + " cChance="
+ cChance);
}
String[][] dimValues = new String[numDims][];
int valueCount = 2;
for (int dim = 0; dim < numDims; dim++) {
Set<String> values = new HashSet<>();
while (values.size() < valueCount) {
String s = TestUtil.randomRealisticUnicodeString(random());
//String s = _TestUtil.randomString(random());
if (s.length() > 0) {
values.add(s);
}
}
dimValues[dim] = values.toArray(new String[values.size()]);
valueCount *= 2;
}
List<Doc> docs = new ArrayList<>();
for (int i = 0; i < numDocs; i++) {
Doc doc = new Doc();
doc.id = "" + i;
doc.contentToken = randomContentToken(false);
doc.dims = new int[numDims];
doc.dims2 = new int[numDims];
for (int dim = 0; dim < numDims; dim++) {
if (random().nextInt(5) == 3) {
// This doc is missing this dim:
doc.dims[dim] = -1;
} else if (dimValues[dim].length <= 4) {
int dimUpto = 0;
doc.dims[dim] = dimValues[dim].length - 1;
while (dimUpto < dimValues[dim].length) {
if (random().nextBoolean()) {
doc.dims[dim] = dimUpto;
break;
}
dimUpto++;
}
} else {
doc.dims[dim] = random().nextInt(dimValues[dim].length);
}
if (random().nextInt(5) == 3) {
// 2nd value:
doc.dims2[dim] = random().nextInt(dimValues[dim].length);
} else {
doc.dims2[dim] = -1;
}
}
docs.add(doc);
}
Directory d = newDirectory();
Directory td = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setInfoStream(InfoStream.NO_OUTPUT);
RandomIndexWriter w = new RandomIndexWriter(random(), d, iwc);
DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(td, IndexWriterConfig.OpenMode.CREATE);
FacetsConfig config = new FacetsConfig();
for (int i = 0; i < numDims; i++) {
config.setMultiValued("dim" + i, true);
}
boolean doUseDV = random().nextBoolean();
for (Doc rawDoc : docs) {
Document doc = new Document();
doc.add(newStringField("id", rawDoc.id, Field.Store.YES));
doc.add(new SortedDocValuesField("id", new BytesRef(rawDoc.id)));
doc.add(newStringField("content", rawDoc.contentToken, Field.Store.NO));
if (VERBOSE) {
System.out.println(" doc id=" + rawDoc.id + " token=" + rawDoc.contentToken);
}
for (int dim = 0; dim < numDims; dim++) {
int dimValue = rawDoc.dims[dim];
if (dimValue != -1) {
if (doUseDV) {
doc.add(new SortedSetDocValuesFacetField("dim" + dim, dimValues[dim][dimValue]));
} else {
doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue]));
}
doc.add(new StringField("dim" + dim, dimValues[dim][dimValue], Field.Store.YES));
if (VERBOSE) {
System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue]));
}
}
int dimValue2 = rawDoc.dims2[dim];
if (dimValue2 != -1) {
if (doUseDV) {
doc.add(new SortedSetDocValuesFacetField("dim" + dim, dimValues[dim][dimValue2]));
} else {
doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue2]));
}
doc.add(new StringField("dim" + dim, dimValues[dim][dimValue2], Field.Store.YES));
if (VERBOSE) {
System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue2]));
}
}
}
w.addDocument(config.build(tw, doc));
}
if (random().nextBoolean()) {
// Randomly delete a few docs:
int numDel = TestUtil.nextInt(random(), 1, (int) (numDocs * 0.05));
if (VERBOSE) {
System.out.println("delete " + numDel);
}
int delCount = 0;
while (delCount < numDel) {
Doc doc = docs.get(random().nextInt(docs.size()));
if (!doc.deleted) {
if (VERBOSE) {
System.out.println(" delete id=" + doc.id);
}
doc.deleted = true;
w.deleteDocuments(new Term("id", doc.id));
delCount++;
}
}
}
if (random().nextBoolean()) {
if (VERBOSE) {
System.out.println("TEST: forceMerge(1)...");
}
w.forceMerge(1);
}
IndexReader r = w.getReader();
final SortedSetDocValuesReaderState sortedSetDVState;
IndexSearcher s = getNewSearcher(r);
if (doUseDV) {
sortedSetDVState = new DefaultSortedSetDocValuesReaderState(s.getIndexReader());
} else {
sortedSetDVState = null;
}
if (VERBOSE) {
System.out.println("r.numDocs() = " + r.numDocs());
}
// NRT open
TaxonomyReader tr = new DirectoryTaxonomyReader(tw);
int numIters = atLeast(10);
for (int iter = 0; iter < numIters; iter++) {
String contentToken = random().nextInt(30) == 17 ? null : randomContentToken(true);
int numDrillDown = TestUtil.nextInt(random(), 1, Math.min(4, numDims));
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter + " baseQuery=" + contentToken + " numDrillDown=" + numDrillDown
+ " useSortedSetDV=" + doUseDV);
}
String[][] drillDowns = new String[numDims][];
int count = 0;
boolean anyMultiValuedDrillDowns = false;
while (count < numDrillDown) {
int dim = random().nextInt(numDims);
if (drillDowns[dim] == null) {
if (random().nextBoolean()) {
// Drill down on one value:
drillDowns[dim] = new String[] { dimValues[dim][random().nextInt(dimValues[dim].length)] };
} else {
int orCount = TestUtil.nextInt(random(), 1, Math.min(5, dimValues[dim].length));
drillDowns[dim] = new String[orCount];
anyMultiValuedDrillDowns |= orCount > 1;
for (int i = 0; i < orCount; i++) {
while (true) {
String value = dimValues[dim][random().nextInt(dimValues[dim].length)];
for (int j = 0; j < i; j++) {
if (value.equals(drillDowns[dim][j])) {
value = null;
break;
}
}
if (value != null) {
drillDowns[dim][i] = value;
break;
}
}
}
}
if (VERBOSE) {
BytesRef[] values = new BytesRef[drillDowns[dim].length];
for (int i = 0; i < values.length; i++) {
values[i] = new BytesRef(drillDowns[dim][i]);
}
System.out.println(" dim" + dim + "=" + Arrays.toString(values));
}
count++;
}
}
Query baseQuery;
if (contentToken == null) {
baseQuery = new MatchAllDocsQuery();
} else {
baseQuery = new TermQuery(new Term("content", contentToken));
}
DrillDownQuery ddq = new DrillDownQuery(config, baseQuery);
for (int dim = 0; dim < numDims; dim++) {
if (drillDowns[dim] != null) {
for (String value : drillDowns[dim]) {
ddq.add("dim" + dim, value);
}
}
}
Query filter;
if (random().nextInt(7) == 6) {
if (VERBOSE) {
System.out.println(" only-even filter");
}
filter = new Query() {
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
return new ConstantScoreWeight(this, boost) {
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
DocIdSetIterator approximation = DocIdSetIterator.all(context.reader().maxDoc());
return new ConstantScoreScorer(this, score(), scoreMode, new TwoPhaseIterator(approximation) {
@Override
public boolean matches() throws IOException {
int docID = approximation.docID();
return (Integer.parseInt(context.reader().document(docID).get("id")) & 1) == 0;
}
@Override
public float matchCost() {
return 1000f;
}
});
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
return false;
}
};
}
@Override
public void visit(QueryVisitor visitor) {
}
@Override
public String toString(String field) {
return "drillSidewaysTestFilter";
}
@Override
public boolean equals(Object o) {
return o == this;
}
@Override
public int hashCode() {
return System.identityHashCode(this);
}
};
} else {
filter = null;
}
// Verify docs are always collected in order. If we
// had an AssertingScorer it could catch it when
// Weight.scoresDocsOutOfOrder lies!:
getNewDrillSideways(s, config, tr)
.search(
ddq,
new org.apache.lucene.search.SimpleCollector() {
int lastDocID;
@Override
public void collect(int doc) {
assert doc > lastDocID;
lastDocID = doc;
}
@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
lastDocID = -1;
}
@Override
public ScoreMode scoreMode() {
return ScoreMode.COMPLETE_NO_SCORES;
}
});
// Also separately verify that DS respects the
// scoreSubDocsAtOnce method, to ensure that all
// subScorers are on the same docID:
if (!anyMultiValuedDrillDowns) {
// Can only do this test when there are no OR'd
// drill-down values, because in that case it's
// easily possible for one of the DD terms to be on
// a future docID:
getNewDrillSidewaysScoreSubdocsAtOnce(s, config, tr).search(ddq, new AssertingSubDocsAtOnceCollector());
}
Sort sort = new Sort(new SortField("id", SortField.Type.STRING));
DrillSideways ds;
if (doUseDV) {
ds = getNewDrillSideways(s, config, sortedSetDVState);
} else {
ds = getNewDrillSidewaysBuildFacetsResult(s, config, tr);
}
DSResults actual = new DSResults();
if (random().nextBoolean()) {
DrillSidewaysResult dsr = ds.search(ddq, filter, null, numDocs, sort, true);
actual.resultCount = dsr.hits.totalHits;
for (ScoreDoc scoreDoc : dsr.hits.scoreDocs) {
actual.results.add(new DocAndScore(scoreDoc.doc, scoreDoc.score));
}
actual.facets = dsr.facets;
} else {
// can't support filter queries with this impl
filter = null;
CollectorManager<SimpleCollector, List<DocAndScore>> manager =
new SimpleCollectorManager(numDocs, Comparator.comparing(cr -> cr.id));
// Because we validate the scores computed through DrillSideways against those found through
// a direct search
// against IndexSearcher, make sure our Collectors announce themselves as requiring scores.
// See conversation
// in LUCENE-10060 where this bug was introduced and then discovered:
assertTrue(manager.newCollector().scoreMode.needsScores());
DrillSideways.ConcurrentDrillSidewaysResult<List<DocAndScore>> cr = ds.search(ddq, manager);
actual.results = cr.collectorResult;
actual.resultCount = new TotalHits(actual.results.size(), TotalHits.Relation.EQUAL_TO);
actual.facets = cr.facets;
}
TestFacetResult expected =
slowDrillSidewaysSearch(s, docs, contentToken, drillDowns, dimValues, filter);
TopDocs hits = s.search(baseQuery, numDocs);
Map<String, Float> scores = new HashMap<>();
for (ScoreDoc sd : hits.scoreDocs) {
scores.put(s.doc(sd.doc).get("id"), sd.score);
}
if (VERBOSE) {
System.out.println(" verify all facets");
}
verifyEquals(dimValues, s, expected, actual, scores, doUseDV);
// Make sure drill down doesn't change score:
Query q = ddq;
if (filter != null) {
q = new BooleanQuery.Builder().add(q, Occur.MUST).add(filter, Occur.FILTER).build();
}
TopDocs ddqHits = s.search(q, numDocs);
assertEquals(expected.hits.size(), ddqHits.totalHits.value);
for (int i = 0; i < expected.hits.size(); i++) {
// Score should be IDENTICAL:
assertEquals(scores.get(expected.hits.get(i).id), ddqHits.scoreDocs[i].score, 0.0f);
}
}
w.close();
IOUtils.close(r, tr, tw, d, td);
}
private static class Counters {
int[][] counts;
public Counters(String[][] dimValues) {
counts = new int[dimValues.length][];
for (int dim = 0; dim < dimValues.length; dim++) {
counts[dim] = new int[dimValues[dim].length];
}
}
public void inc(int[] dims, int[] dims2) {
inc(dims, dims2, -1);
}
public void inc(int[] dims, int[] dims2, int onlyDim) {
assert dims.length == counts.length;
assert dims2.length == counts.length;
for (int dim = 0; dim < dims.length; dim++) {
if (onlyDim == -1 || dim == onlyDim) {
if (dims[dim] != -1) {
counts[dim][dims[dim]]++;
}
if (dims2[dim] != -1 && dims2[dim] != dims[dim]) {
counts[dim][dims2[dim]]++;
}
}
}
}
}
private static class TestFacetResult {
List<Doc> hits;
int[][] counts;
int[] uniqueCounts;
public TestFacetResult() {}
}
private static class DocAndScore {
final int doc;
final float score;
DocAndScore(int doc, float score) {
this.doc = doc;
this.score = score;
}
DocAndScore(DocAndScore other) {
this.doc = other.doc;
this.score = other.score;
}
}
private static class DSResults {
TotalHits resultCount;
List<DocAndScore> results = new ArrayList<>();
Facets facets;
}
private static class CollectedResult {
final DocAndScore docAndScore;
final String id;
CollectedResult(DocAndScore docAndScore, String id) {
this.docAndScore = docAndScore;
this.id = id;
}
}
private abstract static class SimpleLeafCollector implements LeafCollector {
protected Scorable scorer;
}
private static class SimpleCollector implements Collector {
private final ScoreMode scoreMode;
final List<CollectedResult> hits = new ArrayList<>();
SimpleCollector(ScoreMode scoreMode) {
this.scoreMode = scoreMode;
}
@Override
public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
int docBase = context.docBase;
SortedDocValues idVals = DocValues.getSorted(context.reader(), "id");
return new SimpleLeafCollector() {
@Override
public void setScorer(Scorable scorer) {
super.scorer = scorer;
}
@Override
public void collect(int doc) throws IOException {
float score = scorer.score();
String id = null;
if (idVals.advanceExact(doc)) {
id = idVals.lookupOrd(idVals.ordValue()).utf8ToString();
}
hits.add(new CollectedResult(new DocAndScore(docBase + doc, score), id));
}
};
}
@Override
public ScoreMode scoreMode() {
return scoreMode;
}
}
private static class SimpleCollectorManager
implements CollectorManager<SimpleCollector, List<DocAndScore>> {
private final int numDocs;
private final Comparator<CollectedResult> comparator;
private final ScoreMode scoreMode;
SimpleCollectorManager(int numDocs, Comparator<CollectedResult> comparator) {
this(numDocs, comparator, ScoreMode.COMPLETE);
}
SimpleCollectorManager(
int numDocs, Comparator<CollectedResult> comparator, ScoreMode scoreMode) {
this.numDocs = numDocs;
this.comparator = comparator;
this.scoreMode = scoreMode;
}
@Override
public SimpleCollector newCollector() {
return new SimpleCollector(scoreMode);
}
@Override
public List<DocAndScore> reduce(Collection<SimpleCollector> collectors) {
List<CollectedResult> all = new ArrayList<>();
for (SimpleCollector c : collectors) {
all.addAll(c.hits);
}
// Just sorts using the provided comparator returns up to 'numDocs'.
// This is an inefficient way to go about doing this, but it's only for unit testing.
return all.stream()
.sorted(comparator)
.map(cr -> new DocAndScore(cr.docAndScore))
.limit(numDocs)
.collect(Collectors.toList());
}
}
private int[] getTopNOrds(final int[] counts, final String[] values, int topN) {
final int[] ids = new int[counts.length];
for (int i = 0; i < ids.length; i++) {
ids[i] = i;
}
// Naive (on purpose, to reduce bug in tester/gold):
// sort all ids, then return top N slice:
new InPlaceMergeSorter() {
@Override
protected void swap(int i, int j) {
int id = ids[i];
ids[i] = ids[j];
ids[j] = id;
}
@Override
protected int compare(int i, int j) {
int counti = counts[ids[i]];
int countj = counts[ids[j]];
// Sort by count descending...
if (counti > countj) {
return -1;
} else if (counti < countj) {
return 1;
} else {
// ... then by label ascending:
return new BytesRef(values[ids[i]]).compareTo(new BytesRef(values[ids[j]]));
}
}
}.sort(0, ids.length);
if (topN > ids.length) {
topN = ids.length;
}
int numSet = topN;
for (int i = 0; i < topN; i++) {
if (counts[ids[i]] == 0) {
numSet = i;
break;
}
}
int[] topNIDs = new int[numSet];
System.arraycopy(ids, 0, topNIDs, 0, topNIDs.length);
return topNIDs;
}
private TestFacetResult slowDrillSidewaysSearch(IndexSearcher s, List<Doc> docs, String contentToken,
String[][] drillDowns, String[][] dimValues, Query onlyEven) throws Exception {
int numDims = dimValues.length;
List<Doc> hits = new ArrayList<>();
Counters drillDownCounts = new Counters(dimValues);
Counters[] drillSidewaysCounts = new Counters[dimValues.length];
for (int dim = 0; dim < numDims; dim++) {
drillSidewaysCounts[dim] = new Counters(dimValues);
}
if (VERBOSE) {
System.out.println(" compute expected");
}
nextDoc:
for (Doc doc : docs) {
if (doc.deleted) {
continue;
}
if (onlyEven != null & (Integer.parseInt(doc.id) & 1) != 0) {
continue;
}
if (contentToken == null || doc.contentToken.equals(contentToken)) {
int failDim = -1;
for (int dim = 0; dim < numDims; dim++) {
if (drillDowns[dim] != null) {
String docValue = doc.dims[dim] == -1 ? null : dimValues[dim][doc.dims[dim]];
String docValue2 = doc.dims2[dim] == -1 ? null : dimValues[dim][doc.dims2[dim]];
boolean matches = false;
for (String value : drillDowns[dim]) {
if (value.equals(docValue) || value.equals(docValue2)) {
matches = true;
break;
}
}
if (!matches) {
if (failDim == -1) {
// Doc could be a near-miss, if no other dim fails
failDim = dim;
} else {
// Doc isn't a hit nor a near-miss
continue nextDoc;
}
}
}
}
if (failDim == -1) {
if (VERBOSE) {
System.out.println(" exp: id=" + doc.id + " is a hit");
}
// Hit:
hits.add(doc);
drillDownCounts.inc(doc.dims, doc.dims2);
for (int dim = 0; dim < dimValues.length; dim++) {
drillSidewaysCounts[dim].inc(doc.dims, doc.dims2);
}
} else {
if (VERBOSE) {
System.out.println(" exp: id=" + doc.id + " is a near-miss on dim=" + failDim);
}
drillSidewaysCounts[failDim].inc(doc.dims, doc.dims2, failDim);
}
}
}
Map<String, Integer> idToDocID = new HashMap<>();
for (int i = 0; i < s.getIndexReader().maxDoc(); i++) {
idToDocID.put(s.doc(i).get("id"), i);
}
Collections.sort(hits);
TestFacetResult res = new TestFacetResult();
res.hits = hits;
res.counts = new int[numDims][];
res.uniqueCounts = new int[numDims];
for (int dim = 0; dim < numDims; dim++) {
if (drillDowns[dim] != null) {
res.counts[dim] = drillSidewaysCounts[dim].counts[dim];
} else {
res.counts[dim] = drillDownCounts.counts[dim];
}
int uniqueCount = 0;
for (int j = 0; j < res.counts[dim].length; j++) {
if (res.counts[dim][j] != 0) {
uniqueCount++;
}
}
res.uniqueCounts[dim] = uniqueCount;
}
return res;
}
void verifyEquals(
String[][] dimValues,
IndexSearcher s,
TestFacetResult expected,
DSResults actual,
Map<String, Float> scores, boolean isSortedSetDV) throws Exception {
if (VERBOSE) {
System.out.println(" verify totHits=" + expected.hits.size());
}
assertEquals(expected.hits.size(), actual.resultCount.value);
assertEquals(expected.hits.size(), actual.results.size());
for (int i = 0; i < expected.hits.size(); i++) {
if (VERBOSE) {
System.out.println(" hit " + i + " expected=" + expected.hits.get(i).id);
}
assertEquals(expected.hits.get(i).id, s.doc(actual.results.get(i).doc).get("id"));
// Score should be IDENTICAL:
assertEquals(scores.get(expected.hits.get(i).id), actual.results.get(i).score, 0.0f);
}
for (int dim = 0; dim < expected.counts.length; dim++) {
int topN = random().nextBoolean() ? dimValues[dim].length : TestUtil.nextInt(random(), 1, dimValues[dim].length);
FacetResult fr = actual.facets.getTopChildren(topN, "dim" + dim);
if (VERBOSE) {
System.out.println(" dim" + dim + " topN=" + topN + " (vs " + dimValues[dim].length + " unique values)");
System.out.println(" actual");
}
int idx = 0;
Map<String, Integer> actualValues = new HashMap<>();
if (fr != null) {
for (LabelAndValue labelValue : fr.labelValues) {
actualValues.put(labelValue.label, labelValue.value.intValue());
if (VERBOSE) {
System.out.println(" " + idx + ": " + new BytesRef(labelValue.label) + ": " + labelValue.value);
idx++;
}
}
assertEquals("dim=" + dim, expected.uniqueCounts[dim], fr.childCount);
}
if (topN < dimValues[dim].length) {
int[] topNIDs = getTopNOrds(expected.counts[dim], dimValues[dim], topN);
if (VERBOSE) {
idx = 0;
System.out.println(" expected (sorted)");
for (int i = 0; i < topNIDs.length; i++) {
int expectedOrd = topNIDs[i];
String value = dimValues[dim][expectedOrd];
System.out.println(
" " + idx + ": " + new BytesRef(value) + ": " + expected.counts[dim][expectedOrd]);
idx++;
}
}
if (VERBOSE) {
System.out.println(" topN=" + topN + " expectedTopN=" + topNIDs.length);
}
if (fr != null) {
assertEquals(topNIDs.length, fr.labelValues.length);
} else {
assertEquals(0, topNIDs.length);
}
for (int i = 0; i < topNIDs.length; i++) {
int expectedOrd = topNIDs[i];
assertEquals(expected.counts[dim][expectedOrd], fr.labelValues[i].value.intValue());
if (isSortedSetDV) {
// Tie-break facet labels are only in unicode
// order with SortedSetDVFacets:
assertEquals("value @ idx=" + i, dimValues[dim][expectedOrd], fr.labelValues[i].label);
}
}
} else {
if (VERBOSE) {
idx = 0;
System.out.println(" expected (unsorted)");
for (int i = 0; i < dimValues[dim].length; i++) {
String value = dimValues[dim][i];
if (expected.counts[dim][i] != 0) {
System.out.println(" " + idx + ": " + new BytesRef(value) + ": " + expected.counts[dim][i]);
idx++;
}
}
}
int setCount = 0;
for (int i = 0; i < dimValues[dim].length; i++) {
String value = dimValues[dim][i];
if (expected.counts[dim][i] != 0) {
assertTrue(actualValues.containsKey(value));
assertEquals(expected.counts[dim][i], actualValues.get(value).intValue());
setCount++;
} else {
assertFalse(actualValues.containsKey(value));
}
}
assertEquals(setCount, actualValues.size());
}
}
}
public void testEmptyIndex() throws Exception {
// LUCENE-5045: make sure DrillSideways works with an empty index
Directory dir = newDirectory();
Directory taxoDir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE);
IndexSearcher searcher = getNewSearcher(writer.getReader());
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
// Count "Author"
FacetsConfig config = new FacetsConfig();
DrillSideways ds = getNewDrillSideways(searcher, config, taxoReader);
DrillDownQuery ddq = new DrillDownQuery(config);
ddq.add("Author", "Lisa");
DrillSidewaysResult r = ds.search(ddq, 10); // this used to fail on IllegalArgEx
assertEquals(0, r.hits.totalHits.value);
r = ds.search(ddq, null, null, 10, new Sort(new SortField("foo", SortField.Type.INT)), false); // this used to fail on IllegalArgEx
assertEquals(0, r.hits.totalHits.value);
writer.close();
IOUtils.close(taxoWriter, searcher.getIndexReader(), taxoReader, dir, taxoDir);
}
public void testNoDrillDownFacetCollection() throws Exception {
Directory dir = newDirectory();
Directory taxoDir = newDirectory();
// Writes facet ords to a separate directory from the
// main index:
DirectoryTaxonomyWriter taxoWriter =
new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE);
FacetsConfig config = new FacetsConfig();
config.setHierarchical("Publish Date", true);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(new FacetField("Author", "Bob"));
doc.add(new FacetField("Publish Date", "2010", "10", "15"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(new FacetField("Author", "Lisa"));
doc.add(new FacetField("Publish Date", "2010", "10", "20"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(new FacetField("Author", "Lisa"));
doc.add(new FacetField("Publish Date", "2012", "1", "1"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(new FacetField("Author", "Susan"));
doc.add(new FacetField("Publish Date", "2012", "1", "7"));
writer.addDocument(config.build(taxoWriter, doc));
doc = new Document();
doc.add(new FacetField("Author", "Frank"));
doc.add(new FacetField("Publish Date", "1999", "5", "5"));
writer.addDocument(config.build(taxoWriter, doc));
// NRT open
IndexSearcher searcher = getNewSearcher(writer.getReader());
// NRT open
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
// Sometimes pass an ExecutorService to test both paths for providing null drill down collectors
// The actual configuration of the ExecutorService doesn't matter at all:
ExecutorService executorService = null;
if (random().nextBoolean()) {
executorService = Executors.newSingleThreadExecutor(new NamedThreadFactory("ds_test"));
}
// Don't collect for drill down faceting
DrillSideways ds =
new DrillSideways(searcher, config, taxoReader, null, executorService) {
@Override
protected FacetsCollector createDrillDownFacetsCollector() {
return null;
}
@Override
protected FacetsCollectorManager createDrillDownFacetsCollectorManager() {
return null;
}
};
// Another simple case: drill-down on single fields
// but OR of two values
DrillDownQuery ddq = new DrillDownQuery(config);
ddq.add("Author", "Lisa");
ddq.add("Author", "Bob");
DrillSidewaysResult r = ds.search(null, ddq, 10);
Facets facets = r.facets;
assertEquals(3, r.hits.totalHits.value);
assertEquals(
"dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n",
facets.getTopChildren(10, "Author").toString());
// Because we don't collect drill-downs, we shouldn't be able to get counts for Publish Date
expectThrows(IllegalArgumentException.class, () -> facets.getTopChildren(10, "Publish Date"));
assertTrue(facets instanceof MultiFacets);
List<FacetResult> allResults = facets.getAllDims(10);
// Should only have the one dimension because we didn't collect for drill down
assertEquals(1, allResults.size());
assertEquals(
"dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n",
allResults.get(0).toString());
// More interesting case: drill-down on two fields
ddq = new DrillDownQuery(config);
ddq.add("Author", "Lisa");
ddq.add("Publish Date", "2010");
r = ds.search(null, ddq, 10);
assertEquals(1, r.hits.totalHits.value);
// Should be able to count on both fields since they're both drill sideways cases
assertEquals(
"dim=Publish Date path=[] value=2 childCount=2\n 2010 (1)\n 2012 (1)\n",
r.facets.getTopChildren(10, "Publish Date").toString());
assertEquals(
"dim=Author path=[] value=2 childCount=2\n Bob (1)\n Lisa (1)\n",
r.facets.getTopChildren(10, "Author").toString());
if (executorService != null) {
executorService.shutdown();
}
writer.close();
IOUtils.close(searcher.getIndexReader(), taxoReader, taxoWriter, dir, taxoDir);
}
public void testScorer() throws Exception {
// LUCENE-6001 some scorers, eg ReqExlScorer, can hit NPE if cost is called after nextDoc
Directory dir = newDirectory();
Directory taxoDir = newDirectory();
// Writes facet ords to a separate directory from the
// main index:
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE);
FacetsConfig config = new FacetsConfig();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newTextField("field", "foo bar", Field.Store.NO));
doc.add(new FacetField("Author", "Bob"));
doc.add(new FacetField("dim", "a"));
writer.addDocument(config.build(taxoWriter, doc));
// NRT open
IndexSearcher searcher = getNewSearcher(writer.getReader());
// NRT open
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
DrillSideways ds = getNewDrillSideways(searcher, config, taxoReader);
BooleanQuery.Builder bq = new BooleanQuery.Builder();
bq.add(new TermQuery(new Term("field", "foo")), BooleanClause.Occur.MUST);
bq.add(new TermQuery(new Term("field", "bar")), BooleanClause.Occur.MUST_NOT);
DrillDownQuery ddq = new DrillDownQuery(config, bq.build());
ddq.add("field", "foo");
ddq.add("author", bq.build());
ddq.add("dim", bq.build());
DrillSidewaysResult r = ds.search(null, ddq, 10);
assertEquals(0, r.hits.totalHits.value);
writer.close();
IOUtils.close(searcher.getIndexReader(), taxoReader, taxoWriter, dir, taxoDir);
}
public void testExtendedDrillSidewaysResult() throws Exception {
// LUCENE-9945: Extend DrillSideways to support exposing FacetCollectors directly
Directory dir = newDirectory();
Directory taxoDir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
DirectoryTaxonomyWriter taxoWriter =
new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE);
FacetsConfig config = new FacetsConfig();
config.setHierarchical("dim", true);
Document doc = new Document();
doc.add(new FacetField("dim", "a"));
writer.addDocument(config.build(taxoWriter, doc));
Document doc2 = new Document();
doc.add(new FacetField("dim", "x"));
writer.addDocument(config.build(taxoWriter, doc2));
// open NRT
IndexSearcher searcher = getNewSearcher(writer.getReader());
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
DrillDownQuery ddq = new DrillDownQuery(config);
ddq.add("dim", "x");
DrillSideways ds = getNewDrillSidewaysBuildFacetsResult(searcher, config, taxoReader);
SimpleCollectorManager manager =
new SimpleCollectorManager(
10, (a, b) -> Float.compare(b.docAndScore.score, a.docAndScore.score));
SimpleCollector collector = manager.newCollector();
// Sometimes pass in a Collector and sometimes CollectorManager
// so that we can test both DrillSidewaysResult and ConcurrentDrillSidewaysResult
DrillSidewaysResult r;
if (random().nextBoolean()) {
r = ds.search(ddq, collector);
} else {
r = ds.search(ddq, manager);
}
// compute Facets using exposed FacetCollectors from DrillSidewaysResult
Map<String, Facets> drillSidewaysFacets = new HashMap<>();
Facets drillDownFacets = getTaxonomyFacetCounts(taxoReader, config, r.drillDownFacetsCollector);
if (r.drillSidewaysFacetsCollector != null) {
for (int i = 0; i < r.drillSidewaysFacetsCollector.length; i++) {
drillSidewaysFacets.put(
r.drillSidewaysDims[i],
getTaxonomyFacetCounts(taxoReader, config, r.drillSidewaysFacetsCollector[i]));
}
}
Facets facets;
if (drillSidewaysFacets.isEmpty()) {
facets = drillDownFacets;
} else {
facets = new MultiFacets(drillSidewaysFacets, drillDownFacets);
}
// Facets computed using FacetsCollecter exposed in DrillSidewaysResult
// should match the Facets computed by {@link DrillSideways#buildFacetsResult}
FacetResult facetResultActual = facets.getTopChildren(2, "dim");
FacetResult facetResultExpected = r.facets.getTopChildren(2, "dim");
assertEquals(facetResultExpected.dim, facetResultActual.dim);
assertEquals(facetResultExpected.path.length, facetResultActual.path.length);
assertEquals(facetResultExpected.value, facetResultActual.value);
assertEquals(facetResultExpected.childCount, facetResultActual.childCount);
writer.close();
IOUtils.close(searcher.getIndexReader(), taxoReader, taxoWriter, dir, taxoDir);
}
}