blob: 4c530e7812215d4844e27aaef1a1de8f82ed31ef [file] [log] [blame]
package org.apache.lucene.facet.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.facet.params.FacetSearchParams;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MultiCollector;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.Weight;
/**
* Computes drill down and sideways counts for the provided
* {@link DrillDownQuery}. Drill sideways counts include
* alternative values/aggregates for the drill-down
* dimensions so that a dimension does not disappear after
* the user drills down into it.
*
* <p> Use one of the static search
* methods to do the search, and then get the hits and facet
* results from the returned {@link DrillSidewaysResult}.
*
* <p><b>NOTE</b>: this allocates one {@link
* FacetsCollector} for each drill-down, plus one. If your
* index has high number of facet labels then this will
* multiply your memory usage.
*
* @lucene.experimental
*/
public class DrillSideways {
protected final IndexSearcher searcher;
protected final TaxonomyReader taxoReader;
/** Create a new {@code DrillSideways} instance. */
public DrillSideways(IndexSearcher searcher, TaxonomyReader taxoReader) {
this.searcher = searcher;
this.taxoReader = taxoReader;
}
/** Moves any drill-downs that don't have a corresponding
* facet request into the baseQuery. This is unusual,
* yet allowed, because typically the added drill-downs are because
* the user has clicked on previously presented facets,
* and those same facets would be computed this time
* around. */
private static DrillDownQuery moveDrillDownOnlyClauses(DrillDownQuery in, FacetSearchParams fsp) {
Set<String> facetDims = new HashSet<String>();
for(FacetRequest fr : fsp.facetRequests) {
if (fr.categoryPath.length == 0) {
throw new IllegalArgumentException("all FacetRequests must have CategoryPath with length > 0");
}
facetDims.add(fr.categoryPath.components[0]);
}
BooleanClause[] clauses = in.getBooleanQuery().getClauses();
Map<String,Integer> drillDownDims = in.getDims();
String[] dimsByIndex = new String[drillDownDims.size()];
for(Map.Entry<String,Integer> ent : drillDownDims.entrySet()) {
dimsByIndex[ent.getValue()] = ent.getKey();
}
int startClause;
if (clauses.length == drillDownDims.size()) {
startClause = 0;
} else {
assert clauses.length == 1+drillDownDims.size();
startClause = 1;
}
// Break out drill-down clauses that have no
// corresponding facet request and move them inside the
// baseQuery:
List<Query> nonFacetClauses = new ArrayList<Query>();
List<Query> facetClauses = new ArrayList<Query>();
Map<String,Integer> dimToIndex = new LinkedHashMap<String,Integer>();
for(int i=startClause;i<clauses.length;i++) {
Query q = clauses[i].getQuery();
String dim = dimsByIndex[i-startClause];
if (!facetDims.contains(dim)) {
nonFacetClauses.add(q);
} else {
facetClauses.add(q);
dimToIndex.put(dim, dimToIndex.size());
}
}
if (!nonFacetClauses.isEmpty()) {
BooleanQuery newBaseQuery = new BooleanQuery(true);
if (startClause == 1) {
// Add original basaeQuery:
newBaseQuery.add(clauses[0].getQuery(), BooleanClause.Occur.MUST);
}
for(Query q : nonFacetClauses) {
newBaseQuery.add(q, BooleanClause.Occur.MUST);
}
return new DrillDownQuery(fsp.indexingParams, newBaseQuery, facetClauses, dimToIndex);
} else {
// No change:
return in;
}
}
/**
* Search, collecting hits with a {@link Collector}, and
* computing drill down and sideways counts.
*/
@SuppressWarnings({"rawtypes","unchecked"})
public DrillSidewaysResult search(DrillDownQuery query,
Collector hitCollector, FacetSearchParams fsp) throws IOException {
if (query.fip != fsp.indexingParams) {
throw new IllegalArgumentException("DrillDownQuery's FacetIndexingParams should match FacetSearchParams'");
}
query = moveDrillDownOnlyClauses(query, fsp);
Map<String,Integer> drillDownDims = query.getDims();
if (drillDownDims.isEmpty()) {
// Just do ordinary search when there are no drill-downs:
FacetsCollector c = FacetsCollector.create(getDrillDownAccumulator(fsp));
searcher.search(query, MultiCollector.wrap(hitCollector, c));
return new DrillSidewaysResult(c.getFacetResults(), null);
}
List<FacetRequest> ddRequests = new ArrayList<FacetRequest>();
for(FacetRequest fr : fsp.facetRequests) {
assert fr.categoryPath.length > 0;
if (!drillDownDims.containsKey(fr.categoryPath.components[0])) {
ddRequests.add(fr);
}
}
FacetSearchParams fsp2;
if (!ddRequests.isEmpty()) {
fsp2 = new FacetSearchParams(fsp.indexingParams, ddRequests);
} else {
fsp2 = null;
}
BooleanQuery ddq = query.getBooleanQuery();
BooleanClause[] clauses = ddq.getClauses();
Query baseQuery;
int startClause;
if (clauses.length == drillDownDims.size()) {
// TODO: we could optimize this pure-browse case by
// making a custom scorer instead:
baseQuery = new MatchAllDocsQuery();
startClause = 0;
} else {
assert clauses.length == 1+drillDownDims.size();
baseQuery = clauses[0].getQuery();
startClause = 1;
}
FacetsCollector drillDownCollector = fsp2 == null ? null : FacetsCollector.create(getDrillDownAccumulator(fsp2));
FacetsCollector[] drillSidewaysCollectors = new FacetsCollector[drillDownDims.size()];
int idx = 0;
for(String dim : drillDownDims.keySet()) {
List<FacetRequest> requests = new ArrayList<FacetRequest>();
for(FacetRequest fr : fsp.facetRequests) {
assert fr.categoryPath.length > 0;
if (fr.categoryPath.components[0].equals(dim)) {
requests.add(fr);
}
}
if (requests.isEmpty()) {
throw new IllegalArgumentException("could not find FacetRequest for drill-sideways dimension \"" + dim + "\"");
}
drillSidewaysCollectors[idx++] = FacetsCollector.create(getDrillSidewaysAccumulator(dim, new FacetSearchParams(fsp.indexingParams, requests)));
}
boolean useCollectorMethod = scoreSubDocsAtOnce();
Term[][] drillDownTerms = null;
if (!useCollectorMethod) {
// Optimistic: assume subQueries of the DDQ are either
// TermQuery or BQ OR of TermQuery; if this is wrong
// then we detect it and fallback to the mome general
// but slower DrillSidewaysCollector:
drillDownTerms = new Term[clauses.length-startClause][];
for(int i=startClause;i<clauses.length;i++) {
Query q = clauses[i].getQuery();
// DrillDownQuery always wraps each subQuery in
// ConstantScoreQuery:
assert q instanceof ConstantScoreQuery;
q = ((ConstantScoreQuery) q).getQuery();
if (q instanceof TermQuery) {
drillDownTerms[i-startClause] = new Term[] {((TermQuery) q).getTerm()};
} else if (q instanceof BooleanQuery) {
BooleanQuery q2 = (BooleanQuery) q;
BooleanClause[] clauses2 = q2.getClauses();
drillDownTerms[i-startClause] = new Term[clauses2.length];
for(int j=0;j<clauses2.length;j++) {
if (clauses2[j].getQuery() instanceof TermQuery) {
drillDownTerms[i-startClause][j] = ((TermQuery) clauses2[j].getQuery()).getTerm();
} else {
useCollectorMethod = true;
break;
}
}
} else {
useCollectorMethod = true;
}
}
}
if (useCollectorMethod) {
// TODO: maybe we could push the "collector method"
// down into the optimized scorer to have a tighter
// integration ... and so TermQuery clauses could
// continue to run "optimized"
collectorMethod(query, baseQuery, startClause, hitCollector, drillDownCollector, drillSidewaysCollectors);
} else {
DrillSidewaysQuery dsq = new DrillSidewaysQuery(baseQuery, drillDownCollector, drillSidewaysCollectors, drillDownTerms);
searcher.search(dsq, hitCollector);
}
int numDims = drillDownDims.size();
List<FacetResult>[] drillSidewaysResults = new List[numDims];
List<FacetResult> drillDownResults = null;
List<FacetResult> mergedResults = new ArrayList<FacetResult>();
int[] requestUpto = new int[drillDownDims.size()];
int ddUpto = 0;
for(int i=0;i<fsp.facetRequests.size();i++) {
FacetRequest fr = fsp.facetRequests.get(i);
assert fr.categoryPath.length > 0;
Integer dimIndex = drillDownDims.get(fr.categoryPath.components[0]);
if (dimIndex == null) {
// Pure drill down dim (the current query didn't
// drill down on this dim):
if (drillDownResults == null) {
// Lazy init, in case all requests were against
// drill-sideways dims:
drillDownResults = drillDownCollector.getFacetResults();
//System.out.println("get DD results");
}
//System.out.println("add dd results " + i);
mergedResults.add(drillDownResults.get(ddUpto++));
} else {
// Drill sideways dim:
int dim = dimIndex.intValue();
List<FacetResult> sidewaysResult = drillSidewaysResults[dim];
if (sidewaysResult == null) {
// Lazy init, in case no facet request is against
// a given drill down dim:
sidewaysResult = drillSidewaysCollectors[dim].getFacetResults();
drillSidewaysResults[dim] = sidewaysResult;
}
mergedResults.add(sidewaysResult.get(requestUpto[dim]));
requestUpto[dim]++;
}
}
return new DrillSidewaysResult(mergedResults, null);
}
/** Uses the more general but slower method of sideways
* counting. This method allows an arbitrary subQuery to
* implement the drill down for a given dimension. */
private void collectorMethod(DrillDownQuery ddq, Query baseQuery, int startClause, Collector hitCollector, Collector drillDownCollector, Collector[] drillSidewaysCollectors) throws IOException {
BooleanClause[] clauses = ddq.getBooleanQuery().getClauses();
Map<String,Integer> drillDownDims = ddq.getDims();
BooleanQuery topQuery = new BooleanQuery(true);
final DrillSidewaysCollector collector = new DrillSidewaysCollector(hitCollector, drillDownCollector, drillSidewaysCollectors,
drillDownDims);
// TODO: if query is already a BQ we could copy that and
// add clauses to it, instead of doing BQ inside BQ
// (should be more efficient)? Problem is this can
// affect scoring (coord) ... too bad we can't disable
// coord on a clause by clause basis:
topQuery.add(baseQuery, BooleanClause.Occur.MUST);
// NOTE: in theory we could just make a single BQ, with
// +query a b c minShouldMatch=2, but in this case,
// annoyingly, BS2 wraps a sub-scorer that always
// returns 2 as the .freq(), not how many of the
// SHOULD clauses matched:
BooleanQuery subQuery = new BooleanQuery(true);
Query wrappedSubQuery = new QueryWrapper(subQuery,
new SetWeight() {
@Override
public void set(Weight w) {
collector.setWeight(w, -1);
}
});
Query constantScoreSubQuery = new ConstantScoreQuery(wrappedSubQuery);
// Don't impact score of original query:
constantScoreSubQuery.setBoost(0.0f);
topQuery.add(constantScoreSubQuery, BooleanClause.Occur.MUST);
// Unfortunately this sub-BooleanQuery
// will never get BS1 because today BS1 only works
// if topScorer=true... and actually we cannot use BS1
// anyways because we need subDocsScoredAtOnce:
int dimIndex = 0;
for(int i=startClause;i<clauses.length;i++) {
Query q = clauses[i].getQuery();
// DrillDownQuery always wraps each subQuery in
// ConstantScoreQuery:
assert q instanceof ConstantScoreQuery;
q = ((ConstantScoreQuery) q).getQuery();
final int finalDimIndex = dimIndex;
subQuery.add(new QueryWrapper(q,
new SetWeight() {
@Override
public void set(Weight w) {
collector.setWeight(w, finalDimIndex);
}
}),
BooleanClause.Occur.SHOULD);
dimIndex++;
}
// TODO: we could better optimize the "just one drill
// down" case w/ a separate [specialized]
// collector...
int minShouldMatch = drillDownDims.size()-1;
if (minShouldMatch == 0) {
// Must add another "fake" clause so BQ doesn't erase
// itself by rewriting to the single clause:
Query end = new MatchAllDocsQuery();
end.setBoost(0.0f);
subQuery.add(end, BooleanClause.Occur.SHOULD);
minShouldMatch++;
}
subQuery.setMinimumNumberShouldMatch(minShouldMatch);
// System.out.println("EXE " + topQuery);
// Collects against the passed-in
// drillDown/SidewaysCollectors as a side effect:
searcher.search(topQuery, collector);
}
/**
* Search, sorting by {@link Sort}, and computing
* drill down and sideways counts.
*/
public DrillSidewaysResult search(DrillDownQuery query,
Filter filter, FieldDoc after, int topN, Sort sort, boolean doDocScores,
boolean doMaxScore, FacetSearchParams fsp) throws IOException {
if (filter != null) {
query = new DrillDownQuery(filter, query);
}
if (sort != null) {
int limit = searcher.getIndexReader().maxDoc();
if (limit == 0) {
limit = 1; // the collector does not alow numHits = 0
}
topN = Math.min(topN, limit);
final TopFieldCollector hitCollector = TopFieldCollector.create(sort,
topN,
after,
true,
doDocScores,
doMaxScore,
true);
DrillSidewaysResult r = search(query, hitCollector, fsp);
return new DrillSidewaysResult(r.facetResults, hitCollector.topDocs());
} else {
return search(after, query, topN, fsp);
}
}
/**
* Search, sorting by score, and computing
* drill down and sideways counts.
*/
public DrillSidewaysResult search(ScoreDoc after,
DrillDownQuery query, int topN, FacetSearchParams fsp) throws IOException {
int limit = searcher.getIndexReader().maxDoc();
if (limit == 0) {
limit = 1; // the collector does not alow numHits = 0
}
topN = Math.min(topN, limit);
TopScoreDocCollector hitCollector = TopScoreDocCollector.create(topN, after, true);
DrillSidewaysResult r = search(query, hitCollector, fsp);
return new DrillSidewaysResult(r.facetResults, hitCollector.topDocs());
}
/** Override this to use a custom drill-down {@link
* FacetsAccumulator}. */
protected FacetsAccumulator getDrillDownAccumulator(FacetSearchParams fsp) throws IOException {
return FacetsAccumulator.create(fsp, searcher.getIndexReader(), taxoReader);
}
/** Override this to use a custom drill-sideways {@link
* FacetsAccumulator}. */
protected FacetsAccumulator getDrillSidewaysAccumulator(String dim, FacetSearchParams fsp) throws IOException {
return FacetsAccumulator.create(fsp, searcher.getIndexReader(), taxoReader);
}
/** Override this and return true if your collector
* (e.g., ToParentBlockJoinCollector) expects all
* sub-scorers to be positioned on the document being
* collected. This will cause some performance loss;
* default is false. Note that if you return true from
* this method (in a subclass) be sure your collector
* also returns false from {@link
* Collector#acceptsDocsOutOfOrder}: this will trick
* BooleanQuery into also scoring all subDocs at once. */
protected boolean scoreSubDocsAtOnce() {
return false;
}
/**
* Represents the returned result from a drill sideways search. Note that if
* you called
* {@link DrillSideways#search(DrillDownQuery, Collector, FacetSearchParams)},
* then {@link #hits} will be {@code null}.
*/
public static class DrillSidewaysResult {
/** Combined drill down & sideways results. */
public final List<FacetResult> facetResults;
/** Hits. */
public final TopDocs hits;
public DrillSidewaysResult(List<FacetResult> facetResults, TopDocs hits) {
this.facetResults = facetResults;
this.hits = hits;
}
}
private interface SetWeight {
public void set(Weight w);
}
/** Just records which Weight was given out for the
* (possibly rewritten) Query. */
private static class QueryWrapper extends Query {
private final Query originalQuery;
private final SetWeight setter;
public QueryWrapper(Query originalQuery, SetWeight setter) {
this.originalQuery = originalQuery;
this.setter = setter;
}
@Override
public Weight createWeight(final IndexSearcher searcher) throws IOException {
Weight w = originalQuery.createWeight(searcher);
setter.set(w);
return w;
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
Query rewritten = originalQuery.rewrite(reader);
if (rewritten != originalQuery) {
return new QueryWrapper(rewritten, setter);
} else {
return this;
}
}
@Override
public String toString(String s) {
return originalQuery.toString(s);
}
@Override
public boolean equals(Object o) {
if (!(o instanceof QueryWrapper)) return false;
final QueryWrapper other = (QueryWrapper) o;
return super.equals(o) && originalQuery.equals(other.originalQuery);
}
@Override
public int hashCode() {
return super.hashCode() * 31 + originalQuery.hashCode();
}
}
}