blob: 365431c32b7e041987c2ec332719f1f5a07e87ee [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.search.BulkScorer;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.search.ScoreCachingWrappingScorer;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
class DrillSidewaysScorer extends BulkScorer {
//private static boolean DEBUG = false;
private final Collector drillDownCollector;
private LeafCollector drillDownLeafCollector;
private final DocsAndCost[] dims;
// DrillDown DocsEnums:
private final Scorer baseScorer;
private final DocIdSetIterator baseIterator;
private final LeafReaderContext context;
final boolean scoreSubDocsAtOnce;
private static final int CHUNK = 2048;
private static final int MASK = CHUNK-1;
private int collectDocID = -1;
private float collectScore;
DrillSidewaysScorer(LeafReaderContext context, Scorer baseScorer, Collector drillDownCollector,
DocsAndCost[] dims, boolean scoreSubDocsAtOnce) {
this.dims = dims;
this.context = context;
this.baseScorer = baseScorer;
this.baseIterator = baseScorer.iterator();
this.drillDownCollector = drillDownCollector;
this.scoreSubDocsAtOnce = scoreSubDocsAtOnce;
}
@Override
public long cost() {
return baseIterator.cost();
}
@Override
public int score(LeafCollector collector, Bits acceptDocs, int min, int maxDoc) throws IOException {
if (min != 0) {
throw new IllegalArgumentException("min must be 0, got " + min);
}
if (maxDoc != Integer.MAX_VALUE) {
throw new IllegalArgumentException("maxDoc must be Integer.MAX_VALUE");
}
//if (DEBUG) {
// System.out.println("\nscore: reader=" + context.reader());
//}
//System.out.println("score r=" + context.reader());
if (drillDownCollector != null) {
drillDownLeafCollector = drillDownCollector.getLeafCollector(context);
} else {
drillDownLeafCollector = null;
}
for (DocsAndCost dim : dims) {
dim.sidewaysLeafCollector = dim.sidewaysCollector.getLeafCollector(context);
}
// some scorers, eg ReqExlScorer, can hit NPE if cost is called after nextDoc
long baseQueryCost = baseIterator.cost();
final int numDims = dims.length;
long drillDownCost = 0;
for (int dim=0;dim<numDims;dim++) {
drillDownCost += dims[dim].approximation.cost();
}
long drillDownAdvancedCost = 0;
if (numDims > 1) {
drillDownAdvancedCost = dims[1].approximation.cost();
}
// Position all scorers to their first matching doc:
baseIterator.nextDoc();
for (DocsAndCost dim : dims) {
dim.approximation.nextDoc();
}
/*
System.out.println("\nbaseDocID=" + baseScorer.docID() + " est=" + estBaseHitCount);
System.out.println(" maxDoc=" + context.reader().maxDoc());
System.out.println(" maxCost=" + maxCost);
System.out.println(" dims[0].freq=" + dims[0].freq);
if (numDims > 1) {
System.out.println(" dims[1].freq=" + dims[1].freq);
}
*/
if (scoreSubDocsAtOnce || baseQueryCost < drillDownCost/10) {
//System.out.println("queryFirst: baseScorer=" + baseScorer + " disis.length=" + disis.length + " bits.length=" + bits.length);
doQueryFirstScoring(acceptDocs, collector, dims);
} else if (numDims > 1 && drillDownAdvancedCost < baseQueryCost/10) {
//System.out.println("drillDownAdvance");
doDrillDownAdvanceScoring(acceptDocs, collector, dims);
} else {
//System.out.println("union");
doUnionScoring(acceptDocs, collector, dims);
}
return Integer.MAX_VALUE;
}
/** Used when base query is highly constraining vs the
* drilldowns, or when the docs must be scored at once
* (i.e., like BooleanScorer2, not BooleanScorer). In
* this case we just .next() on base and .advance() on
* the dim filters. */
private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims) throws IOException {
//if (DEBUG) {
// System.out.println(" doQueryFirstScoring");
//}
setScorer(collector, ScoreCachingWrappingScorer.wrap(baseScorer));
int docID = baseScorer.docID();
nextDoc: while (docID != PostingsEnum.NO_MORE_DOCS) {
if (acceptDocs != null && acceptDocs.get(docID) == false) {
docID = baseIterator.nextDoc();
continue;
}
LeafCollector failedCollector = null;
for (DocsAndCost dim : dims) {
// TODO: should we sort this 2nd dimension of
// docsEnums from most frequent to least?
if (dim.approximation.docID() < docID) {
dim.approximation.advance(docID);
}
boolean matches = false;
if (dim.approximation.docID() == docID) {
if (dim.twoPhase == null) {
matches = true;
} else {
matches = dim.twoPhase.matches();
}
}
if (matches == false) {
if (failedCollector != null) {
// More than one dim fails on this document, so
// it's neither a hit nor a near-miss; move to
// next doc:
docID = baseIterator.nextDoc();
continue nextDoc;
} else {
failedCollector = dim.sidewaysLeafCollector;
}
}
}
collectDocID = docID;
if (failedCollector == null) {
// Hit passed all filters, so it's "real":
collectHit(collector, dims);
} else {
// Hit missed exactly one filter:
collectNearMiss(failedCollector);
}
docID = baseIterator.nextDoc();
}
}
/** Used when drill downs are highly constraining vs
* baseQuery. */
private void doDrillDownAdvanceScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims) throws IOException {
setScorer(collector, new ScoreAndDoc());
final int maxDoc = context.reader().maxDoc();
final int numDims = dims.length;
//if (DEBUG) {
// System.out.println(" doDrillDownAdvanceScoring");
//}
// TODO: maybe a class like BS, instead of parallel arrays
int[] filledSlots = new int[CHUNK];
int[] docIDs = new int[CHUNK];
float[] scores = new float[CHUNK];
int[] missingDims = new int[CHUNK];
int[] counts = new int[CHUNK];
docIDs[0] = -1;
int nextChunkStart = CHUNK;
final FixedBitSet seen = new FixedBitSet(CHUNK);
while (true) {
//if (DEBUG) {
// System.out.println("\ncycle nextChunkStart=" + nextChunkStart + " docIds[0]=" + docIDs[0]);
//}
// First dim:
//if (DEBUG) {
// System.out.println(" dim0");
//}
DocsAndCost dc = dims[0];
int docID = dc.approximation.docID();
while (docID < nextChunkStart) {
if (acceptDocs == null || acceptDocs.get(docID)) {
int slot = docID & MASK;
if (docIDs[slot] != docID && (dc.twoPhase == null || dc.twoPhase.matches())) {
seen.set(slot);
// Mark slot as valid:
//if (DEBUG) {
// System.out.println(" set docID=" + docID + " id=" + context.reader().document(docID).get("id"));
//}
docIDs[slot] = docID;
missingDims[slot] = 1;
counts[slot] = 1;
}
}
docID = dc.approximation.nextDoc();
}
// Second dim:
//if (DEBUG) {
// System.out.println(" dim1");
//}
dc = dims[1];
docID = dc.approximation.docID();
while (docID < nextChunkStart) {
if (acceptDocs == null || acceptDocs.get(docID)
&& (dc.twoPhase == null || dc.twoPhase.matches())) {
int slot = docID & MASK;
if (docIDs[slot] != docID) {
// Mark slot as valid:
seen.set(slot);
//if (DEBUG) {
// System.out.println(" set docID=" + docID + " missingDim=0 id=" + context.reader().document(docID).get("id"));
//}
docIDs[slot] = docID;
missingDims[slot] = 0;
counts[slot] = 1;
} else {
// TODO: single-valued dims will always be true
// below; we could somehow specialize
if (missingDims[slot] >= 1) {
missingDims[slot] = 2;
counts[slot] = 2;
//if (DEBUG) {
// System.out.println(" set docID=" + docID + " missingDim=2 id=" + context.reader().document(docID).get("id"));
//}
} else {
counts[slot] = 1;
//if (DEBUG) {
// System.out.println(" set docID=" + docID + " missingDim=" + missingDims[slot] + " id=" + context.reader().document(docID).get("id"));
//}
}
}
}
docID = dc.approximation.nextDoc();
}
// After this we can "upgrade" to conjunction, because
// any doc not seen by either dim 0 or dim 1 cannot be
// a hit or a near miss:
//if (DEBUG) {
// System.out.println(" baseScorer");
//}
// Fold in baseScorer, using advance:
int filledCount = 0;
int slot0 = 0;
while (slot0 < CHUNK && (slot0 = seen.nextSetBit(slot0)) != DocIdSetIterator.NO_MORE_DOCS) {
int ddDocID = docIDs[slot0];
assert ddDocID != -1;
int baseDocID = baseIterator.docID();
if (baseDocID < ddDocID) {
baseDocID = baseIterator.advance(ddDocID);
}
if (baseDocID == ddDocID) {
//if (DEBUG) {
// System.out.println(" keep docID=" + ddDocID + " id=" + context.reader().document(ddDocID).get("id"));
//}
scores[slot0] = baseScorer.score();
filledSlots[filledCount++] = slot0;
counts[slot0]++;
} else {
//if (DEBUG) {
// System.out.println(" no docID=" + ddDocID + " id=" + context.reader().document(ddDocID).get("id"));
//}
docIDs[slot0] = -1;
// TODO: we could jump slot0 forward to the
// baseDocID ... but we'd need to set docIDs for
// intervening slots to -1
}
slot0++;
}
seen.clear(0, CHUNK);
if (filledCount == 0) {
if (nextChunkStart >= maxDoc) {
break;
}
nextChunkStart += CHUNK;
continue;
}
// TODO: factor this out & share w/ union scorer,
// except we start from dim=2 instead:
for (int dim=2;dim<numDims;dim++) {
//if (DEBUG) {
// System.out.println(" dim=" + dim + " [" + dims[dim].dim + "]");
//}
dc = dims[dim];
docID = dc.approximation.docID();
while (docID < nextChunkStart) {
int slot = docID & MASK;
if (docIDs[slot] == docID
&& counts[slot] >= dim
&& (dc.twoPhase == null || dc.twoPhase.matches())) {
// TODO: single-valued dims will always be true
// below; we could somehow specialize
if (missingDims[slot] >= dim) {
//if (DEBUG) {
// System.out.println(" set docID=" + docID + " count=" + (dim+2));
//}
missingDims[slot] = dim+1;
counts[slot] = dim+2;
} else {
//if (DEBUG) {
// System.out.println(" set docID=" + docID + " missing count=" + (dim+1));
//}
counts[slot] = dim+1;
}
}
// TODO: sometimes use advance?
docID = dc.approximation.nextDoc();
}
}
// Collect:
//if (DEBUG) {
// System.out.println(" now collect: " + filledCount + " hits");
//}
for (int i=0;i<filledCount;i++) {
int slot = filledSlots[i];
collectDocID = docIDs[slot];
collectScore = scores[slot];
//if (DEBUG) {
// System.out.println(" docID=" + docIDs[slot] + " count=" + counts[slot]);
//}
if (counts[slot] == 1+numDims) {
collectHit(collector, dims);
} else if (counts[slot] == numDims) {
collectNearMiss(dims[missingDims[slot]].sidewaysLeafCollector);
}
}
if (nextChunkStart >= maxDoc) {
break;
}
nextChunkStart += CHUNK;
}
}
private void doUnionScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims) throws IOException {
//if (DEBUG) {
// System.out.println(" doUnionScoring");
//}
setScorer(collector, new ScoreAndDoc());
final int maxDoc = context.reader().maxDoc();
final int numDims = dims.length;
// TODO: maybe a class like BS, instead of parallel arrays
int[] filledSlots = new int[CHUNK];
int[] docIDs = new int[CHUNK];
float[] scores = new float[CHUNK];
int[] missingDims = new int[CHUNK];
int[] counts = new int[CHUNK];
docIDs[0] = -1;
// NOTE: this is basically a specialized version of
// BooleanScorer, to the minShouldMatch=N-1 case, but
// carefully tracking which dimension failed to match
int nextChunkStart = CHUNK;
while (true) {
//if (DEBUG) {
// System.out.println("\ncycle nextChunkStart=" + nextChunkStart + " docIds[0]=" + docIDs[0]);
//}
int filledCount = 0;
int docID = baseIterator.docID();
//if (DEBUG) {
// System.out.println(" base docID=" + docID);
//}
while (docID < nextChunkStart) {
if (acceptDocs == null || acceptDocs.get(docID)) {
int slot = docID & MASK;
//if (DEBUG) {
// System.out.println(" docIDs[slot=" + slot + "]=" + docID + " id=" + context.reader().document(docID).get("id"));
//}
// Mark slot as valid:
assert docIDs[slot] != docID: "slot=" + slot + " docID=" + docID;
docIDs[slot] = docID;
scores[slot] = baseScorer.score();
filledSlots[filledCount++] = slot;
missingDims[slot] = 0;
counts[slot] = 1;
}
docID = baseIterator.nextDoc();
}
if (filledCount == 0) {
if (nextChunkStart >= maxDoc) {
break;
}
nextChunkStart += CHUNK;
continue;
}
// First drill-down dim, basically adds SHOULD onto
// the baseQuery:
//if (DEBUG) {
// System.out.println(" dim=0 [" + dims[0].dim + "]");
//}
{
DocsAndCost dc = dims[0];
docID = dc.approximation.docID();
//if (DEBUG) {
// System.out.println(" start docID=" + docID);
//}
while (docID < nextChunkStart) {
int slot = docID & MASK;
if (docIDs[slot] == docID // this also checks that the doc is not deleted
&& (dc.twoPhase == null || dc.twoPhase.matches())) {
//if (DEBUG) {
// System.out.println(" set docID=" + docID + " count=2");
//}
missingDims[slot] = 1;
counts[slot] = 2;
}
docID = dc.approximation.nextDoc();
}
}
for (int dim=1;dim<numDims;dim++) {
//if (DEBUG) {
// System.out.println(" dim=" + dim + " [" + dims[dim].dim + "]");
//}
DocsAndCost dc = dims[dim];
docID = dc.approximation.docID();
//if (DEBUG) {
// System.out.println(" start docID=" + docID);
//}
while (docID < nextChunkStart) {
int slot = docID & MASK;
if (docIDs[slot] == docID // also means that the doc is not deleted
&& counts[slot] >= dim
&& (dc.twoPhase == null || dc.twoPhase.matches())) {
// This doc is still in the running...
// TODO: single-valued dims will always be true
// below; we could somehow specialize
if (missingDims[slot] >= dim) {
//if (DEBUG) {
// System.out.println(" set docID=" + docID + " count=" + (dim+2));
//}
missingDims[slot] = dim+1;
counts[slot] = dim+2;
} else {
//if (DEBUG) {
// System.out.println(" set docID=" + docID + " missing count=" + (dim+1));
//}
counts[slot] = dim+1;
}
}
docID = dc.approximation.nextDoc();
}
}
// Collect:
//System.out.println(" now collect: " + filledCount + " hits");
for (int i=0;i<filledCount;i++) {
// NOTE: This is actually in-order collection,
// because we only accept docs originally returned by
// the baseScorer (ie that Scorer is AND'd)
int slot = filledSlots[i];
collectDocID = docIDs[slot];
collectScore = scores[slot];
//if (DEBUG) {
// System.out.println(" docID=" + docIDs[slot] + " count=" + counts[slot]);
//}
//System.out.println(" collect doc=" + collectDocID + " main.freq=" + (counts[slot]-1) + " main.doc=" + collectDocID + " exactCount=" + numDims);
if (counts[slot] == 1+numDims) {
//System.out.println(" hit");
collectHit(collector, dims);
} else if (counts[slot] == numDims) {
//System.out.println(" sw");
collectNearMiss(dims[missingDims[slot]].sidewaysLeafCollector);
}
}
if (nextChunkStart >= maxDoc) {
break;
}
nextChunkStart += CHUNK;
}
}
private void collectHit(LeafCollector collector, DocsAndCost[] dims) throws IOException {
//if (DEBUG) {
// System.out.println(" hit");
//}
collector.collect(collectDocID);
if (drillDownCollector != null) {
drillDownLeafCollector.collect(collectDocID);
}
// TODO: we could "fix" faceting of the sideways counts
// to do this "union" (of the drill down hits) in the
// end instead:
// Tally sideways counts:
for (DocsAndCost dim : dims) {
dim.sidewaysLeafCollector.collect(collectDocID);
}
}
private void collectNearMiss(LeafCollector sidewaysCollector) throws IOException {
//if (DEBUG) {
// System.out.println(" missingDim=" + dim);
//}
sidewaysCollector.collect(collectDocID);
}
private void setScorer(LeafCollector mainCollector, Scorable scorer) throws IOException {
mainCollector.setScorer(scorer);
if (drillDownLeafCollector != null) {
drillDownLeafCollector.setScorer(scorer);
}
for (DocsAndCost dim : dims) {
dim.sidewaysLeafCollector.setScorer(scorer);
}
}
private final class ScoreAndDoc extends Scorable {
@Override
public int docID() {
return collectDocID;
}
@Override
public float score() {
return collectScore;
}
@Override
public Collection<ChildScorable> getChildren() {
return Collections.singletonList(new ChildScorable(baseScorer, "MUST"));
}
}
static class DocsAndCost {
// approximation of matching docs, or the scorer itself
final DocIdSetIterator approximation;
// two-phase confirmation, or null if the approximation is accurate
final TwoPhaseIterator twoPhase;
final Collector sidewaysCollector;
LeafCollector sidewaysLeafCollector;
DocsAndCost(Scorer scorer, Collector sidewaysCollector) {
final TwoPhaseIterator twoPhase = scorer.twoPhaseIterator();
if (twoPhase == null) {
this.approximation = scorer.iterator();
this.twoPhase = null;
} else {
this.approximation = twoPhase.approximation();
this.twoPhase = twoPhase;
}
this.sidewaysCollector = sidewaysCollector;
}
}
}