blob: c470290388b7ee0d406c4239910d6bf31523cfcf [file] [log] [blame]
package org.apache.lucene.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.search.BooleanQuery.BooleanWeight;
/* Description from Doug Cutting (excerpted from
* LUCENE-1483):
*
* BooleanScorer uses an array to score windows of
* 2K docs. So it scores docs 0-2K first, then docs 2K-4K,
* etc. For each window it iterates through all query terms
* and accumulates a score in table[doc%2K]. It also stores
* in the table a bitmask representing which terms
* contributed to the score. Non-zero scores are chained in
* a linked list. At the end of scoring each window it then
* iterates through the linked list and, if the bitmask
* matches the boolean constraints, collects a hit. For
* boolean queries with lots of frequent terms this can be
* much faster, since it does not need to update a priority
* queue for each posting, instead performing constant-time
* operations per posting. The only downside is that it
* results in hits being delivered out-of-order within the
* window, which means it cannot be nested within other
* scorers. But it works well as a top-level scorer.
*
* The new BooleanScorer2 implementation instead works by
* merging priority queues of postings, albeit with some
* clever tricks. For example, a pure conjunction (all terms
* required) does not require a priority queue. Instead it
* sorts the posting streams at the start, then repeatedly
* skips the first to to the last. If the first ever equals
* the last, then there's a hit. When some terms are
* required and some terms are optional, the conjunction can
* be evaluated first, then the optional terms can all skip
* to the match and be added to the score. Thus the
* conjunction can reduce the number of priority queue
* updates for the optional terms. */
final class BooleanScorer extends Scorer {
private static final class BooleanScorerCollector extends Collector {
private BucketTable bucketTable;
private int mask;
private Scorer scorer;
public BooleanScorerCollector(int mask, BucketTable bucketTable) {
this.mask = mask;
this.bucketTable = bucketTable;
}
@Override
public void collect(final int doc) throws IOException {
final BucketTable table = bucketTable;
final int i = doc & BucketTable.MASK;
final Bucket bucket = table.buckets[i];
if (bucket.doc != doc) { // invalid bucket
bucket.doc = doc; // set doc
bucket.score = scorer.score(); // initialize score
bucket.bits = mask; // initialize mask
bucket.coord = 1; // initialize coord
bucket.next = table.first; // push onto valid list
table.first = bucket;
} else { // valid bucket
bucket.score += scorer.score(); // increment score
bucket.bits |= mask; // add bits in mask
bucket.coord++; // increment coord
}
}
@Override
public void setNextReader(AtomicReaderContext context) {
// not needed by this implementation
}
@Override
public void setScorer(Scorer scorer) {
this.scorer = scorer;
}
@Override
public boolean acceptsDocsOutOfOrder() {
return true;
}
}
// An internal class which is used in score(Collector, int) for setting the
// current score. This is required since Collector exposes a setScorer method
// and implementations that need the score will call scorer.score().
// Therefore the only methods that are implemented are score() and doc().
private static final class BucketScorer extends Scorer {
double score;
int doc = NO_MORE_DOCS;
int freq;
public BucketScorer(Weight weight) { super(weight); }
@Override
public int advance(int target) { return NO_MORE_DOCS; }
@Override
public int docID() { return doc; }
@Override
public int freq() { return freq; }
@Override
public int nextDoc() { return NO_MORE_DOCS; }
@Override
public float score() { return (float)score; }
@Override
public long cost() { return 1; }
}
static final class Bucket {
int doc = -1; // tells if bucket is valid
double score; // incremental score
// TODO: break out bool anyProhibited, int
// numRequiredMatched; then we can remove 32 limit on
// required clauses
int bits; // used for bool constraints
int coord; // count of terms in score
Bucket next; // next valid bucket
}
/** A simple hash table of document scores within a range. */
static final class BucketTable {
public static final int SIZE = 1 << 11;
public static final int MASK = SIZE - 1;
final Bucket[] buckets = new Bucket[SIZE];
Bucket first = null; // head of valid list
public BucketTable() {
// Pre-fill to save the lazy init when collecting
// each sub:
for(int idx=0;idx<SIZE;idx++) {
buckets[idx] = new Bucket();
}
}
public Collector newCollector(int mask) {
return new BooleanScorerCollector(mask, this);
}
public int size() { return SIZE; }
}
static final class SubScorer {
public Scorer scorer;
// TODO: re-enable this if BQ ever sends us required clauses
//public boolean required = false;
public boolean prohibited;
public Collector collector;
public SubScorer next;
public SubScorer(Scorer scorer, boolean required, boolean prohibited,
Collector collector, SubScorer next) {
if (required) {
throw new IllegalArgumentException("this scorer cannot handle required=true");
}
this.scorer = scorer;
// TODO: re-enable this if BQ ever sends us required clauses
//this.required = required;
this.prohibited = prohibited;
this.collector = collector;
this.next = next;
}
}
private SubScorer scorers = null;
private BucketTable bucketTable = new BucketTable();
private final float[] coordFactors;
// TODO: re-enable this if BQ ever sends us required clauses
//private int requiredMask = 0;
private final int minNrShouldMatch;
private int end;
private Bucket current;
// Any time a prohibited clause matches we set bit 0:
private static final int PROHIBITED_MASK = 1;
BooleanScorer(BooleanWeight weight, boolean disableCoord, int minNrShouldMatch,
List<Scorer> optionalScorers, List<Scorer> prohibitedScorers, int maxCoord) throws IOException {
super(weight);
this.minNrShouldMatch = minNrShouldMatch;
if (optionalScorers != null && optionalScorers.size() > 0) {
for (Scorer scorer : optionalScorers) {
if (scorer.nextDoc() != NO_MORE_DOCS) {
scorers = new SubScorer(scorer, false, false, bucketTable.newCollector(0), scorers);
}
}
}
if (prohibitedScorers != null && prohibitedScorers.size() > 0) {
for (Scorer scorer : prohibitedScorers) {
if (scorer.nextDoc() != NO_MORE_DOCS) {
scorers = new SubScorer(scorer, false, true, bucketTable.newCollector(PROHIBITED_MASK), scorers);
}
}
}
coordFactors = new float[optionalScorers.size() + 1];
for (int i = 0; i < coordFactors.length; i++) {
coordFactors[i] = disableCoord ? 1.0f : weight.coord(i, maxCoord);
}
}
// firstDocID is ignored since nextDoc() initializes 'current'
@Override
public boolean score(Collector collector, int max, int firstDocID) throws IOException {
// Make sure it's only BooleanScorer that calls us:
assert firstDocID == -1;
boolean more;
Bucket tmp;
BucketScorer bs = new BucketScorer(weight);
// The internal loop will set the score and doc before calling collect.
collector.setScorer(bs);
do {
bucketTable.first = null;
while (current != null) { // more queued
// check prohibited & required
if ((current.bits & PROHIBITED_MASK) == 0) {
// TODO: re-enable this if BQ ever sends us required
// clauses
//&& (current.bits & requiredMask) == requiredMask) {
// NOTE: Lucene always passes max =
// Integer.MAX_VALUE today, because we never embed
// a BooleanScorer inside another (even though
// that should work)... but in theory an outside
// app could pass a different max so we must check
// it:
if (current.doc >= max){
tmp = current;
current = current.next;
tmp.next = bucketTable.first;
bucketTable.first = tmp;
continue;
}
if (current.coord >= minNrShouldMatch) {
bs.score = current.score * coordFactors[current.coord];
bs.doc = current.doc;
bs.freq = current.coord;
collector.collect(current.doc);
}
}
current = current.next; // pop the queue
}
if (bucketTable.first != null){
current = bucketTable.first;
bucketTable.first = current.next;
return true;
}
// refill the queue
more = false;
end += BucketTable.SIZE;
for (SubScorer sub = scorers; sub != null; sub = sub.next) {
int subScorerDocID = sub.scorer.docID();
if (subScorerDocID != NO_MORE_DOCS) {
more |= sub.scorer.score(sub.collector, end, subScorerDocID);
}
}
current = bucketTable.first;
} while (current != null || more);
return false;
}
@Override
public int advance(int target) {
throw new UnsupportedOperationException();
}
@Override
public int docID() {
throw new UnsupportedOperationException();
}
@Override
public int nextDoc() {
throw new UnsupportedOperationException();
}
@Override
public float score() {
throw new UnsupportedOperationException();
}
@Override
public int freq() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long cost() {
return Integer.MAX_VALUE;
}
@Override
public void score(Collector collector) throws IOException {
score(collector, Integer.MAX_VALUE, -1);
}
@Override
public String toString() {
StringBuilder buffer = new StringBuilder();
buffer.append("boolean(");
for (SubScorer sub = scorers; sub != null; sub = sub.next) {
buffer.append(sub.scorer.toString());
buffer.append(" ");
}
buffer.append(")");
return buffer.toString();
}
@Override
public Collection<ChildScorer> getChildren() {
throw new UnsupportedOperationException();
}
}