blob: d26abe16c216b792f03f70cda4031fd36f6b9822 [file] [log] [blame]
package org.apache.lucene.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
// Defaults derived from rough tests with a 20.0 million
// doc Wikipedia index. With more than 350 terms in the
// query, the filter method is fastest:
public static int DEFAULT_TERM_COUNT_CUTOFF = 350;
// If the query will hit more than 1 in 1000 of the docs
// in the index (0.1%), the filter method is fastest:
public static double DEFAULT_DOC_COUNT_PERCENT = 0.1;
private int termCountCutoff = DEFAULT_TERM_COUNT_CUTOFF;
private double docCountPercent = DEFAULT_DOC_COUNT_PERCENT;
/** If the number of terms in this query is equal to or
* larger than this setting then {@link
* MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} is used. */
public void setTermCountCutoff(int count) {
termCountCutoff = count;
}
/** @see #setTermCountCutoff */
public int getTermCountCutoff() {
return termCountCutoff;
}
/** If the number of documents to be visited in the
* postings exceeds this specified percentage of the
* maxDoc() for the index, then {@link
* MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} is used.
* @param percent 0.0 to 100.0 */
public void setDocCountPercent(double percent) {
docCountPercent = percent;
}
/** @see #setDocCountPercent */
public double getDocCountPercent() {
return docCountPercent;
}
@Override
protected BooleanQuery getTopLevelQuery() {
return new BooleanQuery(true);
}
@Override
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost /*ignored*/, TermContext states) {
topLevel.add(new TermQuery(term, states), BooleanClause.Occur.SHOULD);
}
@Override
public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
// Get the enum and start visiting terms. If we
// exhaust the enum before hitting either of the
// cutoffs, we use ConstantBooleanQueryRewrite; else,
// ConstantFilterRewrite:
final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc());
final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff);
final CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit);
collectTerms(reader, query, col);
final int size = col.pendingTerms.size();
if (col.hasCutOff) {
return MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query);
} else {
final BooleanQuery bq = getTopLevelQuery();
if (size > 0) {
final BytesRefHash pendingTerms = col.pendingTerms;
final int sort[] = pendingTerms.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
for(int i = 0; i < size; i++) {
final int pos = sort[i];
// docFreq is not used for constant score here, we pass 1
// to explicitely set a fake value, so it's not calculated
addClause(bq, new Term(query.field, pendingTerms.get(pos, new BytesRef())), 1, 1.0f, col.array.termState[pos]);
}
}
// Strip scores
final Query result = new ConstantScoreQuery(bq);
result.setBoost(query.getBoost());
return result;
}
}
static final class CutOffTermCollector extends TermCollector {
CutOffTermCollector(int docCountCutoff, int termCountLimit) {
this.docCountCutoff = docCountCutoff;
this.termCountLimit = termCountLimit;
}
@Override
public void setNextEnum(TermsEnum termsEnum) {
this.termsEnum = termsEnum;
}
@Override
public boolean collect(BytesRef bytes) throws IOException {
int pos = pendingTerms.add(bytes);
docVisitCount += termsEnum.docFreq();
if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) {
hasCutOff = true;
return false;
}
final TermState termState = termsEnum.termState();
assert termState != null;
if (pos < 0) {
pos = (-pos)-1;
array.termState[pos].register(termState, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
} else {
array.termState[pos] = new TermContext(topReaderContext, termState, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
}
return true;
}
int docVisitCount = 0;
boolean hasCutOff = false;
TermsEnum termsEnum;
final int docCountCutoff, termCountLimit;
final TermStateByteStart array = new TermStateByteStart(16);
final BytesRefHash pendingTerms = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectAllocator()), 16, array);
}
@Override
public int hashCode() {
final int prime = 1279;
return (int) (prime * termCountCutoff + Double.doubleToLongBits(docCountPercent));
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
ConstantScoreAutoRewrite other = (ConstantScoreAutoRewrite) obj;
if (other.termCountCutoff != termCountCutoff) {
return false;
}
if (Double.doubleToLongBits(other.docCountPercent) != Double.doubleToLongBits(docCountPercent)) {
return false;
}
return true;
}
/** Special implementation of BytesStartArray that keeps parallel arrays for {@link TermContext} */
static final class TermStateByteStart extends DirectBytesStartArray {
TermContext[] termState;
public TermStateByteStart(int initSize) {
super(initSize);
}
@Override
public int[] init() {
final int[] ord = super.init();
termState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
assert termState.length >= ord.length;
return ord;
}
@Override
public int[] grow() {
final int[] ord = super.grow();
if (termState.length < ord.length) {
TermContext[] tmpTermState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(termState, 0, tmpTermState, 0, termState.length);
termState = tmpTermState;
}
assert termState.length >= ord.length;
return ord;
}
@Override
public int[] clear() {
termState = null;
return super.clear();
}
}
}