blob: 86aff36b0ba52317c223fdc77a2836860d98e617 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.comparators;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldComparator;
import org.apache.lucene.search.LeafFieldComparator;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.util.DocIdSetBuilder;
import org.apache.lucene.util.FutureArrays;
import java.io.IOException;
/**
* Abstract numeric comparator for comparing numeric values.
* This comparator provides a skipping functionality – an iterator that can skip over non-competitive documents.
*
* <p>Parameter {@code field} provided in the constructor is used as a field name in the default
* implementations of the methods {@code getNumericDocValues} and {@code getPointValues} to retrieve
* doc values and points. You can pass a dummy value for a field name (e.g. when sorting by script),
* but in this case you must override both of these methods.
*/
public abstract class NumericComparator<T extends Number> extends FieldComparator<T> {
protected final T missingValue;
protected final String field;
protected final boolean reverse;
private final int bytesCount; // how many bytes are used to encode this number
protected boolean topValueSet;
protected boolean singleSort; // singleSort is true, if sort is based on a single sort field.
protected boolean hitsThresholdReached;
protected boolean queueFull;
private boolean canSkipDocuments;
protected NumericComparator(String field, T missingValue, boolean reverse, int sortPos, int bytesCount) {
this.field = field;
this.missingValue = missingValue;
this.reverse = reverse;
this.canSkipDocuments = (sortPos == 0); // skipping functionality is only relevant for primary sort
this.bytesCount = bytesCount;
}
@Override
public void disableSkipping() {
canSkipDocuments = false;
}
@Override
public void setTopValue(T value) {
topValueSet = true;
}
@Override
public void setSingleSort() {
singleSort = true;
}
/**
* Leaf comparator for {@link NumericComparator} that provides skipping functionality
*/
public abstract class NumericLeafComparator implements LeafFieldComparator {
protected final NumericDocValues docValues;
private final PointValues pointValues;
private final boolean enableSkipping; // if skipping functionality should be enabled on this segment
private final int maxDoc;
private final byte[] minValueAsBytes;
private final byte[] maxValueAsBytes;
private DocIdSetIterator competitiveIterator;
private long iteratorCost;
private int maxDocVisited = -1;
private int updateCounter = 0;
public NumericLeafComparator(LeafReaderContext context) throws IOException {
this.docValues = getNumericDocValues(context, field);
this.pointValues = canSkipDocuments ? getPointValues(context, field) : null;
if (pointValues != null) {
FieldInfo info = context.reader().getFieldInfos().fieldInfo(field);
if (info == null || info.getPointDimensionCount() == 0) {
throw new IllegalStateException(
"Field "
+ field
+ " doesn't index points according to FieldInfos yet returns non-null PointValues");
} else if (info.getPointDimensionCount() > 1) {
throw new IllegalArgumentException(
"Field " + field + " is indexed with multiple dimensions, sorting is not supported");
} else if (info.getPointNumBytes() != bytesCount) {
throw new IllegalArgumentException(
"Field "
+ field
+ " is indexed with "
+ info.getPointNumBytes()
+ " bytes per dimension, but "
+ NumericComparator.this
+ " expected "
+ bytesCount);
}
this.enableSkipping = true; // skipping is enabled when points are available
this.maxDoc = context.reader().maxDoc();
this.maxValueAsBytes = reverse == false ? new byte[bytesCount] : topValueSet ? new byte[bytesCount] : null;
this.minValueAsBytes = reverse ? new byte[bytesCount] : topValueSet ? new byte[bytesCount] : null;
this.competitiveIterator = DocIdSetIterator.all(maxDoc);
this.iteratorCost = maxDoc;
} else {
this.enableSkipping = false;
this.maxDoc = 0;
this.maxValueAsBytes = null;
this.minValueAsBytes = null;
}
}
/**
* Retrieves the NumericDocValues for the field in this segment
*
* <p>If you override this method, you must also override {@link
* #getPointValues(LeafReaderContext, String)} This class uses sort optimization that leverages
* points to filter out non-competitive matches, which relies on the assumption that points and
* doc values record the same information.
*
* @param context – reader context
* @param field - field name
* @return numeric doc values for the field in this segment.
* @throws IOException If there is a low-level I/O error
*/
protected NumericDocValues getNumericDocValues(LeafReaderContext context, String field) throws IOException {
return DocValues.getNumeric(context.reader(), field);
}
/**
* Retrieves point values for the field in this segment
*
* <p>If you override this method, you must also override {@link
* #getNumericDocValues(LeafReaderContext, String)} This class uses sort optimization that
* leverages points to filter out non-competitive matches, which relies on the assumption that
* points and doc values record the same information. Return {@code null} even if no points
* implementation is available, in this case sort optimization with points will be disabled.
*
* @param context – reader context
* @param field - field name
* @return point values for the field in this segment if they are available or {@code null} if
* sort optimization with points should be disabled.
* @throws IOException If there is a low-level I/O error
*/
protected PointValues getPointValues(LeafReaderContext context, String field)
throws IOException {
return context.reader().getPointValues(field);
}
@Override
public void setBottom(int slot) throws IOException {
queueFull = true; // if we are setting bottom, it means that we have collected enough hits
updateCompetitiveIterator(); // update an iterator if we set a new bottom
}
@Override
public void copy(int slot, int doc) throws IOException {
maxDocVisited = doc;
}
@Override
public void setScorer(Scorable scorer) throws IOException {
if (scorer instanceof Scorer) {
iteratorCost = ((Scorer) scorer).iterator().cost(); // starting iterator cost is the scorer's cost
updateCompetitiveIterator(); // update an iterator when we have a new segment
}
}
@Override
public void setHitsThresholdReached() throws IOException {
hitsThresholdReached = true;
updateCompetitiveIterator();
}
// update its iterator to include possibly only docs that are "stronger" than the current bottom entry
private void updateCompetitiveIterator() throws IOException {
if (enableSkipping == false || hitsThresholdReached == false || queueFull == false) return;
// if some documents have missing points, check that missing values prohibits optimization
if ((pointValues.getDocCount() < maxDoc) && isMissingValueCompetitive()) {
return; // we can't filter out documents, as documents with missing values are competitive
}
updateCounter++;
if (updateCounter > 256 && (updateCounter & 0x1f) != 0x1f) { // Start sampling if we get called too much
return;
}
if (reverse == false) {
encodeBottom(maxValueAsBytes);
if (topValueSet) {
encodeTop(minValueAsBytes);
}
} else {
encodeBottom(minValueAsBytes);
if (topValueSet) {
encodeTop(maxValueAsBytes);
}
}
DocIdSetBuilder result = new DocIdSetBuilder(maxDoc);
PointValues.IntersectVisitor visitor = new PointValues.IntersectVisitor() {
DocIdSetBuilder.BulkAdder adder;
@Override
public void grow(int count) {
adder = result.grow(count);
}
@Override
public void visit(int docID) {
if (docID <= maxDocVisited) {
return; // Already visited or skipped
}
adder.add(docID);
}
@Override
public void visit(int docID, byte[] packedValue) {
if (docID <= maxDocVisited) {
return; // already visited or skipped
}
if (maxValueAsBytes != null) {
int cmp = FutureArrays.compareUnsigned(packedValue, 0, bytesCount, maxValueAsBytes, 0, bytesCount);
// if doc's value is too high or for single sort even equal, it is not competitive and the doc can be skipped
if (cmp > 0 || (singleSort && cmp == 0)) return;
}
if (minValueAsBytes != null) {
int cmp = FutureArrays.compareUnsigned(packedValue, 0, bytesCount, minValueAsBytes, 0, bytesCount);
// if doc's value is too low or for single sort even equal, it is not competitive and the doc can be skipped
if (cmp < 0 || (singleSort && cmp == 0)) return;
}
adder.add(docID); // doc is competitive
}
@Override
public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
if (maxValueAsBytes != null) {
int cmp = FutureArrays.compareUnsigned(minPackedValue, 0, bytesCount, maxValueAsBytes, 0, bytesCount);
if (cmp > 0 || (singleSort && cmp == 0)) return PointValues.Relation.CELL_OUTSIDE_QUERY;
}
if (minValueAsBytes != null) {
int cmp = FutureArrays.compareUnsigned(maxPackedValue, 0, bytesCount, minValueAsBytes, 0, bytesCount);
if (cmp < 0 || (singleSort && cmp == 0)) return PointValues.Relation.CELL_OUTSIDE_QUERY;
}
if ((maxValueAsBytes != null &&
FutureArrays.compareUnsigned(maxPackedValue, 0, bytesCount, maxValueAsBytes, 0, bytesCount) > 0) ||
(minValueAsBytes != null &&
FutureArrays.compareUnsigned(minPackedValue, 0, bytesCount, minValueAsBytes, 0, bytesCount) < 0)) {
return PointValues.Relation.CELL_CROSSES_QUERY;
}
return PointValues.Relation.CELL_INSIDE_QUERY;
}
};
final long threshold = iteratorCost >>> 3;
long estimatedNumberOfMatches = pointValues.estimatePointCount(visitor); // runs in O(log(numPoints))
if (estimatedNumberOfMatches >= threshold) {
// the new range is not selective enough to be worth materializing, it doesn't reduce number of docs at least 8x
return;
}
pointValues.intersect(visitor);
competitiveIterator = result.build().iterator();
iteratorCost = competitiveIterator.cost();
}
@Override
public DocIdSetIterator competitiveIterator() {
if (enableSkipping == false) return null;
return new DocIdSetIterator() {
private int docID = competitiveIterator.docID();
@Override
public int nextDoc() throws IOException {
return advance(docID + 1);
}
@Override
public int docID() {
return docID;
}
@Override
public long cost() {
return competitiveIterator.cost();
}
@Override
public int advance(int target) throws IOException {
return docID = competitiveIterator.advance(target);
}
};
}
protected abstract boolean isMissingValueCompetitive();
protected abstract void encodeBottom(byte[] packedValue);
protected abstract void encodeTop(byte[] packedValue);
}
}