lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.lucene.search.comparators;

 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.NumericDocValues;
 import org.apache.lucene.index.PointValues;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.FieldComparator;
 import org.apache.lucene.search.LeafFieldComparator;
 import org.apache.lucene.search.Scorable;
 import org.apache.lucene.search.Scorer;
 import org.apache.lucene.util.DocIdSetBuilder;
 import org.apache.lucene.util.FutureArrays;

 import java.io.IOException;

 /**
  * Abstract numeric comparator for comparing numeric values.
  * This comparator provides a skipping functionality – an iterator that can skip over non-competitive documents.
  *
  * <p>Parameter {@code field} provided in the constructor is used as a field name in the default
  * implementations of the methods {@code getNumericDocValues} and {@code getPointValues} to retrieve
  * doc values and points. You can pass a dummy value for a field name (e.g. when sorting by script),
  * but in this case you must override both of these methods.
  */
 public abstract class NumericComparator<T extends Number> extends FieldComparator<T> {
   protected final T missingValue;
   protected final String field;
   protected final boolean reverse;
   private final int bytesCount; // how many bytes are used to encode this number

   protected boolean topValueSet;
   protected boolean singleSort; // singleSort is true, if sort is based on a single sort field.
   protected boolean hitsThresholdReached;
   protected boolean queueFull;
   private boolean canSkipDocuments;

   protected NumericComparator(String field, T missingValue, boolean reverse, int sortPos, int bytesCount) {
     this.field = field;
     this.missingValue = missingValue;
     this.reverse = reverse;
     this.canSkipDocuments = (sortPos == 0); // skipping functionality is only relevant for primary sort
     this.bytesCount = bytesCount;
   }

   @Override
   public void disableSkipping() {
     canSkipDocuments = false;
   }

   @Override
   public void setTopValue(T value) {
     topValueSet = true;
   }

   @Override
   public void setSingleSort() {
     singleSort = true;
   }

   /**
    * Leaf comparator for {@link NumericComparator} that provides skipping functionality
    */
   public abstract class NumericLeafComparator implements LeafFieldComparator {
     protected final NumericDocValues docValues;
     private final PointValues pointValues;
     private final boolean enableSkipping; // if skipping functionality should be enabled on this segment
     private final int maxDoc;
     private final byte[] minValueAsBytes;
     private final byte[] maxValueAsBytes;

     private DocIdSetIterator competitiveIterator;
     private long iteratorCost;
     private int maxDocVisited = -1;
     private int updateCounter = 0;

     public NumericLeafComparator(LeafReaderContext context) throws IOException {
       this.docValues = getNumericDocValues(context, field);
       this.pointValues = canSkipDocuments ? getPointValues(context, field) : null;
       if (pointValues != null) {
         FieldInfo info = context.reader().getFieldInfos().fieldInfo(field);
         if (info == null || info.getPointDimensionCount() == 0) {
           throw new IllegalStateException(
               "Field "
                   + field
                   + " doesn't index points according to FieldInfos yet returns non-null PointValues");
         } else if (info.getPointDimensionCount() > 1) {
           throw new IllegalArgumentException(
               "Field " + field + " is indexed with multiple dimensions, sorting is not supported");
         } else if (info.getPointNumBytes() != bytesCount) {
           throw new IllegalArgumentException(
               "Field "
                   + field
                   + " is indexed with "
                   + info.getPointNumBytes()
                   + " bytes per dimension, but "
                   + NumericComparator.this
                   + " expected "
                   + bytesCount);
         }
         this.enableSkipping = true; // skipping is enabled when points are available
         this.maxDoc = context.reader().maxDoc();
         this.maxValueAsBytes = reverse == false ? new byte[bytesCount] : topValueSet ? new byte[bytesCount] : null;
         this.minValueAsBytes = reverse ? new byte[bytesCount] : topValueSet ? new byte[bytesCount] : null;
         this.competitiveIterator = DocIdSetIterator.all(maxDoc);
         this.iteratorCost = maxDoc;
       } else {
         this.enableSkipping = false;
         this.maxDoc = 0;
         this.maxValueAsBytes = null;
         this.minValueAsBytes = null;
       }
     }

     /**
      * Retrieves the NumericDocValues for the field in this segment
      *
      * <p>If you override this method, you must also override {@link
      * #getPointValues(LeafReaderContext, String)} This class uses sort optimization that leverages
      * points to filter out non-competitive matches, which relies on the assumption that points and
      * doc values record the same information.
      *
      * @param context – reader context
      * @param field - field name
      * @return numeric doc values for the field in this segment.
      * @throws IOException If there is a low-level I/O error
      */
     protected NumericDocValues getNumericDocValues(LeafReaderContext context, String field) throws IOException {
       return DocValues.getNumeric(context.reader(), field);
     }

     /**
      * Retrieves point values for the field in this segment
      *
      * <p>If you override this method, you must also override {@link
      * #getNumericDocValues(LeafReaderContext, String)} This class uses sort optimization that
      * leverages points to filter out non-competitive matches, which relies on the assumption that
      * points and doc values record the same information. Return {@code null} even if no points
      * implementation is available, in this case sort optimization with points will be disabled.
      *
      * @param context – reader context
      * @param field - field name
      * @return point values for the field in this segment if they are available or {@code null} if
      *     sort optimization with points should be disabled.
      * @throws IOException If there is a low-level I/O error
      */
     protected PointValues getPointValues(LeafReaderContext context, String field)
         throws IOException {
       return context.reader().getPointValues(field);
     }

     @Override
     public void setBottom(int slot) throws IOException {
       queueFull = true; // if we are setting bottom, it means that we have collected enough hits
       updateCompetitiveIterator(); // update an iterator if we set a new bottom
     }

     @Override
     public void copy(int slot, int doc) throws IOException {
       maxDocVisited = doc;
     }

     @Override
     public void setScorer(Scorable scorer) throws IOException {
       if (scorer instanceof Scorer) {
         iteratorCost = ((Scorer) scorer).iterator().cost(); // starting iterator cost is the scorer's cost
         updateCompetitiveIterator(); // update an iterator when we have a new segment
       }
     }

     @Override
     public void setHitsThresholdReached() throws IOException {
       hitsThresholdReached = true;
       updateCompetitiveIterator();
     }

     // update its iterator to include possibly only docs that are "stronger" than the current bottom entry
     private void updateCompetitiveIterator() throws IOException {
       if (enableSkipping == false || hitsThresholdReached == false || queueFull == false) return;
       // if some documents have missing points, check that missing values prohibits optimization
       if ((pointValues.getDocCount() < maxDoc) && isMissingValueCompetitive()) {
         return; // we can't filter out documents, as documents with missing values are competitive
       }

       updateCounter++;
       if (updateCounter > 256 && (updateCounter & 0x1f) != 0x1f) { // Start sampling if we get called too much
         return;
       }
       if (reverse == false) {
         encodeBottom(maxValueAsBytes);
         if (topValueSet) {
           encodeTop(minValueAsBytes);
         }
       } else {
         encodeBottom(minValueAsBytes);
         if (topValueSet) {
           encodeTop(maxValueAsBytes);
         }
       }

       DocIdSetBuilder result = new DocIdSetBuilder(maxDoc);
       PointValues.IntersectVisitor visitor = new PointValues.IntersectVisitor() {
         DocIdSetBuilder.BulkAdder adder;

         @Override
         public void grow(int count) {
           adder = result.grow(count);
         }

         @Override
         public void visit(int docID) {
           if (docID <= maxDocVisited) {
             return; // Already visited or skipped
           }
           adder.add(docID);
         }

         @Override
         public void visit(int docID, byte[] packedValue) {
           if (docID <= maxDocVisited) {
             return;  // already visited or skipped
           }
           if (maxValueAsBytes != null) {
             int cmp = FutureArrays.compareUnsigned(packedValue, 0, bytesCount, maxValueAsBytes, 0, bytesCount);
             // if doc's value is too high or for single sort even equal, it is not competitive and the doc can be skipped
             if (cmp > 0 || (singleSort && cmp == 0)) return;
           }
           if (minValueAsBytes != null) {
             int cmp = FutureArrays.compareUnsigned(packedValue, 0, bytesCount, minValueAsBytes, 0, bytesCount);
             // if doc's value is too low or for single sort even equal, it is not competitive and the doc can be skipped
             if (cmp < 0 || (singleSort && cmp == 0)) return;
           }
           adder.add(docID); // doc is competitive
         }

         @Override
         public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
           if (maxValueAsBytes != null) {
             int cmp = FutureArrays.compareUnsigned(minPackedValue, 0, bytesCount, maxValueAsBytes, 0, bytesCount);
             if (cmp > 0 || (singleSort && cmp == 0)) return PointValues.Relation.CELL_OUTSIDE_QUERY;
           }
           if (minValueAsBytes != null) {
             int cmp = FutureArrays.compareUnsigned(maxPackedValue, 0, bytesCount, minValueAsBytes, 0, bytesCount);
             if (cmp < 0 || (singleSort && cmp == 0)) return PointValues.Relation.CELL_OUTSIDE_QUERY;
           }
           if ((maxValueAsBytes != null &&
               FutureArrays.compareUnsigned(maxPackedValue, 0, bytesCount, maxValueAsBytes, 0, bytesCount) > 0) ||
               (minValueAsBytes != null &&
                   FutureArrays.compareUnsigned(minPackedValue, 0, bytesCount, minValueAsBytes, 0, bytesCount) < 0)) {
             return PointValues.Relation.CELL_CROSSES_QUERY;
           }
           return PointValues.Relation.CELL_INSIDE_QUERY;
         }
       };
       final long threshold = iteratorCost >>> 3;
       long estimatedNumberOfMatches = pointValues.estimatePointCount(visitor); // runs in O(log(numPoints))
       if (estimatedNumberOfMatches >= threshold) {
         // the new range is not selective enough to be worth materializing, it doesn't reduce number of docs at least 8x
         return;
       }
       pointValues.intersect(visitor);
       competitiveIterator = result.build().iterator();
       iteratorCost = competitiveIterator.cost();
     }

     @Override
     public DocIdSetIterator competitiveIterator() {
       if (enableSkipping == false) return null;
       return new DocIdSetIterator() {
         private int docID = competitiveIterator.docID();

         @Override
         public int nextDoc() throws IOException {
           return advance(docID + 1);
         }

         @Override
         public int docID() {
           return docID;
         }

         @Override
         public long cost() {
           return competitiveIterator.cost();
         }

         @Override
         public int advance(int target) throws IOException {
           return docID = competitiveIterator.advance(target);
         }
       };
     }

     protected abstract boolean isMissingValueCompetitive();

     protected abstract void encodeBottom(byte[] packedValue);

     protected abstract void encodeTop(byte[] packedValue);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.lucene.search.comparators;

	import org.apache.lucene.index.DocValues;
	import org.apache.lucene.index.FieldInfo;
	import org.apache.lucene.index.LeafReaderContext;
	import org.apache.lucene.index.NumericDocValues;
	import org.apache.lucene.index.PointValues;
	import org.apache.lucene.search.DocIdSetIterator;
	import org.apache.lucene.search.FieldComparator;
	import org.apache.lucene.search.LeafFieldComparator;
	import org.apache.lucene.search.Scorable;
	import org.apache.lucene.search.Scorer;
	import org.apache.lucene.util.DocIdSetBuilder;
	import org.apache.lucene.util.FutureArrays;

	import java.io.IOException;

	/**
	* Abstract numeric comparator for comparing numeric values.
	* This comparator provides a skipping functionality – an iterator that can skip over non-competitive documents.
	*
	* <p>Parameter {@code field} provided in the constructor is used as a field name in the default
	* implementations of the methods {@code getNumericDocValues} and {@code getPointValues} to retrieve
	* doc values and points. You can pass a dummy value for a field name (e.g. when sorting by script),
	* but in this case you must override both of these methods.
	*/
	public abstract class NumericComparator<T extends Number> extends FieldComparator<T> {
	protected final T missingValue;
	protected final String field;
	protected final boolean reverse;
	private final int bytesCount; // how many bytes are used to encode this number

	protected boolean topValueSet;
	protected boolean singleSort; // singleSort is true, if sort is based on a single sort field.
	protected boolean hitsThresholdReached;
	protected boolean queueFull;
	private boolean canSkipDocuments;

	protected NumericComparator(String field, T missingValue, boolean reverse, int sortPos, int bytesCount) {
	this.field = field;
	this.missingValue = missingValue;
	this.reverse = reverse;
	this.canSkipDocuments = (sortPos == 0); // skipping functionality is only relevant for primary sort
	this.bytesCount = bytesCount;
	}

	@Override
	public void disableSkipping() {
	canSkipDocuments = false;
	}

	@Override
	public void setTopValue(T value) {
	topValueSet = true;
	}

	@Override
	public void setSingleSort() {
	singleSort = true;
	}

	/**
	* Leaf comparator for {@link NumericComparator} that provides skipping functionality
	*/
	public abstract class NumericLeafComparator implements LeafFieldComparator {
	protected final NumericDocValues docValues;
	private final PointValues pointValues;
	private final boolean enableSkipping; // if skipping functionality should be enabled on this segment
	private final int maxDoc;
	private final byte[] minValueAsBytes;
	private final byte[] maxValueAsBytes;

	private DocIdSetIterator competitiveIterator;
	private long iteratorCost;
	private int maxDocVisited = -1;
	private int updateCounter = 0;

	public NumericLeafComparator(LeafReaderContext context) throws IOException {
	this.docValues = getNumericDocValues(context, field);
	this.pointValues = canSkipDocuments ? getPointValues(context, field) : null;
	if (pointValues != null) {
	FieldInfo info = context.reader().getFieldInfos().fieldInfo(field);
	if (info == null \|\| info.getPointDimensionCount() == 0) {
	throw new IllegalStateException(
	"Field "
	+ field
	+ " doesn't index points according to FieldInfos yet returns non-null PointValues");
	} else if (info.getPointDimensionCount() > 1) {
	throw new IllegalArgumentException(
	"Field " + field + " is indexed with multiple dimensions, sorting is not supported");
	} else if (info.getPointNumBytes() != bytesCount) {
	throw new IllegalArgumentException(
	"Field "
	+ field
	+ " is indexed with "
	+ info.getPointNumBytes()
	+ " bytes per dimension, but "
	+ NumericComparator.this
	+ " expected "
	+ bytesCount);
	}
	this.enableSkipping = true; // skipping is enabled when points are available
	this.maxDoc = context.reader().maxDoc();
	this.maxValueAsBytes = reverse == false ? new byte[bytesCount] : topValueSet ? new byte[bytesCount] : null;
	this.minValueAsBytes = reverse ? new byte[bytesCount] : topValueSet ? new byte[bytesCount] : null;
	this.competitiveIterator = DocIdSetIterator.all(maxDoc);
	this.iteratorCost = maxDoc;
	} else {
	this.enableSkipping = false;
	this.maxDoc = 0;
	this.maxValueAsBytes = null;
	this.minValueAsBytes = null;
	}
	}

	/**
	* Retrieves the NumericDocValues for the field in this segment
	*
	* <p>If you override this method, you must also override {@link
	* #getPointValues(LeafReaderContext, String)} This class uses sort optimization that leverages
	* points to filter out non-competitive matches, which relies on the assumption that points and
	* doc values record the same information.
	*
	* @param context – reader context
	* @param field - field name
	* @return numeric doc values for the field in this segment.
	* @throws IOException If there is a low-level I/O error
	*/
	protected NumericDocValues getNumericDocValues(LeafReaderContext context, String field) throws IOException {
	return DocValues.getNumeric(context.reader(), field);
	}

	/**
	* Retrieves point values for the field in this segment
	*
	* <p>If you override this method, you must also override {@link
	* #getNumericDocValues(LeafReaderContext, String)} This class uses sort optimization that
	* leverages points to filter out non-competitive matches, which relies on the assumption that
	* points and doc values record the same information. Return {@code null} even if no points
	* implementation is available, in this case sort optimization with points will be disabled.
	*
	* @param context – reader context
	* @param field - field name
	* @return point values for the field in this segment if they are available or {@code null} if
	* sort optimization with points should be disabled.
	* @throws IOException If there is a low-level I/O error
	*/
	protected PointValues getPointValues(LeafReaderContext context, String field)
	throws IOException {
	return context.reader().getPointValues(field);
	}

	@Override
	public void setBottom(int slot) throws IOException {
	queueFull = true; // if we are setting bottom, it means that we have collected enough hits
	updateCompetitiveIterator(); // update an iterator if we set a new bottom
	}

	@Override
	public void copy(int slot, int doc) throws IOException {
	maxDocVisited = doc;
	}

	@Override
	public void setScorer(Scorable scorer) throws IOException {
	if (scorer instanceof Scorer) {
	iteratorCost = ((Scorer) scorer).iterator().cost(); // starting iterator cost is the scorer's cost
	updateCompetitiveIterator(); // update an iterator when we have a new segment
	}
	}

	@Override
	public void setHitsThresholdReached() throws IOException {
	hitsThresholdReached = true;
	updateCompetitiveIterator();
	}

	// update its iterator to include possibly only docs that are "stronger" than the current bottom entry
	private void updateCompetitiveIterator() throws IOException {
	if (enableSkipping == false \|\| hitsThresholdReached == false \|\| queueFull == false) return;
	// if some documents have missing points, check that missing values prohibits optimization
	if ((pointValues.getDocCount() < maxDoc) && isMissingValueCompetitive()) {
	return; // we can't filter out documents, as documents with missing values are competitive
	}

	updateCounter++;
	if (updateCounter > 256 && (updateCounter & 0x1f) != 0x1f) { // Start sampling if we get called too much
	return;
	}
	if (reverse == false) {
	encodeBottom(maxValueAsBytes);
	if (topValueSet) {
	encodeTop(minValueAsBytes);
	}
	} else {
	encodeBottom(minValueAsBytes);
	if (topValueSet) {
	encodeTop(maxValueAsBytes);
	}
	}

	DocIdSetBuilder result = new DocIdSetBuilder(maxDoc);
	PointValues.IntersectVisitor visitor = new PointValues.IntersectVisitor() {
	DocIdSetBuilder.BulkAdder adder;

	@Override
	public void grow(int count) {
	adder = result.grow(count);
	}

	@Override
	public void visit(int docID) {
	if (docID <= maxDocVisited) {
	return; // Already visited or skipped
	}
	adder.add(docID);
	}

	@Override
	public void visit(int docID, byte[] packedValue) {
	if (docID <= maxDocVisited) {
	return; // already visited or skipped
	}
	if (maxValueAsBytes != null) {
	int cmp = FutureArrays.compareUnsigned(packedValue, 0, bytesCount, maxValueAsBytes, 0, bytesCount);
	// if doc's value is too high or for single sort even equal, it is not competitive and the doc can be skipped
	if (cmp > 0 \|\| (singleSort && cmp == 0)) return;
	}
	if (minValueAsBytes != null) {
	int cmp = FutureArrays.compareUnsigned(packedValue, 0, bytesCount, minValueAsBytes, 0, bytesCount);
	// if doc's value is too low or for single sort even equal, it is not competitive and the doc can be skipped
	if (cmp < 0 \|\| (singleSort && cmp == 0)) return;
	}
	adder.add(docID); // doc is competitive
	}

	@Override
	public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
	if (maxValueAsBytes != null) {
	int cmp = FutureArrays.compareUnsigned(minPackedValue, 0, bytesCount, maxValueAsBytes, 0, bytesCount);
	if (cmp > 0 \|\| (singleSort && cmp == 0)) return PointValues.Relation.CELL_OUTSIDE_QUERY;
	}
	if (minValueAsBytes != null) {
	int cmp = FutureArrays.compareUnsigned(maxPackedValue, 0, bytesCount, minValueAsBytes, 0, bytesCount);
	if (cmp < 0 \|\| (singleSort && cmp == 0)) return PointValues.Relation.CELL_OUTSIDE_QUERY;
	}
	if ((maxValueAsBytes != null &&
	FutureArrays.compareUnsigned(maxPackedValue, 0, bytesCount, maxValueAsBytes, 0, bytesCount) > 0) \|\|
	(minValueAsBytes != null &&
	FutureArrays.compareUnsigned(minPackedValue, 0, bytesCount, minValueAsBytes, 0, bytesCount) < 0)) {
	return PointValues.Relation.CELL_CROSSES_QUERY;
	}
	return PointValues.Relation.CELL_INSIDE_QUERY;
	}
	};
	final long threshold = iteratorCost >>> 3;
	long estimatedNumberOfMatches = pointValues.estimatePointCount(visitor); // runs in O(log(numPoints))
	if (estimatedNumberOfMatches >= threshold) {
	// the new range is not selective enough to be worth materializing, it doesn't reduce number of docs at least 8x
	return;
	}
	pointValues.intersect(visitor);
	competitiveIterator = result.build().iterator();
	iteratorCost = competitiveIterator.cost();
	}

	@Override
	public DocIdSetIterator competitiveIterator() {
	if (enableSkipping == false) return null;
	return new DocIdSetIterator() {
	private int docID = competitiveIterator.docID();

	@Override
	public int nextDoc() throws IOException {
	return advance(docID + 1);
	}

	@Override
	public int docID() {
	return docID;
	}

	@Override
	public long cost() {
	return competitiveIterator.cost();
	}

	@Override
	public int advance(int target) throws IOException {
	return docID = competitiveIterator.advance(target);
	}
	};
	}

	protected abstract boolean isMissingValueCompetitive();

	protected abstract void encodeBottom(byte[] packedValue);

	protected abstract void encodeTop(byte[] packedValue);
	}
	}