blob: ad78edf9b8002ffb5f4e9b3400aa9d8a505e45eb [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import java.math.BigInteger;
import java.net.InetAddress;
import java.util.Arrays;
import org.apache.lucene.document.BinaryPoint;
import org.apache.lucene.document.DoublePoint;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FloatPoint;
import org.apache.lucene.document.InetAddressPoint;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.LatLonPoint;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.bkd.BKDConfig;
/**
* Access to indexed numeric values.
*
* <p>Points represent numeric values and are indexed differently than ordinary text. Instead of an
* inverted index, points are indexed with datastructures such as <a
* href="https://en.wikipedia.org/wiki/K-d_tree">KD-trees</a>. These structures are optimized for
* operations such as <i>range</i>, <i>distance</i>, <i>nearest-neighbor</i>, and
* <i>point-in-polygon</i> queries.
*
* <h2>Basic Point Types</h2>
*
* <table>
* <caption>Basic point types in Java and Lucene</caption>
* <tr><th>Java type</th><th>Lucene class</th></tr>
* <tr><td>{@code int}</td><td>{@link IntPoint}</td></tr>
* <tr><td>{@code long}</td><td>{@link LongPoint}</td></tr>
* <tr><td>{@code float}</td><td>{@link FloatPoint}</td></tr>
* <tr><td>{@code double}</td><td>{@link DoublePoint}</td></tr>
* <tr><td>{@code byte[]}</td><td>{@link BinaryPoint}</td></tr>
* <tr><td>{@link InetAddress}</td><td>{@link InetAddressPoint}</td></tr>
* <tr><td>{@link BigInteger}</td><td><a href="{@docRoot}/../sandbox/org/apache/lucene/sandbox/document/BigIntegerPoint.html">BigIntegerPoint</a>*</td></tr>
* </table>
*
* * in the <i>lucene-sandbox</i> jar<br>
*
* <p>Basic Lucene point types behave like their java peers: for example {@link IntPoint} represents
* a signed 32-bit {@link Integer}, supporting values ranging from {@link Integer#MIN_VALUE} to
* {@link Integer#MAX_VALUE}, ordered consistent with {@link Integer#compareTo(Integer)}. In
* addition to indexing support, point classes also contain static methods (such as {@link
* IntPoint#newRangeQuery(String, int, int)}) for creating common queries. For example:
*
* <pre class="prettyprint">
* // add year 1970 to document
* document.add(new IntPoint("year", 1970));
* // index document
* writer.addDocument(document);
* ...
* // issue range query of 1960-1980
* Query query = IntPoint.newRangeQuery("year", 1960, 1980);
* TopDocs docs = searcher.search(query, ...);
* </pre>
*
* <h2>Geospatial Point Types</h2>
*
* Although basic point types such as {@link DoublePoint} support points in multi-dimensional space
* too, Lucene has specialized classes for location data. These classes are optimized for location
* data: they are more space-efficient and support special operations such as <i>distance</i> and
* <i>polygon</i> queries. There are currently two implementations: <br>
*
* <ol>
* <li>{@link LatLonPoint}: indexes {@code (latitude,longitude)} as {@code (x,y)} in
* two-dimensional space.
* <li><a
* href="{@docRoot}/../spatial3d/org/apache/lucene/spatial3d/Geo3DPoint.html">Geo3DPoint</a>*
* in <i>lucene-spatial3d</i>: indexes {@code (latitude,longitude)} as {@code (x,y,z)} in
* three-dimensional space.
* </ol>
*
* * does <b>not</b> support altitude, 3D here means "uses three dimensions under-the-hood"<br>
*
* <h2>Advanced usage</h2>
*
* Custom structures can be created on top of single- or multi- dimensional basic types, on top of
* {@link BinaryPoint} for more flexibility, or via custom {@link Field} subclasses.
*
* @lucene.experimental
*/
public abstract class PointValues {
/** Maximum number of bytes for each dimension */
public static final int MAX_NUM_BYTES = 16;
/** Maximum number of dimensions */
public static final int MAX_DIMENSIONS = BKDConfig.MAX_DIMS;
/** Maximum number of index dimensions */
public static final int MAX_INDEX_DIMENSIONS = BKDConfig.MAX_INDEX_DIMS;
/**
* Return the cumulated number of points across all leaves of the given {@link IndexReader}.
* Leaves that do not have points for the given field are ignored.
*
* @see PointValues#size()
*/
public static long size(IndexReader reader, String field) throws IOException {
long size = 0;
for (LeafReaderContext ctx : reader.leaves()) {
PointValues values = ctx.reader().getPointValues(field);
if (values != null) {
size += values.size();
}
}
return size;
}
/**
* Return the cumulated number of docs that have points across all leaves of the given {@link
* IndexReader}. Leaves that do not have points for the given field are ignored.
*
* @see PointValues#getDocCount()
*/
public static int getDocCount(IndexReader reader, String field) throws IOException {
int count = 0;
for (LeafReaderContext ctx : reader.leaves()) {
PointValues values = ctx.reader().getPointValues(field);
if (values != null) {
count += values.getDocCount();
}
}
return count;
}
/**
* Return the minimum packed values across all leaves of the given {@link IndexReader}. Leaves
* that do not have points for the given field are ignored.
*
* @see PointValues#getMinPackedValue()
*/
public static byte[] getMinPackedValue(IndexReader reader, String field) throws IOException {
byte[] minValue = null;
for (LeafReaderContext ctx : reader.leaves()) {
PointValues values = ctx.reader().getPointValues(field);
if (values == null) {
continue;
}
byte[] leafMinValue = values.getMinPackedValue();
if (leafMinValue == null) {
continue;
}
if (minValue == null) {
minValue = leafMinValue.clone();
} else {
final int numDimensions = values.getNumIndexDimensions();
final int numBytesPerDimension = values.getBytesPerDimension();
for (int i = 0; i < numDimensions; ++i) {
int offset = i * numBytesPerDimension;
if (Arrays.compareUnsigned(
leafMinValue,
offset,
offset + numBytesPerDimension,
minValue,
offset,
offset + numBytesPerDimension)
< 0) {
System.arraycopy(leafMinValue, offset, minValue, offset, numBytesPerDimension);
}
}
}
}
return minValue;
}
/**
* Return the maximum packed values across all leaves of the given {@link IndexReader}. Leaves
* that do not have points for the given field are ignored.
*
* @see PointValues#getMaxPackedValue()
*/
public static byte[] getMaxPackedValue(IndexReader reader, String field) throws IOException {
byte[] maxValue = null;
for (LeafReaderContext ctx : reader.leaves()) {
PointValues values = ctx.reader().getPointValues(field);
if (values == null) {
continue;
}
byte[] leafMaxValue = values.getMaxPackedValue();
if (leafMaxValue == null) {
continue;
}
if (maxValue == null) {
maxValue = leafMaxValue.clone();
} else {
final int numDimensions = values.getNumIndexDimensions();
final int numBytesPerDimension = values.getBytesPerDimension();
for (int i = 0; i < numDimensions; ++i) {
int offset = i * numBytesPerDimension;
if (Arrays.compareUnsigned(
leafMaxValue,
offset,
offset + numBytesPerDimension,
maxValue,
offset,
offset + numBytesPerDimension)
> 0) {
System.arraycopy(leafMaxValue, offset, maxValue, offset, numBytesPerDimension);
}
}
}
}
return maxValue;
}
/** Default constructor */
protected PointValues() {}
/** Used by {@link #intersect} to check how each recursive cell corresponds to the query. */
public enum Relation {
/** Return this if the cell is fully contained by the query */
CELL_INSIDE_QUERY,
/** Return this if the cell and query do not overlap */
CELL_OUTSIDE_QUERY,
/** Return this if the cell partially overlaps the query */
CELL_CROSSES_QUERY
};
/**
* We recurse the BKD tree, using a provided instance of this to guide the recursion.
*
* @lucene.experimental
*/
public interface IntersectVisitor {
/**
* Called for all documents in a leaf cell that's fully contained by the query. The consumer
* should blindly accept the docID.
*/
void visit(int docID) throws IOException;
/**
* Called for all documents in a leaf cell that crosses the query. The consumer should
* scrutinize the packedValue to decide whether to accept it. In the 1D case, values are visited
* in increasing order, and in the case of ties, in increasing docID order.
*/
void visit(int docID, byte[] packedValue) throws IOException;
/**
* Similar to {@link IntersectVisitor#visit(int, byte[])} but in this case the packedValue can
* have more than one docID associated to it. The provided iterator should not escape the scope
* of this method so that implementations of PointValues are free to reuse it,
*/
default void visit(DocIdSetIterator iterator, byte[] packedValue) throws IOException {
int docID;
while ((docID = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
visit(docID, packedValue);
}
}
/**
* Called for non-leaf cells to test how the cell relates to the query, to determine how to
* further recurse down the tree.
*/
Relation compare(byte[] minPackedValue, byte[] maxPackedValue);
/** Notifies the caller that this many documents are about to be visited */
default void grow(int count) {}
;
}
/**
* Finds all documents and points matching the provided visitor. This method does not enforce live
* documents, so it's up to the caller to test whether each document is deleted, if necessary.
*/
public abstract void intersect(IntersectVisitor visitor) throws IOException;
/**
* Estimate the number of points that would be visited by {@link #intersect} with the given {@link
* IntersectVisitor}. This should run many times faster than {@link #intersect(IntersectVisitor)}.
*/
public abstract long estimatePointCount(IntersectVisitor visitor);
/**
* Estimate the number of documents that would be matched by {@link #intersect} with the given
* {@link IntersectVisitor}. This should run many times faster than {@link
* #intersect(IntersectVisitor)}.
*
* @see DocIdSetIterator#cost
*/
public long estimateDocCount(IntersectVisitor visitor) {
long estimatedPointCount = estimatePointCount(visitor);
int docCount = getDocCount();
double size = size();
if (estimatedPointCount >= size) {
// math all docs
return docCount;
} else if (size == docCount || estimatedPointCount == 0L) {
// if the point count estimate is 0 or we have only single values
// return this estimate
return estimatedPointCount;
} else {
// in case of multi values estimate the number of docs using the solution provided in
// https://math.stackexchange.com/questions/1175295/urn-problem-probability-of-drawing-balls-of-k-unique-colors
// then approximate the solution for points per doc << size() which results in the expression
// D * (1 - ((N - n) / N)^(N/D))
// where D is the total number of docs, N the total number of points and n the estimated point
// count
long docEstimate =
(long) (docCount * (1d - Math.pow((size - estimatedPointCount) / size, size / docCount)));
return docEstimate == 0L ? 1L : docEstimate;
}
}
/**
* Returns minimum value for each dimension, packed, or null if {@link #size} is <code>0</code>
*/
public abstract byte[] getMinPackedValue() throws IOException;
/**
* Returns maximum value for each dimension, packed, or null if {@link #size} is <code>0</code>
*/
public abstract byte[] getMaxPackedValue() throws IOException;
/** Returns how many dimensions are represented in the values */
public abstract int getNumDimensions() throws IOException;
/** Returns how many dimensions are used for the index */
public abstract int getNumIndexDimensions() throws IOException;
/** Returns the number of bytes per dimension */
public abstract int getBytesPerDimension() throws IOException;
/** Returns the total number of indexed points across all documents. */
public abstract long size();
/** Returns the total number of documents that have indexed at least one point. */
public abstract int getDocCount();
}