| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.index; |
| |
| import java.io.IOException; |
| import java.math.BigInteger; |
| import java.net.InetAddress; |
| |
| import org.apache.lucene.document.BinaryPoint; |
| import org.apache.lucene.document.DoublePoint; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.FloatPoint; |
| import org.apache.lucene.document.IntPoint; |
| import org.apache.lucene.document.LatLonPoint; |
| import org.apache.lucene.document.LongPoint; |
| import org.apache.lucene.search.DocIdSetIterator; |
| import org.apache.lucene.util.FutureArrays; |
| import org.apache.lucene.util.bkd.BKDConfig; |
| |
| /** |
| * Access to indexed numeric values. |
| * <p> |
| * Points represent numeric values and are indexed differently than ordinary text. Instead of an inverted index, |
| * points are indexed with datastructures such as <a href="https://en.wikipedia.org/wiki/K-d_tree">KD-trees</a>. |
| * These structures are optimized for operations such as <i>range</i>, <i>distance</i>, <i>nearest-neighbor</i>, |
| * and <i>point-in-polygon</i> queries. |
| * <h2>Basic Point Types</h2> |
| * <table> |
| * <caption>Basic point types in Java and Lucene</caption> |
| * <tr><th>Java type</th><th>Lucene class</th></tr> |
| * <tr><td>{@code int}</td><td>{@link IntPoint}</td></tr> |
| * <tr><td>{@code long}</td><td>{@link LongPoint}</td></tr> |
| * <tr><td>{@code float}</td><td>{@link FloatPoint}</td></tr> |
| * <tr><td>{@code double}</td><td>{@link DoublePoint}</td></tr> |
| * <tr><td>{@code byte[]}</td><td>{@link BinaryPoint}</td></tr> |
| * <tr><td>{@link BigInteger}</td><td><a href="{@docRoot}/../sandbox/org/apache/lucene/document/BigIntegerPoint.html">BigIntegerPoint</a>*</td></tr> |
| * <tr><td>{@link InetAddress}</td><td><a href="{@docRoot}/../misc/org/apache/lucene/document/InetAddressPoint.html">InetAddressPoint</a>*</td></tr> |
| * </table> |
| * * in the <i>lucene-sandbox</i> jar<br> |
| * <p> |
| * Basic Lucene point types behave like their java peers: for example {@link IntPoint} represents a signed 32-bit |
| * {@link Integer}, supporting values ranging from {@link Integer#MIN_VALUE} to {@link Integer#MAX_VALUE}, ordered |
| * consistent with {@link Integer#compareTo(Integer)}. In addition to indexing support, point classes also contain |
| * static methods (such as {@link IntPoint#newRangeQuery(String, int, int)}) for creating common queries. For example: |
| * <pre class="prettyprint"> |
| * // add year 1970 to document |
| * document.add(new IntPoint("year", 1970)); |
| * // index document |
| * writer.addDocument(document); |
| * ... |
| * // issue range query of 1960-1980 |
| * Query query = IntPoint.newRangeQuery("year", 1960, 1980); |
| * TopDocs docs = searcher.search(query, ...); |
| * </pre> |
| * <h2>Geospatial Point Types</h2> |
| * Although basic point types such as {@link DoublePoint} support points in multi-dimensional space too, Lucene has |
| * specialized classes for location data. These classes are optimized for location data: they are more space-efficient and |
| * support special operations such as <i>distance</i> and <i>polygon</i> queries. There are currently two implementations: |
| * <br> |
| * <ol> |
| * <li>{@link LatLonPoint}: indexes {@code (latitude,longitude)} as {@code (x,y)} in two-dimensional space. |
| * <li><a href="{@docRoot}/../spatial3d/org/apache/lucene/spatial3d/Geo3DPoint.html">Geo3DPoint</a>* in <i>lucene-spatial3d</i>: indexes {@code (latitude,longitude)} as {@code (x,y,z)} in three-dimensional space. |
| * </ol> |
| * * does <b>not</b> support altitude, 3D here means "uses three dimensions under-the-hood"<br> |
| * <h2>Advanced usage</h2> |
| * Custom structures can be created on top of single- or multi- dimensional basic types, on top of |
| * {@link BinaryPoint} for more flexibility, or via custom {@link Field} subclasses. |
| * |
| * @lucene.experimental */ |
| public abstract class PointValues { |
| |
| /** Maximum number of bytes for each dimension */ |
| public static final int MAX_NUM_BYTES = 16; |
| |
| /** Maximum number of dimensions */ |
| public static final int MAX_DIMENSIONS = BKDConfig.MAX_DIMS; |
| |
| /** Maximum number of index dimensions */ |
| public static final int MAX_INDEX_DIMENSIONS = BKDConfig.MAX_INDEX_DIMS; |
| |
| /** Return the cumulated number of points across all leaves of the given |
| * {@link IndexReader}. Leaves that do not have points for the given field |
| * are ignored. |
| * @see PointValues#size() */ |
| public static long size(IndexReader reader, String field) throws IOException { |
| long size = 0; |
| for (LeafReaderContext ctx : reader.leaves()) { |
| PointValues values = ctx.reader().getPointValues(field); |
| if (values != null) { |
| size += values.size(); |
| } |
| } |
| return size; |
| } |
| |
| /** Return the cumulated number of docs that have points across all leaves |
| * of the given {@link IndexReader}. Leaves that do not have points for the |
| * given field are ignored. |
| * @see PointValues#getDocCount() */ |
| public static int getDocCount(IndexReader reader, String field) throws IOException { |
| int count = 0; |
| for (LeafReaderContext ctx : reader.leaves()) { |
| PointValues values = ctx.reader().getPointValues(field); |
| if (values != null) { |
| count += values.getDocCount(); |
| } |
| } |
| return count; |
| } |
| |
| /** Return the minimum packed values across all leaves of the given |
| * {@link IndexReader}. Leaves that do not have points for the given field |
| * are ignored. |
| * @see PointValues#getMinPackedValue() */ |
| public static byte[] getMinPackedValue(IndexReader reader, String field) throws IOException { |
| byte[] minValue = null; |
| for (LeafReaderContext ctx : reader.leaves()) { |
| PointValues values = ctx.reader().getPointValues(field); |
| if (values == null) { |
| continue; |
| } |
| byte[] leafMinValue = values.getMinPackedValue(); |
| if (leafMinValue == null) { |
| continue; |
| } |
| if (minValue == null) { |
| minValue = leafMinValue.clone(); |
| } else { |
| final int numDimensions = values.getNumIndexDimensions(); |
| final int numBytesPerDimension = values.getBytesPerDimension(); |
| for (int i = 0; i < numDimensions; ++i) { |
| int offset = i * numBytesPerDimension; |
| if (FutureArrays.compareUnsigned(leafMinValue, offset, offset + numBytesPerDimension, minValue, offset, offset + numBytesPerDimension) < 0) { |
| System.arraycopy(leafMinValue, offset, minValue, offset, numBytesPerDimension); |
| } |
| } |
| } |
| } |
| return minValue; |
| } |
| |
| /** Return the maximum packed values across all leaves of the given |
| * {@link IndexReader}. Leaves that do not have points for the given field |
| * are ignored. |
| * @see PointValues#getMaxPackedValue() */ |
| public static byte[] getMaxPackedValue(IndexReader reader, String field) throws IOException { |
| byte[] maxValue = null; |
| for (LeafReaderContext ctx : reader.leaves()) { |
| PointValues values = ctx.reader().getPointValues(field); |
| if (values == null) { |
| continue; |
| } |
| byte[] leafMaxValue = values.getMaxPackedValue(); |
| if (leafMaxValue == null) { |
| continue; |
| } |
| if (maxValue == null) { |
| maxValue = leafMaxValue.clone(); |
| } else { |
| final int numDimensions = values.getNumIndexDimensions(); |
| final int numBytesPerDimension = values.getBytesPerDimension(); |
| for (int i = 0; i < numDimensions; ++i) { |
| int offset = i * numBytesPerDimension; |
| if (FutureArrays.compareUnsigned(leafMaxValue, offset, offset + numBytesPerDimension, maxValue, offset, offset + numBytesPerDimension) > 0) { |
| System.arraycopy(leafMaxValue, offset, maxValue, offset, numBytesPerDimension); |
| } |
| } |
| } |
| } |
| return maxValue; |
| } |
| |
| /** Default constructor */ |
| protected PointValues() { |
| } |
| |
| /** Used by {@link #intersect} to check how each recursive cell corresponds to the query. */ |
| public enum Relation { |
| /** Return this if the cell is fully contained by the query */ |
| CELL_INSIDE_QUERY, |
| /** Return this if the cell and query do not overlap */ |
| CELL_OUTSIDE_QUERY, |
| /** Return this if the cell partially overlaps the query */ |
| CELL_CROSSES_QUERY |
| }; |
| |
| /** We recurse the BKD tree, using a provided instance of this to guide the recursion. |
| * |
| * @lucene.experimental */ |
| public interface IntersectVisitor { |
| /** Called for all documents in a leaf cell that's fully contained by the query. The |
| * consumer should blindly accept the docID. */ |
| void visit(int docID) throws IOException; |
| |
| /** Called for all documents in a leaf cell that crosses the query. The consumer |
| * should scrutinize the packedValue to decide whether to accept it. In the 1D case, |
| * values are visited in increasing order, and in the case of ties, in increasing |
| * docID order. */ |
| void visit(int docID, byte[] packedValue) throws IOException; |
| |
| /** Similar to {@link IntersectVisitor#visit(int, byte[])} but in this case the packedValue |
| * can have more than one docID associated to it. The provided iterator should not escape the |
| * scope of this method so that implementations of PointValues are free to reuse it,*/ |
| default void visit(DocIdSetIterator iterator, byte[] packedValue) throws IOException { |
| int docID; |
| while ((docID = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { |
| visit(docID, packedValue); |
| } |
| } |
| |
| /** Called for non-leaf cells to test how the cell relates to the query, to |
| * determine how to further recurse down the tree. */ |
| Relation compare(byte[] minPackedValue, byte[] maxPackedValue); |
| |
| /** Notifies the caller that this many documents are about to be visited */ |
| default void grow(int count) {}; |
| } |
| |
| /** Finds all documents and points matching the provided visitor. |
| * This method does not enforce live documents, so it's up to the caller |
| * to test whether each document is deleted, if necessary. */ |
| public abstract void intersect(IntersectVisitor visitor) throws IOException; |
| |
| /** Estimate the number of points that would be visited by {@link #intersect} |
| * with the given {@link IntersectVisitor}. This should run many times faster |
| * than {@link #intersect(IntersectVisitor)}. */ |
| public abstract long estimatePointCount(IntersectVisitor visitor); |
| |
| /** Estimate the number of documents that would be matched by {@link #intersect} |
| * with the given {@link IntersectVisitor}. This should run many times faster |
| * than {@link #intersect(IntersectVisitor)}. |
| * @see DocIdSetIterator#cost */ |
| public long estimateDocCount(IntersectVisitor visitor) { |
| long estimatedPointCount = estimatePointCount(visitor); |
| int docCount = getDocCount(); |
| double size = size(); |
| if (estimatedPointCount >= size) { |
| // math all docs |
| return docCount; |
| } else if (size == docCount || estimatedPointCount == 0L ) { |
| // if the point count estimate is 0 or we have only single values |
| // return this estimate |
| return estimatedPointCount; |
| } else { |
| // in case of multi values estimate the number of docs using the solution provided in |
| // https://math.stackexchange.com/questions/1175295/urn-problem-probability-of-drawing-balls-of-k-unique-colors |
| // then approximate the solution for points per doc << size() which results in the expression |
| // D * (1 - ((N - n) / N)^(N/D)) |
| // where D is the total number of docs, N the total number of points and n the estimated point count |
| long docEstimate = (long) (docCount * (1d - Math.pow((size - estimatedPointCount) / size, size / docCount))); |
| return docEstimate == 0L ? 1L : docEstimate; |
| } |
| } |
| |
| |
| /** Returns minimum value for each dimension, packed, or null if {@link #size} is <code>0</code> */ |
| public abstract byte[] getMinPackedValue() throws IOException; |
| |
| /** Returns maximum value for each dimension, packed, or null if {@link #size} is <code>0</code> */ |
| public abstract byte[] getMaxPackedValue() throws IOException; |
| |
| /** Returns how many dimensions are represented in the values */ |
| public abstract int getNumDimensions() throws IOException; |
| |
| /** Returns how many dimensions are used for the index */ |
| public abstract int getNumIndexDimensions() throws IOException; |
| |
| /** Returns the number of bytes per dimension */ |
| public abstract int getBytesPerDimension() throws IOException; |
| |
| /** Returns the total number of indexed points across all documents. */ |
| public abstract long size(); |
| |
| /** Returns the total number of documents that have indexed at least one point. */ |
| public abstract int getDocCount(); |
| } |