blob: 5c808fe9161c16c7458104d62b83e1bb603fdf1b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.Closeable;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.WeakHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.util.Bits; // javadocs
/**
* IndexReader is an abstract class, providing an interface for accessing a point-in-time view of an
* index. Any changes made to the index via {@link IndexWriter} will not be visible until a new
* {@code IndexReader} is opened. It's best to use {@link DirectoryReader#open(IndexWriter)} to
* obtain an {@code IndexReader}, if your {@link IndexWriter} is in-process. When you need to
* re-open to see changes to the index, it's best to use {@link
* DirectoryReader#openIfChanged(DirectoryReader)} since the new reader will share resources with
* the previous one when possible. Search of an index is done entirely through this abstract
* interface, so that any subclass which implements it is searchable.
*
* <p>There are two different types of IndexReaders:
*
* <ul>
* <li>{@link LeafReader}: These indexes do not consist of several sub-readers, they are atomic.
* They support retrieval of stored fields, doc values, terms, and postings.
* <li>{@link CompositeReader}: Instances (like {@link DirectoryReader}) of this reader can only
* be used to get stored fields from the underlying LeafReaders, but it is not possible to
* directly retrieve postings. To do that, get the sub-readers via {@link
* CompositeReader#getSequentialSubReaders}.
* </ul>
*
* <p>IndexReader instances for indexes on disk are usually constructed with a call to one of the
* static <code>DirectoryReader.open()</code> methods, e.g. {@link
* DirectoryReader#open(org.apache.lucene.store.Directory)}. {@link DirectoryReader} implements the
* {@link CompositeReader} interface, it is not possible to directly get postings.
*
* <p>For efficiency, in this API documents are often referred to via <i>document numbers</i>,
* non-negative integers which each name a unique document in the index. These document numbers are
* ephemeral -- they may change as documents are added to and deleted from an index. Clients should
* thus not rely on a given document having the same number between sessions.
*
* <p><a id="thread-safety"></a>
*
* <p><b>NOTE</b>: {@link IndexReader} instances are completely thread safe, meaning multiple
* threads can call any of its methods, concurrently. If your application requires external
* synchronization, you should <b>not</b> synchronize on the <code>IndexReader</code> instance; use
* your own (non-Lucene) objects instead.
*/
public abstract class IndexReader implements Closeable {
private boolean closed = false;
private boolean closedByChild = false;
private final AtomicInteger refCount = new AtomicInteger(1);
IndexReader() {
if (!(this instanceof CompositeReader || this instanceof LeafReader))
throw new Error(
"IndexReader should never be directly extended, subclass LeafReader or CompositeReader instead.");
}
/**
* A utility class that gives hooks in order to help build a cache based on the data that is
* contained in this index.
*
* @lucene.experimental
*/
public static interface CacheHelper {
/**
* Get a key that the resource can be cached on. The given entry can be compared using identity,
* ie. {@link Object#equals} is implemented as {@code ==} and {@link Object#hashCode} is
* implemented as {@link System#identityHashCode}.
*/
CacheKey getKey();
/**
* Add a {@link ClosedListener} which will be called when the resource guarded by {@link
* #getKey()} is closed.
*/
void addClosedListener(ClosedListener listener);
}
/** A cache key identifying a resource that is being cached on. */
public static final class CacheKey {
CacheKey() {} // only instantiable by core impls
}
/**
* A listener that is called when a resource gets closed.
*
* @lucene.experimental
*/
@FunctionalInterface
public static interface ClosedListener {
/**
* Invoked when the resource (segment core, or index reader) that is being cached on is closed.
*/
void onClose(CacheKey key) throws IOException;
}
private final Set<IndexReader> parentReaders =
Collections.synchronizedSet(
Collections.newSetFromMap(new WeakHashMap<IndexReader, Boolean>()));
/**
* Expert: This method is called by {@code IndexReader}s which wrap other readers (e.g. {@link
* CompositeReader} or {@link FilterLeafReader}) to register the parent at the child (this reader)
* on construction of the parent. When this reader is closed, it will mark all registered parents
* as closed, too. The references to parent readers are weak only, so they can be GCed once they
* are no longer in use.
*
* @lucene.experimental
*/
public final void registerParentReader(IndexReader reader) {
ensureOpen();
parentReaders.add(reader);
}
// overridden by StandardDirectoryReader and SegmentReader
void notifyReaderClosedListeners() throws IOException {
// nothing to notify in the base impl
}
private void reportCloseToParentReaders() throws IOException {
synchronized (parentReaders) {
for (IndexReader parent : parentReaders) {
parent.closedByChild = true;
// cross memory barrier by a fake write:
parent.refCount.addAndGet(0);
// recurse:
parent.reportCloseToParentReaders();
}
}
}
/** Expert: returns the current refCount for this reader */
public final int getRefCount() {
// NOTE: don't ensureOpen, so that callers can see
// refCount is 0 (reader is closed)
return refCount.get();
}
/**
* Expert: increments the refCount of this IndexReader instance. RefCounts are used to determine
* when a reader can be closed safely, i.e. as soon as there are no more references. Be sure to
* always call a corresponding {@link #decRef}, in a finally clause; otherwise the reader may
* never be closed. Note that {@link #close} simply calls decRef(), which means that the
* IndexReader will not really be closed until {@link #decRef} has been called for all outstanding
* references.
*
* @see #decRef
* @see #tryIncRef
*/
public final void incRef() {
if (!tryIncRef()) {
ensureOpen();
}
}
/**
* Expert: increments the refCount of this IndexReader instance only if the IndexReader has not
* been closed yet and returns <code>true</code> iff the refCount was successfully incremented,
* otherwise <code>false</code>. If this method returns <code>false</code> the reader is either
* already closed or is currently being closed. Either way this reader instance shouldn't be used
* by an application unless <code>true</code> is returned.
*
* <p>RefCounts are used to determine when a reader can be closed safely, i.e. as soon as there
* are no more references. Be sure to always call a corresponding {@link #decRef}, in a finally
* clause; otherwise the reader may never be closed. Note that {@link #close} simply calls
* decRef(), which means that the IndexReader will not really be closed until {@link #decRef} has
* been called for all outstanding references.
*
* @see #decRef
* @see #incRef
*/
public final boolean tryIncRef() {
int count;
while ((count = refCount.get()) > 0) {
if (refCount.compareAndSet(count, count + 1)) {
return true;
}
}
return false;
}
/**
* Expert: decreases the refCount of this IndexReader instance. If the refCount drops to 0, then
* this reader is closed. If an exception is hit, the refCount is unchanged.
*
* @throws IOException in case an IOException occurs in doClose()
* @see #incRef
*/
@SuppressWarnings("try")
public final void decRef() throws IOException {
// only check refcount here (don't call ensureOpen()), so we can
// still close the reader if it was made invalid by a child:
if (refCount.get() <= 0) {
throw new AlreadyClosedException("this IndexReader is closed");
}
final int rc = refCount.decrementAndGet();
if (rc == 0) {
closed = true;
try (Closeable finalizer = this::reportCloseToParentReaders;
Closeable finalizer1 = this::notifyReaderClosedListeners) {
doClose();
}
} else if (rc < 0) {
throw new IllegalStateException(
"too many decRef calls: refCount is " + rc + " after decrement");
}
}
/**
* Throws AlreadyClosedException if this IndexReader or any of its child readers is closed,
* otherwise returns.
*/
protected final void ensureOpen() throws AlreadyClosedException {
if (refCount.get() <= 0) {
throw new AlreadyClosedException("this IndexReader is closed");
}
// the happens before rule on reading the refCount, which must be after the fake write,
// ensures that we see the value:
if (closedByChild) {
throw new AlreadyClosedException(
"this IndexReader cannot be used anymore as one of its child readers was closed");
}
}
/**
* {@inheritDoc}
*
* <p>{@code IndexReader} subclasses are not allowed to implement equals/hashCode, so methods are
* declared final.
*/
@Override
public final boolean equals(Object obj) {
return (this == obj);
}
/**
* {@inheritDoc}
*
* <p>{@code IndexReader} subclasses are not allowed to implement equals/hashCode, so methods are
* declared final.
*/
@Override
public final int hashCode() {
return System.identityHashCode(this);
}
/**
* Retrieve term vectors for this document, or null if term vectors were not indexed. The returned
* Fields instance acts like a single-document inverted index (the docID will be 0).
*/
public abstract Fields getTermVectors(int docID) throws IOException;
/**
* Retrieve term vector for this document and field, or null if term vectors were not indexed. The
* returned Fields instance acts like a single-document inverted index (the docID will be 0).
*/
public final Terms getTermVector(int docID, String field) throws IOException {
Fields vectors = getTermVectors(docID);
if (vectors == null) {
return null;
}
return vectors.terms(field);
}
/**
* Returns the number of documents in this index.
*
* <p><b>NOTE</b>: This operation may run in O(maxDoc). Implementations that can't return this
* number in constant-time should cache it.
*/
public abstract int numDocs();
/**
* Returns one greater than the largest possible document number. This may be used to, e.g.,
* determine how big to allocate an array which will have an element for every document number in
* an index.
*/
public abstract int maxDoc();
/**
* Returns the number of deleted documents.
*
* <p><b>NOTE</b>: This operation may run in O(maxDoc).
*/
public final int numDeletedDocs() {
return maxDoc() - numDocs();
}
/**
* Expert: visits the fields of a stored document, for custom processing/loading of each field. If
* you simply want to load all fields, use {@link #document(int)}. If you want to load a subset,
* use {@link DocumentStoredFieldVisitor}.
*/
public abstract void document(int docID, StoredFieldVisitor visitor) throws IOException;
/**
* Returns the stored fields of the <code>n</code><sup>th</sup> <code>Document</code> in this
* index. This is just sugar for using {@link DocumentStoredFieldVisitor}.
*
* <p><b>NOTE:</b> for performance reasons, this method does not check if the requested document
* is deleted, and therefore asking for a deleted document may yield unspecified results. Usually
* this is not required, however you can test if the doc is deleted by checking the {@link Bits}
* returned from {@link MultiBits#getLiveDocs}.
*
* <p><b>NOTE:</b> only the content of a field is returned, if that field was stored during
* indexing. Metadata like boost, omitNorm, IndexOptions, tokenized, etc., are not preserved.
*
* @throws CorruptIndexException if the index is corrupt
* @throws IOException if there is a low-level IO error
*/
// TODO: we need a separate StoredField, so that the
// Document returned here contains that class not
// IndexableField
public final Document document(int docID) throws IOException {
final DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor();
document(docID, visitor);
return visitor.getDocument();
}
/**
* Like {@link #document(int)} but only loads the specified fields. Note that this is simply sugar
* for {@link DocumentStoredFieldVisitor#DocumentStoredFieldVisitor(Set)}.
*/
public final Document document(int docID, Set<String> fieldsToLoad) throws IOException {
final DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor(fieldsToLoad);
document(docID, visitor);
return visitor.getDocument();
}
/**
* Returns true if any documents have been deleted. Implementers should consider overriding this
* method if {@link #maxDoc()} or {@link #numDocs()} are not constant-time operations.
*/
public boolean hasDeletions() {
return numDeletedDocs() > 0;
}
/**
* Closes files associated with this index. Also saves any new deletions to disk. No other methods
* should be called after this has been called.
*
* @throws IOException if there is a low-level IO error
*/
@Override
public final synchronized void close() throws IOException {
if (!closed) {
decRef();
closed = true;
}
}
/** Implements close. */
protected abstract void doClose() throws IOException;
/**
* Expert: Returns the root {@link IndexReaderContext} for this {@link IndexReader}'s sub-reader
* tree.
*
* <p>Iff this reader is composed of sub readers, i.e. this reader being a composite reader, this
* method returns a {@link CompositeReaderContext} holding the reader's direct children as well as
* a view of the reader tree's atomic leaf contexts. All sub- {@link IndexReaderContext} instances
* referenced from this readers top-level context are private to this reader and are not shared
* with another context tree. For example, IndexSearcher uses this API to drive searching by one
* atomic leaf reader at a time. If this reader is not composed of child readers, this method
* returns an {@link LeafReaderContext}.
*
* <p>Note: Any of the sub-{@link CompositeReaderContext} instances referenced from this top-level
* context do not support {@link CompositeReaderContext#leaves()}. Only the top-level context
* maintains the convenience leaf-view for performance reasons.
*/
public abstract IndexReaderContext getContext();
/**
* Returns the reader's leaves, or itself if this reader is atomic. This is a convenience method
* calling {@code this.getContext().leaves()}.
*
* @see IndexReaderContext#leaves()
*/
public final List<LeafReaderContext> leaves() {
return getContext().leaves();
}
/**
* Optional method: Return a {@link CacheHelper} that can be used to cache based on the content of
* this reader. Two readers that have different data or different sets of deleted documents will
* be considered different.
*
* <p>A return value of {@code null} indicates that this reader is not suited for caching, which
* is typically the case for short-lived wrappers that alter the content of the wrapped reader.
*
* @lucene.experimental
*/
public abstract CacheHelper getReaderCacheHelper();
/**
* Returns the number of documents containing the <code>term</code>. This method returns 0 if the
* term or field does not exists. This method does not take into account deleted documents that
* have not yet been merged away.
*
* @see TermsEnum#docFreq()
*/
public abstract int docFreq(Term term) throws IOException;
/**
* Returns the total number of occurrences of {@code term} across all documents (the sum of the
* freq() for each doc that has this term). Note that, like other term measures, this measure does
* not take deleted documents into account.
*/
public abstract long totalTermFreq(Term term) throws IOException;
/**
* Returns the sum of {@link TermsEnum#docFreq()} for all terms in this field. Note that, just
* like other term measures, this measure does not take deleted documents into account.
*
* @see Terms#getSumDocFreq()
*/
public abstract long getSumDocFreq(String field) throws IOException;
/**
* Returns the number of documents that have at least one term for this field. Note that, just
* like other term measures, this measure does not take deleted documents into account.
*
* @see Terms#getDocCount()
*/
public abstract int getDocCount(String field) throws IOException;
/**
* Returns the sum of {@link TermsEnum#totalTermFreq} for all terms in this field. Note that, just
* like other term measures, this measure does not take deleted documents into account.
*
* @see Terms#getSumTotalTermFreq()
*/
public abstract long getSumTotalTermFreq(String field) throws IOException;
}