blob: 1463c0a9547aacd8005e400baf65d8467125938b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.stream.Collectors;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter.IndexReaderWarmer;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.PrintStreamInfoStream;
import org.apache.lucene.util.SetOnce;
import org.apache.lucene.util.SetOnce.AlreadySetException;
import org.apache.lucene.util.Version;
/**
* Holds all the configuration that is used to create an {@link IndexWriter}. Once {@link
* IndexWriter} has been created with this object, changes to this object will not affect the {@link
* IndexWriter} instance. For that, use {@link LiveIndexWriterConfig} that is returned from {@link
* IndexWriter#getConfig()}.
*
* <p>All setter methods return {@link IndexWriterConfig} to allow chaining settings conveniently,
* for example:
*
* <pre class="prettyprint">
* IndexWriterConfig conf = new IndexWriterConfig(analyzer);
* conf.setter1().setter2();
* </pre>
*
* @see IndexWriter#getConfig()
* @since 3.1
*/
public final class IndexWriterConfig extends LiveIndexWriterConfig {
/** Specifies the open mode for {@link IndexWriter}. */
public static enum OpenMode {
/** Creates a new index or overwrites an existing one. */
CREATE,
/** Opens an existing index. */
APPEND,
/**
* Creates a new index if one does not exist, otherwise it opens the index and documents will be
* appended.
*/
CREATE_OR_APPEND
}
/** Denotes a flush trigger is disabled. */
public static final int DISABLE_AUTO_FLUSH = -1;
/** Disabled by default (because IndexWriter flushes by RAM usage by default). */
public static final int DEFAULT_MAX_BUFFERED_DELETE_TERMS = DISABLE_AUTO_FLUSH;
/** Disabled by default (because IndexWriter flushes by RAM usage by default). */
public static final int DEFAULT_MAX_BUFFERED_DOCS = DISABLE_AUTO_FLUSH;
/**
* Default value is 16 MB (which means flush when buffered docs consume approximately 16 MB RAM).
*/
public static final double DEFAULT_RAM_BUFFER_SIZE_MB = 16.0;
/** Default setting (true) for {@link #setReaderPooling}. */
// We changed this default to true with concurrent deletes/updates (LUCENE-7868),
// because we will otherwise need to open and close segment readers more frequently.
// False is still supported, but will have worse performance since readers will
// be forced to aggressively move all state to disk.
public static final boolean DEFAULT_READER_POOLING = true;
/** Default value is 1945. Change using {@link #setRAMPerThreadHardLimitMB(int)} */
public static final int DEFAULT_RAM_PER_THREAD_HARD_LIMIT_MB = 1945;
/**
* Default value for compound file system for newly written segments (set to <code>true</code>).
* For batch indexing with very large ram buffers use <code>false</code>
*/
public static final boolean DEFAULT_USE_COMPOUND_FILE_SYSTEM = true;
/** Default value for whether calls to {@link IndexWriter#close()} include a commit. */
public static final boolean DEFAULT_COMMIT_ON_CLOSE = true;
/**
* Default value for time to wait for merges on commit or getReader (when using a {@link
* MergePolicy} that implements {@link MergePolicy#findFullFlushMerges}).
*/
public static final long DEFAULT_MAX_FULL_FLUSH_MERGE_WAIT_MILLIS = 0;
// indicates whether this config instance is already attached to a writer.
// not final so that it can be cloned properly.
private SetOnce<IndexWriter> writer = new SetOnce<>();
/**
* Sets the {@link IndexWriter} this config is attached to.
*
* @throws AlreadySetException if this config is already attached to a writer.
*/
IndexWriterConfig setIndexWriter(IndexWriter writer) {
if (this.writer.get() != null) {
throw new IllegalStateException(
"do not share IndexWriterConfig instances across IndexWriters");
}
this.writer.set(writer);
return this;
}
/**
* Creates a new config, using {@link StandardAnalyzer} as the analyzer. By default, {@link
* TieredMergePolicy} is used for merging; Note that {@link TieredMergePolicy} is free to select
* non-contiguous merges, which means docIDs may not remain monotonic over time. If this is a
* problem you should switch to {@link LogByteSizeMergePolicy} or {@link LogDocMergePolicy}.
*/
public IndexWriterConfig() {
this(new StandardAnalyzer());
}
/**
* Creates a new config that with the provided {@link Analyzer}. By default, {@link
* TieredMergePolicy} is used for merging; Note that {@link TieredMergePolicy} is free to select
* non-contiguous merges, which means docIDs may not remain monotonic over time. If this is a
* problem you should switch to {@link LogByteSizeMergePolicy} or {@link LogDocMergePolicy}.
*/
public IndexWriterConfig(Analyzer analyzer) {
super(analyzer);
}
/**
* Specifies {@link OpenMode} of the index.
*
* <p>Only takes effect when IndexWriter is first created.
*/
public IndexWriterConfig setOpenMode(OpenMode openMode) {
if (openMode == null) {
throw new IllegalArgumentException("openMode must not be null");
}
this.openMode = openMode;
return this;
}
@Override
public OpenMode getOpenMode() {
return openMode;
}
/**
* Expert: set the compatibility version to use for this index. In case the index is created, it
* will use the given major version for compatibility. It is sometimes useful to set the previous
* major version for compatibility due to the fact that {@link IndexWriter#addIndexes} only
* accepts indices that have been written with the same major version as the current index. If the
* index already exists, then this value is ignored. Default value is the {@link Version#major
* major} of the {@link Version#LATEST latest version}.
*
* <p><b>NOTE</b>: Changing the creation version reduces backward compatibility guarantees. For
* instance an index created with Lucene 8 with a compatibility version of 7 can't be read with
* Lucene 9 due to the fact that Lucene only supports reading indices created with the current or
* previous major release.
*
* @param indexCreatedVersionMajor the major version to use for compatibility
*/
public IndexWriterConfig setIndexCreatedVersionMajor(int indexCreatedVersionMajor) {
if (indexCreatedVersionMajor > Version.LATEST.major) {
throw new IllegalArgumentException(
"indexCreatedVersionMajor may not be in the future: current major version is "
+ Version.LATEST.major
+ ", but got: "
+ indexCreatedVersionMajor);
}
if (indexCreatedVersionMajor < Version.LATEST.major - 1) {
throw new IllegalArgumentException(
"indexCreatedVersionMajor may not be less than the minimum supported version: "
+ (Version.LATEST.major - 1)
+ ", but got: "
+ indexCreatedVersionMajor);
}
this.createdVersionMajor = indexCreatedVersionMajor;
return this;
}
/**
* Expert: allows an optional {@link IndexDeletionPolicy} implementation to be specified. You can
* use this to control when prior commits are deleted from the index. The default policy is {@link
* KeepOnlyLastCommitDeletionPolicy} which removes all prior commits as soon as a new commit is
* done (this matches behavior before 2.2). Creating your own policy can allow you to explicitly
* keep previous "point in time" commits alive in the index for some time, to allow readers to
* refresh to the new commit without having the old commit deleted out from under them. This is
* necessary on filesystems like NFS that do not support "delete on last close" semantics, which
* Lucene's "point in time" search normally relies on.
*
* <p><b>NOTE:</b> the deletion policy must not be null.
*
* <p>Only takes effect when IndexWriter is first created.
*/
public IndexWriterConfig setIndexDeletionPolicy(IndexDeletionPolicy delPolicy) {
if (delPolicy == null) {
throw new IllegalArgumentException("indexDeletionPolicy must not be null");
}
this.delPolicy = delPolicy;
return this;
}
@Override
public IndexDeletionPolicy getIndexDeletionPolicy() {
return delPolicy;
}
/**
* Expert: allows to open a certain commit point. The default is null which opens the latest
* commit point. This can also be used to open {@link IndexWriter} from a near-real-time reader,
* if you pass the reader's {@link DirectoryReader#getIndexCommit}.
*
* <p>Only takes effect when IndexWriter is first created.
*/
public IndexWriterConfig setIndexCommit(IndexCommit commit) {
this.commit = commit;
return this;
}
@Override
public IndexCommit getIndexCommit() {
return commit;
}
/**
* Expert: set the {@link Similarity} implementation used by this IndexWriter.
*
* <p><b>NOTE:</b> the similarity must not be null.
*
* <p>Only takes effect when IndexWriter is first created.
*/
public IndexWriterConfig setSimilarity(Similarity similarity) {
if (similarity == null) {
throw new IllegalArgumentException("similarity must not be null");
}
this.similarity = similarity;
return this;
}
@Override
public Similarity getSimilarity() {
return similarity;
}
/**
* Expert: sets the merge scheduler used by this writer. The default is {@link
* ConcurrentMergeScheduler}.
*
* <p><b>NOTE:</b> the merge scheduler must not be null.
*
* <p>Only takes effect when IndexWriter is first created.
*/
public IndexWriterConfig setMergeScheduler(MergeScheduler mergeScheduler) {
if (mergeScheduler == null) {
throw new IllegalArgumentException("mergeScheduler must not be null");
}
this.mergeScheduler = mergeScheduler;
return this;
}
@Override
public MergeScheduler getMergeScheduler() {
return mergeScheduler;
}
/**
* Set the {@link Codec}.
*
* <p>Only takes effect when IndexWriter is first created.
*/
public IndexWriterConfig setCodec(Codec codec) {
if (codec == null) {
throw new IllegalArgumentException("codec must not be null");
}
this.codec = codec;
return this;
}
@Override
public Codec getCodec() {
return codec;
}
@Override
public MergePolicy getMergePolicy() {
return mergePolicy;
}
/**
* By default, IndexWriter does not pool the SegmentReaders it must open for deletions and
* merging, unless a near-real-time reader has been obtained by calling {@link
* DirectoryReader#open(IndexWriter)}. This method lets you enable pooling without getting a
* near-real-time reader. NOTE: if you set this to false, IndexWriter will still pool readers once
* {@link DirectoryReader#open(IndexWriter)} is called.
*
* <p>Only takes effect when IndexWriter is first created.
*/
public IndexWriterConfig setReaderPooling(boolean readerPooling) {
this.readerPooling = readerPooling;
return this;
}
@Override
public boolean getReaderPooling() {
return readerPooling;
}
/**
* Expert: Controls when segments are flushed to disk during indexing. The {@link FlushPolicy}
* initialized during {@link IndexWriter} instantiation and once initialized the given instance is
* bound to this {@link IndexWriter} and should not be used with another writer.
*
* @see #setMaxBufferedDocs(int)
* @see #setRAMBufferSizeMB(double)
*/
IndexWriterConfig setFlushPolicy(FlushPolicy flushPolicy) {
if (flushPolicy == null) {
throw new IllegalArgumentException("flushPolicy must not be null");
}
this.flushPolicy = flushPolicy;
return this;
}
/**
* Expert: Sets the maximum memory consumption per thread triggering a forced flush if exceeded. A
* {@link DocumentsWriterPerThread} is forcefully flushed once it exceeds this limit even if the
* {@link #getRAMBufferSizeMB()} has not been exceeded. This is a safety limit to prevent a {@link
* DocumentsWriterPerThread} from address space exhaustion due to its internal 32 bit signed
* integer based memory addressing. The given value must be less that 2GB (2048MB)
*
* @see #DEFAULT_RAM_PER_THREAD_HARD_LIMIT_MB
*/
public IndexWriterConfig setRAMPerThreadHardLimitMB(int perThreadHardLimitMB) {
if (perThreadHardLimitMB <= 0 || perThreadHardLimitMB >= 2048) {
throw new IllegalArgumentException(
"PerThreadHardLimit must be greater than 0 and less than 2048MB");
}
this.perThreadHardLimitMB = perThreadHardLimitMB;
return this;
}
@Override
public int getRAMPerThreadHardLimitMB() {
return perThreadHardLimitMB;
}
@Override
FlushPolicy getFlushPolicy() {
return flushPolicy;
}
@Override
public InfoStream getInfoStream() {
return infoStream;
}
@Override
public Analyzer getAnalyzer() {
return super.getAnalyzer();
}
@Override
public int getMaxBufferedDocs() {
return super.getMaxBufferedDocs();
}
@Override
public IndexReaderWarmer getMergedSegmentWarmer() {
return super.getMergedSegmentWarmer();
}
@Override
public double getRAMBufferSizeMB() {
return super.getRAMBufferSizeMB();
}
/**
* Information about merges, deletes and a message when maxFieldLength is reached will be printed
* to this. Must not be null, but {@link InfoStream#NO_OUTPUT} may be used to suppress output.
*/
public IndexWriterConfig setInfoStream(InfoStream infoStream) {
if (infoStream == null) {
throw new IllegalArgumentException(
"Cannot set InfoStream implementation to null. "
+ "To disable logging use InfoStream.NO_OUTPUT");
}
this.infoStream = infoStream;
return this;
}
/** Convenience method that uses {@link PrintStreamInfoStream}. Must not be null. */
public IndexWriterConfig setInfoStream(PrintStream printStream) {
if (printStream == null) {
throw new IllegalArgumentException("printStream must not be null");
}
return setInfoStream(new PrintStreamInfoStream(printStream));
}
@Override
public IndexWriterConfig setMergePolicy(MergePolicy mergePolicy) {
return (IndexWriterConfig) super.setMergePolicy(mergePolicy);
}
@Override
public IndexWriterConfig setMaxBufferedDocs(int maxBufferedDocs) {
return (IndexWriterConfig) super.setMaxBufferedDocs(maxBufferedDocs);
}
@Override
public IndexWriterConfig setMergedSegmentWarmer(IndexReaderWarmer mergeSegmentWarmer) {
return (IndexWriterConfig) super.setMergedSegmentWarmer(mergeSegmentWarmer);
}
@Override
public IndexWriterConfig setRAMBufferSizeMB(double ramBufferSizeMB) {
return (IndexWriterConfig) super.setRAMBufferSizeMB(ramBufferSizeMB);
}
@Override
public IndexWriterConfig setUseCompoundFile(boolean useCompoundFile) {
return (IndexWriterConfig) super.setUseCompoundFile(useCompoundFile);
}
/**
* Sets if calls {@link IndexWriter#close()} should first commit before closing. Use <code>true
* </code> to match behavior of Lucene 4.x.
*/
public IndexWriterConfig setCommitOnClose(boolean commitOnClose) {
this.commitOnClose = commitOnClose;
return this;
}
/**
* Expert: sets the amount of time to wait for merges (during {@link IndexWriter#commit} or {@link
* IndexWriter#getReader(boolean, boolean)}) returned by MergePolicy.findFullFlushMerges(...). If
* this time is reached, we proceed with the commit based on segments merged up to that point. The
* merges are not aborted, and will still run to completion independent of the commit or getReader
* call, like natural segment merges. The default is <code>
* {@value IndexWriterConfig#DEFAULT_MAX_FULL_FLUSH_MERGE_WAIT_MILLIS}</code>.
*
* <p>Note: This settings has no effect unless {@link
* MergePolicy#findFullFlushMerges(MergeTrigger, SegmentInfos, MergePolicy.MergeContext)} has an
* implementation that actually returns merges which by default doesn't return any merges.
*/
public IndexWriterConfig setMaxFullFlushMergeWaitMillis(long maxFullFlushMergeWaitMillis) {
this.maxFullFlushMergeWaitMillis = maxFullFlushMergeWaitMillis;
return this;
}
/** Set the {@link Sort} order to use for all (flushed and merged) segments. */
public IndexWriterConfig setIndexSort(Sort sort) {
for (SortField sortField : sort.getSort()) {
if (sortField.getIndexSorter() == null) {
throw new IllegalArgumentException("Cannot sort index with sort field " + sortField);
}
}
this.indexSort = sort;
this.indexSortFields =
Arrays.stream(sort.getSort()).map(SortField::getField).collect(Collectors.toSet());
return this;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder(super.toString());
sb.append("writer=").append(writer.get()).append("\n");
return sb.toString();
}
@Override
public IndexWriterConfig setCheckPendingFlushUpdate(boolean checkPendingFlushOnUpdate) {
return (IndexWriterConfig) super.setCheckPendingFlushUpdate(checkPendingFlushOnUpdate);
}
/**
* Sets the soft deletes field. A soft delete field in lucene is a doc-values field that marks a
* document as soft-deleted if a document has at least one value in that field. If a document is
* marked as soft-deleted the document is treated as if it has been hard-deleted through the
* IndexWriter API ({@link IndexWriter#deleteDocuments(Term...)}. Merges will reclaim soft-deleted
* as well as hard-deleted documents and index readers obtained from the IndexWriter will reflect
* all deleted documents in it's live docs. If soft-deletes are used documents must be indexed via
* {@link IndexWriter#softUpdateDocument(Term, Iterable, Field...)}. Deletes are applied via
* {@link IndexWriter#updateDocValues(Term, Field...)}.
*
* <p>Soft deletes allow to retain documents across merges if the merge policy modifies the live
* docs of a merge reader. {@link SoftDeletesRetentionMergePolicy} for instance allows to specify
* an arbitrary query to mark all documents that should survive the merge. This can be used to for
* example keep all document modifications for a certain time interval or the last N operations if
* some kind of sequence ID is available in the index.
*
* <p>Currently there is no API support to un-delete a soft-deleted document. In oder to un-delete
* the document must be re-indexed using {@link IndexWriter#softUpdateDocument(Term, Iterable,
* Field...)}.
*
* <p>The default value for this is <code>null</code> which disables soft-deletes. If soft-deletes
* are enabled documents can still be hard-deleted. Hard-deleted documents will won't considered
* as soft-deleted even if they have a value in the soft-deletes field.
*
* @see #getSoftDeletesField()
*/
public IndexWriterConfig setSoftDeletesField(String softDeletesField) {
this.softDeletesField = softDeletesField;
return this;
}
/** Set event listener to record key events in IndexWriter */
public IndexWriterConfig setIndexWriterEventListener(
final IndexWriterEventListener eventListener) {
this.eventListener = eventListener;
return this;
}
}