blob: e281afd5c44073f9e93d50f277f7a7a19dc4b074 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.Comparator;
import java.util.stream.Collectors;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter.IndexReaderWarmer;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.PrintStreamInfoStream;
import org.apache.lucene.util.SetOnce;
import org.apache.lucene.util.SetOnce.AlreadySetException;
import org.apache.lucene.util.Version;
/**
* Holds all the configuration that is used to create an {@link IndexWriter}.
* Once {@link IndexWriter} has been created with this object, changes to this
* object will not affect the {@link IndexWriter} instance. For that, use
* {@link LiveIndexWriterConfig} that is returned from {@link IndexWriter#getConfig()}.
*
* <p>
* All setter methods return {@link IndexWriterConfig} to allow chaining
* settings conveniently, for example:
*
* <pre class="prettyprint">
* IndexWriterConfig conf = new IndexWriterConfig(analyzer);
* conf.setter1().setter2();
* </pre>
*
* @see IndexWriter#getConfig()
*
* @since 3.1
*/
public final class IndexWriterConfig extends LiveIndexWriterConfig {
/**
* Specifies the open mode for {@link IndexWriter}.
*/
public enum OpenMode {
/**
* Creates a new index or overwrites an existing one.
*/
CREATE,
/**
* Opens an existing index.
*/
APPEND,
/**
* Creates a new index if one does not exist,
* otherwise it opens the index and documents will be appended.
*/
CREATE_OR_APPEND
}
/** Denotes a flush trigger is disabled. */
public final static int DISABLE_AUTO_FLUSH = -1;
/** Disabled by default (because IndexWriter flushes by RAM usage by default). */
public final static int DEFAULT_MAX_BUFFERED_DELETE_TERMS = DISABLE_AUTO_FLUSH;
/** Disabled by default (because IndexWriter flushes by RAM usage by default). */
public final static int DEFAULT_MAX_BUFFERED_DOCS = DISABLE_AUTO_FLUSH;
/**
* Default value is 16 MB (which means flush when buffered docs consume
* approximately 16 MB RAM).
*/
public final static double DEFAULT_RAM_BUFFER_SIZE_MB = 16.0;
/** Default setting (true) for {@link #setReaderPooling}. */
// We changed this default to true with concurrent deletes/updates (LUCENE-7868),
// because we will otherwise need to open and close segment readers more frequently.
// False is still supported, but will have worse performance since readers will
// be forced to aggressively move all state to disk.
public final static boolean DEFAULT_READER_POOLING = true;
/** Default value is 1945. Change using {@link #setRAMPerThreadHardLimitMB(int)} */
public static final int DEFAULT_RAM_PER_THREAD_HARD_LIMIT_MB = 1945;
/** Default value for compound file system for newly written segments
* (set to <code>true</code>). For batch indexing with very large
* ram buffers use <code>false</code> */
public final static boolean DEFAULT_USE_COMPOUND_FILE_SYSTEM = true;
/** Default value for whether calls to {@link IndexWriter#close()} include a commit. */
public final static boolean DEFAULT_COMMIT_ON_CLOSE = true;
/** Default value for time to wait for merges on commit or getReader (when using a {@link MergePolicy} that implements {@link MergePolicy#findFullFlushMerges}). */
public static final long DEFAULT_MAX_FULL_FLUSH_MERGE_WAIT_MILLIS = 0;
// indicates whether this config instance is already attached to a writer.
// not final so that it can be cloned properly.
private SetOnce<IndexWriter> writer = new SetOnce<>();
/**
* Sets the {@link IndexWriter} this config is attached to.
*
* @throws AlreadySetException
* if this config is already attached to a writer.
*/
IndexWriterConfig setIndexWriter(IndexWriter writer) {
if (this.writer.get() != null) {
throw new IllegalStateException("do not share IndexWriterConfig instances across IndexWriters");
}
this.writer.set(writer);
return this;
}
/**
* Creates a new config, using {@link StandardAnalyzer} as the
* analyzer. By default, {@link TieredMergePolicy} is used
* for merging;
* Note that {@link TieredMergePolicy} is free to select
* non-contiguous merges, which means docIDs may not
* remain monotonic over time. If this is a problem you
* should switch to {@link LogByteSizeMergePolicy} or
* {@link LogDocMergePolicy}.
*/
public IndexWriterConfig() {
this(new StandardAnalyzer());
}
/**
* Creates a new config that with the provided {@link
* Analyzer}. By default, {@link TieredMergePolicy} is used
* for merging;
* Note that {@link TieredMergePolicy} is free to select
* non-contiguous merges, which means docIDs may not
* remain monotonic over time. If this is a problem you
* should switch to {@link LogByteSizeMergePolicy} or
* {@link LogDocMergePolicy}.
*/
public IndexWriterConfig(Analyzer analyzer) {
super(analyzer);
}
/** Specifies {@link OpenMode} of the index.
*
* <p>Only takes effect when IndexWriter is first created. */
public IndexWriterConfig setOpenMode(OpenMode openMode) {
if (openMode == null) {
throw new IllegalArgumentException("openMode must not be null");
}
this.openMode = openMode;
return this;
}
@Override
public OpenMode getOpenMode() {
return openMode;
}
/**
* Expert: set the compatibility version to use for this index. In case the
* index is created, it will use the given major version for compatibility.
* It is sometimes useful to set the previous major version for compatibility
* due to the fact that {@link IndexWriter#addIndexes} only accepts indices
* that have been written with the same major version as the current index.
* If the index already exists, then this value is ignored.
* Default value is the {@link Version#major major} of the
* {@link Version#LATEST latest version}.
* <p><b>NOTE</b>: Changing the creation version reduces backward
* compatibility guarantees. For instance an index created with Lucene 8 with
* a compatibility version of 7 can't be read with Lucene 9 due to the fact
* that Lucene only supports reading indices created with the current or
* previous major release.
* @param indexCreatedVersionMajor the major version to use for compatibility
*/
public IndexWriterConfig setIndexCreatedVersionMajor(int indexCreatedVersionMajor) {
if (indexCreatedVersionMajor > Version.LATEST.major) {
throw new IllegalArgumentException("indexCreatedVersionMajor may not be in the future: current major version is " +
Version.LATEST.major + ", but got: " + indexCreatedVersionMajor);
}
if (indexCreatedVersionMajor < Version.LATEST.major - 1) {
throw new IllegalArgumentException("indexCreatedVersionMajor may not be less than the minimum supported version: " +
(Version.LATEST.major-1) + ", but got: " + indexCreatedVersionMajor);
}
this.createdVersionMajor = indexCreatedVersionMajor;
return this;
}
/**
* Expert: allows an optional {@link IndexDeletionPolicy} implementation to be
* specified. You can use this to control when prior commits are deleted from
* the index. The default policy is {@link KeepOnlyLastCommitDeletionPolicy}
* which removes all prior commits as soon as a new commit is done (this
* matches behavior before 2.2). Creating your own policy can allow you to
* explicitly keep previous "point in time" commits alive in the index for
* some time, to allow readers to refresh to the new commit without having the
* old commit deleted out from under them. This is necessary on filesystems
* like NFS that do not support "delete on last close" semantics, which
* Lucene's "point in time" search normally relies on.
* <p>
* <b>NOTE:</b> the deletion policy must not be null.
*
* <p>Only takes effect when IndexWriter is first created.
*/
public IndexWriterConfig setIndexDeletionPolicy(IndexDeletionPolicy delPolicy) {
if (delPolicy == null) {
throw new IllegalArgumentException("indexDeletionPolicy must not be null");
}
this.delPolicy = delPolicy;
return this;
}
@Override
public IndexDeletionPolicy getIndexDeletionPolicy() {
return delPolicy;
}
/**
* Expert: allows to open a certain commit point. The default is null which
* opens the latest commit point. This can also be used to open {@link IndexWriter}
* from a near-real-time reader, if you pass the reader's
* {@link DirectoryReader#getIndexCommit}.
*
* <p>Only takes effect when IndexWriter is first created. */
public IndexWriterConfig setIndexCommit(IndexCommit commit) {
this.commit = commit;
return this;
}
@Override
public IndexCommit getIndexCommit() {
return commit;
}
/**
* Expert: set the {@link Similarity} implementation used by this IndexWriter.
* <p>
* <b>NOTE:</b> the similarity must not be null.
*
* <p>Only takes effect when IndexWriter is first created. */
public IndexWriterConfig setSimilarity(Similarity similarity) {
if (similarity == null) {
throw new IllegalArgumentException("similarity must not be null");
}
this.similarity = similarity;
return this;
}
@Override
public Similarity getSimilarity() {
return similarity;
}
/**
* Expert: sets the merge scheduler used by this writer. The default is
* {@link ConcurrentMergeScheduler}.
* <p>
* <b>NOTE:</b> the merge scheduler must not be null.
*
* <p>Only takes effect when IndexWriter is first created. */
public IndexWriterConfig setMergeScheduler(MergeScheduler mergeScheduler) {
if (mergeScheduler == null) {
throw new IllegalArgumentException("mergeScheduler must not be null");
}
this.mergeScheduler = mergeScheduler;
return this;
}
@Override
public MergeScheduler getMergeScheduler() {
return mergeScheduler;
}
/**
* Set the {@link Codec}.
*
* <p>
* Only takes effect when IndexWriter is first created.
*/
public IndexWriterConfig setCodec(Codec codec) {
if (codec == null) {
throw new IllegalArgumentException("codec must not be null");
}
this.codec = codec;
return this;
}
@Override
public Codec getCodec() {
return codec;
}
@Override
public MergePolicy getMergePolicy() {
return mergePolicy;
}
/** By default, IndexWriter does not pool the
* SegmentReaders it must open for deletions and
* merging, unless a near-real-time reader has been
* obtained by calling {@link DirectoryReader#open(IndexWriter)}.
* This method lets you enable pooling without getting a
* near-real-time reader. NOTE: if you set this to
* false, IndexWriter will still pool readers once
* {@link DirectoryReader#open(IndexWriter)} is called.
*
* <p>Only takes effect when IndexWriter is first created. */
public IndexWriterConfig setReaderPooling(boolean readerPooling) {
this.readerPooling = readerPooling;
return this;
}
@Override
public boolean getReaderPooling() {
return readerPooling;
}
/**
* Expert: Controls when segments are flushed to disk during indexing.
* The {@link FlushPolicy} initialized during {@link IndexWriter} instantiation and once initialized
* the given instance is bound to this {@link IndexWriter} and should not be used with another writer.
* @see #setMaxBufferedDocs(int)
* @see #setRAMBufferSizeMB(double)
*/
IndexWriterConfig setFlushPolicy(FlushPolicy flushPolicy) {
if (flushPolicy == null) {
throw new IllegalArgumentException("flushPolicy must not be null");
}
this.flushPolicy = flushPolicy;
return this;
}
/**
* Expert: Sets the maximum memory consumption per thread triggering a forced
* flush if exceeded. A {@link DocumentsWriterPerThread} is forcefully flushed
* once it exceeds this limit even if the {@link #getRAMBufferSizeMB()} has
* not been exceeded. This is a safety limit to prevent a
* {@link DocumentsWriterPerThread} from address space exhaustion due to its
* internal 32 bit signed integer based memory addressing.
* The given value must be less that 2GB (2048MB)
*
* @see #DEFAULT_RAM_PER_THREAD_HARD_LIMIT_MB
*/
public IndexWriterConfig setRAMPerThreadHardLimitMB(int perThreadHardLimitMB) {
if (perThreadHardLimitMB <= 0 || perThreadHardLimitMB >= 2048) {
throw new IllegalArgumentException("PerThreadHardLimit must be greater than 0 and less than 2048MB");
}
this.perThreadHardLimitMB = perThreadHardLimitMB;
return this;
}
@Override
public int getRAMPerThreadHardLimitMB() {
return perThreadHardLimitMB;
}
@Override
FlushPolicy getFlushPolicy() {
return flushPolicy;
}
@Override
public InfoStream getInfoStream() {
return infoStream;
}
@Override
public Analyzer getAnalyzer() {
return super.getAnalyzer();
}
@Override
public int getMaxBufferedDocs() {
return super.getMaxBufferedDocs();
}
@Override
public IndexReaderWarmer getMergedSegmentWarmer() {
return super.getMergedSegmentWarmer();
}
@Override
public double getRAMBufferSizeMB() {
return super.getRAMBufferSizeMB();
}
/**
* Information about merges, deletes and a
* message when maxFieldLength is reached will be printed
* to this. Must not be null, but {@link InfoStream#NO_OUTPUT}
* may be used to suppress output.
*/
public IndexWriterConfig setInfoStream(InfoStream infoStream) {
if (infoStream == null) {
throw new IllegalArgumentException("Cannot set InfoStream implementation to null. "+
"To disable logging use InfoStream.NO_OUTPUT");
}
this.infoStream = infoStream;
return this;
}
/**
* Convenience method that uses {@link PrintStreamInfoStream}. Must not be null.
*/
public IndexWriterConfig setInfoStream(PrintStream printStream) {
if (printStream == null) {
throw new IllegalArgumentException("printStream must not be null");
}
return setInfoStream(new PrintStreamInfoStream(printStream));
}
@Override
public IndexWriterConfig setMergePolicy(MergePolicy mergePolicy) {
return (IndexWriterConfig) super.setMergePolicy(mergePolicy);
}
@Override
public IndexWriterConfig setMaxBufferedDocs(int maxBufferedDocs) {
return (IndexWriterConfig) super.setMaxBufferedDocs(maxBufferedDocs);
}
@Override
public IndexWriterConfig setMergedSegmentWarmer(IndexReaderWarmer mergeSegmentWarmer) {
return (IndexWriterConfig) super.setMergedSegmentWarmer(mergeSegmentWarmer);
}
@Override
public IndexWriterConfig setRAMBufferSizeMB(double ramBufferSizeMB) {
return (IndexWriterConfig) super.setRAMBufferSizeMB(ramBufferSizeMB);
}
@Override
public IndexWriterConfig setUseCompoundFile(boolean useCompoundFile) {
return (IndexWriterConfig) super.setUseCompoundFile(useCompoundFile);
}
/**
* Sets if calls {@link IndexWriter#close()} should first commit
* before closing. Use <code>true</code> to match behavior of Lucene 4.x.
*/
public IndexWriterConfig setCommitOnClose(boolean commitOnClose) {
this.commitOnClose = commitOnClose;
return this;
}
/**
* Expert: sets the amount of time to wait for merges (during {@link IndexWriter#commit}
* or {@link IndexWriter#getReader(boolean, boolean)}) returned by
* MergePolicy.findFullFlushMerges(...).
* If this time is reached, we proceed with the commit based on segments merged up to that point.
* The merges are not aborted, and will still run to completion independent of the commit or getReader call,
* like natural segment merges. The default is <code>{@value IndexWriterConfig#DEFAULT_MAX_FULL_FLUSH_MERGE_WAIT_MILLIS}</code>.
*
* Note: This settings has no effect unless {@link MergePolicy#findFullFlushMerges(MergeTrigger, SegmentInfos, MergePolicy.MergeContext)}
* has an implementation that actually returns merges which by default doesn't return any merges.
*/
public IndexWriterConfig setMaxFullFlushMergeWaitMillis(long maxFullFlushMergeWaitMillis) {
this.maxFullFlushMergeWaitMillis = maxFullFlushMergeWaitMillis;
return this;
}
/**
* Set the {@link Sort} order to use for all (flushed and merged) segments.
*/
public IndexWriterConfig setIndexSort(Sort sort) {
for (SortField sortField : sort.getSort()) {
if (sortField.getIndexSorter() == null) {
throw new IllegalArgumentException("Cannot sort index with sort field " + sortField);
}
}
this.indexSort = sort;
this.indexSortFields = Arrays.stream(sort.getSort()).map(SortField::getField).collect(Collectors.toSet());
return this;
}
/**
* Set the comparator for sorting leaf readers. A DirectoryReader opened from a IndexWriter with
* this configuration will have its leaf readers sorted with the provided leaf sorter.
*
* @param leafSorter – a comparator for sorting leaf readers
* @return IndexWriterConfig with leafSorter set.
*/
public IndexWriterConfig setLeafSorter(Comparator<LeafReader> leafSorter) {
this.leafSorter = leafSorter;
return this;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder(super.toString());
sb.append("writer=").append(writer.get()).append("\n");
return sb.toString();
}
@Override
public IndexWriterConfig setCheckPendingFlushUpdate(boolean checkPendingFlushOnUpdate) {
return (IndexWriterConfig) super.setCheckPendingFlushUpdate(checkPendingFlushOnUpdate);
}
/**
* Sets the soft deletes field. A soft delete field in lucene is a doc-values field that marks a document as soft-deleted if a
* document has at least one value in that field. If a document is marked as soft-deleted the document is treated as
* if it has been hard-deleted through the IndexWriter API ({@link IndexWriter#deleteDocuments(Term...)}.
* Merges will reclaim soft-deleted as well as hard-deleted documents and index readers obtained from the IndexWriter
* will reflect all deleted documents in it's live docs. If soft-deletes are used documents must be indexed via
* {@link IndexWriter#softUpdateDocument(Term, Iterable, Field...)}. Deletes are applied via
* {@link IndexWriter#updateDocValues(Term, Field...)}.
*
* Soft deletes allow to retain documents across merges if the merge policy modifies the live docs of a merge reader.
* {@link SoftDeletesRetentionMergePolicy} for instance allows to specify an arbitrary query to mark all documents
* that should survive the merge. This can be used to for example keep all document modifications for a certain time
* interval or the last N operations if some kind of sequence ID is available in the index.
*
* Currently there is no API support to un-delete a soft-deleted document. In oder to un-delete the document must be
* re-indexed using {@link IndexWriter#softUpdateDocument(Term, Iterable, Field...)}.
*
* The default value for this is <code>null</code> which disables soft-deletes. If soft-deletes are enabled documents
* can still be hard-deleted. Hard-deleted documents will won't considered as soft-deleted even if they have
* a value in the soft-deletes field.
*
* @see #getSoftDeletesField()
*/
public IndexWriterConfig setSoftDeletesField(String softDeletesField) {
this.softDeletesField = softDeletesField;
return this;
}
}