| package org.apache.lucene.index; |
| |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.index.DocumentsWriter.IndexingChain; |
| import org.apache.lucene.index.IndexWriter.IndexReaderWarmer; |
| import org.apache.lucene.search.Similarity; |
| import org.apache.lucene.util.Version; |
| |
| /** |
| * Holds all the configuration of {@link IndexWriter}. This object is only used |
| * while constructing a new IndexWriter. Those settings cannot be changed |
| * afterwards, except instantiating a new IndexWriter. |
| * <p> |
| * All setter methods return {@link IndexWriterConfig} to allow chaining |
| * settings conveniently. Thus someone can do: |
| * |
| * <pre> |
| * IndexWriterConfig conf = new IndexWriterConfig(analyzer); |
| * conf.setter1().setter2(); |
| * </pre> |
| * |
| * @since 3.1 |
| */ |
| public final class IndexWriterConfig implements Cloneable { |
| |
| public static final int UNLIMITED_FIELD_LENGTH = Integer.MAX_VALUE; |
| |
| /** |
| * Specifies the open mode for {@link IndexWriter}: |
| * <ul> |
| * {@link #CREATE} - creates a new index or overwrites an existing one. |
| * {@link #CREATE_OR_APPEND} - creates a new index if one does not exist, |
| * otherwise it opens the index and documents will be appended. |
| * {@link #APPEND} - opens an existing index. |
| * </ul> |
| */ |
| public static enum OpenMode { CREATE, APPEND, CREATE_OR_APPEND } |
| |
| /** Default value is 128. Change using {@link #setTermIndexInterval(int)}. */ |
| public static final int DEFAULT_TERM_INDEX_INTERVAL = 128; |
| |
| /** Denotes a flush trigger is disabled. */ |
| public final static int DISABLE_AUTO_FLUSH = -1; |
| |
| /** Disabled by default (because IndexWriter flushes by RAM usage by default). */ |
| public final static int DEFAULT_MAX_BUFFERED_DELETE_TERMS = DISABLE_AUTO_FLUSH; |
| |
| /** Disabled by default (because IndexWriter flushes by RAM usage by default). */ |
| public final static int DEFAULT_MAX_BUFFERED_DOCS = DISABLE_AUTO_FLUSH; |
| |
| /** |
| * Default value is 16 MB (which means flush when buffered docs consume |
| * approximately 16 MB RAM). |
| */ |
| public final static double DEFAULT_RAM_BUFFER_SIZE_MB = 16.0; |
| |
| /** |
| * Default value for the write lock timeout (1,000 ms). |
| * |
| * @see #setDefaultWriteLockTimeout(long) |
| */ |
| public static long WRITE_LOCK_TIMEOUT = 1000; |
| |
| /** The maximum number of simultaneous threads that may be |
| * indexing documents at once in IndexWriter; if more |
| * than this many threads arrive they will wait for |
| * others to finish. */ |
| public final static int DEFAULT_MAX_THREAD_STATES = 8; |
| |
| /** |
| * Sets the default (for any instance) maximum time to wait for a write lock |
| * (in milliseconds). |
| */ |
| public static void setDefaultWriteLockTimeout(long writeLockTimeout) { |
| WRITE_LOCK_TIMEOUT = writeLockTimeout; |
| } |
| |
| /** |
| * Returns the default write lock timeout for newly instantiated |
| * IndexWriterConfigs. |
| * |
| * @see #setDefaultWriteLockTimeout(long) |
| */ |
| public static long getDefaultWriteLockTimeout() { |
| return WRITE_LOCK_TIMEOUT; |
| } |
| |
| private Analyzer analyzer; |
| private IndexDeletionPolicy delPolicy; |
| private IndexCommit commit; |
| private OpenMode openMode; |
| private int maxFieldLength; |
| private Similarity similarity; |
| private int termIndexInterval; |
| private MergeScheduler mergeScheduler; |
| private long writeLockTimeout; |
| private int maxBufferedDeleteTerms; |
| private double ramBufferSizeMB; |
| private int maxBufferedDocs; |
| private IndexingChain indexingChain; |
| private IndexReaderWarmer mergedSegmentWarmer; |
| private MergePolicy mergePolicy; |
| private int maxThreadStates; |
| |
| // required for clone |
| private Version matchVersion; |
| |
| /** |
| * Creates a new config that with defaults that match the specified |
| * {@link Version} as well as the default {@link Analyzer}. {@link Version} is |
| * a placeholder for future changes. The default settings are relevant to 3.1 |
| * and before. In the future, if different settings will apply to different |
| * versions, they will be documented here. |
| */ |
| public IndexWriterConfig(Version matchVersion, Analyzer analyzer) { |
| this.matchVersion = matchVersion; |
| this.analyzer = analyzer; |
| delPolicy = new KeepOnlyLastCommitDeletionPolicy(); |
| commit = null; |
| openMode = OpenMode.CREATE_OR_APPEND; |
| maxFieldLength = UNLIMITED_FIELD_LENGTH; |
| similarity = Similarity.getDefault(); |
| termIndexInterval = DEFAULT_TERM_INDEX_INTERVAL; |
| mergeScheduler = new ConcurrentMergeScheduler(); |
| writeLockTimeout = WRITE_LOCK_TIMEOUT; |
| maxBufferedDeleteTerms = DEFAULT_MAX_BUFFERED_DELETE_TERMS; |
| ramBufferSizeMB = DEFAULT_RAM_BUFFER_SIZE_MB; |
| maxBufferedDocs = DEFAULT_MAX_BUFFERED_DOCS; |
| indexingChain = DocumentsWriter.defaultIndexingChain; |
| mergedSegmentWarmer = null; |
| mergePolicy = new LogByteSizeMergePolicy(); |
| maxThreadStates = DEFAULT_MAX_THREAD_STATES; |
| } |
| |
| @Override |
| public Object clone() { |
| // Shallow clone is the only thing that's possible, since parameters like |
| // analyzer, index commit etc. do not implemnt Cloneable. |
| try { |
| return super.clone(); |
| } catch (CloneNotSupportedException e) { |
| // should not happen |
| throw new RuntimeException(e); |
| } |
| } |
| |
| /** Returns the default analyzer to use for indexing documents. */ |
| public Analyzer getAnalyzer() { |
| return analyzer; |
| } |
| |
| /** Specifies {@link OpenMode} of that index. */ |
| public IndexWriterConfig setOpenMode(OpenMode openMode) { |
| this.openMode = openMode; |
| return this; |
| } |
| |
| /** Returns the {@link OpenMode} set by {@link #setOpenMode(OpenMode)}. */ |
| public OpenMode getOpenMode() { |
| return openMode; |
| } |
| |
| /** |
| * Expert: allows an optional {@link IndexDeletionPolicy} implementation to be |
| * specified. You can use this to control when prior commits are deleted from |
| * the index. The default policy is {@link KeepOnlyLastCommitDeletionPolicy} |
| * which removes all prior commits as soon as a new commit is done (this |
| * matches behavior before 2.2). Creating your own policy can allow you to |
| * explicitly keep previous "point in time" commits alive in the index for |
| * some time, to allow readers to refresh to the new commit without having the |
| * old commit deleted out from under them. This is necessary on filesystems |
| * like NFS that do not support "delete on last close" semantics, which |
| * Lucene's "point in time" search normally relies on. |
| * <p> |
| * <b>NOTE:</b> the deletion policy cannot be null. If <code>null</code> is |
| * passed, the deletion policy will be set to the default. |
| */ |
| public IndexWriterConfig setIndexDeletionPolicy(IndexDeletionPolicy delPolicy) { |
| this.delPolicy = delPolicy == null ? new KeepOnlyLastCommitDeletionPolicy() : delPolicy; |
| return this; |
| } |
| |
| /** |
| * Returns the {@link IndexDeletionPolicy} specified in |
| * {@link #setIndexDeletionPolicy(IndexDeletionPolicy)} or the default |
| * {@link KeepOnlyLastCommitDeletionPolicy}/ |
| */ |
| public IndexDeletionPolicy getIndexDeletionPolicy() { |
| return delPolicy; |
| } |
| |
| /** |
| * The maximum number of terms that will be indexed for a single field in a |
| * document. This limits the amount of memory required for indexing, so that |
| * collections with very large files will not crash the indexing process by |
| * running out of memory. This setting refers to the number of running terms, |
| * not to the number of different terms. |
| * <p> |
| * <b>NOTE:</b> this silently truncates large documents, excluding from the |
| * index all terms that occur further in the document. If you know your source |
| * documents are large, be sure to set this value high enough to accomodate |
| * the expected size. If you set it to {@link #UNLIMITED_FIELD_LENGTH}, then |
| * the only limit is your memory, but you should anticipate an |
| * OutOfMemoryError. |
| * <p> |
| * By default it is set to {@link #UNLIMITED_FIELD_LENGTH}. |
| */ |
| public IndexWriterConfig setMaxFieldLength(int maxFieldLength) { |
| this.maxFieldLength = maxFieldLength; |
| return this; |
| } |
| |
| /** |
| * Returns the maximum number of terms that will be indexed for a single field |
| * in a document. |
| * |
| * @see #setMaxFieldLength(int) |
| */ |
| public int getMaxFieldLength() { |
| return maxFieldLength; |
| } |
| |
| /** |
| * Expert: allows to open a certain commit point. The default is null which |
| * opens the latest commit point. |
| */ |
| public IndexWriterConfig setIndexCommit(IndexCommit commit) { |
| this.commit = commit; |
| return this; |
| } |
| |
| /** |
| * Returns the {@link IndexCommit} as specified in |
| * {@link #setIndexCommit(IndexCommit)} or the default, <code>null</code> |
| * which specifies to open the latest index commit point. |
| */ |
| public IndexCommit getIndexCommit() { |
| return commit; |
| } |
| |
| /** |
| * Expert: set the {@link Similarity} implementation used by this IndexWriter. |
| * <p> |
| * <b>NOTE:</b> the similarity cannot be null. If <code>null</code> is passed, |
| * the similarity will be set to the default. |
| * |
| * @see Similarity#setDefault(Similarity) |
| */ |
| public IndexWriterConfig setSimilarity(Similarity similarity) { |
| this.similarity = similarity == null ? Similarity.getDefault() : similarity; |
| return this; |
| } |
| |
| /** |
| * Expert: returns the {@link Similarity} implementation used by this |
| * IndexWriter. This defaults to the current value of |
| * {@link Similarity#getDefault()}. |
| */ |
| public Similarity getSimilarity() { |
| return similarity; |
| } |
| |
| /** |
| * Expert: set the interval between indexed terms. Large values cause less |
| * memory to be used by IndexReader, but slow random-access to terms. Small |
| * values cause more memory to be used by an IndexReader, and speed |
| * random-access to terms. |
| * <p> |
| * This parameter determines the amount of computation required per query |
| * term, regardless of the number of documents that contain that term. In |
| * particular, it is the maximum number of other terms that must be scanned |
| * before a term is located and its frequency and position information may be |
| * processed. In a large index with user-entered query terms, query processing |
| * time is likely to be dominated not by term lookup but rather by the |
| * processing of frequency and positional data. In a small index or when many |
| * uncommon query terms are generated (e.g., by wildcard queries) term lookup |
| * may become a dominant cost. |
| * <p> |
| * In particular, <code>numUniqueTerms/interval</code> terms are read into |
| * memory by an IndexReader, and, on average, <code>interval/2</code> terms |
| * must be scanned for each random term access. |
| * |
| * @see #DEFAULT_TERM_INDEX_INTERVAL |
| */ |
| public IndexWriterConfig setTermIndexInterval(int interval) { |
| this.termIndexInterval = interval; |
| return this; |
| } |
| |
| /** |
| * Returns the interval between indexed terms. |
| * |
| * @see #setTermIndexInterval(int) |
| */ |
| public int getTermIndexInterval() { |
| return termIndexInterval; |
| } |
| |
| /** |
| * Expert: sets the merge scheduler used by this writer. The default is |
| * {@link ConcurrentMergeScheduler}. |
| * <p> |
| * <b>NOTE:</b> the merge scheduler cannot be null. If <code>null</code> is |
| * passed, the merge scheduler will be set to the default. |
| */ |
| public IndexWriterConfig setMergeScheduler(MergeScheduler mergeScheduler) { |
| this.mergeScheduler = mergeScheduler == null ? new ConcurrentMergeScheduler() : mergeScheduler; |
| return this; |
| } |
| |
| /** |
| * Returns the {@link MergeScheduler} that was set by |
| * {@link #setMergeScheduler(MergeScheduler)} |
| */ |
| public MergeScheduler getMergeScheduler() { |
| return mergeScheduler; |
| } |
| |
| /** |
| * Sets the maximum time to wait for a write lock (in milliseconds) for this |
| * instance. You can change the default value for all instances by calling |
| * {@link #setDefaultWriteLockTimeout(long)}. |
| */ |
| public IndexWriterConfig setWriteLockTimeout(long writeLockTimeout) { |
| this.writeLockTimeout = writeLockTimeout; |
| return this; |
| } |
| |
| /** |
| * Returns allowed timeout when acquiring the write lock. |
| * |
| * @see #setWriteLockTimeout(long) |
| */ |
| public long getWriteLockTimeout() { |
| return writeLockTimeout; |
| } |
| |
| /** |
| * Determines the minimal number of delete terms required before the buffered |
| * in-memory delete terms are applied and flushed. If there are documents |
| * buffered in memory at the time, they are merged and a new segment is |
| * created. |
| |
| * <p>Disabled by default (writer flushes by RAM usage). |
| * |
| * @throws IllegalArgumentException if maxBufferedDeleteTerms |
| * is enabled but smaller than 1 |
| * @see #setRAMBufferSizeMB |
| */ |
| public IndexWriterConfig setMaxBufferedDeleteTerms(int maxBufferedDeleteTerms) { |
| if (maxBufferedDeleteTerms != DISABLE_AUTO_FLUSH |
| && maxBufferedDeleteTerms < 1) |
| throw new IllegalArgumentException( |
| "maxBufferedDeleteTerms must at least be 1 when enabled"); |
| this.maxBufferedDeleteTerms = maxBufferedDeleteTerms; |
| return this; |
| } |
| |
| /** |
| * Returns the number of buffered deleted terms that will trigger a flush if |
| * enabled. |
| * |
| * @see #setMaxBufferedDeleteTerms(int) |
| */ |
| public int getMaxBufferedDeleteTerms() { |
| return maxBufferedDeleteTerms; |
| } |
| |
| /** |
| * Determines the amount of RAM that may be used for buffering added documents |
| * and deletions before they are flushed to the Directory. Generally for |
| * faster indexing performance it's best to flush by RAM usage instead of |
| * document count and use as large a RAM buffer as you can. |
| * |
| * <p> |
| * When this is set, the writer will flush whenever buffered documents and |
| * deletions use this much RAM. Pass in {@link #DISABLE_AUTO_FLUSH} to prevent |
| * triggering a flush due to RAM usage. Note that if flushing by document |
| * count is also enabled, then the flush will be triggered by whichever comes |
| * first. |
| * |
| * <p> |
| * <b>NOTE</b>: the account of RAM usage for pending deletions is only |
| * approximate. Specifically, if you delete by Query, Lucene currently has no |
| * way to measure the RAM usage of individual Queries so the accounting will |
| * under-estimate and you should compensate by either calling commit() |
| * periodically yourself, or by using {@link #setMaxBufferedDeleteTerms(int)} |
| * to flush by count instead of RAM usage (each buffered delete Query counts |
| * as one). |
| * |
| * <p> |
| * <b>NOTE</b>: because IndexWriter uses <code>int</code>s when managing its |
| * internal storage, the absolute maximum value for this setting is somewhat |
| * less than 2048 MB. The precise limit depends on various factors, such as |
| * how large your documents are, how many fields have norms, etc., so it's |
| * best to set this value comfortably under 2048. |
| * |
| * <p> |
| * The default value is {@link #DEFAULT_RAM_BUFFER_SIZE_MB}. |
| * |
| * @throws IllegalArgumentException |
| * if ramBufferSize is enabled but non-positive, or it disables |
| * ramBufferSize when maxBufferedDocs is already disabled |
| */ |
| public IndexWriterConfig setRAMBufferSizeMB(double ramBufferSizeMB) { |
| if (ramBufferSizeMB > 2048.0) { |
| throw new IllegalArgumentException("ramBufferSize " + ramBufferSizeMB |
| + " is too large; should be comfortably less than 2048"); |
| } |
| if (ramBufferSizeMB != DISABLE_AUTO_FLUSH && ramBufferSizeMB <= 0.0) |
| throw new IllegalArgumentException( |
| "ramBufferSize should be > 0.0 MB when enabled"); |
| if (ramBufferSizeMB == DISABLE_AUTO_FLUSH && maxBufferedDocs == DISABLE_AUTO_FLUSH) |
| throw new IllegalArgumentException( |
| "at least one of ramBufferSize and maxBufferedDocs must be enabled"); |
| this.ramBufferSizeMB = ramBufferSizeMB; |
| return this; |
| } |
| |
| /** Returns the value set by {@link #setRAMBufferSizeMB(double)} if enabled. */ |
| public double getRAMBufferSizeMB() { |
| return ramBufferSizeMB; |
| } |
| |
| /** |
| * Determines the minimal number of documents required before the buffered |
| * in-memory documents are flushed as a new Segment. Large values generally |
| * give faster indexing. |
| * |
| * <p> |
| * When this is set, the writer will flush every maxBufferedDocs added |
| * documents. Pass in {@link #DISABLE_AUTO_FLUSH} to prevent triggering a |
| * flush due to number of buffered documents. Note that if flushing by RAM |
| * usage is also enabled, then the flush will be triggered by whichever comes |
| * first. |
| * |
| * <p> |
| * Disabled by default (writer flushes by RAM usage). |
| * |
| * @see #setRAMBufferSizeMB(double) |
| * |
| * @throws IllegalArgumentException |
| * if maxBufferedDocs is enabled but smaller than 2, or it disables |
| * maxBufferedDocs when ramBufferSize is already disabled |
| */ |
| public IndexWriterConfig setMaxBufferedDocs(int maxBufferedDocs) { |
| if (maxBufferedDocs != DISABLE_AUTO_FLUSH && maxBufferedDocs < 2) |
| throw new IllegalArgumentException( |
| "maxBufferedDocs must at least be 2 when enabled"); |
| if (maxBufferedDocs == DISABLE_AUTO_FLUSH |
| && ramBufferSizeMB == DISABLE_AUTO_FLUSH) |
| throw new IllegalArgumentException( |
| "at least one of ramBufferSize and maxBufferedDocs must be enabled"); |
| this.maxBufferedDocs = maxBufferedDocs; |
| return this; |
| } |
| |
| /** |
| * Returns the number of buffered added documents that will trigger a flush if |
| * enabled. |
| * |
| * @see #setMaxBufferedDocs(int) |
| */ |
| public int getMaxBufferedDocs() { |
| return maxBufferedDocs; |
| } |
| |
| /** Set the merged segment warmer. See {@link IndexReaderWarmer}. */ |
| public IndexWriterConfig setMergedSegmentWarmer(IndexReaderWarmer mergeSegmentWarmer) { |
| this.mergedSegmentWarmer = mergeSegmentWarmer; |
| return this; |
| } |
| |
| /** Returns the current merged segment warmer. See {@link IndexReaderWarmer}. */ |
| public IndexReaderWarmer getMergedSegmentWarmer() { |
| return mergedSegmentWarmer; |
| } |
| |
| /** |
| * Expert: {@link MergePolicy} is invoked whenever there are changes to the |
| * segments in the index. Its role is to select which merges to do, if any, |
| * and return a {@link MergePolicy.MergeSpecification} describing the merges. |
| * It also selects merges to do for optimize(). (The default is |
| * {@link LogByteSizeMergePolicy}. |
| */ |
| public IndexWriterConfig setMergePolicy(MergePolicy mergePolicy) { |
| this.mergePolicy = mergePolicy == null ? new LogByteSizeMergePolicy() : mergePolicy; |
| return this; |
| } |
| |
| /** |
| * Sets the max number of simultaneous threads that may be indexing documents |
| * at once in IndexWriter. Values < 1 are invalid and if passed |
| * <code>maxThreadStates</code> will be set to |
| * {@link #DEFAULT_MAX_THREAD_STATES}. |
| */ |
| public IndexWriterConfig setMaxThreadStates(int maxThreadStates) { |
| this.maxThreadStates = maxThreadStates < 1 ? DEFAULT_MAX_THREAD_STATES : maxThreadStates; |
| return this; |
| } |
| |
| /** Returns the max number of simultaneous threads that |
| * may be indexing documents at once in IndexWriter. */ |
| public int getMaxThreadStates() { |
| return maxThreadStates; |
| } |
| |
| /** |
| * Returns the current MergePolicy in use by this writer. |
| * |
| * @see #setMergePolicy(MergePolicy) |
| */ |
| public MergePolicy getMergePolicy() { |
| return mergePolicy; |
| } |
| |
| /** Expert: sets the {@link DocConsumer} chain to be used to process documents. */ |
| IndexWriterConfig setIndexingChain(IndexingChain indexingChain) { |
| this.indexingChain = indexingChain == null ? DocumentsWriter.defaultIndexingChain : indexingChain; |
| return this; |
| } |
| |
| /** Returns the indexing chain set on {@link #setIndexingChain(IndexingChain)}. */ |
| IndexingChain getIndexingChain() { |
| return indexingChain; |
| } |
| |
| @Override |
| public String toString() { |
| StringBuilder sb = new StringBuilder(); |
| sb.append("matchVersion=").append(matchVersion).append("\n"); |
| sb.append("analyzer=").append(analyzer.getClass().getName()).append("\n"); |
| sb.append("delPolicy=").append(delPolicy.getClass().getName()).append("\n"); |
| sb.append("commit=").append(commit == null ? "null" : commit.getClass().getName()).append("\n"); |
| sb.append("openMode=").append(openMode).append("\n"); |
| sb.append("maxFieldLength=").append(maxFieldLength).append("\n"); |
| sb.append("similarity=").append(similarity.getClass().getName()).append("\n"); |
| sb.append("termIndexInterval=").append(termIndexInterval).append("\n"); |
| sb.append("mergeScheduler=").append(mergeScheduler.getClass().getName()).append("\n"); |
| sb.append("default WRITE_LOCK_TIMEOUT=").append(WRITE_LOCK_TIMEOUT).append("\n"); |
| sb.append("writeLockTimeout=").append(writeLockTimeout).append("\n"); |
| sb.append("maxBufferedDeleteTerms=").append(maxBufferedDeleteTerms).append("\n"); |
| sb.append("ramBufferSizeMB=").append(ramBufferSizeMB).append("\n"); |
| sb.append("maxBufferedDocs=").append(maxBufferedDocs).append("\n"); |
| sb.append("mergedSegmentWarmer=").append(mergedSegmentWarmer).append("\n"); |
| sb.append("mergePolicy=").append(mergePolicy).append("\n"); |
| sb.append("maxThreadStates=").append(maxThreadStates).append("\n"); |
| return sb.toString(); |
| } |
| } |