blob: ffa3a220486f7d220b94160a7c2cfceeb11ae252 [file] [log] [blame]
Index: common-build.xml
===================================================================
--- common-build.xml (revision 414705)
+++ common-build.xml (working copy)
@@ -28,8 +28,8 @@
<property name="javac.deprecation" value="off"/>
<property name="javac.debug" value="on"/>
- <property name="javac.source" value="1.4"/>
- <property name="javac.target" value="1.4"/>
+ <property name="javac.source" value="1.5"/>
+ <property name="javac.target" value="1.5"/>
<property name="project.name" value="site"/> <!-- todo: is this used by anakia or something else? -->
<property name="build.encoding" value="utf-8"/>
Index: src/test/org/apache/lucene/index/TestDocumentWriter.java
===================================================================
--- src/test/org/apache/lucene/index/TestDocumentWriter.java (revision 414705)
+++ src/test/org/apache/lucene/index/TestDocumentWriter.java (working copy)
@@ -16,11 +16,15 @@
* limitations under the License.
*/
+import java.util.LinkedList;
+import java.util.List;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.TokenSelector;
import org.apache.lucene.document.*;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.RAMDirectory;
@@ -54,6 +58,16 @@
Analyzer analyzer = new WhitespaceAnalyzer();
Similarity similarity = Similarity.getDefault();
DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
+ writer.setTermVectorTokenSelector(new TokenSelector(){
+ public boolean accept(String field, Token t) {
+ return Character.isLowerCase(t.termText().charAt(0));
+ }
+ });
+ writer.setPositionsTokenSelector(new TokenSelector(){
+ public boolean accept(String field, Token t) {
+ return Character.isLowerCase(t.termText().charAt(0));
+ }
+ });
String segName = "test";
writer.addDocument(segName, testDoc);
//After adding the document, we should be able to read it back in
@@ -84,6 +98,31 @@
fields = doc.getFields(DocHelper.TEXT_FIELD_3_KEY);
assertTrue(fields != null && fields.length == 1);
assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_3_TEXT));
+
+ fields = doc.getFields(DocHelper.TEXT_FIELD_UTF2_KEY);
+ assertTrue(fields != null && fields.length == 1);
+ assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_UTF2_TEXT));
+ assertTrue(fields[0].isTermVectorStored());
+ TermFreqVector tv = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_UTF2_KEY);
+ assertTrue(tv != null);
+ String[] words = DocHelper.FIELD_UTF2_TEXT.split("\\s+");
+ String[] tvwords = tv.getTerms();
+ List uniques = new LinkedList();
+ int omitted = 0;
+ for (int i=0; i<words.length; i++)
+ if (!uniques.contains(words[i])) {
+ uniques.add(words[i]);
+ if (!Character.isLowerCase(words[i].charAt(0)))
+ omitted++;
+ }
+ assertTrue(omitted!=0);
+ assertTrue(omitted!=uniques.size());
+ assertEquals(uniques.size()-omitted, tvwords.length);
+ for (int i=0; i<uniques.size(); i++) {
+ for (int j=0; j<tvwords.length; j++)
+ if (uniques.get(i).equals(tvwords[j]))
+ assertTrue(Character.isLowerCase(((String)uniques.get(i)).charAt(0)));
+ }
// test that the norm file is not present if omitNorms is true
for (int i = 0; i < reader.fieldInfos.size(); i++) {
Index: src/test/org/apache/lucene/index/TestParallelWriter.java
===================================================================
--- src/test/org/apache/lucene/index/TestParallelWriter.java (revision 0)
+++ src/test/org/apache/lucene/index/TestParallelWriter.java (revision 0)
@@ -0,0 +1,151 @@
+/*
+ * TestParallelWriter.java
+ * JUnit based test
+ *
+ * Created on April 30, 2006, 12:34 PM
+ */
+
+package org.apache.lucene.index;
+
+import java.util.Arrays;
+import junit.framework.*;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
+
+/**
+ *
+ * @author Chuck Williams
+ */
+public class TestParallelWriter extends TestCase {
+
+ ParallelWriter writer;
+ Directory[] directories;
+ Map<Directory, List<String>> fieldDirectories = new HashMap<Directory, List<String>>();
+ ParallelReader reader;
+ IndexSearcher searcher;
+
+ public TestParallelWriter(String testName) {
+ super(testName);
+ }
+
+ protected void setUp() throws Exception {
+ directories = new Directory[] { new RAMDirectory(), new RAMDirectory(), new RAMDirectory() };
+ fieldDirectories.put(directories[0], Arrays.asList("title", "body"));
+ fieldDirectories.put(directories[1], Arrays.asList("markup"));
+ fieldDirectories.put(directories[2], Arrays.asList("meta"));
+
+ openWriter(true);
+
+ Document doc1 = new Document();
+ doc1.add(new Field("title", "Foxes", Field.Store.YES, Field.Index.TOKENIZED));
+ doc1.add(new Field("body", "The quick brown fox jumped over the lazy dog", Field.Store.NO, Field.Index.TOKENIZED));
+ doc1.add(new Field("meta", "Animals", Field.Store.YES, Field.Index.UN_TOKENIZED));
+ writer.addDocument(doc1);
+
+ Document doc2 = new Document();
+ doc2.add(new Field("title", "Galaxies", Field.Store.YES, Field.Index.TOKENIZED));
+ doc2.add(new Field("body", "Once upon a time in a galaxy far far away", Field.Store.NO, Field.Index.TOKENIZED));
+ doc2.add(new Field("meta", "Space", Field.Store.YES, Field.Index.UN_TOKENIZED));
+ writer.addDocument(doc2);
+
+ closeWriter();
+
+ openWriter(false);
+ openReader();
+ }
+
+ private void openWriter(boolean create) throws IOException {
+ writer = new ParallelWriter(fieldDirectories, new StandardAnalyzer(), create);
+ }
+
+ private void closeWriter() throws IOException {
+ writer.close();
+ }
+
+ private void openReader() throws IOException {
+ reader = new ParallelReader();
+ for (Directory dir : directories)
+ reader.add(IndexReader.open(dir));
+ searcher = new IndexSearcher(reader);
+ }
+
+ private void closeReader() throws IOException {
+ searcher.close();
+ reader.close();
+ }
+
+ protected void tearDown() throws Exception {
+ writer.close();
+ reader.close();
+ for (Directory dir : directories)
+ dir.close();
+ }
+
+ public static Test suite() {
+ TestSuite suite = new TestSuite(TestParallelWriter.class);
+
+ return suite;
+ }
+
+ /**
+ * Test of addDocument method, of class org.apache.lucene.index.ParallelWriter.
+ */
+ public void test() throws Exception {
+ System.out.println("Test ParallelWriter");
+
+ assertEquals(2, writer.docCount());
+ assertEquals(2, reader.numDocs());
+
+ Hits hits = searcher.search(new TermQuery(new Term("title", "foxes")));
+ assertEquals(1, hits.length());
+ Document doc = hits.doc(0);
+ assertEquals("Animals", doc.get("meta"));
+
+ hits = searcher.search(new TermQuery(new Term("body", "galaxy")));
+ assertEquals(1, hits.length());
+ doc = hits.doc(0);
+ assertEquals("Galaxies", doc.get("title"));
+ assertEquals("Space", doc.get("meta"));
+
+ closeWriter();
+ reader.deleteDocuments(new Term("title", "foxes"));
+ closeReader();
+
+ openWriter(false);
+ doc = new Document();
+ doc.add(new Field("title", "Foxes", Field.Store.YES, Field.Index.TOKENIZED));
+ doc.add(new Field("body", "The quick brown fox jumped over the lazy dog", Field.Store.NO, Field.Index.TOKENIZED));
+ doc.add(new Field("meta", "Animals", Field.Store.YES, Field.Index.UN_TOKENIZED));
+ doc.add(new Field("markup", "Interesting", Field.Store.YES, Field.Index.UN_TOKENIZED));
+ writer.addDocument(doc);
+
+ closeWriter();
+ openWriter(false);
+ openReader();
+
+ hits = searcher.search(new TermQuery(new Term("markup", "Interesting")));
+ assertEquals(1, hits.length());
+ doc = hits.doc(0);
+ assertEquals("Animals", doc.get("meta"));
+ assertEquals("Foxes", doc.get("title"));
+ assertEquals("Interesting", doc.get("markup"));
+ }
+
+}
Index: src/java/org/apache/lucene/analysis/TokenSelector.java
===================================================================
--- src/java/org/apache/lucene/analysis/TokenSelector.java (revision 0)
+++ src/java/org/apache/lucene/analysis/TokenSelector.java (revision 0)
@@ -0,0 +1,24 @@
+/*
+ * TokenSelector.java
+ *
+ * Created on June 13, 2006, 12:18 PM
+ *
+ */
+
+package org.apache.lucene.analysis;
+
+/**
+ * An interface for selecting a subset of a token stream
+ *
+ * @author Chuck Wiliams
+ */
+public interface TokenSelector {
+
+ /** Determine if a token should be selected
+ * @param fieldName field in which token was found
+ * @param token a token
+ * @return true iff token should be selected
+ */
+ public boolean accept(String fieldName, Token token);
+
+}
Index: src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java
===================================================================
--- src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java (revision 0)
+++ src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java (revision 0)
@@ -0,0 +1,44 @@
+/*
+ * PerFieldTokenSelectorWrapper.java
+ *
+ * Created on June 13, 2006, 4:09 PM
+ *
+ */
+
+package org.apache.lucene.analysis;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Expert: TokenSelector that implements a mapping from field names to TokenSelectors
+ *
+ * @author Chuck Williams
+ */
+public class PerFieldTokenSelectorWrapper implements TokenSelector {
+
+ private Map selectors = new HashMap();
+ private TokenSelector defaultSelector;
+
+ /** Expert: create a PerFieldTokenSelector with given default selector (null means select all) */
+ public PerFieldTokenSelectorWrapper(TokenSelector defaultSelector) {
+ this.defaultSelector = defaultSelector;
+ }
+
+ /** Add a token selector for the named field */
+ public void addSelector(String fieldName, TokenSelector selector) {
+ selectors.put(fieldName, selector);
+ }
+
+ /** Determine if token is accepted by fieldName */
+ public boolean accept(String fieldName, Token token) {
+ TokenSelector selector = (TokenSelector) selectors.get(fieldName);
+ if (selector!=null)
+ return selector.accept(fieldName, token);
+ else if (defaultSelector!=null)
+ return defaultSelector.accept(fieldName, token);
+ else
+ return true;
+ }
+
+}
\ No newline at end of file
Index: src/java/org/apache/lucene/index/Writable.java.orig
===================================================================
--- src/java/org/apache/lucene/index/Writable.java.orig (revision 0)
+++ src/java/org/apache/lucene/index/Writable.java.orig (revision 0)
@@ -0,0 +1,248 @@
+/*
+ * Writable.java
+ *
+ * Created on April 28, 2006, 6:10 PM
+ *
+ */
+
+package org.apache.lucene.index;
+
+import java.io.IOException;
+import java.io.PrintStream;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenSelector;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.store.Directory;
+
+/**
+ * An interface that abstracts index writers (e.g., IndexWriter, ParallelWriter)
+ *
+ * @author Chuck Williams
+ */
+public interface Writable {
+
+ /**
+ * Adds a document to this index. If the document contains more than
+ * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are
+ * discarded.
+ */
+ public void addDocument(Document doc) throws IOException;
+
+ /**
+ * Adds a document to this index, using the provided analyzer instead of the
+ * value of {@link #getAnalyzer()}. If the document contains more than
+ * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are
+ * discarded.
+ */
+ public void addDocument(Document doc, Analyzer analyzer) throws IOException;
+
+ /**
+ * Returns the number of documents currently in this index.
+ */
+ public int docCount();
+
+ /**
+ * Merges all segments together into a single segment, optimizing an index
+ * for search.
+ */
+ public void optimize() throws IOException;
+
+ /**
+ * Flushes all changes to an index and closes all associated files.
+ */
+ public void close() throws IOException;
+
+ /**
+ * Returns the analyzer used by this index.
+ */
+ public Analyzer getAnalyzer();
+
+
+ /**
+ * Setting to turn on usage of a compound file. When on, multiple files
+ * for each segment are merged into a single file once the segment creation
+ * is finished. This is done regardless of what directory is in use.
+ */
+ public void setUseCompoundFile(boolean value);
+
+ /**
+ * Get the current setting of whether to use the compound file format.
+ * Note that this just returns the value you set with setUseCompoundFile(boolean)
+ * or the default. You cannot use this to query the status of an existing index.
+ *
+ * @see #setUseCompoundFile(boolean)
+ */
+ public boolean getUseCompoundFile();
+
+ /**
+ * Expert: Set the Similarity implementation used by this IndexWriter.
+ *
+ * @see Similarity#setDefault(Similarity)
+ */
+ public void setSimilarity(Similarity similarity);
+
+ /**
+ * Expert: Return the Similarity implementation used by this IndexWriter.
+ *
+ * <p>This defaults to the current value of {@link Similarity#getDefault()}.
+ */
+ public Similarity getSimilarity();
+
+ /**
+ * Expert: Set the interval between indexed terms. Large values cause less
+ * memory to be used by IndexReader, but slow random-access to terms. Small
+ * values cause more memory to be used by an IndexReader, and speed
+ * random-access to terms.
+ *
+ * This parameter determines the amount of computation required per query
+ * term, regardless of the number of documents that contain that term. In
+ * particular, it is the maximum number of other terms that must be
+ * scanned before a term is located and its frequency and position information
+ * may be processed. In a large index with user-entered query terms, query
+ * processing time is likely to be dominated not by term lookup but rather
+ * by the processing of frequency and positional data. In a small index
+ * or when many uncommon query terms are generated (e.g., by wildcard
+ * queries) term lookup may become a dominant cost.
+ *
+ * In particular, <code>numUniqueTerms/interval</code> terms are read into
+ * memory by an IndexReader, and, on average, <code>interval/2</code> terms
+ * must be scanned for each random term access.
+ *
+ * @see #DEFAULT_TERM_INDEX_INTERVAL
+ */
+ public void setTermIndexInterval(int interval);
+
+ /**
+ * Expert: Return the interval between indexed terms.
+ *
+ * @see #setTermIndexInterval(int)
+ */
+ public int getTermIndexInterval();
+
+ /**
+ * Determines the minimal number of documents required before the buffered
+ * in-memory documents are merging and a new Segment is created.
+ * Since Documents are merged in a {@link org.apache.lucene.store.RAMDirectory},
+ * large value gives faster indexing. At the same time, mergeFactor limits
+ * the number of files open in a FSDirectory.
+ *
+ * <p> The default value is 10.
+ *
+ *
+ * @throws IllegalArgumentException if maxBufferedDocs is smaller than 2
+ */
+ public void setMaxBufferedDocs(int maxBufferedDocs);
+
+ /**
+ *
+ *
+ * @see #setMaxBufferedDocs
+ */
+ public int getMaxBufferedDocs();
+
+ /**
+ * The maximum number of terms that will be indexed for a single field in a
+ * document. This limits the amount of memory required for indexing, so that
+ * collections with very large files will not crash the indexing process by
+ * running out of memory.<p/>
+ * Note that this effectively truncates large documents, excluding from the
+ * index terms that occur further in the document. If you know your source
+ * documents are large, be sure to set this value high enough to accomodate
+ * the expected size. If you set it to Integer.MAX_VALUE, then the only limit
+ * is your memory, but you should anticipate an OutOfMemoryError.<p/>
+ * By default, no more than 10,000 terms will be indexed for a field.
+ */
+ public void setMaxFieldLength(int maxFieldLength);
+
+ /**
+ *
+ *
+ * @see #setMaxFieldLength
+ */
+ public int getMaxFieldLength();
+
+ /**
+ * Determines the largest number of documents ever merged by addDocument().
+ * Small values (e.g., less than 10,000) are best for interactive indexing,
+ * as this limits the length of pauses while indexing to a few seconds.
+ * Larger values are best for batched indexing and speedier searches.
+ *
+ * <p>The default value is {@link Integer#MAX_VALUE}.
+ */
+ public void setMaxMergeDocs(int maxMergeDocs);
+
+ /**
+ *
+ *
+ * @see #setMaxMergeDocs
+ */
+ public int getMaxMergeDocs();
+
+ /**
+ * Determines how often segment indices are merged by addDocument(). With
+ * smaller values, less RAM is used while indexing, and searches on
+ * unoptimized indices are faster, but indexing speed is slower. With larger
+ * values, more RAM is used during indexing, and while searches on unoptimized
+ * indices are slower, indexing is faster. Thus larger values (> 10) are best
+ * for batch index creation, and smaller values (< 10) for indices that are
+ * interactively maintained.
+ *
+ * <p>This must never be less than 2. The default value is 10.
+ */
+ public void setMergeFactor(int mergeFactor);
+
+ /**
+ *
+ *
+ * @see #setMergeFactor
+ */
+ public int getMergeFactor();
+
+ /**
+ * Sets the maximum time to wait for a write lock (in milliseconds).
+ */
+ public void setWriteLockTimeout(long writeLockTimeout);
+
+ /**
+ *
+ *
+ * @see #setWriteLockTimeout
+ */
+ public long getWriteLockTimeout();
+
+ /**
+ * Sets the maximum time to wait for a commit lock (in milliseconds).
+ */
+ public void setCommitLockTimeout(long commitLockTimeout);
+
+ /**
+ *
+ *
+ * @see #setCommitLockTimeout
+ */
+ public long getCommitLockTimeout();
+
+ /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors.
+ * @param selector the term vector TokenSelector
+ */
+ public void setTermVectorTokenSelector(TokenSelector selector);
+
+ /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors.
+ * @return the TokenSelector used to determine term vector tokens
+ */
+ public TokenSelector getTermVectorTokenSelector();
+
+ /** If non-null, information about merges and a message when
+ * maxFieldLength is reached will be printed to this.
+ */
+ public void setInfoStream(PrintStream infoStream);
+
+ /**
+ *
+ *
+ * @see #setInfoStream
+ */
+ public PrintStream getInfoStream();
+
+}
Index: src/java/org/apache/lucene/index/IndexWriter.java
===================================================================
--- src/java/org/apache/lucene/index/IndexWriter.java (revision 414705)
+++ src/java/org/apache/lucene/index/IndexWriter.java (working copy)
@@ -17,6 +17,7 @@
*/
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenSelector;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
@@ -56,7 +57,7 @@
@see IndexModifier IndexModifier supports the important methods of IndexWriter plus deletion
*/
-public class IndexWriter {
+public class IndexWriter implements Writable {
/**
* Default value for the write lock timeout (1,000).
@@ -100,8 +101,10 @@
*/
public final static int DEFAULT_TERM_INDEX_INTERVAL = 128;
- private Directory directory; // where this index resides
- private Analyzer analyzer; // how to analyze text
+ private Directory directory; // where this index resides
+ private Analyzer analyzer; // how to analyze text
+ private TokenSelector termVectorTokenSelector; // subset of token stream stored in term vectors
+ private TokenSelector positionsTokenSelector; // subset of token stream for which positions are stored
private Similarity similarity = Similarity.getDefault(); // how to normalize
@@ -153,6 +156,38 @@
return this.similarity;
}
+ /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors.
+ * @param selector the term vector TokenSelector
+ */
+ public void setTermVectorTokenSelector(TokenSelector selector) {
+ this.termVectorTokenSelector = selector;
+ }
+
+ /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors.
+ * @return the TokenSelector used to determine term vector tokens
+ */
+ public TokenSelector getTermVectorTokenSelector() {
+ return termVectorTokenSelector;
+ }
+
+ /** Expert: Set the TokenSelector used to determine subset of tokens for which positions are stored.
+ * (At least one position is always stored for each term in each doc to ensure the term stays in
+ * the index so long as any docs reference it)
+ * @param selector the positions TokenSelector
+ */
+ public void setPositionsTokenSelector(TokenSelector selector) {
+ this.positionsTokenSelector = selector;
+ }
+
+ /** Expert: Set the TokenSelector used to determine subset of tokens for which freq and positions are stored..
+ * (At least one position is always stored for each term in each doc to ensure the term stays in
+ * the index so long as any docs reference it)
+ * @return the positions TokenSelector
+ */
+ public TokenSelector getPositionsTokenSelector() {
+ return positionsTokenSelector;
+ }
+
/** Expert: Set the interval between indexed terms. Large values cause less
* memory to be used by IndexReader, but slow random-access to terms. Small
* values cause more memory to be used by an IndexReader, and speed
@@ -471,6 +506,8 @@
public void addDocument(Document doc, Analyzer analyzer) throws IOException {
DocumentWriter dw =
new DocumentWriter(ramDirectory, analyzer, this);
+ dw.setTermVectorTokenSelector(termVectorTokenSelector);
+ dw.setPositionsTokenSelector(positionsTokenSelector);
dw.setInfoStream(infoStream);
String segmentName = newSegmentName();
dw.addDocument(segmentName, doc);
Index: src/java/org/apache/lucene/index/Writable.java
===================================================================
--- src/java/org/apache/lucene/index/Writable.java (revision 0)
+++ src/java/org/apache/lucene/index/Writable.java (revision 0)
@@ -0,0 +1,262 @@
+/*
+ * Writable.java
+ *
+ * Created on April 28, 2006, 6:10 PM
+ *
+ */
+
+package org.apache.lucene.index;
+
+import java.io.IOException;
+import java.io.PrintStream;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenSelector;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.store.Directory;
+
+/**
+ * An interface that abstracts index writers (e.g., IndexWriter, ParallelWriter)
+ *
+ * @author Chuck Williams
+ */
+public interface Writable {
+
+ /**
+ * Adds a document to this index. If the document contains more than
+ * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are
+ * discarded.
+ */
+ public void addDocument(Document doc) throws IOException;
+
+ /**
+ * Adds a document to this index, using the provided analyzer instead of the
+ * value of {@link #getAnalyzer()}. If the document contains more than
+ * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are
+ * discarded.
+ */
+ public void addDocument(Document doc, Analyzer analyzer) throws IOException;
+
+ /**
+ * Returns the number of documents currently in this index.
+ */
+ public int docCount();
+
+ /**
+ * Merges all segments together into a single segment, optimizing an index
+ * for search.
+ */
+ public void optimize() throws IOException;
+
+ /**
+ * Flushes all changes to an index and closes all associated files.
+ */
+ public void close() throws IOException;
+
+ /**
+ * Returns the analyzer used by this index.
+ */
+ public Analyzer getAnalyzer();
+
+
+ /**
+ * Setting to turn on usage of a compound file. When on, multiple files
+ * for each segment are merged into a single file once the segment creation
+ * is finished. This is done regardless of what directory is in use.
+ */
+ public void setUseCompoundFile(boolean value);
+
+ /**
+ * Get the current setting of whether to use the compound file format.
+ * Note that this just returns the value you set with setUseCompoundFile(boolean)
+ * or the default. You cannot use this to query the status of an existing index.
+ *
+ * @see #setUseCompoundFile(boolean)
+ */
+ public boolean getUseCompoundFile();
+
+ /**
+ * Expert: Set the Similarity implementation used by this IndexWriter.
+ *
+ * @see Similarity#setDefault(Similarity)
+ */
+ public void setSimilarity(Similarity similarity);
+
+ /**
+ * Expert: Return the Similarity implementation used by this IndexWriter.
+ *
+ * <p>This defaults to the current value of {@link Similarity#getDefault()}.
+ */
+ public Similarity getSimilarity();
+
+ /**
+ * Expert: Set the interval between indexed terms. Large values cause less
+ * memory to be used by IndexReader, but slow random-access to terms. Small
+ * values cause more memory to be used by an IndexReader, and speed
+ * random-access to terms.
+ *
+ * This parameter determines the amount of computation required per query
+ * term, regardless of the number of documents that contain that term. In
+ * particular, it is the maximum number of other terms that must be
+ * scanned before a term is located and its frequency and position information
+ * may be processed. In a large index with user-entered query terms, query
+ * processing time is likely to be dominated not by term lookup but rather
+ * by the processing of frequency and positional data. In a small index
+ * or when many uncommon query terms are generated (e.g., by wildcard
+ * queries) term lookup may become a dominant cost.
+ *
+ * In particular, <code>numUniqueTerms/interval</code> terms are read into
+ * memory by an IndexReader, and, on average, <code>interval/2</code> terms
+ * must be scanned for each random term access.
+ *
+ * @see #DEFAULT_TERM_INDEX_INTERVAL
+ */
+ public void setTermIndexInterval(int interval);
+
+ /**
+ * Expert: Return the interval between indexed terms.
+ *
+ * @see #setTermIndexInterval(int)
+ */
+ public int getTermIndexInterval();
+
+ /**
+ * Determines the minimal number of documents required before the buffered
+ * in-memory documents are merging and a new Segment is created.
+ * Since Documents are merged in a {@link org.apache.lucene.store.RAMDirectory},
+ * large value gives faster indexing. At the same time, mergeFactor limits
+ * the number of files open in a FSDirectory.
+ *
+ * <p> The default value is 10.
+ *
+ *
+ * @throws IllegalArgumentException if maxBufferedDocs is smaller than 2
+ */
+ public void setMaxBufferedDocs(int maxBufferedDocs);
+
+ /**
+ *
+ *
+ * @see #setMaxBufferedDocs
+ */
+ public int getMaxBufferedDocs();
+
+ /**
+ * The maximum number of terms that will be indexed for a single field in a
+ * document. This limits the amount of memory required for indexing, so that
+ * collections with very large files will not crash the indexing process by
+ * running out of memory.<p/>
+ * Note that this effectively truncates large documents, excluding from the
+ * index terms that occur further in the document. If you know your source
+ * documents are large, be sure to set this value high enough to accomodate
+ * the expected size. If you set it to Integer.MAX_VALUE, then the only limit
+ * is your memory, but you should anticipate an OutOfMemoryError.<p/>
+ * By default, no more than 10,000 terms will be indexed for a field.
+ */
+ public void setMaxFieldLength(int maxFieldLength);
+
+ /**
+ *
+ *
+ * @see #setMaxFieldLength
+ */
+ public int getMaxFieldLength();
+
+ /**
+ * Determines the largest number of documents ever merged by addDocument().
+ * Small values (e.g., less than 10,000) are best for interactive indexing,
+ * as this limits the length of pauses while indexing to a few seconds.
+ * Larger values are best for batched indexing and speedier searches.
+ *
+ * <p>The default value is {@link Integer#MAX_VALUE}.
+ */
+ public void setMaxMergeDocs(int maxMergeDocs);
+
+ /**
+ *
+ *
+ * @see #setMaxMergeDocs
+ */
+ public int getMaxMergeDocs();
+
+ /**
+ * Determines how often segment indices are merged by addDocument(). With
+ * smaller values, less RAM is used while indexing, and searches on
+ * unoptimized indices are faster, but indexing speed is slower. With larger
+ * values, more RAM is used during indexing, and while searches on unoptimized
+ * indices are slower, indexing is faster. Thus larger values (> 10) are best
+ * for batch index creation, and smaller values (< 10) for indices that are
+ * interactively maintained.
+ *
+ * <p>This must never be less than 2. The default value is 10.
+ */
+ public void setMergeFactor(int mergeFactor);
+
+ /**
+ *
+ *
+ * @see #setMergeFactor
+ */
+ public int getMergeFactor();
+
+ /**
+ * Sets the maximum time to wait for a write lock (in milliseconds).
+ */
+ public void setWriteLockTimeout(long writeLockTimeout);
+
+ /**
+ *
+ *
+ * @see #setWriteLockTimeout
+ */
+ public long getWriteLockTimeout();
+
+ /**
+ * Sets the maximum time to wait for a commit lock (in milliseconds).
+ */
+ public void setCommitLockTimeout(long commitLockTimeout);
+
+ /**
+ *
+ *
+ * @see #setCommitLockTimeout
+ */
+ public long getCommitLockTimeout();
+
+ /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors.
+ * @param selector the term vector TokenSelector
+ */
+ public void setTermVectorTokenSelector(TokenSelector selector);
+
+ /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors.
+ * @return the TokenSelector used to determine term vector tokens
+ */
+ public TokenSelector getTermVectorTokenSelector();
+
+ /** Expert: Set the TokenSelector used to determine subset of tokens for which positions are stored.
+ * (At least one position is always stored for each term in each doc to ensure the term stays in
+ * the index so long as any docs reference it)
+ * @param selector the positions TokenSelector
+ */
+ public void setPositionsTokenSelector(TokenSelector selector);
+
+ /** Expert: Set the TokenSelector used to determine subset of tokens for which freq and positions are stored..
+ * (At least one position is always stored for each term in each doc to ensure the term stays in
+ * the index so long as any docs reference it)
+ * @return the positions TokenSelector
+ */
+ public TokenSelector getPositionsTokenSelector();
+
+ /** If non-null, information about merges and a message when
+ * maxFieldLength is reached will be printed to this.
+ */
+ public void setInfoStream(PrintStream infoStream);
+
+ /**
+ *
+ *
+ * @see #setInfoStream
+ */
+ public PrintStream getInfoStream();
+
+}
Index: src/java/org/apache/lucene/index/DocumentWriter.java
===================================================================
--- src/java/org/apache/lucene/index/DocumentWriter.java (revision 414705)
+++ src/java/org/apache/lucene/index/DocumentWriter.java (working copy)
@@ -17,6 +17,7 @@
*/
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenSelector;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
@@ -35,6 +36,8 @@
final class DocumentWriter {
private Analyzer analyzer;
+ private TokenSelector termVectorTokenSelector;
+ private TokenSelector positionsTokenSelector;
private Directory directory;
private Similarity similarity;
private FieldInfos fieldInfos;
@@ -142,9 +145,9 @@
if (!field.isTokenized()) { // un-tokenized field
String stringValue = field.stringValue();
if(field.isStoreOffsetWithTermVector())
- addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
+ addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()), false, false);
else
- addPosition(fieldName, stringValue, position++, null);
+ addPosition(fieldName, stringValue, position++, null, false, false);
offset += stringValue.length();
length++;
} else
@@ -165,10 +168,16 @@
for (Token t = stream.next(); t != null; t = stream.next()) {
position += (t.getPositionIncrement() - 1);
- if(field.isStoreOffsetWithTermVector())
- addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()));
- else
- addPosition(fieldName, t.termText(), position++, null);
+ boolean omittv = false, omitpos = false;
+ if (termVectorTokenSelector!=null && !termVectorTokenSelector.accept(field.name(), t))
+ omittv = true;
+ if (positionsTokenSelector !=null && !positionsTokenSelector. accept(field.name(), t))
+ omitpos = true;
+
+ addPosition(fieldName, t.termText(), position++,
+ field.isStoreOffsetWithTermVector() && !omittv ? new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset())
+ : null,
+ omittv, omitpos);
lastToken = t;
if (++length > maxFieldLength) {
@@ -196,20 +205,24 @@
private final Term termBuffer = new Term("", ""); // avoid consing
- private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) {
+ private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset,
+ boolean omitFromTermVector, boolean omitPosition) {
termBuffer.set(field, text);
//System.out.println("Offset: " + offset);
Posting ti = (Posting) postingTable.get(termBuffer);
if (ti != null) { // word seen before
int freq = ti.freq;
- if (ti.positions.length == freq) { // positions array is full
- int[] newPositions = new int[freq * 2]; // double size
- int[] positions = ti.positions;
- for (int i = 0; i < freq; i++) // copy old positions to new
- newPositions[i] = positions[i];
- ti.positions = newPositions;
+
+ if (!omitPosition) {
+ if (ti.positions.length == freq) { // positions array is full
+ int[] newPositions = new int[freq * 2]; // double size
+ int[] positions = ti.positions;
+ for (int i = 0; i < freq; i++) // copy old positions to new
+ newPositions[i] = positions[i];
+ ti.positions = newPositions;
+ }
+ ti.positions[freq] = position; // add new position
}
- ti.positions[freq] = position; // add new position
if (offset != null) {
if (ti.offsets.length == freq){
@@ -223,10 +236,12 @@
}
ti.offsets[freq] = offset;
}
- ti.freq = freq + 1; // update frequency
- } else { // word not seen before
+
+ if (!omitPosition)
+ ti.freq = freq + 1; // update frequency
+ } else { // word not seen before
Term term = new Term(field, text, false);
- postingTable.put(term, new Posting(term, position, offset));
+ postingTable.put(term, new Posting(term, position, offset, omitFromTermVector));
}
}
@@ -351,7 +366,7 @@
termVectorWriter.closeField();
}
}
- if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
+ if (termVectorWriter != null && termVectorWriter.isFieldOpen() && !posting.omitFromTermVector) {
termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
}
}
@@ -390,6 +405,16 @@
this.infoStream = infoStream;
}
+ /** If non-null, this will be used to select which tokens are stored in term vectors */
+ void setTermVectorTokenSelector(TokenSelector selector) {
+ this.termVectorTokenSelector = selector;
+ }
+
+ /** If non-null, this will be used to select which tokens have positions stored in the index. */
+ void setPositionsTokenSelector(TokenSelector selector) {
+ this.positionsTokenSelector = selector;
+ }
+
}
final class Posting { // info about a Term in a doc
@@ -397,17 +422,17 @@
int freq; // its frequency in doc
int[] positions; // positions it occurs at
TermVectorOffsetInfo [] offsets;
+ boolean omitFromTermVector; // if true, omit from term vector
- Posting(Term t, int position, TermVectorOffsetInfo offset) {
+ Posting(Term t, int position, TermVectorOffsetInfo offset, boolean omitFromTermVector) {
term = t;
freq = 1;
positions = new int[1];
positions[0] = position;
- if(offset != null){
- offsets = new TermVectorOffsetInfo[1];
- offsets[0] = offset;
+ if(offset != null) {
+ offsets = new TermVectorOffsetInfo[1];
+ offsets[0] = offset;
}
- else
- offsets = null;
+ this.omitFromTermVector = omitFromTermVector;
}
}
Index: src/java/org/apache/lucene/index/ParallelWriter.java
===================================================================
--- src/java/org/apache/lucene/index/ParallelWriter.java (revision 0)
+++ src/java/org/apache/lucene/index/ParallelWriter.java (revision 0)
@@ -0,0 +1,345 @@
+/*
+ * ParallelWriter.java
+ *
+ * Created on April 28, 2006, 7:07 PM
+ *
+ */
+
+package org.apache.lucene.index;
+
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.CountDownLatch;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenSelector;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.store.Directory;
+
+/**
+ * ParallelWriter is a companion to ParallelReader, although as with IndexWriter it only supports indexes stored in a Directory.
+ * The interface is at the field level. A map from directories to lists of fields is provided to create the ParallelWriter,
+ * which then creates an IndexWriter for each specified directory and operates on each field of a document using the IndexWriter
+ * for the directory to which that field is mapped. This mapping allows an application to configure its use of parallel sub-
+ * indexes independently from the rest of its processing.
+ *
+ * This implementation single-threads calls to addDocument(), but does the sub-document writes in parallel. Users of this class
+ * must ensure that the ParallelReader is never reopened while adding a new document, and must deal with recovery if exceptions
+ * occur while adding a document.
+ *
+ * @author Chuck Williams
+ */
+public class ParallelWriter implements Writable {
+
+ IndexWriter[] writers; // All IndexWriters
+ IOException exception; // If any writer gets an exception, this is stored here (only one needed)
+ Map<String,IndexWriter> writerMap; // Field name --> IndexWriter that stores that field
+ IndexWriter oneWriter; // An arbitrarily chosen IndexWriter -- used to get config info which is the same for all IndexWriters
+ Analyzer analyzer; // The Analyzer applied to all tokenized field content
+
+ private static final Document EMPTY_DOCUMENT = new Document(); // Empty document used to sync doc id's when a document is added without fields for all indexes
+
+ /**
+ * Create a new ParallelWriter
+ *
+ * @param directoryFieldsMap specifies the directory to use to store each field, multiple directories creating parallel indexes
+ * @param analyzer applied to all tokenized field content
+ * @param create create new indexes in directories iff true
+ * @throws IOException if the IndexWriters cannot be created
+ */
+ public ParallelWriter(Map<Directory,List<String>> directoryFieldsMap, Analyzer analyzer, boolean create) throws IOException {
+ this.analyzer = analyzer;
+ writers = new IndexWriter[directoryFieldsMap.size()];
+ writerMap = new HashMap<String,IndexWriter>(directoryFieldsMap.size()*5/3);
+ int i=0;
+ for (Map.Entry<Directory,List<String>> entry : directoryFieldsMap.entrySet()) {
+ IndexWriter writer = new IndexWriter(entry.getKey(), analyzer, create);
+ writers[i++] = oneWriter = writer;
+ for (String field : entry.getValue())
+ writerMap.put(field, writer);
+ }
+ }
+
+ /** Invert a directoryFieldsMap
+ * @param directoryFieldsMap a map for directories to lists of fields they contain
+ * @return a map from each field to its directory
+ */
+ public static Map<String, Directory> invertDirectoryFieldsMap(Map<Directory,List<String>> directoryFieldsMap) {
+ Map<String, Directory> fieldDirectoryMap = new HashMap<String, Directory>();
+ for (Map.Entry<Directory, List<String>> entry : directoryFieldsMap.entrySet())
+ for (String field : entry.getValue())
+ fieldDirectoryMap.put(field, entry.getKey());
+ return fieldDirectoryMap;
+ }
+
+ /** Add document to this index by adding subdocuments with the mapped fields for each parallel index. This method is synchronized because the
+ * the parallel indexes must be maintained such that equal doc id's in different indexes hold fields for the same document.
+ * This synchronization could have a negative effect on batch indexing performance. Users of this method must ensure that the ParllelReader
+ * is not re-opened within the scope of this method as it would likely find the sub-indexes out of sync.
+ * @param doc the document to add
+ * @throws IOException if there are problems writing the indexes. <strong>WARNING: If this happens it is bad.</string> The doc-id's in the
+ * indexes are likely out of sync. This situation requires repair to resync the doc ids in each document set. Possible
+ * repair actions include rebuilding the indexes or deleting documents at the end to restore equal document sets and then
+ * optimizing to restore equal doc ids.
+ * @throws RuntimeException if the threads writing to the sub-indexes are interrupted.
+ */
+ public void addDocument(Document doc) throws IOException {
+ addDocument(doc, analyzer);
+ }
+
+ /** Add document to this index by adding subdocuments with the mapped fields for each parallel index. This method is synchronized because the
+ * the parallel indexes must be maintained such that equal doc id's in different indexes hold fields for the same document.
+ * This synchronization could have a negative effect on batch indexing performance. Users of this method must ensure that the ParllelReader
+ * is not re-opened within the scope of this method as it would likely find the sub-indexes out of sync.
+ * @param doc the document to add
+ * @param analyzer apply special analyzer to this document rather than the one for the index (discouraged -- use addDocument(doc))
+ * @throws IOException if there are problems writing the indexes. <strong>WARNING: If this happens it is bad.</string> The doc-id's in the
+ * indexes are likely out of sync. This situation requires repair to resync the doc ids in each document set. Possible
+ * repair actions include rebuilding the indexes or deleting documents at the end to restore equal document sets and then
+ * optimizing to restore equal doc ids.
+ * @throws RuntimeException if the threads writing to the sub-indexes are interrupted.
+ */
+ public synchronized void addDocument(Document doc, Analyzer analyzer) throws IOException {
+ Map<IndexWriter,Document> documentMap = new HashMap<IndexWriter,Document>(writers.length*5/3);
+ Enumeration<Field> fields = doc.fields();
+ while (fields.hasMoreElements()) {
+ Field field = fields.nextElement();
+ IndexWriter writer = writerMap.get(field.name());
+ if (writer==null)
+ throw new RuntimeException(new UnknownFieldException("Unregistered field: " + field.name()));
+ Document subdoc = documentMap.get(writer);
+ if (subdoc==null)
+ documentMap.put(writer, subdoc = new Document());
+ subdoc.add(field);
+ }
+ CountDownLatch latch = new CountDownLatch(writers.length);
+ exception = null;
+ for (IndexWriter writer : writers) {
+ Document subdoc = documentMap.get(writer);
+ if (subdoc==null) // Must have a document in each parallel index to sync doc id's
+ subdoc = EMPTY_DOCUMENT;
+ new Thread(new WriterWorker(writer, subdoc, latch)).run();
+ }
+ try {
+ latch.await();
+ } catch (InterruptedException e) {
+ throw new RuntimeException("Interrupted while writing subdocuments!", e);
+ }
+ if (exception != null)
+ throw exception;
+ }
+
+ // Write a sub-documents to a sub-index and record any exception
+ private class WriterWorker implements Runnable {
+
+ private IndexWriter writer;
+ private Document document;
+ private CountDownLatch latch;
+
+ private WriterWorker(IndexWriter writer, Document document, CountDownLatch latch) {
+ this.writer = writer;
+ this.document = document;
+ this.latch = latch;
+ }
+
+ public void run() {
+ try {
+ writer.addDocument(document);
+ } catch (IOException e) {
+ exception = e;
+ } finally {
+ latch.countDown();
+ }
+ }
+
+ }
+
+ /** Obtain the number of document in this index, which is the same for each parallel index. */
+ public int docCount() {
+ return oneWriter.docCount();
+ }
+
+ /** Optimize all parallel indexes. This is synchronized to keep all index doc-id's synced up */
+ public synchronized void optimize() throws IOException {
+ for (IndexWriter writer : writers)
+ writer.optimize();
+ }
+
+ /** Close all parallel indexes. Note that the provided directories are not closed. Synchronized. */
+ public synchronized void close() throws IOException {
+ for (IndexWriter writer : writers)
+ writer.close();
+ }
+
+ /** Getter for analyzer provided to the constructor */
+ public Analyzer getAnalyzer() {
+ return analyzer;
+ }
+
+ /** Set whether or not to use compound file format in every parallel index */
+ public void setUseCompoundFile(boolean value) {
+ for (IndexWriter writer : writers)
+ writer.setUseCompoundFile(value);
+ }
+
+ /** Get the compound file usage decision, same for every parallel index */
+ public boolean getUseCompoundFile() {
+ return oneWriter.getUseCompoundFile();
+ }
+
+ /** Set similarity to use for every parallel index */
+ public void setSimilarity(Similarity similarity) {
+ for (IndexWriter writer : writers)
+ writer.setSimilarity(similarity);
+ }
+
+ /** Get similarity, which is used by every parallel index */
+ public Similarity getSimilarity() {
+ return oneWriter.getSimilarity();
+ }
+
+ /** Set the termIndexInterval used for every parallel index */
+ public void setTermIndexInterval(int interval) {
+ for (IndexWriter writer : writers)
+ writer.setTermIndexInterval(interval);
+ }
+
+ /** Get the termIndexInterval, which is used by every parallel index */
+ public int getTermIndexInterval() {
+ return oneWriter.getTermIndexInterval();
+ }
+
+ /** Set maxBufferedDocs for every parallel index */
+ public void setMaxBufferedDocs(int maxBufferedDocs) {
+ for (IndexWriter writer : writers)
+ writer.setMaxBufferedDocs(maxBufferedDocs);
+ }
+
+ /** get maxBufferedDocs, same for every parallel index */
+ public int getMaxBufferedDocs() {
+ return oneWriter.getMaxBufferedDocs();
+ }
+
+ /** Set maxFieldLength to use for every parallel index */
+ public void setMaxFieldLength(int maxFieldLength) {
+ for (IndexWriter writer : writers)
+ writer.setMaxFieldLength(maxFieldLength);
+ }
+
+ /** Get maxFieldLength, same for every parallel index */
+ public int getMaxFieldLength() {
+ return oneWriter.getMaxFieldLength();
+ }
+
+ /** Set maxMergeDocs for every parallel index */
+ public void setMaxMergeDocs(int maxMergeDocs) {
+ for (IndexWriter writer : writers)
+ writer.setMaxMergeDocs(maxMergeDocs);
+ }
+
+ /** Get max merge docs, same for every parallel index */
+ public int getMaxMergeDocs() {
+ return oneWriter.getMaxMergeDocs();
+ }
+
+ /** Set merge factor for every parallel index */
+ public void setMergeFactor(int mergeFactor) {
+ for (IndexWriter writer : writers)
+ writer.setMergeFactor(mergeFactor);
+ }
+
+ /** Get merge factor, same for every parallel index */
+ public int getMergeFactor() {
+ return oneWriter.getMergeFactor();
+ }
+
+ /** Set write lock timeout (millis) for every parallel index */
+ public void setWriteLockTimeout(long writeLockTimeout) {
+ for (IndexWriter writer : writers)
+ writer.setWriteLockTimeout(writeLockTimeout);
+ }
+
+ /** Get write lock timeout, same for every parallel index */
+ public long getWriteLockTimeout() {
+ return oneWriter.getWriteLockTimeout();
+ }
+
+ /** Set commit lock timeout for every parallel index */
+ public void setCommitLockTimeout(long commitLockTimeout) {
+ for (IndexWriter writer : writers)
+ writer.setCommitLockTimeout(commitLockTimeout);
+ }
+
+ /** Get commit lock timeout, same for every parallel index */
+ public long getCommitLockTimeout() {
+ return oneWriter.getCommitLockTimeout();
+ }
+
+ /** Get term vector TokenSelector, same for every parallel index */
+ public void setTermVectorTokenSelector(TokenSelector selector) {
+ for (IndexWriter writer : writers)
+ writer.setTermVectorTokenSelector(selector);
+ }
+
+ /** Set term vector TokenSelector for every parallel index */
+ public TokenSelector getTermVectorTokenSelector() {
+ return oneWriter.getTermVectorTokenSelector();
+ }
+
+ /** Set positions TokenSelector for every parallel index */
+ public void setPositionsTokenSelector(TokenSelector selector) {
+ for (IndexWriter writer : writers)
+ writer.setPositionsTokenSelector(selector);
+ }
+
+ /** Get positions TokenSelector, same for every parallel index */
+ public TokenSelector getPositionsTokenSelector() {
+ return oneWriter.getPositionsTokenSelector();
+ }
+
+ /** Unsupported. use setInfoStream(field, infoStream) */
+ public void setInfoStream(PrintStream infoStream) {
+ throw new UnsupportedOperationException();
+ }
+
+ /** Set an info stream for the the IndexWriter managing a specified field. The info stream receives information about field truncations, merges, etc.
+ * @param field the field whose writer to assign the info stream to
+ * @param infoStream the info stream
+ * @throws UnknownFieldException if field has not been associated with an IndexWriter in this index
+ */
+ public void setInfoStream(String field, PrintStream infoStream) throws UnknownFieldException {
+ IndexWriter writer = writerMap.get(field);
+ if (writer==null)
+ throw new UnknownFieldException("Unregistered field: " + field);
+ writer.setInfoStream(infoStream);
+
+ }
+
+ /** Unsupported. use getInfoStream(field) */
+ public PrintStream getInfoStream() {
+ throw new UnsupportedOperationException();
+ }
+
+ /** Getter for info stream associated with field. See setInfoStream(field, infoStream).
+ */
+ public PrintStream getInfoStream(String field, PrintStream infoStream) throws UnknownFieldException {
+ IndexWriter writer = writerMap.get(field);
+ if (writer==null)
+ throw new UnknownFieldException("Unregistered field: " + field);
+ return writer.getInfoStream();
+ }
+
+ private static class UnknownFieldException extends Exception {
+
+ private UnknownFieldException(String message) {
+ super(message);
+ }
+
+ }
+
+}