| Index: common-build.xml |
| =================================================================== |
| --- common-build.xml (revision 414705) |
| +++ common-build.xml (working copy) |
| @@ -28,8 +28,8 @@ |
| |
| <property name="javac.deprecation" value="off"/> |
| <property name="javac.debug" value="on"/> |
| - <property name="javac.source" value="1.4"/> |
| - <property name="javac.target" value="1.4"/> |
| + <property name="javac.source" value="1.5"/> |
| + <property name="javac.target" value="1.5"/> |
| |
| <property name="project.name" value="site"/> <!-- todo: is this used by anakia or something else? --> |
| <property name="build.encoding" value="utf-8"/> |
| Index: src/test/org/apache/lucene/index/TestDocumentWriter.java |
| =================================================================== |
| --- src/test/org/apache/lucene/index/TestDocumentWriter.java (revision 414705) |
| +++ src/test/org/apache/lucene/index/TestDocumentWriter.java (working copy) |
| @@ -16,11 +16,15 @@ |
| * limitations under the License. |
| */ |
| |
| +import java.util.LinkedList; |
| +import java.util.List; |
| import junit.framework.TestCase; |
| import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.WhitespaceAnalyzer; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.WhitespaceTokenizer; |
| +import org.apache.lucene.analysis.TokenSelector; |
| import org.apache.lucene.document.*; |
| import org.apache.lucene.search.Similarity; |
| import org.apache.lucene.store.RAMDirectory; |
| @@ -54,6 +58,16 @@ |
| Analyzer analyzer = new WhitespaceAnalyzer(); |
| Similarity similarity = Similarity.getDefault(); |
| DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50); |
| + writer.setTermVectorTokenSelector(new TokenSelector(){ |
| + public boolean accept(String field, Token t) { |
| + return Character.isLowerCase(t.termText().charAt(0)); |
| + } |
| + }); |
| + writer.setPositionsTokenSelector(new TokenSelector(){ |
| + public boolean accept(String field, Token t) { |
| + return Character.isLowerCase(t.termText().charAt(0)); |
| + } |
| + }); |
| String segName = "test"; |
| writer.addDocument(segName, testDoc); |
| //After adding the document, we should be able to read it back in |
| @@ -84,6 +98,31 @@ |
| fields = doc.getFields(DocHelper.TEXT_FIELD_3_KEY); |
| assertTrue(fields != null && fields.length == 1); |
| assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_3_TEXT)); |
| + |
| + fields = doc.getFields(DocHelper.TEXT_FIELD_UTF2_KEY); |
| + assertTrue(fields != null && fields.length == 1); |
| + assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_UTF2_TEXT)); |
| + assertTrue(fields[0].isTermVectorStored()); |
| + TermFreqVector tv = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_UTF2_KEY); |
| + assertTrue(tv != null); |
| + String[] words = DocHelper.FIELD_UTF2_TEXT.split("\\s+"); |
| + String[] tvwords = tv.getTerms(); |
| + List uniques = new LinkedList(); |
| + int omitted = 0; |
| + for (int i=0; i<words.length; i++) |
| + if (!uniques.contains(words[i])) { |
| + uniques.add(words[i]); |
| + if (!Character.isLowerCase(words[i].charAt(0))) |
| + omitted++; |
| + } |
| + assertTrue(omitted!=0); |
| + assertTrue(omitted!=uniques.size()); |
| + assertEquals(uniques.size()-omitted, tvwords.length); |
| + for (int i=0; i<uniques.size(); i++) { |
| + for (int j=0; j<tvwords.length; j++) |
| + if (uniques.get(i).equals(tvwords[j])) |
| + assertTrue(Character.isLowerCase(((String)uniques.get(i)).charAt(0))); |
| + } |
| |
| // test that the norm file is not present if omitNorms is true |
| for (int i = 0; i < reader.fieldInfos.size(); i++) { |
| Index: src/test/org/apache/lucene/index/TestParallelWriter.java |
| =================================================================== |
| --- src/test/org/apache/lucene/index/TestParallelWriter.java (revision 0) |
| +++ src/test/org/apache/lucene/index/TestParallelWriter.java (revision 0) |
| @@ -0,0 +1,151 @@ |
| +/* |
| + * TestParallelWriter.java |
| + * JUnit based test |
| + * |
| + * Created on April 30, 2006, 12:34 PM |
| + */ |
| + |
| +package org.apache.lucene.index; |
| + |
| +import java.util.Arrays; |
| +import junit.framework.*; |
| +import java.io.IOException; |
| +import java.io.PrintStream; |
| +import java.util.ArrayList; |
| +import java.util.Enumeration; |
| +import java.util.HashMap; |
| +import java.util.HashSet; |
| +import java.util.List; |
| +import java.util.Map; |
| +import java.util.Set; |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.standard.StandardAnalyzer; |
| +import org.apache.lucene.document.Document; |
| +import org.apache.lucene.document.Field; |
| +import org.apache.lucene.search.Hits; |
| +import org.apache.lucene.search.IndexSearcher; |
| +import org.apache.lucene.search.Similarity; |
| +import org.apache.lucene.search.TermQuery; |
| +import org.apache.lucene.store.Directory; |
| +import org.apache.lucene.store.RAMDirectory; |
| + |
| +/** |
| + * |
| + * @author Chuck Williams |
| + */ |
| +public class TestParallelWriter extends TestCase { |
| + |
| + ParallelWriter writer; |
| + Directory[] directories; |
| + Map<Directory, List<String>> fieldDirectories = new HashMap<Directory, List<String>>(); |
| + ParallelReader reader; |
| + IndexSearcher searcher; |
| + |
| + public TestParallelWriter(String testName) { |
| + super(testName); |
| + } |
| + |
| + protected void setUp() throws Exception { |
| + directories = new Directory[] { new RAMDirectory(), new RAMDirectory(), new RAMDirectory() }; |
| + fieldDirectories.put(directories[0], Arrays.asList("title", "body")); |
| + fieldDirectories.put(directories[1], Arrays.asList("markup")); |
| + fieldDirectories.put(directories[2], Arrays.asList("meta")); |
| + |
| + openWriter(true); |
| + |
| + Document doc1 = new Document(); |
| + doc1.add(new Field("title", "Foxes", Field.Store.YES, Field.Index.TOKENIZED)); |
| + doc1.add(new Field("body", "The quick brown fox jumped over the lazy dog", Field.Store.NO, Field.Index.TOKENIZED)); |
| + doc1.add(new Field("meta", "Animals", Field.Store.YES, Field.Index.UN_TOKENIZED)); |
| + writer.addDocument(doc1); |
| + |
| + Document doc2 = new Document(); |
| + doc2.add(new Field("title", "Galaxies", Field.Store.YES, Field.Index.TOKENIZED)); |
| + doc2.add(new Field("body", "Once upon a time in a galaxy far far away", Field.Store.NO, Field.Index.TOKENIZED)); |
| + doc2.add(new Field("meta", "Space", Field.Store.YES, Field.Index.UN_TOKENIZED)); |
| + writer.addDocument(doc2); |
| + |
| + closeWriter(); |
| + |
| + openWriter(false); |
| + openReader(); |
| + } |
| + |
| + private void openWriter(boolean create) throws IOException { |
| + writer = new ParallelWriter(fieldDirectories, new StandardAnalyzer(), create); |
| + } |
| + |
| + private void closeWriter() throws IOException { |
| + writer.close(); |
| + } |
| + |
| + private void openReader() throws IOException { |
| + reader = new ParallelReader(); |
| + for (Directory dir : directories) |
| + reader.add(IndexReader.open(dir)); |
| + searcher = new IndexSearcher(reader); |
| + } |
| + |
| + private void closeReader() throws IOException { |
| + searcher.close(); |
| + reader.close(); |
| + } |
| + |
| + protected void tearDown() throws Exception { |
| + writer.close(); |
| + reader.close(); |
| + for (Directory dir : directories) |
| + dir.close(); |
| + } |
| + |
| + public static Test suite() { |
| + TestSuite suite = new TestSuite(TestParallelWriter.class); |
| + |
| + return suite; |
| + } |
| + |
| + /** |
| + * Test of addDocument method, of class org.apache.lucene.index.ParallelWriter. |
| + */ |
| + public void test() throws Exception { |
| + System.out.println("Test ParallelWriter"); |
| + |
| + assertEquals(2, writer.docCount()); |
| + assertEquals(2, reader.numDocs()); |
| + |
| + Hits hits = searcher.search(new TermQuery(new Term("title", "foxes"))); |
| + assertEquals(1, hits.length()); |
| + Document doc = hits.doc(0); |
| + assertEquals("Animals", doc.get("meta")); |
| + |
| + hits = searcher.search(new TermQuery(new Term("body", "galaxy"))); |
| + assertEquals(1, hits.length()); |
| + doc = hits.doc(0); |
| + assertEquals("Galaxies", doc.get("title")); |
| + assertEquals("Space", doc.get("meta")); |
| + |
| + closeWriter(); |
| + reader.deleteDocuments(new Term("title", "foxes")); |
| + closeReader(); |
| + |
| + openWriter(false); |
| + doc = new Document(); |
| + doc.add(new Field("title", "Foxes", Field.Store.YES, Field.Index.TOKENIZED)); |
| + doc.add(new Field("body", "The quick brown fox jumped over the lazy dog", Field.Store.NO, Field.Index.TOKENIZED)); |
| + doc.add(new Field("meta", "Animals", Field.Store.YES, Field.Index.UN_TOKENIZED)); |
| + doc.add(new Field("markup", "Interesting", Field.Store.YES, Field.Index.UN_TOKENIZED)); |
| + writer.addDocument(doc); |
| + |
| + closeWriter(); |
| + openWriter(false); |
| + openReader(); |
| + |
| + hits = searcher.search(new TermQuery(new Term("markup", "Interesting"))); |
| + assertEquals(1, hits.length()); |
| + doc = hits.doc(0); |
| + assertEquals("Animals", doc.get("meta")); |
| + assertEquals("Foxes", doc.get("title")); |
| + assertEquals("Interesting", doc.get("markup")); |
| + } |
| + |
| +} |
| Index: src/java/org/apache/lucene/analysis/TokenSelector.java |
| =================================================================== |
| --- src/java/org/apache/lucene/analysis/TokenSelector.java (revision 0) |
| +++ src/java/org/apache/lucene/analysis/TokenSelector.java (revision 0) |
| @@ -0,0 +1,24 @@ |
| +/* |
| + * TokenSelector.java |
| + * |
| + * Created on June 13, 2006, 12:18 PM |
| + * |
| + */ |
| + |
| +package org.apache.lucene.analysis; |
| + |
| +/** |
| + * An interface for selecting a subset of a token stream |
| + * |
| + * @author Chuck Wiliams |
| + */ |
| +public interface TokenSelector { |
| + |
| + /** Determine if a token should be selected |
| + * @param fieldName field in which token was found |
| + * @param token a token |
| + * @return true iff token should be selected |
| + */ |
| + public boolean accept(String fieldName, Token token); |
| + |
| +} |
| Index: src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java |
| =================================================================== |
| --- src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java (revision 0) |
| +++ src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java (revision 0) |
| @@ -0,0 +1,44 @@ |
| +/* |
| + * PerFieldTokenSelectorWrapper.java |
| + * |
| + * Created on June 13, 2006, 4:09 PM |
| + * |
| + */ |
| + |
| +package org.apache.lucene.analysis; |
| + |
| +import java.util.HashMap; |
| +import java.util.Map; |
| + |
| +/** |
| + * Expert: TokenSelector that implements a mapping from field names to TokenSelectors |
| + * |
| + * @author Chuck Williams |
| + */ |
| +public class PerFieldTokenSelectorWrapper implements TokenSelector { |
| + |
| + private Map selectors = new HashMap(); |
| + private TokenSelector defaultSelector; |
| + |
| + /** Expert: create a PerFieldTokenSelector with given default selector (null means select all) */ |
| + public PerFieldTokenSelectorWrapper(TokenSelector defaultSelector) { |
| + this.defaultSelector = defaultSelector; |
| + } |
| + |
| + /** Add a token selector for the named field */ |
| + public void addSelector(String fieldName, TokenSelector selector) { |
| + selectors.put(fieldName, selector); |
| + } |
| + |
| + /** Determine if token is accepted by fieldName */ |
| + public boolean accept(String fieldName, Token token) { |
| + TokenSelector selector = (TokenSelector) selectors.get(fieldName); |
| + if (selector!=null) |
| + return selector.accept(fieldName, token); |
| + else if (defaultSelector!=null) |
| + return defaultSelector.accept(fieldName, token); |
| + else |
| + return true; |
| + } |
| + |
| +} |
| \ No newline at end of file |
| Index: src/java/org/apache/lucene/index/Writable.java.orig |
| =================================================================== |
| --- src/java/org/apache/lucene/index/Writable.java.orig (revision 0) |
| +++ src/java/org/apache/lucene/index/Writable.java.orig (revision 0) |
| @@ -0,0 +1,248 @@ |
| +/* |
| + * Writable.java |
| + * |
| + * Created on April 28, 2006, 6:10 PM |
| + * |
| + */ |
| + |
| +package org.apache.lucene.index; |
| + |
| +import java.io.IOException; |
| +import java.io.PrintStream; |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.TokenSelector; |
| +import org.apache.lucene.document.Document; |
| +import org.apache.lucene.search.Similarity; |
| +import org.apache.lucene.store.Directory; |
| + |
| +/** |
| + * An interface that abstracts index writers (e.g., IndexWriter, ParallelWriter) |
| + * |
| + * @author Chuck Williams |
| + */ |
| +public interface Writable { |
| + |
| + /** |
| + * Adds a document to this index. If the document contains more than |
| + * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are |
| + * discarded. |
| + */ |
| + public void addDocument(Document doc) throws IOException; |
| + |
| + /** |
| + * Adds a document to this index, using the provided analyzer instead of the |
| + * value of {@link #getAnalyzer()}. If the document contains more than |
| + * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are |
| + * discarded. |
| + */ |
| + public void addDocument(Document doc, Analyzer analyzer) throws IOException; |
| + |
| + /** |
| + * Returns the number of documents currently in this index. |
| + */ |
| + public int docCount(); |
| + |
| + /** |
| + * Merges all segments together into a single segment, optimizing an index |
| + * for search. |
| + */ |
| + public void optimize() throws IOException; |
| + |
| + /** |
| + * Flushes all changes to an index and closes all associated files. |
| + */ |
| + public void close() throws IOException; |
| + |
| + /** |
| + * Returns the analyzer used by this index. |
| + */ |
| + public Analyzer getAnalyzer(); |
| + |
| + |
| + /** |
| + * Setting to turn on usage of a compound file. When on, multiple files |
| + * for each segment are merged into a single file once the segment creation |
| + * is finished. This is done regardless of what directory is in use. |
| + */ |
| + public void setUseCompoundFile(boolean value); |
| + |
| + /** |
| + * Get the current setting of whether to use the compound file format. |
| + * Note that this just returns the value you set with setUseCompoundFile(boolean) |
| + * or the default. You cannot use this to query the status of an existing index. |
| + * |
| + * @see #setUseCompoundFile(boolean) |
| + */ |
| + public boolean getUseCompoundFile(); |
| + |
| + /** |
| + * Expert: Set the Similarity implementation used by this IndexWriter. |
| + * |
| + * @see Similarity#setDefault(Similarity) |
| + */ |
| + public void setSimilarity(Similarity similarity); |
| + |
| + /** |
| + * Expert: Return the Similarity implementation used by this IndexWriter. |
| + * |
| + * <p>This defaults to the current value of {@link Similarity#getDefault()}. |
| + */ |
| + public Similarity getSimilarity(); |
| + |
| + /** |
| + * Expert: Set the interval between indexed terms. Large values cause less |
| + * memory to be used by IndexReader, but slow random-access to terms. Small |
| + * values cause more memory to be used by an IndexReader, and speed |
| + * random-access to terms. |
| + * |
| + * This parameter determines the amount of computation required per query |
| + * term, regardless of the number of documents that contain that term. In |
| + * particular, it is the maximum number of other terms that must be |
| + * scanned before a term is located and its frequency and position information |
| + * may be processed. In a large index with user-entered query terms, query |
| + * processing time is likely to be dominated not by term lookup but rather |
| + * by the processing of frequency and positional data. In a small index |
| + * or when many uncommon query terms are generated (e.g., by wildcard |
| + * queries) term lookup may become a dominant cost. |
| + * |
| + * In particular, <code>numUniqueTerms/interval</code> terms are read into |
| + * memory by an IndexReader, and, on average, <code>interval/2</code> terms |
| + * must be scanned for each random term access. |
| + * |
| + * @see #DEFAULT_TERM_INDEX_INTERVAL |
| + */ |
| + public void setTermIndexInterval(int interval); |
| + |
| + /** |
| + * Expert: Return the interval between indexed terms. |
| + * |
| + * @see #setTermIndexInterval(int) |
| + */ |
| + public int getTermIndexInterval(); |
| + |
| + /** |
| + * Determines the minimal number of documents required before the buffered |
| + * in-memory documents are merging and a new Segment is created. |
| + * Since Documents are merged in a {@link org.apache.lucene.store.RAMDirectory}, |
| + * large value gives faster indexing. At the same time, mergeFactor limits |
| + * the number of files open in a FSDirectory. |
| + * |
| + * <p> The default value is 10. |
| + * |
| + * |
| + * @throws IllegalArgumentException if maxBufferedDocs is smaller than 2 |
| + */ |
| + public void setMaxBufferedDocs(int maxBufferedDocs); |
| + |
| + /** |
| + * |
| + * |
| + * @see #setMaxBufferedDocs |
| + */ |
| + public int getMaxBufferedDocs(); |
| + |
| + /** |
| + * The maximum number of terms that will be indexed for a single field in a |
| + * document. This limits the amount of memory required for indexing, so that |
| + * collections with very large files will not crash the indexing process by |
| + * running out of memory.<p/> |
| + * Note that this effectively truncates large documents, excluding from the |
| + * index terms that occur further in the document. If you know your source |
| + * documents are large, be sure to set this value high enough to accomodate |
| + * the expected size. If you set it to Integer.MAX_VALUE, then the only limit |
| + * is your memory, but you should anticipate an OutOfMemoryError.<p/> |
| + * By default, no more than 10,000 terms will be indexed for a field. |
| + */ |
| + public void setMaxFieldLength(int maxFieldLength); |
| + |
| + /** |
| + * |
| + * |
| + * @see #setMaxFieldLength |
| + */ |
| + public int getMaxFieldLength(); |
| + |
| + /** |
| + * Determines the largest number of documents ever merged by addDocument(). |
| + * Small values (e.g., less than 10,000) are best for interactive indexing, |
| + * as this limits the length of pauses while indexing to a few seconds. |
| + * Larger values are best for batched indexing and speedier searches. |
| + * |
| + * <p>The default value is {@link Integer#MAX_VALUE}. |
| + */ |
| + public void setMaxMergeDocs(int maxMergeDocs); |
| + |
| + /** |
| + * |
| + * |
| + * @see #setMaxMergeDocs |
| + */ |
| + public int getMaxMergeDocs(); |
| + |
| + /** |
| + * Determines how often segment indices are merged by addDocument(). With |
| + * smaller values, less RAM is used while indexing, and searches on |
| + * unoptimized indices are faster, but indexing speed is slower. With larger |
| + * values, more RAM is used during indexing, and while searches on unoptimized |
| + * indices are slower, indexing is faster. Thus larger values (> 10) are best |
| + * for batch index creation, and smaller values (< 10) for indices that are |
| + * interactively maintained. |
| + * |
| + * <p>This must never be less than 2. The default value is 10. |
| + */ |
| + public void setMergeFactor(int mergeFactor); |
| + |
| + /** |
| + * |
| + * |
| + * @see #setMergeFactor |
| + */ |
| + public int getMergeFactor(); |
| + |
| + /** |
| + * Sets the maximum time to wait for a write lock (in milliseconds). |
| + */ |
| + public void setWriteLockTimeout(long writeLockTimeout); |
| + |
| + /** |
| + * |
| + * |
| + * @see #setWriteLockTimeout |
| + */ |
| + public long getWriteLockTimeout(); |
| + |
| + /** |
| + * Sets the maximum time to wait for a commit lock (in milliseconds). |
| + */ |
| + public void setCommitLockTimeout(long commitLockTimeout); |
| + |
| + /** |
| + * |
| + * |
| + * @see #setCommitLockTimeout |
| + */ |
| + public long getCommitLockTimeout(); |
| + |
| + /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors. |
| + * @param selector the term vector TokenSelector |
| + */ |
| + public void setTermVectorTokenSelector(TokenSelector selector); |
| + |
| + /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors. |
| + * @return the TokenSelector used to determine term vector tokens |
| + */ |
| + public TokenSelector getTermVectorTokenSelector(); |
| + |
| + /** If non-null, information about merges and a message when |
| + * maxFieldLength is reached will be printed to this. |
| + */ |
| + public void setInfoStream(PrintStream infoStream); |
| + |
| + /** |
| + * |
| + * |
| + * @see #setInfoStream |
| + */ |
| + public PrintStream getInfoStream(); |
| + |
| +} |
| Index: src/java/org/apache/lucene/index/IndexWriter.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/IndexWriter.java (revision 414705) |
| +++ src/java/org/apache/lucene/index/IndexWriter.java (working copy) |
| @@ -17,6 +17,7 @@ |
| */ |
| |
| import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.TokenSelector; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.search.Similarity; |
| import org.apache.lucene.store.Directory; |
| @@ -56,7 +57,7 @@ |
| @see IndexModifier IndexModifier supports the important methods of IndexWriter plus deletion |
| */ |
| |
| -public class IndexWriter { |
| +public class IndexWriter implements Writable { |
| |
| /** |
| * Default value for the write lock timeout (1,000). |
| @@ -100,8 +101,10 @@ |
| */ |
| public final static int DEFAULT_TERM_INDEX_INTERVAL = 128; |
| |
| - private Directory directory; // where this index resides |
| - private Analyzer analyzer; // how to analyze text |
| + private Directory directory; // where this index resides |
| + private Analyzer analyzer; // how to analyze text |
| + private TokenSelector termVectorTokenSelector; // subset of token stream stored in term vectors |
| + private TokenSelector positionsTokenSelector; // subset of token stream for which positions are stored |
| |
| private Similarity similarity = Similarity.getDefault(); // how to normalize |
| |
| @@ -153,6 +156,38 @@ |
| return this.similarity; |
| } |
| |
| + /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors. |
| + * @param selector the term vector TokenSelector |
| + */ |
| + public void setTermVectorTokenSelector(TokenSelector selector) { |
| + this.termVectorTokenSelector = selector; |
| + } |
| + |
| + /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors. |
| + * @return the TokenSelector used to determine term vector tokens |
| + */ |
| + public TokenSelector getTermVectorTokenSelector() { |
| + return termVectorTokenSelector; |
| + } |
| + |
| + /** Expert: Set the TokenSelector used to determine subset of tokens for which positions are stored. |
| + * (At least one position is always stored for each term in each doc to ensure the term stays in |
| + * the index so long as any docs reference it) |
| + * @param selector the positions TokenSelector |
| + */ |
| + public void setPositionsTokenSelector(TokenSelector selector) { |
| + this.positionsTokenSelector = selector; |
| + } |
| + |
| + /** Expert: Set the TokenSelector used to determine subset of tokens for which freq and positions are stored.. |
| + * (At least one position is always stored for each term in each doc to ensure the term stays in |
| + * the index so long as any docs reference it) |
| + * @return the positions TokenSelector |
| + */ |
| + public TokenSelector getPositionsTokenSelector() { |
| + return positionsTokenSelector; |
| + } |
| + |
| /** Expert: Set the interval between indexed terms. Large values cause less |
| * memory to be used by IndexReader, but slow random-access to terms. Small |
| * values cause more memory to be used by an IndexReader, and speed |
| @@ -471,6 +506,8 @@ |
| public void addDocument(Document doc, Analyzer analyzer) throws IOException { |
| DocumentWriter dw = |
| new DocumentWriter(ramDirectory, analyzer, this); |
| + dw.setTermVectorTokenSelector(termVectorTokenSelector); |
| + dw.setPositionsTokenSelector(positionsTokenSelector); |
| dw.setInfoStream(infoStream); |
| String segmentName = newSegmentName(); |
| dw.addDocument(segmentName, doc); |
| Index: src/java/org/apache/lucene/index/Writable.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/Writable.java (revision 0) |
| +++ src/java/org/apache/lucene/index/Writable.java (revision 0) |
| @@ -0,0 +1,262 @@ |
| +/* |
| + * Writable.java |
| + * |
| + * Created on April 28, 2006, 6:10 PM |
| + * |
| + */ |
| + |
| +package org.apache.lucene.index; |
| + |
| +import java.io.IOException; |
| +import java.io.PrintStream; |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.TokenSelector; |
| +import org.apache.lucene.document.Document; |
| +import org.apache.lucene.search.Similarity; |
| +import org.apache.lucene.store.Directory; |
| + |
| +/** |
| + * An interface that abstracts index writers (e.g., IndexWriter, ParallelWriter) |
| + * |
| + * @author Chuck Williams |
| + */ |
| +public interface Writable { |
| + |
| + /** |
| + * Adds a document to this index. If the document contains more than |
| + * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are |
| + * discarded. |
| + */ |
| + public void addDocument(Document doc) throws IOException; |
| + |
| + /** |
| + * Adds a document to this index, using the provided analyzer instead of the |
| + * value of {@link #getAnalyzer()}. If the document contains more than |
| + * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are |
| + * discarded. |
| + */ |
| + public void addDocument(Document doc, Analyzer analyzer) throws IOException; |
| + |
| + /** |
| + * Returns the number of documents currently in this index. |
| + */ |
| + public int docCount(); |
| + |
| + /** |
| + * Merges all segments together into a single segment, optimizing an index |
| + * for search. |
| + */ |
| + public void optimize() throws IOException; |
| + |
| + /** |
| + * Flushes all changes to an index and closes all associated files. |
| + */ |
| + public void close() throws IOException; |
| + |
| + /** |
| + * Returns the analyzer used by this index. |
| + */ |
| + public Analyzer getAnalyzer(); |
| + |
| + |
| + /** |
| + * Setting to turn on usage of a compound file. When on, multiple files |
| + * for each segment are merged into a single file once the segment creation |
| + * is finished. This is done regardless of what directory is in use. |
| + */ |
| + public void setUseCompoundFile(boolean value); |
| + |
| + /** |
| + * Get the current setting of whether to use the compound file format. |
| + * Note that this just returns the value you set with setUseCompoundFile(boolean) |
| + * or the default. You cannot use this to query the status of an existing index. |
| + * |
| + * @see #setUseCompoundFile(boolean) |
| + */ |
| + public boolean getUseCompoundFile(); |
| + |
| + /** |
| + * Expert: Set the Similarity implementation used by this IndexWriter. |
| + * |
| + * @see Similarity#setDefault(Similarity) |
| + */ |
| + public void setSimilarity(Similarity similarity); |
| + |
| + /** |
| + * Expert: Return the Similarity implementation used by this IndexWriter. |
| + * |
| + * <p>This defaults to the current value of {@link Similarity#getDefault()}. |
| + */ |
| + public Similarity getSimilarity(); |
| + |
| + /** |
| + * Expert: Set the interval between indexed terms. Large values cause less |
| + * memory to be used by IndexReader, but slow random-access to terms. Small |
| + * values cause more memory to be used by an IndexReader, and speed |
| + * random-access to terms. |
| + * |
| + * This parameter determines the amount of computation required per query |
| + * term, regardless of the number of documents that contain that term. In |
| + * particular, it is the maximum number of other terms that must be |
| + * scanned before a term is located and its frequency and position information |
| + * may be processed. In a large index with user-entered query terms, query |
| + * processing time is likely to be dominated not by term lookup but rather |
| + * by the processing of frequency and positional data. In a small index |
| + * or when many uncommon query terms are generated (e.g., by wildcard |
| + * queries) term lookup may become a dominant cost. |
| + * |
| + * In particular, <code>numUniqueTerms/interval</code> terms are read into |
| + * memory by an IndexReader, and, on average, <code>interval/2</code> terms |
| + * must be scanned for each random term access. |
| + * |
| + * @see #DEFAULT_TERM_INDEX_INTERVAL |
| + */ |
| + public void setTermIndexInterval(int interval); |
| + |
| + /** |
| + * Expert: Return the interval between indexed terms. |
| + * |
| + * @see #setTermIndexInterval(int) |
| + */ |
| + public int getTermIndexInterval(); |
| + |
| + /** |
| + * Determines the minimal number of documents required before the buffered |
| + * in-memory documents are merging and a new Segment is created. |
| + * Since Documents are merged in a {@link org.apache.lucene.store.RAMDirectory}, |
| + * large value gives faster indexing. At the same time, mergeFactor limits |
| + * the number of files open in a FSDirectory. |
| + * |
| + * <p> The default value is 10. |
| + * |
| + * |
| + * @throws IllegalArgumentException if maxBufferedDocs is smaller than 2 |
| + */ |
| + public void setMaxBufferedDocs(int maxBufferedDocs); |
| + |
| + /** |
| + * |
| + * |
| + * @see #setMaxBufferedDocs |
| + */ |
| + public int getMaxBufferedDocs(); |
| + |
| + /** |
| + * The maximum number of terms that will be indexed for a single field in a |
| + * document. This limits the amount of memory required for indexing, so that |
| + * collections with very large files will not crash the indexing process by |
| + * running out of memory.<p/> |
| + * Note that this effectively truncates large documents, excluding from the |
| + * index terms that occur further in the document. If you know your source |
| + * documents are large, be sure to set this value high enough to accomodate |
| + * the expected size. If you set it to Integer.MAX_VALUE, then the only limit |
| + * is your memory, but you should anticipate an OutOfMemoryError.<p/> |
| + * By default, no more than 10,000 terms will be indexed for a field. |
| + */ |
| + public void setMaxFieldLength(int maxFieldLength); |
| + |
| + /** |
| + * |
| + * |
| + * @see #setMaxFieldLength |
| + */ |
| + public int getMaxFieldLength(); |
| + |
| + /** |
| + * Determines the largest number of documents ever merged by addDocument(). |
| + * Small values (e.g., less than 10,000) are best for interactive indexing, |
| + * as this limits the length of pauses while indexing to a few seconds. |
| + * Larger values are best for batched indexing and speedier searches. |
| + * |
| + * <p>The default value is {@link Integer#MAX_VALUE}. |
| + */ |
| + public void setMaxMergeDocs(int maxMergeDocs); |
| + |
| + /** |
| + * |
| + * |
| + * @see #setMaxMergeDocs |
| + */ |
| + public int getMaxMergeDocs(); |
| + |
| + /** |
| + * Determines how often segment indices are merged by addDocument(). With |
| + * smaller values, less RAM is used while indexing, and searches on |
| + * unoptimized indices are faster, but indexing speed is slower. With larger |
| + * values, more RAM is used during indexing, and while searches on unoptimized |
| + * indices are slower, indexing is faster. Thus larger values (> 10) are best |
| + * for batch index creation, and smaller values (< 10) for indices that are |
| + * interactively maintained. |
| + * |
| + * <p>This must never be less than 2. The default value is 10. |
| + */ |
| + public void setMergeFactor(int mergeFactor); |
| + |
| + /** |
| + * |
| + * |
| + * @see #setMergeFactor |
| + */ |
| + public int getMergeFactor(); |
| + |
| + /** |
| + * Sets the maximum time to wait for a write lock (in milliseconds). |
| + */ |
| + public void setWriteLockTimeout(long writeLockTimeout); |
| + |
| + /** |
| + * |
| + * |
| + * @see #setWriteLockTimeout |
| + */ |
| + public long getWriteLockTimeout(); |
| + |
| + /** |
| + * Sets the maximum time to wait for a commit lock (in milliseconds). |
| + */ |
| + public void setCommitLockTimeout(long commitLockTimeout); |
| + |
| + /** |
| + * |
| + * |
| + * @see #setCommitLockTimeout |
| + */ |
| + public long getCommitLockTimeout(); |
| + |
| + /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors. |
| + * @param selector the term vector TokenSelector |
| + */ |
| + public void setTermVectorTokenSelector(TokenSelector selector); |
| + |
| + /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors. |
| + * @return the TokenSelector used to determine term vector tokens |
| + */ |
| + public TokenSelector getTermVectorTokenSelector(); |
| + |
| + /** Expert: Set the TokenSelector used to determine subset of tokens for which positions are stored. |
| + * (At least one position is always stored for each term in each doc to ensure the term stays in |
| + * the index so long as any docs reference it) |
| + * @param selector the positions TokenSelector |
| + */ |
| + public void setPositionsTokenSelector(TokenSelector selector); |
| + |
| + /** Expert: Set the TokenSelector used to determine subset of tokens for which freq and positions are stored.. |
| + * (At least one position is always stored for each term in each doc to ensure the term stays in |
| + * the index so long as any docs reference it) |
| + * @return the positions TokenSelector |
| + */ |
| + public TokenSelector getPositionsTokenSelector(); |
| + |
| + /** If non-null, information about merges and a message when |
| + * maxFieldLength is reached will be printed to this. |
| + */ |
| + public void setInfoStream(PrintStream infoStream); |
| + |
| + /** |
| + * |
| + * |
| + * @see #setInfoStream |
| + */ |
| + public PrintStream getInfoStream(); |
| + |
| +} |
| Index: src/java/org/apache/lucene/index/DocumentWriter.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/DocumentWriter.java (revision 414705) |
| +++ src/java/org/apache/lucene/index/DocumentWriter.java (working copy) |
| @@ -17,6 +17,7 @@ |
| */ |
| |
| import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.TokenSelector; |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.document.Document; |
| @@ -35,6 +36,8 @@ |
| |
| final class DocumentWriter { |
| private Analyzer analyzer; |
| + private TokenSelector termVectorTokenSelector; |
| + private TokenSelector positionsTokenSelector; |
| private Directory directory; |
| private Similarity similarity; |
| private FieldInfos fieldInfos; |
| @@ -142,9 +145,9 @@ |
| if (!field.isTokenized()) { // un-tokenized field |
| String stringValue = field.stringValue(); |
| if(field.isStoreOffsetWithTermVector()) |
| - addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length())); |
| + addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()), false, false); |
| else |
| - addPosition(fieldName, stringValue, position++, null); |
| + addPosition(fieldName, stringValue, position++, null, false, false); |
| offset += stringValue.length(); |
| length++; |
| } else |
| @@ -165,10 +168,16 @@ |
| for (Token t = stream.next(); t != null; t = stream.next()) { |
| position += (t.getPositionIncrement() - 1); |
| |
| - if(field.isStoreOffsetWithTermVector()) |
| - addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset())); |
| - else |
| - addPosition(fieldName, t.termText(), position++, null); |
| + boolean omittv = false, omitpos = false; |
| + if (termVectorTokenSelector!=null && !termVectorTokenSelector.accept(field.name(), t)) |
| + omittv = true; |
| + if (positionsTokenSelector !=null && !positionsTokenSelector. accept(field.name(), t)) |
| + omitpos = true; |
| + |
| + addPosition(fieldName, t.termText(), position++, |
| + field.isStoreOffsetWithTermVector() && !omittv ? new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()) |
| + : null, |
| + omittv, omitpos); |
| |
| lastToken = t; |
| if (++length > maxFieldLength) { |
| @@ -196,20 +205,24 @@ |
| |
| private final Term termBuffer = new Term("", ""); // avoid consing |
| |
| - private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) { |
| + private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset, |
| + boolean omitFromTermVector, boolean omitPosition) { |
| termBuffer.set(field, text); |
| //System.out.println("Offset: " + offset); |
| Posting ti = (Posting) postingTable.get(termBuffer); |
| if (ti != null) { // word seen before |
| int freq = ti.freq; |
| - if (ti.positions.length == freq) { // positions array is full |
| - int[] newPositions = new int[freq * 2]; // double size |
| - int[] positions = ti.positions; |
| - for (int i = 0; i < freq; i++) // copy old positions to new |
| - newPositions[i] = positions[i]; |
| - ti.positions = newPositions; |
| + |
| + if (!omitPosition) { |
| + if (ti.positions.length == freq) { // positions array is full |
| + int[] newPositions = new int[freq * 2]; // double size |
| + int[] positions = ti.positions; |
| + for (int i = 0; i < freq; i++) // copy old positions to new |
| + newPositions[i] = positions[i]; |
| + ti.positions = newPositions; |
| + } |
| + ti.positions[freq] = position; // add new position |
| } |
| - ti.positions[freq] = position; // add new position |
| |
| if (offset != null) { |
| if (ti.offsets.length == freq){ |
| @@ -223,10 +236,12 @@ |
| } |
| ti.offsets[freq] = offset; |
| } |
| - ti.freq = freq + 1; // update frequency |
| - } else { // word not seen before |
| + |
| + if (!omitPosition) |
| + ti.freq = freq + 1; // update frequency |
| + } else { // word not seen before |
| Term term = new Term(field, text, false); |
| - postingTable.put(term, new Posting(term, position, offset)); |
| + postingTable.put(term, new Posting(term, position, offset, omitFromTermVector)); |
| } |
| } |
| |
| @@ -351,7 +366,7 @@ |
| termVectorWriter.closeField(); |
| } |
| } |
| - if (termVectorWriter != null && termVectorWriter.isFieldOpen()) { |
| + if (termVectorWriter != null && termVectorWriter.isFieldOpen() && !posting.omitFromTermVector) { |
| termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets); |
| } |
| } |
| @@ -390,6 +405,16 @@ |
| this.infoStream = infoStream; |
| } |
| |
| + /** If non-null, this will be used to select which tokens are stored in term vectors */ |
| + void setTermVectorTokenSelector(TokenSelector selector) { |
| + this.termVectorTokenSelector = selector; |
| + } |
| + |
| + /** If non-null, this will be used to select which tokens have positions stored in the index. */ |
| + void setPositionsTokenSelector(TokenSelector selector) { |
| + this.positionsTokenSelector = selector; |
| + } |
| + |
| } |
| |
| final class Posting { // info about a Term in a doc |
| @@ -397,17 +422,17 @@ |
| int freq; // its frequency in doc |
| int[] positions; // positions it occurs at |
| TermVectorOffsetInfo [] offsets; |
| + boolean omitFromTermVector; // if true, omit from term vector |
| |
| - Posting(Term t, int position, TermVectorOffsetInfo offset) { |
| + Posting(Term t, int position, TermVectorOffsetInfo offset, boolean omitFromTermVector) { |
| term = t; |
| freq = 1; |
| positions = new int[1]; |
| positions[0] = position; |
| - if(offset != null){ |
| - offsets = new TermVectorOffsetInfo[1]; |
| - offsets[0] = offset; |
| + if(offset != null) { |
| + offsets = new TermVectorOffsetInfo[1]; |
| + offsets[0] = offset; |
| } |
| - else |
| - offsets = null; |
| + this.omitFromTermVector = omitFromTermVector; |
| } |
| } |
| Index: src/java/org/apache/lucene/index/ParallelWriter.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/ParallelWriter.java (revision 0) |
| +++ src/java/org/apache/lucene/index/ParallelWriter.java (revision 0) |
| @@ -0,0 +1,345 @@ |
| +/* |
| + * ParallelWriter.java |
| + * |
| + * Created on April 28, 2006, 7:07 PM |
| + * |
| + */ |
| + |
| +package org.apache.lucene.index; |
| + |
| +import java.io.IOException; |
| +import java.io.PrintStream; |
| +import java.util.Enumeration; |
| +import java.util.HashMap; |
| +import java.util.List; |
| +import java.util.Map; |
| +import java.util.concurrent.CountDownLatch; |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.TokenSelector; |
| +import org.apache.lucene.document.Document; |
| +import org.apache.lucene.document.Field; |
| +import org.apache.lucene.search.Similarity; |
| +import org.apache.lucene.store.Directory; |
| + |
| +/** |
| + * ParallelWriter is a companion to ParallelReader, although as with IndexWriter it only supports indexes stored in a Directory. |
| + * The interface is at the field level. A map from directories to lists of fields is provided to create the ParallelWriter, |
| + * which then creates an IndexWriter for each specified directory and operates on each field of a document using the IndexWriter |
| + * for the directory to which that field is mapped. This mapping allows an application to configure its use of parallel sub- |
| + * indexes independently from the rest of its processing. |
| + * |
| + * This implementation single-threads calls to addDocument(), but does the sub-document writes in parallel. Users of this class |
| + * must ensure that the ParallelReader is never reopened while adding a new document, and must deal with recovery if exceptions |
| + * occur while adding a document. |
| + * |
| + * @author Chuck Williams |
| + */ |
| +public class ParallelWriter implements Writable { |
| + |
| + IndexWriter[] writers; // All IndexWriters |
| + IOException exception; // If any writer gets an exception, this is stored here (only one needed) |
| + Map<String,IndexWriter> writerMap; // Field name --> IndexWriter that stores that field |
| + IndexWriter oneWriter; // An arbitrarily chosen IndexWriter -- used to get config info which is the same for all IndexWriters |
| + Analyzer analyzer; // The Analyzer applied to all tokenized field content |
| + |
| + private static final Document EMPTY_DOCUMENT = new Document(); // Empty document used to sync doc id's when a document is added without fields for all indexes |
| + |
| + /** |
| + * Create a new ParallelWriter |
| + * |
| + * @param directoryFieldsMap specifies the directory to use to store each field, multiple directories creating parallel indexes |
| + * @param analyzer applied to all tokenized field content |
| + * @param create create new indexes in directories iff true |
| + * @throws IOException if the IndexWriters cannot be created |
| + */ |
| + public ParallelWriter(Map<Directory,List<String>> directoryFieldsMap, Analyzer analyzer, boolean create) throws IOException { |
| + this.analyzer = analyzer; |
| + writers = new IndexWriter[directoryFieldsMap.size()]; |
| + writerMap = new HashMap<String,IndexWriter>(directoryFieldsMap.size()*5/3); |
| + int i=0; |
| + for (Map.Entry<Directory,List<String>> entry : directoryFieldsMap.entrySet()) { |
| + IndexWriter writer = new IndexWriter(entry.getKey(), analyzer, create); |
| + writers[i++] = oneWriter = writer; |
| + for (String field : entry.getValue()) |
| + writerMap.put(field, writer); |
| + } |
| + } |
| + |
| + /** Invert a directoryFieldsMap |
| + * @param directoryFieldsMap a map for directories to lists of fields they contain |
| + * @return a map from each field to its directory |
| + */ |
| + public static Map<String, Directory> invertDirectoryFieldsMap(Map<Directory,List<String>> directoryFieldsMap) { |
| + Map<String, Directory> fieldDirectoryMap = new HashMap<String, Directory>(); |
| + for (Map.Entry<Directory, List<String>> entry : directoryFieldsMap.entrySet()) |
| + for (String field : entry.getValue()) |
| + fieldDirectoryMap.put(field, entry.getKey()); |
| + return fieldDirectoryMap; |
| + } |
| + |
| + /** Add document to this index by adding subdocuments with the mapped fields for each parallel index. This method is synchronized because the |
| + * the parallel indexes must be maintained such that equal doc id's in different indexes hold fields for the same document. |
| + * This synchronization could have a negative effect on batch indexing performance. Users of this method must ensure that the ParllelReader |
| + * is not re-opened within the scope of this method as it would likely find the sub-indexes out of sync. |
| + * @param doc the document to add |
| + * @throws IOException if there are problems writing the indexes. <strong>WARNING: If this happens it is bad.</string> The doc-id's in the |
| + * indexes are likely out of sync. This situation requires repair to resync the doc ids in each document set. Possible |
| + * repair actions include rebuilding the indexes or deleting documents at the end to restore equal document sets and then |
| + * optimizing to restore equal doc ids. |
| + * @throws RuntimeException if the threads writing to the sub-indexes are interrupted. |
| + */ |
| + public void addDocument(Document doc) throws IOException { |
| + addDocument(doc, analyzer); |
| + } |
| + |
| + /** Add document to this index by adding subdocuments with the mapped fields for each parallel index. This method is synchronized because the |
| + * the parallel indexes must be maintained such that equal doc id's in different indexes hold fields for the same document. |
| + * This synchronization could have a negative effect on batch indexing performance. Users of this method must ensure that the ParllelReader |
| + * is not re-opened within the scope of this method as it would likely find the sub-indexes out of sync. |
| + * @param doc the document to add |
| + * @param analyzer apply special analyzer to this document rather than the one for the index (discouraged -- use addDocument(doc)) |
| + * @throws IOException if there are problems writing the indexes. <strong>WARNING: If this happens it is bad.</string> The doc-id's in the |
| + * indexes are likely out of sync. This situation requires repair to resync the doc ids in each document set. Possible |
| + * repair actions include rebuilding the indexes or deleting documents at the end to restore equal document sets and then |
| + * optimizing to restore equal doc ids. |
| + * @throws RuntimeException if the threads writing to the sub-indexes are interrupted. |
| + */ |
| + public synchronized void addDocument(Document doc, Analyzer analyzer) throws IOException { |
| + Map<IndexWriter,Document> documentMap = new HashMap<IndexWriter,Document>(writers.length*5/3); |
| + Enumeration<Field> fields = doc.fields(); |
| + while (fields.hasMoreElements()) { |
| + Field field = fields.nextElement(); |
| + IndexWriter writer = writerMap.get(field.name()); |
| + if (writer==null) |
| + throw new RuntimeException(new UnknownFieldException("Unregistered field: " + field.name())); |
| + Document subdoc = documentMap.get(writer); |
| + if (subdoc==null) |
| + documentMap.put(writer, subdoc = new Document()); |
| + subdoc.add(field); |
| + } |
| + CountDownLatch latch = new CountDownLatch(writers.length); |
| + exception = null; |
| + for (IndexWriter writer : writers) { |
| + Document subdoc = documentMap.get(writer); |
| + if (subdoc==null) // Must have a document in each parallel index to sync doc id's |
| + subdoc = EMPTY_DOCUMENT; |
| + new Thread(new WriterWorker(writer, subdoc, latch)).run(); |
| + } |
| + try { |
| + latch.await(); |
| + } catch (InterruptedException e) { |
| + throw new RuntimeException("Interrupted while writing subdocuments!", e); |
| + } |
| + if (exception != null) |
| + throw exception; |
| + } |
| + |
| + // Write a sub-documents to a sub-index and record any exception |
| + private class WriterWorker implements Runnable { |
| + |
| + private IndexWriter writer; |
| + private Document document; |
| + private CountDownLatch latch; |
| + |
| + private WriterWorker(IndexWriter writer, Document document, CountDownLatch latch) { |
| + this.writer = writer; |
| + this.document = document; |
| + this.latch = latch; |
| + } |
| + |
| + public void run() { |
| + try { |
| + writer.addDocument(document); |
| + } catch (IOException e) { |
| + exception = e; |
| + } finally { |
| + latch.countDown(); |
| + } |
| + } |
| + |
| + } |
| + |
| + /** Obtain the number of document in this index, which is the same for each parallel index. */ |
| + public int docCount() { |
| + return oneWriter.docCount(); |
| + } |
| + |
| + /** Optimize all parallel indexes. This is synchronized to keep all index doc-id's synced up */ |
| + public synchronized void optimize() throws IOException { |
| + for (IndexWriter writer : writers) |
| + writer.optimize(); |
| + } |
| + |
| + /** Close all parallel indexes. Note that the provided directories are not closed. Synchronized. */ |
| + public synchronized void close() throws IOException { |
| + for (IndexWriter writer : writers) |
| + writer.close(); |
| + } |
| + |
| + /** Getter for analyzer provided to the constructor */ |
| + public Analyzer getAnalyzer() { |
| + return analyzer; |
| + } |
| + |
| + /** Set whether or not to use compound file format in every parallel index */ |
| + public void setUseCompoundFile(boolean value) { |
| + for (IndexWriter writer : writers) |
| + writer.setUseCompoundFile(value); |
| + } |
| + |
| + /** Get the compound file usage decision, same for every parallel index */ |
| + public boolean getUseCompoundFile() { |
| + return oneWriter.getUseCompoundFile(); |
| + } |
| + |
| + /** Set similarity to use for every parallel index */ |
| + public void setSimilarity(Similarity similarity) { |
| + for (IndexWriter writer : writers) |
| + writer.setSimilarity(similarity); |
| + } |
| + |
| + /** Get similarity, which is used by every parallel index */ |
| + public Similarity getSimilarity() { |
| + return oneWriter.getSimilarity(); |
| + } |
| + |
| + /** Set the termIndexInterval used for every parallel index */ |
| + public void setTermIndexInterval(int interval) { |
| + for (IndexWriter writer : writers) |
| + writer.setTermIndexInterval(interval); |
| + } |
| + |
| + /** Get the termIndexInterval, which is used by every parallel index */ |
| + public int getTermIndexInterval() { |
| + return oneWriter.getTermIndexInterval(); |
| + } |
| + |
| + /** Set maxBufferedDocs for every parallel index */ |
| + public void setMaxBufferedDocs(int maxBufferedDocs) { |
| + for (IndexWriter writer : writers) |
| + writer.setMaxBufferedDocs(maxBufferedDocs); |
| + } |
| + |
| + /** get maxBufferedDocs, same for every parallel index */ |
| + public int getMaxBufferedDocs() { |
| + return oneWriter.getMaxBufferedDocs(); |
| + } |
| + |
| + /** Set maxFieldLength to use for every parallel index */ |
| + public void setMaxFieldLength(int maxFieldLength) { |
| + for (IndexWriter writer : writers) |
| + writer.setMaxFieldLength(maxFieldLength); |
| + } |
| + |
| + /** Get maxFieldLength, same for every parallel index */ |
| + public int getMaxFieldLength() { |
| + return oneWriter.getMaxFieldLength(); |
| + } |
| + |
| + /** Set maxMergeDocs for every parallel index */ |
| + public void setMaxMergeDocs(int maxMergeDocs) { |
| + for (IndexWriter writer : writers) |
| + writer.setMaxMergeDocs(maxMergeDocs); |
| + } |
| + |
| + /** Get max merge docs, same for every parallel index */ |
| + public int getMaxMergeDocs() { |
| + return oneWriter.getMaxMergeDocs(); |
| + } |
| + |
| + /** Set merge factor for every parallel index */ |
| + public void setMergeFactor(int mergeFactor) { |
| + for (IndexWriter writer : writers) |
| + writer.setMergeFactor(mergeFactor); |
| + } |
| + |
| + /** Get merge factor, same for every parallel index */ |
| + public int getMergeFactor() { |
| + return oneWriter.getMergeFactor(); |
| + } |
| + |
| + /** Set write lock timeout (millis) for every parallel index */ |
| + public void setWriteLockTimeout(long writeLockTimeout) { |
| + for (IndexWriter writer : writers) |
| + writer.setWriteLockTimeout(writeLockTimeout); |
| + } |
| + |
| + /** Get write lock timeout, same for every parallel index */ |
| + public long getWriteLockTimeout() { |
| + return oneWriter.getWriteLockTimeout(); |
| + } |
| + |
| + /** Set commit lock timeout for every parallel index */ |
| + public void setCommitLockTimeout(long commitLockTimeout) { |
| + for (IndexWriter writer : writers) |
| + writer.setCommitLockTimeout(commitLockTimeout); |
| + } |
| + |
| + /** Get commit lock timeout, same for every parallel index */ |
| + public long getCommitLockTimeout() { |
| + return oneWriter.getCommitLockTimeout(); |
| + } |
| + |
| + /** Get term vector TokenSelector, same for every parallel index */ |
| + public void setTermVectorTokenSelector(TokenSelector selector) { |
| + for (IndexWriter writer : writers) |
| + writer.setTermVectorTokenSelector(selector); |
| + } |
| + |
| + /** Set term vector TokenSelector for every parallel index */ |
| + public TokenSelector getTermVectorTokenSelector() { |
| + return oneWriter.getTermVectorTokenSelector(); |
| + } |
| + |
| + /** Set positions TokenSelector for every parallel index */ |
| + public void setPositionsTokenSelector(TokenSelector selector) { |
| + for (IndexWriter writer : writers) |
| + writer.setPositionsTokenSelector(selector); |
| + } |
| + |
| + /** Get positions TokenSelector, same for every parallel index */ |
| + public TokenSelector getPositionsTokenSelector() { |
| + return oneWriter.getPositionsTokenSelector(); |
| + } |
| + |
| + /** Unsupported. use setInfoStream(field, infoStream) */ |
| + public void setInfoStream(PrintStream infoStream) { |
| + throw new UnsupportedOperationException(); |
| + } |
| + |
| + /** Set an info stream for the the IndexWriter managing a specified field. The info stream receives information about field truncations, merges, etc. |
| + * @param field the field whose writer to assign the info stream to |
| + * @param infoStream the info stream |
| + * @throws UnknownFieldException if field has not been associated with an IndexWriter in this index |
| + */ |
| + public void setInfoStream(String field, PrintStream infoStream) throws UnknownFieldException { |
| + IndexWriter writer = writerMap.get(field); |
| + if (writer==null) |
| + throw new UnknownFieldException("Unregistered field: " + field); |
| + writer.setInfoStream(infoStream); |
| + |
| + } |
| + |
| + /** Unsupported. use getInfoStream(field) */ |
| + public PrintStream getInfoStream() { |
| + throw new UnsupportedOperationException(); |
| + } |
| + |
| + /** Getter for info stream associated with field. See setInfoStream(field, infoStream). |
| + */ |
| + public PrintStream getInfoStream(String field, PrintStream infoStream) throws UnknownFieldException { |
| + IndexWriter writer = writerMap.get(field); |
| + if (writer==null) |
| + throw new UnknownFieldException("Unregistered field: " + field); |
| + return writer.getInfoStream(); |
| + } |
| + |
| + private static class UnknownFieldException extends Exception { |
| + |
| + private UnknownFieldException(String message) { |
| + super(message); |
| + } |
| + |
| + } |
| + |
| +} |