| Index: CHANGES.txt |
| =================================================================== |
| --- CHANGES.txt (revision 786867) |
| +++ CHANGES.txt (working copy) |
| @@ -424,7 +424,12 @@ |
| |
| 28. LUCENE-1405: Added support for Ant resource collections in contrib/ant |
| <index> task. (Przemyslaw Sztoch via Erik Hatcher) |
| + |
| +29. LUCENE-1699: Allow setting a TokenStream on Field/Fieldable for indexing |
| + in conjunction with any other ways to specify stored field values, |
| + currently binary or string values. (yonik) |
| |
| + |
| Optimizations |
| |
| 1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing |
| Index: src/java/org/apache/lucene/document/Fieldable.java |
| =================================================================== |
| --- src/java/org/apache/lucene/document/Fieldable.java (revision 786867) |
| +++ src/java/org/apache/lucene/document/Fieldable.java (working copy) |
| @@ -74,36 +74,41 @@ |
| */ |
| String name(); |
| |
| - /** The value of the field as a String, or null. If null, the Reader value, |
| - * binary value, or TokenStream value is used. Exactly one of stringValue(), |
| - * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ |
| + /** The value of the field as a String, or null. |
| + * <p> |
| + * For indexing, if isStored()==true, the stringValue() will be used as the stored field value |
| + * unless isBinary()==true, in which case binaryValue() will be used. |
| + * |
| + * If isIndexed()==true and isTokenized()==false, this String value will be indexed as a single token. |
| + * If isIndexed()==true and isTokenized()==true, then tokenStreamValue() will be used to generate indexed tokens if not null, |
| + * else readerValue() will be used to generate indexed tokens if not null, else stringValue() will be used to generate tokens. |
| + */ |
| public String stringValue(); |
| |
| - /** The value of the field as a Reader, or null. If null, the String value, |
| - * binary value, or TokenStream value is used. Exactly one of stringValue(), |
| - * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ |
| + /** The value of the field as a Reader, which can be used at index time to generate indexed tokens. |
| + * @see #stringValue() |
| + */ |
| public Reader readerValue(); |
| |
| - /** The value of the field in Binary, or null. If null, the Reader value, |
| - * String value, or TokenStream value is used. Exactly one of stringValue(), |
| - * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ |
| + /** The value of the field in Binary, or null. |
| + * @see #stringValue() |
| + */ |
| public byte[] binaryValue(); |
| |
| - /** The value of the field as a TokenStream, or null. If null, the Reader value, |
| - * String value, or binary value is used. Exactly one of stringValue(), |
| - * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ |
| + /** The TokenStream for this field to be used when indexing, or null. |
| + * @see #stringValue() |
| + */ |
| public TokenStream tokenStreamValue(); |
| |
| - /** True iff the value of the field is to be stored in the index for return |
| - with search hits. It is an error for this to be true if a field is |
| - Reader-valued. */ |
| + /** True if the value of the field is to be stored in the index for return |
| + with search hits. */ |
| boolean isStored(); |
| |
| - /** True iff the value of the field is to be indexed, so that it may be |
| + /** True if the value of the field is to be indexed, so that it may be |
| searched on. */ |
| boolean isIndexed(); |
| |
| - /** True iff the value of the field should be tokenized as text prior to |
| + /** True if the value of the field should be tokenized as text prior to |
| indexing. Un-tokenized fields are indexed as a single word and may not be |
| Reader-valued. */ |
| boolean isTokenized(); |
| @@ -111,7 +116,7 @@ |
| /** True if the value of the field is stored and compressed within the index */ |
| boolean isCompressed(); |
| |
| - /** True iff the term or terms used to index this field are stored as a term |
| + /** True if the term or terms used to index this field are stored as a term |
| * vector, available from {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}. |
| * These methods do not provide access to the original content of the field, |
| * only to terms used to index it. If the original content must be |
| @@ -122,17 +127,17 @@ |
| boolean isTermVectorStored(); |
| |
| /** |
| - * True iff terms are stored as term vector together with their offsets |
| + * True if terms are stored as term vector together with their offsets |
| * (start and end positon in source text). |
| */ |
| boolean isStoreOffsetWithTermVector(); |
| |
| /** |
| - * True iff terms are stored as term vector together with their token positions. |
| + * True if terms are stored as term vector together with their token positions. |
| */ |
| boolean isStorePositionWithTermVector(); |
| |
| - /** True iff the value of the filed is stored as binary */ |
| + /** True if the value of the field is stored as binary */ |
| boolean isBinary(); |
| |
| /** True if norms are omitted for this indexed field */ |
| Index: src/java/org/apache/lucene/document/AbstractField.java |
| =================================================================== |
| --- src/java/org/apache/lucene/document/AbstractField.java (revision 786867) |
| +++ src/java/org/apache/lucene/document/AbstractField.java (working copy) |
| @@ -16,7 +16,8 @@ |
| */ |
| |
| import org.apache.lucene.search.PhraseQuery; // for javadocs |
| -import org.apache.lucene.search.spans.SpanQuery; // for javadocs |
| +import org.apache.lucene.search.spans.SpanQuery; |
| +import org.apache.lucene.analysis.TokenStream; // for javadocs |
| |
| |
| /** |
| @@ -38,9 +39,11 @@ |
| protected boolean lazy = false; |
| protected boolean omitTermFreqAndPositions = false; |
| protected float boost = 1.0f; |
| - // the one and only data object for all different kind of field values |
| + // the data object for all different kind of field values |
| protected Object fieldsData = null; |
| - //length/offset for all primitive types |
| + // pre-analyzed tokenStream for indexed fields |
| + protected TokenStream tokenStream; |
| + // length/offset for all primitive types |
| protected int binaryLength; |
| protected int binaryOffset; |
| |
| Index: src/java/org/apache/lucene/document/Field.java |
| =================================================================== |
| --- src/java/org/apache/lucene/document/Field.java (revision 786867) |
| +++ src/java/org/apache/lucene/document/Field.java (working copy) |
| @@ -94,7 +94,7 @@ |
| /** Expert: Index the field's value without an Analyzer, |
| * and also disable the storing of norms. Note that you |
| * can also separately enable/disable norms by calling |
| - * {@link #setOmitNorms}. No norms means that |
| + * {@link Field#setOmitNorms}. No norms means that |
| * index-time field and document boosting and field |
| * length normalization are disabled. The benefit is |
| * less memory usage as norms take up one byte of RAM |
| @@ -159,19 +159,19 @@ |
| } |
| |
| |
| - /** The value of the field as a String, or null. If null, the Reader value, |
| - * binary value, or TokenStream value is used. Exactly one of stringValue(), |
| - * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */ |
| + /** The value of the field as a String, or null. If null, the Reader value or |
| + * binary value is used. Exactly one of stringValue(), |
| + * readerValue(), and getBinaryValue() must be set. */ |
| public String stringValue() { return fieldsData instanceof String ? (String)fieldsData : null; } |
| |
| - /** The value of the field as a Reader, or null. If null, the String value, |
| - * binary value, or TokenStream value is used. Exactly one of stringValue(), |
| - * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */ |
| + /** The value of the field as a Reader, or null. If null, the String value or |
| + * binary value is used. Exactly one of stringValue(), |
| + * readerValue(), and getBinaryValue() must be set. */ |
| public Reader readerValue() { return fieldsData instanceof Reader ? (Reader)fieldsData : null; } |
| |
| /** The value of the field in Binary, or null. If null, the Reader value, |
| - * String value, or TokenStream value is used. Exactly one of stringValue(), |
| - * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. |
| + * or String value is used. Exactly one of stringValue(), |
| + * readerValue(), and getBinaryValue() must be set. |
| * @deprecated This method must allocate a new byte[] if |
| * the {@link AbstractField#getBinaryOffset()} is non-zero |
| * or {@link AbstractField#getBinaryLength()} is not the |
| @@ -191,10 +191,9 @@ |
| return ret; |
| } |
| |
| - /** The value of the field as a TokesStream, or null. If null, the Reader value, |
| - * String value, or binary value is used. Exactly one of stringValue(), |
| - * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */ |
| - public TokenStream tokenStreamValue() { return fieldsData instanceof TokenStream ? (TokenStream)fieldsData : null; } |
| + /** The TokesStream for this field to be used when indexing, or null. If null, the Reader value |
| + * or String value is analyzed to produce the indexed tokens. */ |
| + public TokenStream tokenStreamValue() { return tokenStream; } |
| |
| |
| /** <p>Expert: change the value of this field. This can |
| @@ -204,10 +203,7 @@ |
| * a single {@link Document} instance is re-used as |
| * well. This helps most on small documents.</p> |
| * |
| - * <p>Note that you should only use this method after the |
| - * Field has been consumed (ie, the {@link Document} |
| - * containing this Field has been added to the index). |
| - * Also, each Field instance should only be used once |
| + * <p>Each Field instance should only be used once |
| * within a single {@link Document} instance. See <a |
| * href="http://wiki.apache.org/lucene-java/ImproveIndexingSpeed">ImproveIndexingSpeed</a> |
| * for details.</p> */ |
| @@ -250,7 +246,8 @@ |
| } |
| |
| |
| - /** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */ |
| + /** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. |
| + * @deprecated use {@link #setTokenStream} */ |
| public void setValue(TokenStream value) { |
| if (isBinary) { |
| throw new IllegalArgumentException("cannot set a TokenStream value on a binary field"); |
| @@ -258,9 +255,18 @@ |
| if (isStored) { |
| throw new IllegalArgumentException("cannot set a TokenStream value on a stored field"); |
| } |
| - fieldsData = value; |
| + fieldsData = null; |
| + tokenStream = value; |
| } |
| |
| + /** Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() to return true. |
| + * May be combined with stored values from stringValue() or binaryValue() */ |
| + public void setTokenStream(TokenStream tokenStream) { |
| + this.isIndexed = true; |
| + this.isTokenized = true; |
| + this.tokenStream = tokenStream; |
| + } |
| + |
| /** |
| * Create a field by specifying its name, value and how it will |
| * be saved in the index. Term vectors will not be stored in the index. |
| @@ -459,8 +465,9 @@ |
| throw new NullPointerException("tokenStream cannot be null"); |
| |
| this.name = name.intern(); // field names are interned |
| - this.fieldsData = tokenStream; |
| - |
| + this.fieldsData = null; |
| + this.tokenStream = tokenStream; |
| + |
| this.isStored = false; |
| this.isCompressed = false; |
| |
| Index: src/test/org/apache/lucene/index/TestIndexWriter.java |
| =================================================================== |
| --- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 786867) |
| +++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) |
| @@ -17,11 +17,7 @@ |
| * limitations under the License. |
| */ |
| |
| -import java.io.ByteArrayOutputStream; |
| -import java.io.File; |
| -import java.io.IOException; |
| -import java.io.PrintStream; |
| -import java.io.Reader; |
| +import java.io.*; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.List; |
| @@ -4350,4 +4346,66 @@ |
| t.join(); |
| assertFalse(t.failed); |
| } |
| + |
| + |
| + public void testIndexStoreCombos() throws Exception { |
| + MockRAMDirectory dir = new MockRAMDirectory(); |
| + IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED); |
| + byte[] b = new byte[50]; |
| + for(int i=0;i<50;i++) |
| + b[i] = (byte) (i+77); |
| + |
| + Document doc = new Document(); |
| + Field f = new Field("binary", b, 10, 17, Field.Store.YES); |
| + f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc1field1"))); |
| + Field f2 = new Field("string", "value", Field.Store.YES,Field.Index.ANALYZED); |
| + f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc1field2"))); |
| + doc.add(f); |
| + doc.add(f2); |
| + w.addDocument(doc); |
| + |
| + // add 2 docs to test in-memory merging |
| + f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc2field1"))); |
| + f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc2field2"))); |
| + w.addDocument(doc); |
| + |
| + // force segment flush so we can force a segment merge with doc3 later. |
| + w.commit(); |
| + |
| + f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc3field1"))); |
| + f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc3field2"))); |
| + |
| + w.addDocument(doc); |
| + w.commit(); |
| + w.optimize(); // force segment merge. |
| + |
| + IndexReader ir = IndexReader.open(dir); |
| + doc = ir.document(0); |
| + f = doc.getField("binary"); |
| + b = f.getBinaryValue(); |
| + assertTrue(b != null); |
| + assertEquals(17, b.length, 17); |
| + assertEquals(87, b[0]); |
| + |
| + assertTrue(ir.document(0).getFieldable("binary").isBinary()); |
| + assertTrue(ir.document(1).getFieldable("binary").isBinary()); |
| + assertTrue(ir.document(2).getFieldable("binary").isBinary()); |
| + |
| + assertEquals("value", ir.document(0).get("string")); |
| + assertEquals("value", ir.document(1).get("string")); |
| + assertEquals("value", ir.document(2).get("string")); |
| + |
| + |
| + // test that the terms were indexed. |
| + assertTrue(ir.termDocs(new Term("binary","doc1field1")).next()); |
| + assertTrue(ir.termDocs(new Term("binary","doc2field1")).next()); |
| + assertTrue(ir.termDocs(new Term("binary","doc3field1")).next()); |
| + assertTrue(ir.termDocs(new Term("string","doc1field2")).next()); |
| + assertTrue(ir.termDocs(new Term("string","doc2field2")).next()); |
| + assertTrue(ir.termDocs(new Term("string","doc3field2")).next()); |
| + |
| + ir.close(); |
| + dir.close(); |
| + |
| + } |
| } |