blob: 293a54681ed911765414c5ab2b0076fc66b99a18 [file] [log] [blame]
Index: CHANGES.txt
===================================================================
--- CHANGES.txt (revision 786867)
+++ CHANGES.txt (working copy)
@@ -424,7 +424,12 @@
28. LUCENE-1405: Added support for Ant resource collections in contrib/ant
<index> task. (Przemyslaw Sztoch via Erik Hatcher)
+
+29. LUCENE-1699: Allow setting a TokenStream on Field/Fieldable for indexing
+ in conjunction with any other ways to specify stored field values,
+ currently binary or string values. (yonik)
+
Optimizations
1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing
Index: src/java/org/apache/lucene/document/Fieldable.java
===================================================================
--- src/java/org/apache/lucene/document/Fieldable.java (revision 786867)
+++ src/java/org/apache/lucene/document/Fieldable.java (working copy)
@@ -74,36 +74,41 @@
*/
String name();
- /** The value of the field as a String, or null. If null, the Reader value,
- * binary value, or TokenStream value is used. Exactly one of stringValue(),
- * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+ /** The value of the field as a String, or null.
+ * <p>
+ * For indexing, if isStored()==true, the stringValue() will be used as the stored field value
+ * unless isBinary()==true, in which case binaryValue() will be used.
+ *
+ * If isIndexed()==true and isTokenized()==false, this String value will be indexed as a single token.
+ * If isIndexed()==true and isTokenized()==true, then tokenStreamValue() will be used to generate indexed tokens if not null,
+ * else readerValue() will be used to generate indexed tokens if not null, else stringValue() will be used to generate tokens.
+ */
public String stringValue();
- /** The value of the field as a Reader, or null. If null, the String value,
- * binary value, or TokenStream value is used. Exactly one of stringValue(),
- * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+ /** The value of the field as a Reader, which can be used at index time to generate indexed tokens.
+ * @see #stringValue()
+ */
public Reader readerValue();
- /** The value of the field in Binary, or null. If null, the Reader value,
- * String value, or TokenStream value is used. Exactly one of stringValue(),
- * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+ /** The value of the field in Binary, or null.
+ * @see #stringValue()
+ */
public byte[] binaryValue();
- /** The value of the field as a TokenStream, or null. If null, the Reader value,
- * String value, or binary value is used. Exactly one of stringValue(),
- * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+ /** The TokenStream for this field to be used when indexing, or null.
+ * @see #stringValue()
+ */
public TokenStream tokenStreamValue();
- /** True iff the value of the field is to be stored in the index for return
- with search hits. It is an error for this to be true if a field is
- Reader-valued. */
+ /** True if the value of the field is to be stored in the index for return
+ with search hits. */
boolean isStored();
- /** True iff the value of the field is to be indexed, so that it may be
+ /** True if the value of the field is to be indexed, so that it may be
searched on. */
boolean isIndexed();
- /** True iff the value of the field should be tokenized as text prior to
+ /** True if the value of the field should be tokenized as text prior to
indexing. Un-tokenized fields are indexed as a single word and may not be
Reader-valued. */
boolean isTokenized();
@@ -111,7 +116,7 @@
/** True if the value of the field is stored and compressed within the index */
boolean isCompressed();
- /** True iff the term or terms used to index this field are stored as a term
+ /** True if the term or terms used to index this field are stored as a term
* vector, available from {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}.
* These methods do not provide access to the original content of the field,
* only to terms used to index it. If the original content must be
@@ -122,17 +127,17 @@
boolean isTermVectorStored();
/**
- * True iff terms are stored as term vector together with their offsets
+ * True if terms are stored as term vector together with their offsets
* (start and end positon in source text).
*/
boolean isStoreOffsetWithTermVector();
/**
- * True iff terms are stored as term vector together with their token positions.
+ * True if terms are stored as term vector together with their token positions.
*/
boolean isStorePositionWithTermVector();
- /** True iff the value of the filed is stored as binary */
+ /** True if the value of the field is stored as binary */
boolean isBinary();
/** True if norms are omitted for this indexed field */
Index: src/java/org/apache/lucene/document/AbstractField.java
===================================================================
--- src/java/org/apache/lucene/document/AbstractField.java (revision 786867)
+++ src/java/org/apache/lucene/document/AbstractField.java (working copy)
@@ -16,7 +16,8 @@
*/
import org.apache.lucene.search.PhraseQuery; // for javadocs
-import org.apache.lucene.search.spans.SpanQuery; // for javadocs
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.analysis.TokenStream; // for javadocs
/**
@@ -38,9 +39,11 @@
protected boolean lazy = false;
protected boolean omitTermFreqAndPositions = false;
protected float boost = 1.0f;
- // the one and only data object for all different kind of field values
+ // the data object for all different kind of field values
protected Object fieldsData = null;
- //length/offset for all primitive types
+ // pre-analyzed tokenStream for indexed fields
+ protected TokenStream tokenStream;
+ // length/offset for all primitive types
protected int binaryLength;
protected int binaryOffset;
Index: src/java/org/apache/lucene/document/Field.java
===================================================================
--- src/java/org/apache/lucene/document/Field.java (revision 786867)
+++ src/java/org/apache/lucene/document/Field.java (working copy)
@@ -94,7 +94,7 @@
/** Expert: Index the field's value without an Analyzer,
* and also disable the storing of norms. Note that you
* can also separately enable/disable norms by calling
- * {@link #setOmitNorms}. No norms means that
+ * {@link Field#setOmitNorms}. No norms means that
* index-time field and document boosting and field
* length normalization are disabled. The benefit is
* less memory usage as norms take up one byte of RAM
@@ -159,19 +159,19 @@
}
- /** The value of the field as a String, or null. If null, the Reader value,
- * binary value, or TokenStream value is used. Exactly one of stringValue(),
- * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
+ /** The value of the field as a String, or null. If null, the Reader value or
+ * binary value is used. Exactly one of stringValue(),
+ * readerValue(), and getBinaryValue() must be set. */
public String stringValue() { return fieldsData instanceof String ? (String)fieldsData : null; }
- /** The value of the field as a Reader, or null. If null, the String value,
- * binary value, or TokenStream value is used. Exactly one of stringValue(),
- * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
+ /** The value of the field as a Reader, or null. If null, the String value or
+ * binary value is used. Exactly one of stringValue(),
+ * readerValue(), and getBinaryValue() must be set. */
public Reader readerValue() { return fieldsData instanceof Reader ? (Reader)fieldsData : null; }
/** The value of the field in Binary, or null. If null, the Reader value,
- * String value, or TokenStream value is used. Exactly one of stringValue(),
- * readerValue(), getBinaryValue(), and tokenStreamValue() must be set.
+ * or String value is used. Exactly one of stringValue(),
+ * readerValue(), and getBinaryValue() must be set.
* @deprecated This method must allocate a new byte[] if
* the {@link AbstractField#getBinaryOffset()} is non-zero
* or {@link AbstractField#getBinaryLength()} is not the
@@ -191,10 +191,9 @@
return ret;
}
- /** The value of the field as a TokesStream, or null. If null, the Reader value,
- * String value, or binary value is used. Exactly one of stringValue(),
- * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
- public TokenStream tokenStreamValue() { return fieldsData instanceof TokenStream ? (TokenStream)fieldsData : null; }
+ /** The TokesStream for this field to be used when indexing, or null. If null, the Reader value
+ * or String value is analyzed to produce the indexed tokens. */
+ public TokenStream tokenStreamValue() { return tokenStream; }
/** <p>Expert: change the value of this field. This can
@@ -204,10 +203,7 @@
* a single {@link Document} instance is re-used as
* well. This helps most on small documents.</p>
*
- * <p>Note that you should only use this method after the
- * Field has been consumed (ie, the {@link Document}
- * containing this Field has been added to the index).
- * Also, each Field instance should only be used once
+ * <p>Each Field instance should only be used once
* within a single {@link Document} instance. See <a
* href="http://wiki.apache.org/lucene-java/ImproveIndexingSpeed">ImproveIndexingSpeed</a>
* for details.</p> */
@@ -250,7 +246,8 @@
}
- /** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
+ /** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>.
+ * @deprecated use {@link #setTokenStream} */
public void setValue(TokenStream value) {
if (isBinary) {
throw new IllegalArgumentException("cannot set a TokenStream value on a binary field");
@@ -258,9 +255,18 @@
if (isStored) {
throw new IllegalArgumentException("cannot set a TokenStream value on a stored field");
}
- fieldsData = value;
+ fieldsData = null;
+ tokenStream = value;
}
+ /** Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() to return true.
+ * May be combined with stored values from stringValue() or binaryValue() */
+ public void setTokenStream(TokenStream tokenStream) {
+ this.isIndexed = true;
+ this.isTokenized = true;
+ this.tokenStream = tokenStream;
+ }
+
/**
* Create a field by specifying its name, value and how it will
* be saved in the index. Term vectors will not be stored in the index.
@@ -459,8 +465,9 @@
throw new NullPointerException("tokenStream cannot be null");
this.name = name.intern(); // field names are interned
- this.fieldsData = tokenStream;
-
+ this.fieldsData = null;
+ this.tokenStream = tokenStream;
+
this.isStored = false;
this.isCompressed = false;
Index: src/test/org/apache/lucene/index/TestIndexWriter.java
===================================================================
--- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 786867)
+++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy)
@@ -17,11 +17,7 @@
* limitations under the License.
*/
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.IOException;
-import java.io.PrintStream;
-import java.io.Reader;
+import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@@ -4350,4 +4346,66 @@
t.join();
assertFalse(t.failed);
}
+
+
+ public void testIndexStoreCombos() throws Exception {
+ MockRAMDirectory dir = new MockRAMDirectory();
+ IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
+ byte[] b = new byte[50];
+ for(int i=0;i<50;i++)
+ b[i] = (byte) (i+77);
+
+ Document doc = new Document();
+ Field f = new Field("binary", b, 10, 17, Field.Store.YES);
+ f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc1field1")));
+ Field f2 = new Field("string", "value", Field.Store.YES,Field.Index.ANALYZED);
+ f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc1field2")));
+ doc.add(f);
+ doc.add(f2);
+ w.addDocument(doc);
+
+ // add 2 docs to test in-memory merging
+ f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc2field1")));
+ f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc2field2")));
+ w.addDocument(doc);
+
+ // force segment flush so we can force a segment merge with doc3 later.
+ w.commit();
+
+ f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc3field1")));
+ f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc3field2")));
+
+ w.addDocument(doc);
+ w.commit();
+ w.optimize(); // force segment merge.
+
+ IndexReader ir = IndexReader.open(dir);
+ doc = ir.document(0);
+ f = doc.getField("binary");
+ b = f.getBinaryValue();
+ assertTrue(b != null);
+ assertEquals(17, b.length, 17);
+ assertEquals(87, b[0]);
+
+ assertTrue(ir.document(0).getFieldable("binary").isBinary());
+ assertTrue(ir.document(1).getFieldable("binary").isBinary());
+ assertTrue(ir.document(2).getFieldable("binary").isBinary());
+
+ assertEquals("value", ir.document(0).get("string"));
+ assertEquals("value", ir.document(1).get("string"));
+ assertEquals("value", ir.document(2).get("string"));
+
+
+ // test that the terms were indexed.
+ assertTrue(ir.termDocs(new Term("binary","doc1field1")).next());
+ assertTrue(ir.termDocs(new Term("binary","doc2field1")).next());
+ assertTrue(ir.termDocs(new Term("binary","doc3field1")).next());
+ assertTrue(ir.termDocs(new Term("string","doc1field2")).next());
+ assertTrue(ir.termDocs(new Term("string","doc2field2")).next());
+ assertTrue(ir.termDocs(new Term("string","doc3field2")).next());
+
+ ir.close();
+ dir.close();
+
+ }
}