blob: 4c9c334989549fede92a8c3fc7a5e2284f022b6c [file] [log] [blame]
Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
===================================================================
--- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java (revision 1591787)
+++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java (working copy)
@@ -81,7 +81,7 @@
continue;
}
- final TokenStream stream = field.tokenStream(analyzer);
+ final TokenStream stream = field.tokenStream(analyzer, null);
// reset the TokenStream to the first token
stream.reset();
Index: lucene/core/src/java/org/apache/lucene/document/Field.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/document/Field.java (revision 1591787)
+++ lucene/core/src/java/org/apache/lucene/document/Field.java (working copy)
@@ -74,8 +74,6 @@
* customize how it's tokenized */
protected TokenStream tokenStream;
- private transient TokenStream internalTokenStream;
-
/**
* Field's boost
* @see #boost()
@@ -494,7 +492,7 @@
}
@Override
- public TokenStream tokenStream(Analyzer analyzer) throws IOException {
+ public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) throws IOException {
if (!fieldType().indexed()) {
return null;
}
@@ -501,12 +499,12 @@
final NumericType numericType = fieldType().numericType();
if (numericType != null) {
- if (!(internalTokenStream instanceof NumericTokenStream)) {
+ if (!(reuse instanceof NumericTokenStream && ((NumericTokenStream)reuse).getPrecisionStep() == type.numericPrecisionStep())) {
// lazy init the TokenStream as it is heavy to instantiate
// (attributes,...) if not needed (stored field loading)
- internalTokenStream = new NumericTokenStream(type.numericPrecisionStep());
+ reuse = new NumericTokenStream(type.numericPrecisionStep());
}
- final NumericTokenStream nts = (NumericTokenStream) internalTokenStream;
+ final NumericTokenStream nts = (NumericTokenStream) reuse;
// initialize value in TokenStream
final Number val = (Number) fieldsData;
switch (numericType) {
@@ -525,7 +523,7 @@
default:
throw new AssertionError("Should never get here");
}
- return internalTokenStream;
+ return reuse;
}
if (!fieldType().tokenized()) {
@@ -532,13 +530,13 @@
if (stringValue() == null) {
throw new IllegalArgumentException("Non-Tokenized Fields must have a String value");
}
- if (!(internalTokenStream instanceof StringTokenStream)) {
+ if (!(reuse instanceof StringTokenStream)) {
// lazy init the TokenStream as it is heavy to instantiate
// (attributes,...) if not needed (stored field loading)
- internalTokenStream = new StringTokenStream();
+ reuse = new StringTokenStream();
}
- ((StringTokenStream) internalTokenStream).setValue(stringValue());
- return internalTokenStream;
+ ((StringTokenStream) reuse).setValue(stringValue());
+ return reuse;
}
if (tokenStream != null) {
Index: lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java (revision 1591807)
+++ lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java (working copy)
@@ -511,6 +511,9 @@
// Lazy init'd:
NumericDocValuesWriter norms;
+
+ // reused
+ TokenStream tokenStream;
public PerField(FieldInfo fieldInfo, boolean invert) {
this.fieldInfo = fieldInfo;
@@ -574,7 +577,7 @@
*/
boolean aborting = false;
boolean succeededInProcessingField = false;
- try (TokenStream stream = field.tokenStream(docState.analyzer)) {
+ try (TokenStream stream = tokenStream = field.tokenStream(docState.analyzer, tokenStream)) {
// reset the TokenStream to the first token
stream.reset();
invertState.setAttributeSource(stream);
Index: lucene/core/src/java/org/apache/lucene/index/IndexDocument.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/index/IndexDocument.java (revision 1591787)
+++ lucene/core/src/java/org/apache/lucene/index/IndexDocument.java (working copy)
@@ -24,8 +24,8 @@
public interface IndexDocument {
/** Obtains all indexable fields in document */
- public Iterable<IndexableField> indexableFields();
+ public Iterable<? extends IndexableField> indexableFields();
/** Obtains all storable fields in document */
- public Iterable<StorableField> storableFields();
+ public Iterable<? extends StorableField> storableFields();
}
Index: lucene/core/src/java/org/apache/lucene/index/IndexableField.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/index/IndexableField.java (revision 1591787)
+++ lucene/core/src/java/org/apache/lucene/index/IndexableField.java (working copy)
@@ -42,11 +42,17 @@
* implementations should use the given Analyzer to create the TokenStreams.
*
* @param analyzer Analyzer that should be used to create the TokenStreams from
+ * @param reuse TokenStream for a previous instance of this field <b>name</b>. This allows
+ * custom field types (like StringField and NumericField) that do not use
+ * the analyzer to still have good performance. Note: the passed-in type
+ * may be inappropriate, for example if you mix up different types of Fields
+ * for the same field name. So its the responsibility of the implementation to
+ * check.
* @return TokenStream value for indexing the document. Should always return
* a non-null value if the field is to be indexed
* @throws IOException Can be thrown while creating the TokenStream
*/
- public TokenStream tokenStream(Analyzer analyzer) throws IOException;
+ public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) throws IOException;
/**
* Returns the field's index-time boost.
Index: lucene/core/src/test/org/apache/lucene/index/TestFieldReuse.java
===================================================================
--- lucene/core/src/test/org/apache/lucene/index/TestFieldReuse.java (revision 0)
+++ lucene/core/src/test/org/apache/lucene/index/TestFieldReuse.java (working copy)
@@ -0,0 +1,178 @@
+package org.apache.lucene.index;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collections;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.NumericTokenStream;
+import org.apache.lucene.analysis.NumericTokenStream.NumericTermAttribute;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.IntField;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.NumericUtils;
+
+/** test tokenstream reuse by DefaultIndexingChain */
+public class TestFieldReuse extends BaseTokenStreamTestCase {
+
+ public void testStringField() throws IOException {
+ StringField stringField = new StringField("foo", "bar", Field.Store.NO);
+
+ // passing null
+ TokenStream ts = stringField.tokenStream(null, null);
+ assertTokenStreamContents(ts,
+ new String[] { "bar" },
+ new int[] { 0 },
+ new int[] { 3 }
+ );
+
+ // now reuse previous stream
+ stringField = new StringField("foo", "baz", Field.Store.NO);
+ TokenStream ts2 = stringField.tokenStream(null, ts);
+ assertSame(ts, ts);
+ assertTokenStreamContents(ts,
+ new String[] { "baz" },
+ new int[] { 0 },
+ new int[] { 3 }
+ );
+
+ // pass a bogus stream and ensure its still ok
+ stringField = new StringField("foo", "beer", Field.Store.NO);
+ TokenStream bogus = new NumericTokenStream();
+ ts = stringField.tokenStream(null, bogus);
+ assertNotSame(ts, bogus);
+ assertTokenStreamContents(ts,
+ new String[] { "beer" },
+ new int[] { 0 },
+ new int[] { 4 }
+ );
+ }
+
+ public void testNumericReuse() throws IOException {
+ IntField intField = new IntField("foo", 5, Field.Store.NO);
+
+ // passing null
+ TokenStream ts = intField.tokenStream(null, null);
+ assertTrue(ts instanceof NumericTokenStream);
+ assertEquals(NumericUtils.PRECISION_STEP_DEFAULT, ((NumericTokenStream)ts).getPrecisionStep());
+ assertNumericContents(5, ts);
+
+ // now reuse previous stream
+ intField = new IntField("foo", 20, Field.Store.NO);
+ TokenStream ts2 = intField.tokenStream(null, ts);
+ assertSame(ts, ts2);
+ assertNumericContents(20, ts);
+
+ // pass a bogus stream and ensure its still ok
+ intField = new IntField("foo", 2343, Field.Store.NO);
+ TokenStream bogus = new CannedTokenStream(new Token("bogus", 0, 5));
+ ts = intField.tokenStream(null, bogus);
+ assertNotSame(bogus, ts);
+ assertNumericContents(2343, ts);
+
+ // pass another bogus stream (numeric, but different precision step!)
+ intField = new IntField("foo", 42, Field.Store.NO);
+ assert 3 != NumericUtils.PRECISION_STEP_DEFAULT;
+ bogus = new NumericTokenStream(3);
+ ts = intField.tokenStream(null, bogus);
+ assertNotSame(bogus, ts);
+ assertNumericContents(42, ts);
+ }
+
+ static class MyField implements IndexableField {
+ TokenStream lastSeen;
+ TokenStream lastReturned;
+
+ @Override
+ public String name() {
+ return "foo";
+ }
+
+ @Override
+ public IndexableFieldType fieldType() {
+ return StringField.TYPE_NOT_STORED;
+ }
+
+ @Override
+ public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) throws IOException {
+ lastSeen = reuse;
+ return lastReturned = new CannedTokenStream(new Token("unimportant", 0, 10));
+ }
+
+ @Override
+ public float boost() {
+ return 1;
+ }
+ }
+
+ public void testIndexWriterActuallyReuses() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = new IndexWriterConfig(TEST_VERSION_CURRENT, null);
+ IndexWriter iw = new IndexWriter(dir, iwc);
+ final MyField field1 = new MyField();
+ iw.addDocument(new IndexDocument() {
+ @Override
+ public Iterable<? extends IndexableField> indexableFields() {
+ return Collections.singletonList(field1);
+ }
+ @Override
+ public Iterable<StorableField> storableFields() {
+ return Collections.emptyList();
+ }
+ });
+ TokenStream previous = field1.lastReturned;
+ assertNotNull(previous);
+
+ final MyField field2 = new MyField();
+ iw.addDocument(new IndexDocument() {
+ @Override
+ public Iterable<? extends IndexableField> indexableFields() {
+ return Collections.singletonList(field2);
+ }
+ @Override
+ public Iterable<StorableField> storableFields() {
+ return Collections.emptyList();
+ }
+ });
+ assertSame(previous, field2.lastSeen);
+ iw.shutdown();
+ dir.close();
+ }
+
+ private void assertNumericContents(int value, TokenStream ts) throws IOException {
+ assertTrue(ts instanceof NumericTokenStream);
+ NumericTermAttribute numericAtt = ts.getAttribute(NumericTermAttribute.class);
+ ts.reset();
+ boolean seen = false;
+ while (ts.incrementToken()) {
+ if (numericAtt.getShift() == 0) {
+ assertEquals(value, numericAtt.getRawValue());
+ seen = true;
+ }
+ }
+ ts.end();
+ ts.close();
+ assertTrue(seen);
+ }
+}
Property changes on: lucene/core/src/test/org/apache/lucene/index/TestFieldReuse.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Index: lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java
===================================================================
--- lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java (revision 1591787)
+++ lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java (working copy)
@@ -1602,7 +1602,7 @@
}
@Override
- public TokenStream tokenStream(Analyzer analyzer) throws IOException {
+ public TokenStream tokenStream(Analyzer analyzer, TokenStream previous) throws IOException {
return null;
}
});
Index: lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java
===================================================================
--- lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java (revision 1591787)
+++ lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java (working copy)
@@ -154,7 +154,7 @@
}
@Override
- public TokenStream tokenStream(Analyzer analyzer) throws IOException {
+ public TokenStream tokenStream(Analyzer analyzer, TokenStream previous) throws IOException {
return readerValue() != null ? analyzer.tokenStream(name(), readerValue()) :
analyzer.tokenStream(name(), new StringReader(stringValue()));
}