blob: 86e73a2f7b6715d95aeb505426ab62f516af7c54 [file] [log] [blame]
Index: lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (revision 1645984)
+++ lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (revision )
@@ -17,6 +17,8 @@
* limitations under the License.
*/
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
@@ -28,10 +30,17 @@
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CachingTokenFilter;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockPayloadAnalyzer;
+import org.apache.lucene.analysis.MockTokenFilter;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@@ -44,20 +53,43 @@
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.CommonTermsQuery;
-import org.apache.lucene.search.*;
import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.ConstantScoreQuery;
+import org.apache.lucene.search.FilteredQuery;
+import org.apache.lucene.search.FuzzyQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MultiPhraseQuery;
+import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.search.NumericRangeQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.QueryWrapperFilter;
+import org.apache.lucene.search.RegexpQuery;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TermRangeFilter;
+import org.apache.lucene.search.TermRangeQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner;
import org.apache.lucene.search.join.BitDocIdSetCachingWrapperFilter;
import org.apache.lucene.search.join.BitDocIdSetFilter;
import org.apache.lucene.search.join.ScoreMode;
import org.apache.lucene.search.join.ToChildBlockJoinQuery;
import org.apache.lucene.search.join.ToParentBlockJoinQuery;
-import org.apache.lucene.search.spans.*;
+import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanNotQuery;
+import org.apache.lucene.search.spans.SpanOrQuery;
+import org.apache.lucene.search.spans.SpanPayloadCheckQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
@@ -1891,7 +1923,7 @@
reader.close();
}
- /** If we have term vectors, we can highlight based on payloads */
+ /** We can highlight based on payloads. It's supported both via term vectors and MemoryIndex since Lucene 5. */
public void testPayloadQuery() throws IOException, InvalidTokenOffsetsException {
final String text = "random words and words";//"words" at positions 1 & 4
@@ -1900,7 +1932,7 @@
writer.deleteAll();
Document doc = new Document();
- doc.add(new Field(FIELD_NAME, text, FIELD_TYPE_TV));
+ doc.add(new Field(FIELD_NAME, text, fieldType));
writer.addDocument(doc);
writer.commit();
}
@@ -1908,12 +1940,17 @@
Query query = new SpanPayloadCheckQuery(new SpanTermQuery(new Term(FIELD_NAME, "words")),
Collections.singleton("pos: 1".getBytes("UTF-8")));//just match the first "word" occurrence
IndexSearcher searcher = newSearcher(reader);
- Scorer scorer = new QueryScorer(query, searcher.getIndexReader(), FIELD_NAME);
+ QueryScorer scorer = new QueryScorer(query, searcher.getIndexReader(), FIELD_NAME);
+ scorer.setUsePayloads(true);
Highlighter h = new Highlighter(scorer);
TopDocs hits = searcher.search(query, null, 10);
assertEquals(1, hits.scoreDocs.length);
TokenStream stream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), 0, FIELD_NAME, analyzer);
+ if (random().nextBoolean()) {
+ stream.reset();
+ stream = new CachingTokenFilter(stream);//conceals detection of TokenStreamFromTermVector
+ }
String result = h.getBestFragment(stream, text);
assertEquals("random <B>words</B> and words", result);//only highlight first "word"
}
Index: lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndexAgainstRAMDir.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndexAgainstRAMDir.java (revision 1645984)
+++ lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndexAgainstRAMDir.java (revision )
@@ -68,8 +68,8 @@
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
-import org.apache.lucene.util.ByteBlockPool.Allocator;
import org.apache.lucene.util.ByteBlockPool;
+import org.apache.lucene.util.ByteBlockPool.Allocator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LineFileDocs;
@@ -116,7 +116,7 @@
* runs random tests, up to ITERATIONS times.
*/
public void testRandomQueries() throws Exception {
- MemoryIndex index = new MemoryIndex(random().nextBoolean(), random().nextInt(50) * 1024 * 1024);
+ MemoryIndex index = randomMemoryIndex();
for (int i = 0; i < ITERATIONS; i++) {
assertAgainstRAMDirectory(index);
}
@@ -148,7 +148,8 @@
Directory ramdir = new RAMDirectory();
Analyzer analyzer = randomAnalyzer();
IndexWriter writer = new IndexWriter(ramdir,
- new IndexWriterConfig(analyzer).setCodec(TestUtil.alwaysPostingsFormat(TestUtil.getDefaultPostingsFormat())));
+ new IndexWriterConfig(analyzer).setCodec(
+ TestUtil.alwaysPostingsFormat(TestUtil.getDefaultPostingsFormat())));
Document doc = new Document();
Field field1 = newTextField("foo", fooField.toString(), Field.Store.NO);
Field field2 = newTextField("term", termField.toString(), Field.Store.NO);
@@ -209,7 +210,11 @@
assertEquals(iwDocsAndPos.startOffset(), memDocsAndPos.startOffset());
assertEquals(iwDocsAndPos.endOffset(), memDocsAndPos.endOffset());
}
+
+ if (iwTerms.hasPayloads()) {
+ assertEquals(iwDocsAndPos.getPayload(), memDocsAndPos.getPayload());
- }
+ }
+ }
}
@@ -311,7 +316,7 @@
public void testDocsEnumStart() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
- MemoryIndex memory = new MemoryIndex(random().nextBoolean(), random().nextInt(50) * 1024 * 1024);
+ MemoryIndex memory = new MemoryIndex(random().nextBoolean(), false, random().nextInt(50) * 1024 * 1024);
memory.addField("foo", "bar", analyzer);
LeafReader reader = (LeafReader) memory.createSearcher().getIndexReader();
DocsEnum disi = TestUtil.docs(random(), reader, "foo", new BytesRef("bar"), null, null, DocsEnum.FLAG_NONE);
@@ -336,11 +341,15 @@
return new ByteBlockPool.DirectAllocator();
}
}
-
+
+ private MemoryIndex randomMemoryIndex() {
+ return new MemoryIndex(random().nextBoolean(), random().nextBoolean(), random().nextInt(50) * 1024 * 1024);
+ }
+
public void testDocsAndPositionsEnumStart() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
int numIters = atLeast(3);
- MemoryIndex memory = new MemoryIndex(true, random().nextInt(50) * 1024 * 1024);
+ MemoryIndex memory = new MemoryIndex(true, false, random().nextInt(50) * 1024 * 1024);
for (int i = 0; i < numIters; i++) { // check reuse
memory.addField("foo", "bar", analyzer);
LeafReader reader = (LeafReader) memory.createSearcher().getIndexReader();
@@ -370,7 +379,7 @@
RegexpQuery regex = new RegexpQuery(new Term("field", "worl."));
SpanQuery wrappedquery = new SpanMultiTermQueryWrapper<>(regex);
- MemoryIndex mindex = new MemoryIndex(random().nextBoolean(), random().nextInt(50) * 1024 * 1024);
+ MemoryIndex mindex = randomMemoryIndex();
mindex.addField("field", new MockAnalyzer(random()).tokenStream("field", "hello there"));
// This throws an NPE
@@ -382,7 +391,7 @@
RegexpQuery regex = new RegexpQuery(new Term("field", "worl."));
SpanQuery wrappedquery = new SpanOrQuery(new SpanMultiTermQueryWrapper<>(regex));
- MemoryIndex mindex = new MemoryIndex(random().nextBoolean(), random().nextInt(50) * 1024 * 1024);
+ MemoryIndex mindex = randomMemoryIndex();
mindex.addField("field", new MockAnalyzer(random()).tokenStream("field", "hello there"));
// This passes though
@@ -390,7 +399,7 @@
}
public void testSameFieldAddedMultipleTimes() throws IOException {
- MemoryIndex mindex = new MemoryIndex(random().nextBoolean(), random().nextInt(50) * 1024 * 1024);
+ MemoryIndex mindex = randomMemoryIndex();
MockAnalyzer mockAnalyzer = new MockAnalyzer(random());
mindex.addField("field", "the quick brown fox", mockAnalyzer);
mindex.addField("field", "jumps over the", mockAnalyzer);
@@ -409,8 +418,8 @@
assertTrue("posGap" + mockAnalyzer.getPositionIncrementGap("field") , mindex.search(query) > 0.0001);
}
- public void testNonExistingsField() throws IOException {
- MemoryIndex mindex = new MemoryIndex(random().nextBoolean(), random().nextInt(50) * 1024 * 1024);
+ public void testNonExistentField() throws IOException {
+ MemoryIndex mindex = randomMemoryIndex();
MockAnalyzer mockAnalyzer = new MockAnalyzer(random());
mindex.addField("field", "the quick brown fox", mockAnalyzer);
LeafReader reader = (LeafReader) mindex.createSearcher().getIndexReader();
@@ -420,11 +429,11 @@
assertNull(reader.termPositionsEnum(new Term("not-in-index", "foo")));
assertNull(reader.terms("not-in-index"));
}
-
+
public void testDuellMemIndex() throws IOException {
LineFileDocs lineFileDocs = new LineFileDocs(random());
int numDocs = atLeast(10);
- MemoryIndex memory = new MemoryIndex(random().nextBoolean(), random().nextInt(50) * 1024 * 1024);
+ MemoryIndex memory = randomMemoryIndex();
for (int i = 0; i < numDocs; i++) {
Directory dir = newDirectory();
MockAnalyzer mockAnalyzer = new MockAnalyzer(random());
@@ -535,7 +544,7 @@
assertThat("Position test failed" + failDesc, memPos, equalTo(pos));
assertThat("Start offset test failed" + failDesc, memDocsPosEnum.startOffset(), equalTo(docsPosEnum.startOffset()));
assertThat("End offset test failed" + failDesc, memDocsPosEnum.endOffset(), equalTo(docsPosEnum.endOffset()));
- assertThat("Missing payload test failed" + failDesc, docsPosEnum.getPayload(), equalTo(null));
+ assertThat("Missing payload test failed" + failDesc, docsPosEnum.getPayload(), equalTo(docsPosEnum.getPayload()));
}
}
assertNull("Still some tokens not processed", memTermEnum.next());
Index: lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java (revision 1645984)
+++ lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java (revision )
@@ -21,18 +21,24 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefArray;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.Counter;
import org.apache.lucene.util.UnicodeUtil;
/**
* TokenStream created from a term vector field. The term vector requires positions and/or offsets (either). If you
* want payloads add PayloadAttributeImpl (as you would normally) but don't assume the attribute is already added just
- * because you know the term vector has payloads. This TokenStream supports an efficient {@link #reset()}, so there's
+ * because you know the term vector has payloads, since the first call to incrementToken() will observe if you asked
+ * for them and if not then won't get them. This TokenStream supports an efficient {@link #reset()}, so there's
* no need to wrap with a caching impl.
* <p />
* The implementation will create an array of tokens indexed by token position. As long as there aren't massive jumps
@@ -47,6 +53,11 @@
//TODO add a maxStartOffset filter, which highlighters will find handy
+ //This attribute factory uses less memory when captureState() is called.
+ public static final AttributeFactory ATTRIBUTE_FACTORY =
+ AttributeFactory.getStaticImplementation(
+ AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, PackedTokenAttributeImpl.class);
+
private final Terms vector;
private final CharTermAttribute termAttribute;
@@ -56,11 +67,15 @@
private OffsetAttribute offsetAttribute;//maybe null
private PayloadAttribute payloadAttribute;//maybe null
+ private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null
+ private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null
private TokenLL firstToken = null; // the head of a linked-list
private TokenLL incrementToken = null;
+ private boolean initialized = false;//lazy
+
/**
* Constructor.
*
@@ -68,6 +83,8 @@
* creating the TokenStream. Must have positions and/or offsets.
*/
public TokenStreamFromTermVector(Terms vector) throws IOException {
+ super(ATTRIBUTE_FACTORY);
+ assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*";
if (!vector.hasPositions() && !vector.hasOffsets()) {
throw new IllegalArgumentException("The term vector needs positions and/or offsets.");
}
@@ -81,20 +98,20 @@
@Override
public void reset() throws IOException {
- if (firstToken == null) {//just the first time
- init();
- }
incrementToken = null;
super.reset();
}
- //We initialize in reset() because we can see which attributes the consumer wants, particularly payloads
+ //We delay initialization because we can see which attributes the consumer wants, particularly payloads
private void init() throws IOException {
+ assert !initialized;
if (vector.hasOffsets()) {
offsetAttribute = addAttribute(OffsetAttribute.class);
}
if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
payloadAttribute = getAttribute(PayloadAttribute.class);
+ payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
+ spareBytesRefBuilder = new BytesRefBuilder();
}
// Step 1: iterate termsEnum and create a token, placing into an array of tokens by position
@@ -132,14 +149,9 @@
}
if (payloadAttribute != null) {
- // Must make a deep copy of the returned payload,
- // since D&PEnum API is allowed to re-use on every
- // call:
final BytesRef payload = dpEnum.getPayload();
- if (payload != null) {
- token.payload = BytesRef.deepCopyOf(payload);//TODO share a ByteBlockPool & re-use BytesRef
+ token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload);
- }
+ }
- }
//Add token to an array indexed by position
if (positionedTokens.length <= pos) {
@@ -198,6 +210,8 @@
prevTokenPos = pos;
prevToken = token;
}
+
+ initialized = true;
}
private TokenLL[] initTokensArray() throws IOException {
@@ -216,8 +230,12 @@
}
@Override
- public boolean incrementToken() {
+ public boolean incrementToken() throws IOException {
if (incrementToken == null) {
+ if (!initialized) {
+ init();
+ assert initialized;
+ }
incrementToken = firstToken;
if (incrementToken == null) {
return false;
@@ -234,8 +252,12 @@
offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.endOffset);
}
if (payloadAttribute != null) {
- payloadAttribute.setPayload(incrementToken.payload);
+ if (incrementToken.payloadIndex == -1) {
+ payloadAttribute.setPayload(null);
+ } else {
+ payloadAttribute.setPayload(payloadsBytesRefArray.get(spareBytesRefBuilder, incrementToken.payloadIndex));
- }
+ }
+ }
return true;
}
@@ -245,7 +267,7 @@
int positionIncrement;
int startOffset;
int endOffset;
- BytesRef payload;
+ int payloadIndex;
TokenLL next;
Index: lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (revision 1645984)
+++ lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (revision )
@@ -83,9 +83,9 @@
private boolean cachedTokenStream;
private boolean wrapToCaching = true;
private int maxDocCharsToAnalyze;
+ private boolean usePayloads = false;
private LeafReader internalReader = null;
-
public WeightedSpanTermExtractor() {
}
@@ -384,7 +384,7 @@
// Use MemoryIndex (index/invert this tokenStream now)
if (internalReader == null) {
- final MemoryIndex indexer = new MemoryIndex(true);
+ final MemoryIndex indexer = new MemoryIndex(true, usePayloads);//offsets and payloads
if (cacheIt) {
assert !cachedTokenStream;
tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
@@ -652,8 +652,16 @@
public void setExpandMultiTermQuery(boolean expandMultiTermQuery) {
this.expandMultiTermQuery = expandMultiTermQuery;
+ }
+
+ public boolean isUsePayloads() {
+ return usePayloads;
+ }
+
+ public void setUsePayloads(boolean usePayloads) {
+ this.usePayloads = usePayloads;
}
-
+
public boolean isCachedTokenStream() {
return cachedTokenStream;
}
Index: lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (revision 1645984)
+++ lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (revision )
@@ -54,6 +54,7 @@
private boolean skipInitExtractor;
private boolean wrapToCaching = true;
private int maxCharsToAnalyze;
+ private boolean usePayloads = false;
/**
* @param query Query to use for highlighting
@@ -213,6 +214,7 @@
qse.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
qse.setExpandMultiTermQuery(expandMultiTermQuery);
qse.setWrapIfNotCachingTokenFilter(wrapToCaching);
+ qse.setUsePayloads(usePayloads);
if (reader == null) {
this.fieldWeightedSpanTerms = qse.getWeightedSpanTerms(query,
tokenStream, field);
@@ -258,8 +260,20 @@
*/
public void setExpandMultiTermQuery(boolean expandMultiTermQuery) {
this.expandMultiTermQuery = expandMultiTermQuery;
+ }
+
+ /**
+ * Whether or not we should capture payloads in {@link MemoryIndex} at each position so that queries can access them.
+ * This does not apply to term vector based TokenStreams, which support payloads only when the term vector has them.
+ */
+ public boolean isUsePayloads() {
+ return usePayloads;
+ }
+
+ public void setUsePayloads(boolean usePayloads) {
+ this.usePayloads = usePayloads;
}
-
+
/**
* By default, {@link TokenStream}s that are not of the type
* {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to
Index: lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (revision 1645984)
+++ lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (revision )
@@ -29,6 +29,7 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.index.BinaryDocValues;
@@ -60,6 +61,8 @@
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefArray;
+import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
import org.apache.lucene.util.Counter;
@@ -187,17 +190,19 @@
*/
public class MemoryIndex {
+ private static final boolean DEBUG = false;
+
/** info for each field: Map&lt;String fieldName, Info field&gt; */
private final SortedMap<String,Info> fields = new TreeMap<>();
private final boolean storeOffsets;
+ private final boolean storePayloads;
-
+
- private static final boolean DEBUG = false;
-
private final ByteBlockPool byteBlockPool;
private final IntBlockPool intBlockPool;
// private final IntBlockPool.SliceReader postingsReader;
private final IntBlockPool.SliceWriter postingsWriter;
+ private final BytesRefArray payloadsBytesRefs;//non null only when storePayloads
private Counter bytesUsed;
@@ -206,7 +211,7 @@
private Similarity normSimilarity = IndexSearcher.getDefaultSimilarity();
/**
- * Constructs an empty instance.
+ * Constructs an empty instance that will not store offsets or payloads.
*/
public MemoryIndex() {
this(false);
@@ -215,25 +220,36 @@
/**
* Constructs an empty instance that can optionally store the start and end
* character offset of each token term in the text. This can be useful for
- * highlighting of hit locations with the Lucene highlighter package.
- * Protected until the highlighter package matures, so that this can actually
- * be meaningfully integrated.
+ * highlighting of hit locations with the Lucene highlighter package. But
+ * it will not store payloads; use another constructor for that.
*
* @param storeOffsets
* whether or not to store the start and end character offset of
* each token term in the text
*/
public MemoryIndex(boolean storeOffsets) {
- this(storeOffsets, 0);
+ this(storeOffsets, false);
}
-
+
/**
+ * Constructs an empty instance with the option of storing offsets and payloads.
+ *
+ * @param storeOffsets store term offsets at each position
+ * @param storePayloads store term payloads at each position
+ */
+ public MemoryIndex(boolean storeOffsets, boolean storePayloads) {
+ this(storeOffsets, storePayloads, 0);
+ }
+
+ /**
* Expert: This constructor accepts an upper limit for the number of bytes that should be reused if this instance is {@link #reset()}.
* @param storeOffsets <code>true</code> if offsets should be stored
+ * @param storePayloads <code>true</code> if payloads should be stored
* @param maxReusedBytes the number of bytes that should remain in the internal memory pools after {@link #reset()} is called
*/
- MemoryIndex(boolean storeOffsets, long maxReusedBytes) {
+ MemoryIndex(boolean storeOffsets, boolean storePayloads, long maxReusedBytes) {
this.storeOffsets = storeOffsets;
+ this.storePayloads = storePayloads;
this.bytesUsed = Counter.newCounter();
final int maxBufferedByteBlocks = (int)((maxReusedBytes/2) / ByteBlockPool.BYTE_BLOCK_SIZE );
final int maxBufferedIntBlocks = (int) ((maxReusedBytes - (maxBufferedByteBlocks*ByteBlockPool.BYTE_BLOCK_SIZE))/(IntBlockPool.INT_BLOCK_SIZE * RamUsageEstimator.NUM_BYTES_INT));
@@ -241,6 +257,7 @@
byteBlockPool = new ByteBlockPool(new RecyclingByteBlockAllocator(ByteBlockPool.BYTE_BLOCK_SIZE, maxBufferedByteBlocks, bytesUsed));
intBlockPool = new IntBlockPool(new RecyclingIntBlockAllocator(IntBlockPool.INT_BLOCK_SIZE, maxBufferedIntBlocks, bytesUsed));
postingsWriter = new SliceWriter(intBlockPool);
+ payloadsBytesRefs = storePayloads ? new BytesRefArray(bytesUsed) : null;
}
/**
@@ -381,8 +398,8 @@
*
* @param fieldName
* a name to be associated with the text
- * @param stream
- * the token stream to retrieve tokens from.
+ * @param tokenStream
+ * the token stream to retrieve tokens from. It's guaranteed to be closed no matter what.
* @param boost
* the boost factor for hits for this field
* @param positionIncrementGap
@@ -391,16 +408,17 @@
* the offset gap if fields with the same name are added more than once
* @see org.apache.lucene.document.Field#setBoost(float)
*/
- public void addField(String fieldName, TokenStream stream, float boost, int positionIncrementGap, int offsetGap) {
- try {
+ public void addField(String fieldName, TokenStream tokenStream, float boost, int positionIncrementGap,
+ int offsetGap) {
+ try (TokenStream stream = tokenStream) {
if (frozen)
throw new IllegalArgumentException("Cannot call addField() when MemoryIndex is frozen");
if (fieldName == null)
throw new IllegalArgumentException("fieldName must not be null");
if (stream == null)
- throw new IllegalArgumentException("token stream must not be null");
+ throw new IllegalArgumentException("token stream must not be null");
if (boost <= 0.0f)
- throw new IllegalArgumentException("boost factor must be greater than 0.0");
+ throw new IllegalArgumentException("boost factor must be greater than 0.0");
int numTokens = 0;
int numOverlapTokens = 0;
int pos = -1;
@@ -421,8 +439,9 @@
sliceArray = info.sliceArray;
sumTotalTermFreq = info.sumTotalTermFreq;
} else {
- fieldInfo = new FieldInfo(fieldName, fields.size(), false, false, false,
- this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS,
+ fieldInfo = new FieldInfo(fieldName, fields.size(), false, false, this.storePayloads,
+ this.storeOffsets
+ ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS,
DocValuesType.NONE, -1, null);
sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY);
terms = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray);
@@ -431,6 +450,7 @@
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
+ PayloadAttribute payloadAtt = storePayloads ? stream.addAttribute(PayloadAttribute.class) : null;
BytesRef ref = termAtt.getBytesRef();
stream.reset();
@@ -451,13 +471,16 @@
}
sliceArray.freq[ord]++;
sumTotalTermFreq++;
- if (!storeOffsets) {
- postingsWriter.writeInt(pos);
+ postingsWriter.writeInt(pos);
- } else {
- postingsWriter.writeInt(pos);
+ if (storeOffsets) {
postingsWriter.writeInt(offsetAtt.startOffset() + offset);
postingsWriter.writeInt(offsetAtt.endOffset() + offset);
}
+ if (storePayloads) {
+ final BytesRef payload = payloadAtt.getPayload();
+ int pIndex = payload == null ? -1 : payloadsBytesRefs.append(payload);
+ postingsWriter.writeInt(pIndex);
+ }
sliceArray.end[ord] = postingsWriter.getCurrentOffset();
}
stream.end();
@@ -466,18 +489,10 @@
if (numTokens > 0) {
fields.put(fieldName, new Info(fieldInfo, terms, sliceArray, numTokens, numOverlapTokens, boost, pos, offsetAtt.endOffset() + offset, sumTotalTermFreq));
}
- } catch (Exception e) { // can never happen
+ } catch (IOException e) {
throw new RuntimeException(e);
- } finally {
- try {
- if (stream != null) {
- stream.close();
- }
+ }
- } catch (IOException e2) {
- throw new RuntimeException(e2);
- }
+ }
- }
- }
/**
* Set the Similarity to be used for calculating field norms
@@ -861,7 +876,7 @@
@Override
public boolean hasPayloads() {
- return false;
+ return storePayloads;
}
};
}
@@ -1043,17 +1058,20 @@
}
private class MemoryDocsAndPositionsEnum extends DocsAndPositionsEnum {
+ private final SliceReader sliceReader;
private int posUpto; // for assert
private boolean hasNext;
private Bits liveDocs;
private int doc = -1;
- private SliceReader sliceReader;
private int freq;
private int startOffset;
private int endOffset;
+ private int payloadIndex;
+ private final BytesRefBuilder payloadBuilder;//only non-null when storePayloads
-
+
public MemoryDocsAndPositionsEnum() {
this.sliceReader = new SliceReader(intBlockPool);
+ this.payloadBuilder = storePayloads ? new BytesRefBuilder() : null;
}
public DocsAndPositionsEnum reset(Bits liveDocs, int start, int end, int freq) {
@@ -1096,15 +1114,16 @@
public int nextPosition() {
assert posUpto++ < freq;
assert !sliceReader.endOfSlice() : " stores offsets : " + startOffset;
- if (storeOffsets) {
- int pos = sliceReader.readInt();
+ int pos = sliceReader.readInt();
+ if (storeOffsets) {
startOffset = sliceReader.readInt();
endOffset = sliceReader.readInt();
- return pos;
- } else {
- return sliceReader.readInt();
}
+ if (storePayloads) {
+ payloadIndex = sliceReader.readInt();
- }
+ }
+ return pos;
+ }
@Override
public int startOffset() {
@@ -1118,8 +1137,11 @@
@Override
public BytesRef getPayload() {
+ if (payloadBuilder == null || payloadIndex == -1) {
- return null;
- }
+ return null;
+ }
+ return payloadsBytesRefs.get(payloadBuilder, payloadIndex);
+ }
@Override
public long cost() {
@@ -1178,6 +1200,9 @@
this.normSimilarity = IndexSearcher.getDefaultSimilarity();
byteBlockPool.reset(false, false); // no need to 0-fill the buffers
intBlockPool.reset(true, false); // here must must 0-fill since we use slices
+ if (payloadsBytesRefs != null) {
+ payloadsBytesRefs.clear();
+ }
this.frozen = false;
}