blob: 2a87a1a7e6914a4778a8fc3a25847daf1bc5bcf7 [file] [log] [blame]
Index: lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java (revision 1669491)
+++ lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java (revision )
@@ -254,7 +254,7 @@
new QueryScorer(phraseQuery));
final TokenStream tokenStream = TokenSources
.getTokenStream(
- indexReader.getTermVector(0, FIELD));
+ indexReader.getTermVector(0, FIELD), -1);
assertEquals("<B>the fox</B> did not jump",
highlighter.getBestFragment(tokenStream, TEXT));
} finally {
@@ -283,7 +283,7 @@
try {
assertEquals(1, indexReader.numDocs());
TokenSources.getTokenStream(
- indexReader.getTermVector(0, FIELD));
+ indexReader.getTermVector(0, FIELD), -1);
fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets");
}
catch (IllegalArgumentException e) {
@@ -333,7 +333,7 @@
writer.close();
assertEquals(1, reader.numDocs());
- TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"));
+ TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"), -1);
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
@@ -409,7 +409,7 @@
writer.close();
assertEquals(1, reader.numDocs());
- TokenStream vectorTokenStream = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"));
+ TokenStream vectorTokenStream = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"), -1);
//sometimes check payloads
PayloadAttribute payloadAttribute = null;
Index: lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (revision 1669491)
+++ lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (revision )
@@ -31,10 +31,19 @@
/**
* Hides implementation issues associated with obtaining a TokenStream for use
- * with the higlighter - can obtain from TermFreqVectors with offsets and
- * (optionally) positions or from Analyzer class reparsing the stored content.
+ * with the highlighter - can obtain from TermFreqVectors with offsets and
+ * (optionally) positions or from Analyzer class re-parsing the stored content.
*/
public class TokenSources {
+
+ /** See {@link #getAnyTokenStream(org.apache.lucene.index.IndexReader, int, String, org.apache.lucene.index.StoredDocument, org.apache.lucene.analysis.Analyzer, int)}. */
+ @Deprecated
+ public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
+ String field, StoredDocument document, Analyzer analyzer)
+ throws IOException {
+ return getAnyTokenStream(reader, docId, field, document, analyzer, -1);
+ }
+
/**
* A convenience method that tries to first get a {@link TokenStreamFromTermVector} for the
* specified docId, then, falls back to using the passed in
@@ -54,16 +63,15 @@
* {@link org.apache.lucene.document.Document}
* @throws IOException if there was an error loading
*/
-
- public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
- String field, StoredDocument document, Analyzer analyzer) throws IOException {
+ public static TokenStream getAnyTokenStream(IndexReader reader, int docId, String field, StoredDocument document,
+ Analyzer analyzer, int maxStartOffset) throws IOException {
TokenStream ts = null;
Fields vectors = reader.getTermVectors(docId);
if (vectors != null) {
Terms vector = vectors.terms(field);
if (vector != null) {
- ts = getTokenStream(vector);
+ ts = getTokenStream(vector, maxStartOffset);
}
}
@@ -74,24 +82,31 @@
return ts;
}
+ /** See {@link #getAnyTokenStream(org.apache.lucene.index.IndexReader, int, String, org.apache.lucene.analysis.Analyzer, int)}. */
+ @Deprecated
+ public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
+ String field, Analyzer analyzer) throws IOException {
+ return getAnyTokenStream(reader, docId, field, analyzer, -1);
+ }
+
/**
* A convenience method that tries a number of approaches to getting a token
* stream. The cost of finding there are no termVectors in the index is
* minimal (1000 invocations still registers 0 ms). So this "lazy" (flexible?)
- * approach to coding is probably acceptable
+ * approach to coding is probably acceptable.
*
* @return null if field not stored correctly
* @throws IOException If there is a low-level I/O error
*/
public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
- String field, Analyzer analyzer) throws IOException {
+ String field, Analyzer analyzer, int maxStartOffset) throws IOException {
TokenStream ts = null;
Fields vectors = reader.getTermVectors(docId);
if (vectors != null) {
Terms vector = vectors.terms(field);
if (vector != null) {
- ts = getTokenStream(vector);
+ ts = getTokenStream(vector, maxStartOffset);
}
}
@@ -102,13 +117,19 @@
return ts;
}
- /** Simply calls {@link #getTokenStream(org.apache.lucene.index.Terms)} now. */
+ /** Simply calls {@link #getTokenStream(org.apache.lucene.index.Terms, int)} now. */
@Deprecated
public static TokenStream getTokenStream(Terms vector,
boolean tokenPositionsGuaranteedContiguous) throws IOException {
- return getTokenStream(vector);
+ return getTokenStream(vector, -1);
}
+ /** @deprecated See {@link #getTokenStream(org.apache.lucene.index.Terms, int)}. */
+ @Deprecated
+ public static TokenStream getTokenStream(final Terms tpv) throws IOException {
+ return getTokenStream(tpv, -1);
+ }
+
/**
* Returns a token stream generated from a {@link Terms}. This
* can be used to feed the highlighter with a pre-parsed token
@@ -119,7 +140,7 @@
*
* @throws IllegalArgumentException if no offsets are available
*/
- public static TokenStream getTokenStream(final Terms tpv) throws IOException {
+ public static TokenStream getTokenStream(final Terms tpv, int maxStartOffset) throws IOException {
if (!tpv.hasOffsets()) {
throw new IllegalArgumentException("Highlighting requires offsets from the TokenStream.");
@@ -127,10 +148,19 @@
// highlighters require offsets, so we insist here.
}
- return new TokenStreamFromTermVector(tpv);
+ return new TokenStreamFromTermVector(tpv, maxStartOffset);
}
/**
+ * @deprecated See {@link #getTokenStreamWithOffsets(org.apache.lucene.index.IndexReader, int, String, int)}.
+ */
+ @Deprecated
+ public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId,
+ String field) throws IOException {
+ return getTokenStreamWithOffsets(reader, docId, field, -1);
+ }
+
+ /**
* Returns a {@link TokenStream} with positions and offsets constructed from
* field termvectors. If the field has no termvectors or offsets
* are not included in the termvector, return null. See {@link #getTokenStream(org.apache.lucene.index.Terms)}
@@ -139,13 +169,14 @@
* @param reader the {@link IndexReader} to retrieve term vectors from
* @param docId the document to retrieve termvectors for
* @param field the field to retrieve termvectors for
+ * @param maxStartOffset max starting offset for tokens returned from term vectors; -1 disables the limit
* @return a {@link TokenStream}, or null if offsets are not available
* @throws IOException If there is a low-level I/O error
*
* @see #getTokenStream(org.apache.lucene.index.Terms)
*/
public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId,
- String field) throws IOException {
+ String field, int maxStartOffset) throws IOException {
Fields vectors = reader.getTermVectors(docId);
if (vectors == null) {
@@ -161,7 +192,7 @@
return null;
}
- return getTokenStream(vector);
+ return getTokenStream(vector, maxStartOffset);
}
// convenience method
Index: lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java (revision 1669491)
+++ lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java (revision )
@@ -32,6 +32,7 @@
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefArray;
import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.UnicodeUtil;
@@ -52,8 +53,6 @@
*/
public final class TokenStreamFromTermVector extends TokenStream {
- //TODO add a maxStartOffset filter, which highlighters will find handy
-
//This attribute factory uses less memory when captureState() is called.
public static final AttributeFactory ATTRIBUTE_FACTORY =
AttributeFactory.getStaticImplementation(
@@ -65,9 +64,14 @@
private final PositionIncrementAttribute positionIncrementAttribute;
+ private final int maxStartOffset;
+
private OffsetAttribute offsetAttribute;//maybe null
private PayloadAttribute payloadAttribute;//maybe null
+
+ private CharsRefBuilder termCharsBuilder;//term data here
+
private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null
private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null
@@ -79,12 +83,14 @@
/**
* Constructor.
- *
+ *
* @param vector Terms that contains the data for
* creating the TokenStream. Must have positions and/or offsets.
+ * @param maxStartOffset if a token's start offset exceeds this then the token is not added. -1 disables the limit.
*/
- public TokenStreamFromTermVector(Terms vector) throws IOException {
+ public TokenStreamFromTermVector(Terms vector, int maxStartOffset) throws IOException {
super(ATTRIBUTE_FACTORY);
+ this.maxStartOffset = maxStartOffset < 0 ? Integer.MAX_VALUE : maxStartOffset;
assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*";
if (!vector.hasPositions() && !vector.hasOffsets()) {
throw new IllegalArgumentException("The term vector needs positions and/or offsets.");
@@ -106,15 +112,22 @@
//We delay initialization because we can see which attributes the consumer wants, particularly payloads
private void init() throws IOException {
assert !initialized;
+ short dpEnumFlags = PostingsEnum.POSITIONS;
if (vector.hasOffsets()) {
+ dpEnumFlags |= PostingsEnum.OFFSETS;
offsetAttribute = addAttribute(OffsetAttribute.class);
}
if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
+ dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);//must ask for offsets too
payloadAttribute = getAttribute(PayloadAttribute.class);
payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
spareBytesRefBuilder = new BytesRefBuilder();
}
+ // We put term data here
+ termCharsBuilder = new CharsRefBuilder();
+ termCharsBuilder.grow((int) (vector.size() * 7));//7 is over-estimate of average term len
+
// Step 1: iterate termsEnum and create a token, placing into an array of tokens by position
TokenLL[] positionedTokens = initTokensArray();
@@ -124,14 +137,17 @@
final TermsEnum termsEnum = vector.iterator(null);
BytesRef termBytesRef;
PostingsEnum dpEnum = null;
+ CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//only for UTF8->UTF16 call
//int sumFreq = 0;
while ((termBytesRef = termsEnum.next()) != null) {
//Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
// note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
- final char[] termChars = new char[termBytesRef.length];
- final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, termChars);
+ tempCharsRefBuilder.grow(termBytesRef.length);
+ final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
+ final int termCharsOff = termCharsBuilder.length();
+ termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);
- dpEnum = termsEnum.postings(null, dpEnum, PostingsEnum.POSITIONS);
+ dpEnum = termsEnum.postings(null, dpEnum, dpEnumFlags);
assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier
dpEnum.nextDoc();
final int freq = dpEnum.freq();
@@ -139,11 +155,14 @@
for (int j = 0; j < freq; j++) {
int pos = dpEnum.nextPosition();
TokenLL token = new TokenLL();
- token.termChars = termChars;
- token.termCharsLen = termCharsLen;
+ token.termCharsOff = termCharsOff;
+ token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
if (offsetAttribute != null) {
token.startOffset = dpEnum.startOffset();
- token.endOffset = dpEnum.endOffset();
+ if (token.startOffset > maxStartOffset) {
+ continue;//filter this token out; exceeds threshold
+ }
+ token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
if (pos == -1) {
pos = token.startOffset >> 3;//divide by 8
}
@@ -216,8 +235,8 @@
}
private TokenLL[] initTokensArray() throws IOException {
- // Estimate the number of position slots we need. We use some estimation factors taken from Wikipedia
- // that reduce the likelihood of needing to expand the array.
+ // Estimate the number of position slots we need from term stats. We use some estimation factors taken from
+ // Wikipedia that reduce the likelihood of needing to expand the array.
int sumTotalTermFreq = (int) vector.getSumTotalTermFreq();
if (sumTotalTermFreq == -1) {//unfortunately term vectors seem to not have this stat
int size = (int) vector.size();
@@ -227,7 +246,12 @@
sumTotalTermFreq = (int)(size * 2.4);
}
final int originalPositionEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this
- return new TokenLL[originalPositionEstimate];
+
+ // This estimate is based on maxStartOffset. Err on the side of this being larger than needed.
+ final int offsetLimitPositionEstimate = (int) (maxStartOffset / 5.0);
+
+ // Take the smaller of the two estimates, but no smaller than 64
+ return new TokenLL[Math.max(64, Math.min(originalPositionEstimate, offsetLimitPositionEstimate))];
}
@Override
@@ -247,10 +271,10 @@
return false;
}
clearAttributes();
- termAttribute.copyBuffer(incrementToken.termChars, 0, incrementToken.termCharsLen);
+ termAttribute.copyBuffer(termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen);
positionIncrementAttribute.setPositionIncrement(incrementToken.positionIncrement);
if (offsetAttribute != null) {
- offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.endOffset);
+ offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc);
}
if (payloadAttribute != null) {
if (incrementToken.payloadIndex == -1) {
@@ -263,11 +287,14 @@
}
private static class TokenLL {
- char[] termChars;
- int termCharsLen;
+ // This class should weigh 32 bytes, including object header
+
+ int termCharsOff; // see termCharsBuilder
+ short termCharsLen;
+
int positionIncrement;
int startOffset;
- int endOffset;
+ short endOffsetInc; // add to startOffset to get endOffset
int payloadIndex;
TokenLL next;
@@ -297,7 +324,7 @@
int compareOffsets(TokenLL tokenB) {
int cmp = Integer.compare(this.startOffset, tokenB.startOffset);
if (cmp == 0) {
- cmp = Integer.compare(this.endOffset, tokenB.endOffset);
+ cmp = Short.compare(this.endOffsetInc, tokenB.endOffsetInc);
}
return cmp;
}