| Index: lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java |
| IDEA additional info: |
| Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP |
| <+>UTF-8 |
| =================================================================== |
| --- lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java (revision 1669491) |
| +++ lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java (revision ) |
| @@ -254,7 +254,7 @@ |
| new QueryScorer(phraseQuery)); |
| final TokenStream tokenStream = TokenSources |
| .getTokenStream( |
| - indexReader.getTermVector(0, FIELD)); |
| + indexReader.getTermVector(0, FIELD), -1); |
| assertEquals("<B>the fox</B> did not jump", |
| highlighter.getBestFragment(tokenStream, TEXT)); |
| } finally { |
| @@ -283,7 +283,7 @@ |
| try { |
| assertEquals(1, indexReader.numDocs()); |
| TokenSources.getTokenStream( |
| - indexReader.getTermVector(0, FIELD)); |
| + indexReader.getTermVector(0, FIELD), -1); |
| fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets"); |
| } |
| catch (IllegalArgumentException e) { |
| @@ -333,7 +333,7 @@ |
| writer.close(); |
| assertEquals(1, reader.numDocs()); |
| |
| - TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field")); |
| + TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"), -1); |
| |
| CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); |
| PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class); |
| @@ -409,7 +409,7 @@ |
| writer.close(); |
| assertEquals(1, reader.numDocs()); |
| |
| - TokenStream vectorTokenStream = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field")); |
| + TokenStream vectorTokenStream = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"), -1); |
| |
| //sometimes check payloads |
| PayloadAttribute payloadAttribute = null; |
| Index: lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java |
| IDEA additional info: |
| Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP |
| <+>UTF-8 |
| =================================================================== |
| --- lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (revision 1669491) |
| +++ lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (revision ) |
| @@ -31,10 +31,19 @@ |
| |
| /** |
| * Hides implementation issues associated with obtaining a TokenStream for use |
| - * with the higlighter - can obtain from TermFreqVectors with offsets and |
| - * (optionally) positions or from Analyzer class reparsing the stored content. |
| + * with the highlighter - can obtain from TermFreqVectors with offsets and |
| + * (optionally) positions or from Analyzer class re-parsing the stored content. |
| */ |
| public class TokenSources { |
| + |
| + /** See {@link #getAnyTokenStream(org.apache.lucene.index.IndexReader, int, String, org.apache.lucene.index.StoredDocument, org.apache.lucene.analysis.Analyzer, int)}. */ |
| + @Deprecated |
| + public static TokenStream getAnyTokenStream(IndexReader reader, int docId, |
| + String field, StoredDocument document, Analyzer analyzer) |
| + throws IOException { |
| + return getAnyTokenStream(reader, docId, field, document, analyzer, -1); |
| + } |
| + |
| /** |
| * A convenience method that tries to first get a {@link TokenStreamFromTermVector} for the |
| * specified docId, then, falls back to using the passed in |
| @@ -54,16 +63,15 @@ |
| * {@link org.apache.lucene.document.Document} |
| * @throws IOException if there was an error loading |
| */ |
| - |
| - public static TokenStream getAnyTokenStream(IndexReader reader, int docId, |
| - String field, StoredDocument document, Analyzer analyzer) throws IOException { |
| + public static TokenStream getAnyTokenStream(IndexReader reader, int docId, String field, StoredDocument document, |
| + Analyzer analyzer, int maxStartOffset) throws IOException { |
| TokenStream ts = null; |
| |
| Fields vectors = reader.getTermVectors(docId); |
| if (vectors != null) { |
| Terms vector = vectors.terms(field); |
| if (vector != null) { |
| - ts = getTokenStream(vector); |
| + ts = getTokenStream(vector, maxStartOffset); |
| } |
| } |
| |
| @@ -74,24 +82,31 @@ |
| return ts; |
| } |
| |
| + /** See {@link #getAnyTokenStream(org.apache.lucene.index.IndexReader, int, String, org.apache.lucene.analysis.Analyzer, int)}. */ |
| + @Deprecated |
| + public static TokenStream getAnyTokenStream(IndexReader reader, int docId, |
| + String field, Analyzer analyzer) throws IOException { |
| + return getAnyTokenStream(reader, docId, field, analyzer, -1); |
| + } |
| + |
| /** |
| * A convenience method that tries a number of approaches to getting a token |
| * stream. The cost of finding there are no termVectors in the index is |
| * minimal (1000 invocations still registers 0 ms). So this "lazy" (flexible?) |
| - * approach to coding is probably acceptable |
| + * approach to coding is probably acceptable. |
| * |
| * @return null if field not stored correctly |
| * @throws IOException If there is a low-level I/O error |
| */ |
| public static TokenStream getAnyTokenStream(IndexReader reader, int docId, |
| - String field, Analyzer analyzer) throws IOException { |
| + String field, Analyzer analyzer, int maxStartOffset) throws IOException { |
| TokenStream ts = null; |
| |
| Fields vectors = reader.getTermVectors(docId); |
| if (vectors != null) { |
| Terms vector = vectors.terms(field); |
| if (vector != null) { |
| - ts = getTokenStream(vector); |
| + ts = getTokenStream(vector, maxStartOffset); |
| } |
| } |
| |
| @@ -102,13 +117,19 @@ |
| return ts; |
| } |
| |
| - /** Simply calls {@link #getTokenStream(org.apache.lucene.index.Terms)} now. */ |
| + /** Simply calls {@link #getTokenStream(org.apache.lucene.index.Terms, int)} now. */ |
| @Deprecated |
| public static TokenStream getTokenStream(Terms vector, |
| boolean tokenPositionsGuaranteedContiguous) throws IOException { |
| - return getTokenStream(vector); |
| + return getTokenStream(vector, -1); |
| } |
| |
| + /** @deprecated See {@link #getTokenStream(org.apache.lucene.index.Terms, int)}. */ |
| + @Deprecated |
| + public static TokenStream getTokenStream(final Terms tpv) throws IOException { |
| + return getTokenStream(tpv, -1); |
| + } |
| + |
| /** |
| * Returns a token stream generated from a {@link Terms}. This |
| * can be used to feed the highlighter with a pre-parsed token |
| @@ -119,7 +140,7 @@ |
| * |
| * @throws IllegalArgumentException if no offsets are available |
| */ |
| - public static TokenStream getTokenStream(final Terms tpv) throws IOException { |
| + public static TokenStream getTokenStream(final Terms tpv, int maxStartOffset) throws IOException { |
| |
| if (!tpv.hasOffsets()) { |
| throw new IllegalArgumentException("Highlighting requires offsets from the TokenStream."); |
| @@ -127,10 +148,19 @@ |
| // highlighters require offsets, so we insist here. |
| } |
| |
| - return new TokenStreamFromTermVector(tpv); |
| + return new TokenStreamFromTermVector(tpv, maxStartOffset); |
| } |
| |
| /** |
| + * @deprecated See {@link #getTokenStreamWithOffsets(org.apache.lucene.index.IndexReader, int, String, int)}. |
| + */ |
| + @Deprecated |
| + public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId, |
| + String field) throws IOException { |
| + return getTokenStreamWithOffsets(reader, docId, field, -1); |
| + } |
| + |
| + /** |
| * Returns a {@link TokenStream} with positions and offsets constructed from |
| * field termvectors. If the field has no termvectors or offsets |
| * are not included in the termvector, return null. See {@link #getTokenStream(org.apache.lucene.index.Terms)} |
| @@ -139,13 +169,14 @@ |
| * @param reader the {@link IndexReader} to retrieve term vectors from |
| * @param docId the document to retrieve termvectors for |
| * @param field the field to retrieve termvectors for |
| + * @param maxStartOffset max starting offset for tokens returned from term vectors; -1 disables the limit |
| * @return a {@link TokenStream}, or null if offsets are not available |
| * @throws IOException If there is a low-level I/O error |
| * |
| * @see #getTokenStream(org.apache.lucene.index.Terms) |
| */ |
| public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId, |
| - String field) throws IOException { |
| + String field, int maxStartOffset) throws IOException { |
| |
| Fields vectors = reader.getTermVectors(docId); |
| if (vectors == null) { |
| @@ -161,7 +192,7 @@ |
| return null; |
| } |
| |
| - return getTokenStream(vector); |
| + return getTokenStream(vector, maxStartOffset); |
| } |
| |
| // convenience method |
| Index: lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java |
| IDEA additional info: |
| Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP |
| <+>UTF-8 |
| =================================================================== |
| --- lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java (revision 1669491) |
| +++ lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java (revision ) |
| @@ -32,6 +32,7 @@ |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefArray; |
| import org.apache.lucene.util.BytesRefBuilder; |
| +import org.apache.lucene.util.CharsRefBuilder; |
| import org.apache.lucene.util.Counter; |
| import org.apache.lucene.util.UnicodeUtil; |
| |
| @@ -52,8 +53,6 @@ |
| */ |
| public final class TokenStreamFromTermVector extends TokenStream { |
| |
| - //TODO add a maxStartOffset filter, which highlighters will find handy |
| - |
| //This attribute factory uses less memory when captureState() is called. |
| public static final AttributeFactory ATTRIBUTE_FACTORY = |
| AttributeFactory.getStaticImplementation( |
| @@ -65,9 +64,14 @@ |
| |
| private final PositionIncrementAttribute positionIncrementAttribute; |
| |
| + private final int maxStartOffset; |
| + |
| private OffsetAttribute offsetAttribute;//maybe null |
| |
| private PayloadAttribute payloadAttribute;//maybe null |
| + |
| + private CharsRefBuilder termCharsBuilder;//term data here |
| + |
| private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null |
| private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null |
| |
| @@ -79,12 +83,14 @@ |
| |
| /** |
| * Constructor. |
| - * |
| + * |
| * @param vector Terms that contains the data for |
| * creating the TokenStream. Must have positions and/or offsets. |
| + * @param maxStartOffset if a token's start offset exceeds this then the token is not added. -1 disables the limit. |
| */ |
| - public TokenStreamFromTermVector(Terms vector) throws IOException { |
| + public TokenStreamFromTermVector(Terms vector, int maxStartOffset) throws IOException { |
| super(ATTRIBUTE_FACTORY); |
| + this.maxStartOffset = maxStartOffset < 0 ? Integer.MAX_VALUE : maxStartOffset; |
| assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*"; |
| if (!vector.hasPositions() && !vector.hasOffsets()) { |
| throw new IllegalArgumentException("The term vector needs positions and/or offsets."); |
| @@ -106,15 +112,22 @@ |
| //We delay initialization because we can see which attributes the consumer wants, particularly payloads |
| private void init() throws IOException { |
| assert !initialized; |
| + short dpEnumFlags = PostingsEnum.POSITIONS; |
| if (vector.hasOffsets()) { |
| + dpEnumFlags |= PostingsEnum.OFFSETS; |
| offsetAttribute = addAttribute(OffsetAttribute.class); |
| } |
| if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) { |
| + dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);//must ask for offsets too |
| payloadAttribute = getAttribute(PayloadAttribute.class); |
| payloadsBytesRefArray = new BytesRefArray(Counter.newCounter()); |
| spareBytesRefBuilder = new BytesRefBuilder(); |
| } |
| |
| + // We put term data here |
| + termCharsBuilder = new CharsRefBuilder(); |
| + termCharsBuilder.grow((int) (vector.size() * 7));//7 is over-estimate of average term len |
| + |
| // Step 1: iterate termsEnum and create a token, placing into an array of tokens by position |
| |
| TokenLL[] positionedTokens = initTokensArray(); |
| @@ -124,14 +137,17 @@ |
| final TermsEnum termsEnum = vector.iterator(null); |
| BytesRef termBytesRef; |
| PostingsEnum dpEnum = null; |
| + CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//only for UTF8->UTF16 call |
| //int sumFreq = 0; |
| while ((termBytesRef = termsEnum.next()) != null) { |
| //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj) |
| // note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand |
| - final char[] termChars = new char[termBytesRef.length]; |
| - final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, termChars); |
| + tempCharsRefBuilder.grow(termBytesRef.length); |
| + final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars()); |
| + final int termCharsOff = termCharsBuilder.length(); |
| + termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen); |
| |
| - dpEnum = termsEnum.postings(null, dpEnum, PostingsEnum.POSITIONS); |
| + dpEnum = termsEnum.postings(null, dpEnum, dpEnumFlags); |
| assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier |
| dpEnum.nextDoc(); |
| final int freq = dpEnum.freq(); |
| @@ -139,11 +155,14 @@ |
| for (int j = 0; j < freq; j++) { |
| int pos = dpEnum.nextPosition(); |
| TokenLL token = new TokenLL(); |
| - token.termChars = termChars; |
| - token.termCharsLen = termCharsLen; |
| + token.termCharsOff = termCharsOff; |
| + token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE); |
| if (offsetAttribute != null) { |
| token.startOffset = dpEnum.startOffset(); |
| - token.endOffset = dpEnum.endOffset(); |
| + if (token.startOffset > maxStartOffset) { |
| + continue;//filter this token out; exceeds threshold |
| + } |
| + token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE); |
| if (pos == -1) { |
| pos = token.startOffset >> 3;//divide by 8 |
| } |
| @@ -216,8 +235,8 @@ |
| } |
| |
| private TokenLL[] initTokensArray() throws IOException { |
| - // Estimate the number of position slots we need. We use some estimation factors taken from Wikipedia |
| - // that reduce the likelihood of needing to expand the array. |
| + // Estimate the number of position slots we need from term stats. We use some estimation factors taken from |
| + // Wikipedia that reduce the likelihood of needing to expand the array. |
| int sumTotalTermFreq = (int) vector.getSumTotalTermFreq(); |
| if (sumTotalTermFreq == -1) {//unfortunately term vectors seem to not have this stat |
| int size = (int) vector.size(); |
| @@ -227,7 +246,12 @@ |
| sumTotalTermFreq = (int)(size * 2.4); |
| } |
| final int originalPositionEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this |
| - return new TokenLL[originalPositionEstimate]; |
| + |
| + // This estimate is based on maxStartOffset. Err on the side of this being larger than needed. |
| + final int offsetLimitPositionEstimate = (int) (maxStartOffset / 5.0); |
| + |
| + // Take the smaller of the two estimates, but no smaller than 64 |
| + return new TokenLL[Math.max(64, Math.min(originalPositionEstimate, offsetLimitPositionEstimate))]; |
| } |
| |
| @Override |
| @@ -247,10 +271,10 @@ |
| return false; |
| } |
| clearAttributes(); |
| - termAttribute.copyBuffer(incrementToken.termChars, 0, incrementToken.termCharsLen); |
| + termAttribute.copyBuffer(termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen); |
| positionIncrementAttribute.setPositionIncrement(incrementToken.positionIncrement); |
| if (offsetAttribute != null) { |
| - offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.endOffset); |
| + offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc); |
| } |
| if (payloadAttribute != null) { |
| if (incrementToken.payloadIndex == -1) { |
| @@ -263,11 +287,14 @@ |
| } |
| |
| private static class TokenLL { |
| - char[] termChars; |
| - int termCharsLen; |
| + // This class should weigh 32 bytes, including object header |
| + |
| + int termCharsOff; // see termCharsBuilder |
| + short termCharsLen; |
| + |
| int positionIncrement; |
| int startOffset; |
| - int endOffset; |
| + short endOffsetInc; // add to startOffset to get endOffset |
| int payloadIndex; |
| |
| TokenLL next; |
| @@ -297,7 +324,7 @@ |
| int compareOffsets(TokenLL tokenB) { |
| int cmp = Integer.compare(this.startOffset, tokenB.startOffset); |
| if (cmp == 0) { |
| - cmp = Integer.compare(this.endOffset, tokenB.endOffset); |
| + cmp = Short.compare(this.endOffsetInc, tokenB.endOffsetInc); |
| } |
| return cmp; |
| } |