docs/attachments/LUCENE-6392/LUCENE-6392_highlight_term_vector_maxStartOffset.patch - lucene-jira-archive - Git at Google

 Index: lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
 IDEA additional info:
 Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
 <+>UTF-8
 ===================================================================
 --- lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java	(revision 1669491)
 +++ lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java	(revision )
 @@ -254,7 +254,7 @@
            new QueryScorer(phraseQuery));
        final TokenStream tokenStream = TokenSources
            .getTokenStream(
 -              indexReader.getTermVector(0, FIELD));
 +              indexReader.getTermVector(0, FIELD), -1);
        assertEquals("<B>the fox</B> did not jump",
            highlighter.getBestFragment(tokenStream, TEXT));
      } finally {
 @@ -283,7 +283,7 @@
      try {
        assertEquals(1, indexReader.numDocs());
        TokenSources.getTokenStream(
 -              indexReader.getTermVector(0, FIELD));
 +              indexReader.getTermVector(0, FIELD), -1);
        fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets");
      }
      catch (IllegalArgumentException e) {
 @@ -333,7 +333,7 @@
      writer.close();
      assertEquals(1, reader.numDocs());

 -    TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"));
 +    TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"), -1);

      CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
      PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
 @@ -409,7 +409,7 @@
      writer.close();
      assertEquals(1, reader.numDocs());

 -    TokenStream vectorTokenStream = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"));
 +    TokenStream vectorTokenStream = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"), -1);

      //sometimes check payloads
      PayloadAttribute payloadAttribute = null;
 Index: lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
 IDEA additional info:
 Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
 <+>UTF-8
 ===================================================================
 --- lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java	(revision 1669491)
 +++ lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java	(revision )
 @@ -31,10 +31,19 @@

  /**
   * Hides implementation issues associated with obtaining a TokenStream for use
 - * with the higlighter - can obtain from TermFreqVectors with offsets and
 - * (optionally) positions or from Analyzer class reparsing the stored content.
 + * with the highlighter - can obtain from TermFreqVectors with offsets and
 + * (optionally) positions or from Analyzer class re-parsing the stored content.
   */
  public class TokenSources {
 +
 +  /** See {@link #getAnyTokenStream(org.apache.lucene.index.IndexReader, int, String, org.apache.lucene.index.StoredDocument, org.apache.lucene.analysis.Analyzer, int)}. */
 +  @Deprecated
 +  public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
 +                                              String field, StoredDocument document, Analyzer analyzer)
 +      throws IOException {
 +    return getAnyTokenStream(reader, docId, field, document, analyzer, -1);
 +  }
 +
    /**
     * A convenience method that tries to first get a {@link TokenStreamFromTermVector} for the
     * specified docId, then, falls back to using the passed in
 @@ -54,16 +63,15 @@
     *         {@link org.apache.lucene.document.Document}
     * @throws IOException if there was an error loading
     */
 -
 -  public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
 -      String field, StoredDocument document, Analyzer analyzer) throws IOException {
 +  public static TokenStream getAnyTokenStream(IndexReader reader, int docId, String field, StoredDocument document,
 +                                              Analyzer analyzer, int maxStartOffset) throws IOException {
      TokenStream ts = null;

      Fields vectors = reader.getTermVectors(docId);
      if (vectors != null) {
        Terms vector = vectors.terms(field);
        if (vector != null) {
 -        ts = getTokenStream(vector);
 +        ts = getTokenStream(vector, maxStartOffset);
        }
      }

 @@ -74,24 +82,31 @@
      return ts;
    }

 +  /** See {@link #getAnyTokenStream(org.apache.lucene.index.IndexReader, int, String, org.apache.lucene.analysis.Analyzer, int)}. */
 +  @Deprecated
 +  public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
 +                                              String field, Analyzer analyzer) throws IOException {
 +    return getAnyTokenStream(reader, docId, field, analyzer, -1);
 +  }
 +
    /**
     * A convenience method that tries a number of approaches to getting a token
     * stream. The cost of finding there are no termVectors in the index is
     * minimal (1000 invocations still registers 0 ms). So this "lazy" (flexible?)
 -   * approach to coding is probably acceptable
 +   * approach to coding is probably acceptable.
     *
     * @return null if field not stored correctly
     * @throws IOException If there is a low-level I/O error
     */
    public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
 -      String field, Analyzer analyzer) throws IOException {
 +                                              String field, Analyzer analyzer, int maxStartOffset) throws IOException {
      TokenStream ts = null;

      Fields vectors = reader.getTermVectors(docId);
      if (vectors != null) {
        Terms vector = vectors.terms(field);
        if (vector != null) {
 -        ts = getTokenStream(vector);
 +        ts = getTokenStream(vector, maxStartOffset);
        }
      }

 @@ -102,13 +117,19 @@
      return ts;
    }

 -  /** Simply calls {@link #getTokenStream(org.apache.lucene.index.Terms)} now. */
 +  /** Simply calls {@link #getTokenStream(org.apache.lucene.index.Terms, int)} now. */
    @Deprecated
    public static TokenStream getTokenStream(Terms vector,
                                             boolean tokenPositionsGuaranteedContiguous) throws IOException {
 -    return getTokenStream(vector);
 +    return getTokenStream(vector, -1);
    }

 +  /** @deprecated See {@link #getTokenStream(org.apache.lucene.index.Terms, int)}. */
 +  @Deprecated
 +  public static TokenStream getTokenStream(final Terms tpv) throws IOException {
 +    return getTokenStream(tpv, -1);
 +  }
 +
    /**
     * Returns a token stream generated from a {@link Terms}. This
     * can be used to feed the highlighter with a pre-parsed token
 @@ -119,7 +140,7 @@
     *
     * @throws IllegalArgumentException if no offsets are available
     */
 -  public static TokenStream getTokenStream(final Terms tpv) throws IOException {
 +  public static TokenStream getTokenStream(final Terms tpv, int maxStartOffset) throws IOException {

      if (!tpv.hasOffsets()) {
        throw new IllegalArgumentException("Highlighting requires offsets from the TokenStream.");
 @@ -127,10 +148,19 @@
        // highlighters require offsets, so we insist here.
      }

 -    return new TokenStreamFromTermVector(tpv);
 +    return new TokenStreamFromTermVector(tpv, maxStartOffset);
    }

    /**
 +   * @deprecated See {@link #getTokenStreamWithOffsets(org.apache.lucene.index.IndexReader, int, String, int)}.
 +   */
 +  @Deprecated
 +  public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId,
 +                                                      String field) throws IOException {
 +    return getTokenStreamWithOffsets(reader, docId, field, -1);
 +  }
 +
 +  /**
     * Returns a {@link TokenStream} with positions and offsets constructed from
     * field termvectors.  If the field has no termvectors or offsets
     * are not included in the termvector, return null.  See {@link #getTokenStream(org.apache.lucene.index.Terms)}
 @@ -139,13 +169,14 @@
     * @param reader the {@link IndexReader} to retrieve term vectors from
     * @param docId the document to retrieve termvectors for
     * @param field the field to retrieve termvectors for
 +   * @param maxStartOffset max starting offset for tokens returned from term vectors; -1 disables the limit
     * @return a {@link TokenStream}, or null if offsets are not available
     * @throws IOException If there is a low-level I/O error
     *
     * @see #getTokenStream(org.apache.lucene.index.Terms)
     */
    public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId,
 -                                                      String field) throws IOException {
 +                                                      String field, int maxStartOffset) throws IOException {

      Fields vectors = reader.getTermVectors(docId);
      if (vectors == null) {
 @@ -161,7 +192,7 @@
        return null;
      }

 -    return getTokenStream(vector);
 +    return getTokenStream(vector, maxStartOffset);
    }

    // convenience method
 Index: lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java
 IDEA additional info:
 Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
 <+>UTF-8
 ===================================================================
 --- lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java	(revision 1669491)
 +++ lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java	(revision )
 @@ -32,6 +32,7 @@
  import org.apache.lucene.util.BytesRef;
  import org.apache.lucene.util.BytesRefArray;
  import org.apache.lucene.util.BytesRefBuilder;
 +import org.apache.lucene.util.CharsRefBuilder;
  import org.apache.lucene.util.Counter;
  import org.apache.lucene.util.UnicodeUtil;

 @@ -52,8 +53,6 @@
   */
  public final class TokenStreamFromTermVector extends TokenStream {

 -  //TODO add a maxStartOffset filter, which highlighters will find handy
 -
    //This attribute factory uses less memory when captureState() is called.
    public static final AttributeFactory ATTRIBUTE_FACTORY =
        AttributeFactory.getStaticImplementation(
 @@ -65,9 +64,14 @@

    private final PositionIncrementAttribute positionIncrementAttribute;

 +  private final int maxStartOffset;
 +
    private OffsetAttribute offsetAttribute;//maybe null

    private PayloadAttribute payloadAttribute;//maybe null
 +
 +  private CharsRefBuilder termCharsBuilder;//term data here
 +
    private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null
    private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null

 @@ -79,12 +83,14 @@

    /**
     * Constructor.
 -   *
 +   *
     * @param vector Terms that contains the data for
     *        creating the TokenStream. Must have positions and/or offsets.
 +   * @param maxStartOffset if a token's start offset exceeds this then the token is not added. -1 disables the limit.
     */
 -  public TokenStreamFromTermVector(Terms vector) throws IOException {
 +  public TokenStreamFromTermVector(Terms vector, int maxStartOffset) throws IOException {
      super(ATTRIBUTE_FACTORY);
 +    this.maxStartOffset = maxStartOffset < 0 ? Integer.MAX_VALUE : maxStartOffset;
      assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*";
      if (!vector.hasPositions() && !vector.hasOffsets()) {
        throw new IllegalArgumentException("The term vector needs positions and/or offsets.");
 @@ -106,15 +112,22 @@
    //We delay initialization because we can see which attributes the consumer wants, particularly payloads
    private void init() throws IOException {
      assert !initialized;
 +    short dpEnumFlags = PostingsEnum.POSITIONS;
      if (vector.hasOffsets()) {
 +      dpEnumFlags |= PostingsEnum.OFFSETS;
        offsetAttribute = addAttribute(OffsetAttribute.class);
      }
      if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
 +      dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);//must ask for offsets too
        payloadAttribute = getAttribute(PayloadAttribute.class);
        payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
        spareBytesRefBuilder = new BytesRefBuilder();
      }

 +    // We put term data here
 +    termCharsBuilder = new CharsRefBuilder();
 +    termCharsBuilder.grow((int) (vector.size() * 7));//7 is over-estimate of average term len
 +
      // Step 1: iterate termsEnum and create a token, placing into an array of tokens by position

      TokenLL[] positionedTokens = initTokensArray();
 @@ -124,14 +137,17 @@
      final TermsEnum termsEnum = vector.iterator(null);
      BytesRef termBytesRef;
      PostingsEnum dpEnum = null;
 +    CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//only for UTF8->UTF16 call
      //int sumFreq = 0;
      while ((termBytesRef = termsEnum.next()) != null) {
        //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
        // note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
 -      final char[] termChars = new char[termBytesRef.length];
 -      final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, termChars);
 +      tempCharsRefBuilder.grow(termBytesRef.length);
 +      final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
 +      final int termCharsOff = termCharsBuilder.length();
 +      termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);

 -      dpEnum = termsEnum.postings(null, dpEnum, PostingsEnum.POSITIONS);
 +      dpEnum = termsEnum.postings(null, dpEnum, dpEnumFlags);
        assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier
        dpEnum.nextDoc();
        final int freq = dpEnum.freq();
 @@ -139,11 +155,14 @@
        for (int j = 0; j < freq; j++) {
          int pos = dpEnum.nextPosition();
          TokenLL token = new TokenLL();
 -        token.termChars = termChars;
 -        token.termCharsLen = termCharsLen;
 +        token.termCharsOff = termCharsOff;
 +        token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
          if (offsetAttribute != null) {
            token.startOffset = dpEnum.startOffset();
 -          token.endOffset = dpEnum.endOffset();
 +          if (token.startOffset > maxStartOffset) {
 +            continue;//filter this token out; exceeds threshold
 +          }
 +          token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
            if (pos == -1) {
              pos = token.startOffset >> 3;//divide by 8
            }
 @@ -216,8 +235,8 @@
    }

    private TokenLL[] initTokensArray() throws IOException {
 -    // Estimate the number of position slots we need. We use some estimation factors taken from Wikipedia
 -    //  that reduce the likelihood of needing to expand the array.
 +    // Estimate the number of position slots we need from term stats.  We use some estimation factors taken from
 +    //  Wikipedia that reduce the likelihood of needing to expand the array.
      int sumTotalTermFreq = (int) vector.getSumTotalTermFreq();
      if (sumTotalTermFreq == -1) {//unfortunately term vectors seem to not have this stat
        int size = (int) vector.size();
 @@ -227,7 +246,12 @@
        sumTotalTermFreq = (int)(size * 2.4);
      }
      final int originalPositionEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this
 -    return new TokenLL[originalPositionEstimate];
 +
 +    // This estimate is based on maxStartOffset. Err on the side of this being larger than needed.
 +    final int offsetLimitPositionEstimate = (int) (maxStartOffset / 5.0);
 +
 +    // Take the smaller of the two estimates, but no smaller than 64
 +    return new TokenLL[Math.max(64, Math.min(originalPositionEstimate, offsetLimitPositionEstimate))];
    }

    @Override
 @@ -247,10 +271,10 @@
        return false;
      }
      clearAttributes();
 -    termAttribute.copyBuffer(incrementToken.termChars, 0, incrementToken.termCharsLen);
 +    termAttribute.copyBuffer(termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen);
      positionIncrementAttribute.setPositionIncrement(incrementToken.positionIncrement);
      if (offsetAttribute != null) {
 -      offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.endOffset);
 +      offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc);
      }
      if (payloadAttribute != null) {
        if (incrementToken.payloadIndex == -1) {
 @@ -263,11 +287,14 @@
    }

    private static class TokenLL {
 -    char[] termChars;
 -    int termCharsLen;
 +    // This class should weigh 32 bytes, including object header
 +
 +    int termCharsOff; // see termCharsBuilder
 +    short termCharsLen;
 +
      int positionIncrement;
      int startOffset;
 -    int endOffset;
 +    short endOffsetInc; // add to startOffset to get endOffset
      int payloadIndex;

      TokenLL next;
 @@ -297,7 +324,7 @@
      int compareOffsets(TokenLL tokenB) {
        int cmp = Integer.compare(this.startOffset, tokenB.startOffset);
        if (cmp == 0) {
 -        cmp = Integer.compare(this.endOffset, tokenB.endOffset);
 +        cmp = Short.compare(this.endOffsetInc, tokenB.endOffsetInc);
        }
        return cmp;
      }
	Index: lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
	IDEA additional info:
	Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
	<+>UTF-8
	===================================================================
	--- lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java (revision 1669491)
	+++ lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java (revision )
	@@ -254,7 +254,7 @@
	new QueryScorer(phraseQuery));
	final TokenStream tokenStream = TokenSources
	.getTokenStream(
	- indexReader.getTermVector(0, FIELD));
	+ indexReader.getTermVector(0, FIELD), -1);
	assertEquals("<B>the fox</B> did not jump",
	highlighter.getBestFragment(tokenStream, TEXT));
	} finally {
	@@ -283,7 +283,7 @@
	try {
	assertEquals(1, indexReader.numDocs());
	TokenSources.getTokenStream(
	- indexReader.getTermVector(0, FIELD));
	+ indexReader.getTermVector(0, FIELD), -1);
	fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets");
	}
	catch (IllegalArgumentException e) {
	@@ -333,7 +333,7 @@
	writer.close();
	assertEquals(1, reader.numDocs());

	- TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"));
	+ TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"), -1);

	CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
	PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
	@@ -409,7 +409,7 @@
	writer.close();
	assertEquals(1, reader.numDocs());

	- TokenStream vectorTokenStream = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"));
	+ TokenStream vectorTokenStream = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"), -1);

	//sometimes check payloads
	PayloadAttribute payloadAttribute = null;
	Index: lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
	IDEA additional info:
	Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
	<+>UTF-8
	===================================================================
	--- lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (revision 1669491)
	+++ lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (revision )
	@@ -31,10 +31,19 @@

	/**
	* Hides implementation issues associated with obtaining a TokenStream for use
	- * with the higlighter - can obtain from TermFreqVectors with offsets and
	- * (optionally) positions or from Analyzer class reparsing the stored content.
	+ * with the highlighter - can obtain from TermFreqVectors with offsets and
	+ * (optionally) positions or from Analyzer class re-parsing the stored content.
	*/
	public class TokenSources {
	+
	+ /** See {@link #getAnyTokenStream(org.apache.lucene.index.IndexReader, int, String, org.apache.lucene.index.StoredDocument, org.apache.lucene.analysis.Analyzer, int)}. */
	+ @Deprecated
	+ public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
	+ String field, StoredDocument document, Analyzer analyzer)
	+ throws IOException {
	+ return getAnyTokenStream(reader, docId, field, document, analyzer, -1);
	+ }
	+
	/**
	* A convenience method that tries to first get a {@link TokenStreamFromTermVector} for the
	* specified docId, then, falls back to using the passed in
	@@ -54,16 +63,15 @@
	* {@link org.apache.lucene.document.Document}
	* @throws IOException if there was an error loading
	*/
	-
	- public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
	- String field, StoredDocument document, Analyzer analyzer) throws IOException {
	+ public static TokenStream getAnyTokenStream(IndexReader reader, int docId, String field, StoredDocument document,
	+ Analyzer analyzer, int maxStartOffset) throws IOException {
	TokenStream ts = null;

	Fields vectors = reader.getTermVectors(docId);
	if (vectors != null) {
	Terms vector = vectors.terms(field);
	if (vector != null) {
	- ts = getTokenStream(vector);
	+ ts = getTokenStream(vector, maxStartOffset);
	}
	}

	@@ -74,24 +82,31 @@
	return ts;
	}

	+ /** See {@link #getAnyTokenStream(org.apache.lucene.index.IndexReader, int, String, org.apache.lucene.analysis.Analyzer, int)}. */
	+ @Deprecated
	+ public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
	+ String field, Analyzer analyzer) throws IOException {
	+ return getAnyTokenStream(reader, docId, field, analyzer, -1);
	+ }
	+
	/**
	* A convenience method that tries a number of approaches to getting a token
	* stream. The cost of finding there are no termVectors in the index is
	* minimal (1000 invocations still registers 0 ms). So this "lazy" (flexible?)
	- * approach to coding is probably acceptable
	+ * approach to coding is probably acceptable.
	*
	* @return null if field not stored correctly
	* @throws IOException If there is a low-level I/O error
	*/
	public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
	- String field, Analyzer analyzer) throws IOException {
	+ String field, Analyzer analyzer, int maxStartOffset) throws IOException {
	TokenStream ts = null;

	Fields vectors = reader.getTermVectors(docId);
	if (vectors != null) {
	Terms vector = vectors.terms(field);
	if (vector != null) {
	- ts = getTokenStream(vector);
	+ ts = getTokenStream(vector, maxStartOffset);
	}
	}

	@@ -102,13 +117,19 @@
	return ts;
	}

	- /** Simply calls {@link #getTokenStream(org.apache.lucene.index.Terms)} now. */
	+ /** Simply calls {@link #getTokenStream(org.apache.lucene.index.Terms, int)} now. */
	@Deprecated
	public static TokenStream getTokenStream(Terms vector,
	boolean tokenPositionsGuaranteedContiguous) throws IOException {
	- return getTokenStream(vector);
	+ return getTokenStream(vector, -1);
	}

	+ /** @deprecated See {@link #getTokenStream(org.apache.lucene.index.Terms, int)}. */
	+ @Deprecated
	+ public static TokenStream getTokenStream(final Terms tpv) throws IOException {
	+ return getTokenStream(tpv, -1);
	+ }
	+
	/**
	* Returns a token stream generated from a {@link Terms}. This
	* can be used to feed the highlighter with a pre-parsed token
	@@ -119,7 +140,7 @@
	*
	* @throws IllegalArgumentException if no offsets are available
	*/
	- public static TokenStream getTokenStream(final Terms tpv) throws IOException {
	+ public static TokenStream getTokenStream(final Terms tpv, int maxStartOffset) throws IOException {

	if (!tpv.hasOffsets()) {
	throw new IllegalArgumentException("Highlighting requires offsets from the TokenStream.");
	@@ -127,10 +148,19 @@
	// highlighters require offsets, so we insist here.
	}

	- return new TokenStreamFromTermVector(tpv);
	+ return new TokenStreamFromTermVector(tpv, maxStartOffset);
	}

	/**
	+ * @deprecated See {@link #getTokenStreamWithOffsets(org.apache.lucene.index.IndexReader, int, String, int)}.
	+ */
	+ @Deprecated
	+ public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId,
	+ String field) throws IOException {
	+ return getTokenStreamWithOffsets(reader, docId, field, -1);
	+ }
	+
	+ /**
	* Returns a {@link TokenStream} with positions and offsets constructed from
	* field termvectors. If the field has no termvectors or offsets
	* are not included in the termvector, return null. See {@link #getTokenStream(org.apache.lucene.index.Terms)}
	@@ -139,13 +169,14 @@
	* @param reader the {@link IndexReader} to retrieve term vectors from
	* @param docId the document to retrieve termvectors for
	* @param field the field to retrieve termvectors for
	+ * @param maxStartOffset max starting offset for tokens returned from term vectors; -1 disables the limit
	* @return a {@link TokenStream}, or null if offsets are not available
	* @throws IOException If there is a low-level I/O error
	*
	* @see #getTokenStream(org.apache.lucene.index.Terms)
	*/
	public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId,
	- String field) throws IOException {
	+ String field, int maxStartOffset) throws IOException {

	Fields vectors = reader.getTermVectors(docId);
	if (vectors == null) {
	@@ -161,7 +192,7 @@
	return null;
	}

	- return getTokenStream(vector);
	+ return getTokenStream(vector, maxStartOffset);
	}

	// convenience method
	Index: lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java
	IDEA additional info:
	Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
	<+>UTF-8
	===================================================================
	--- lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java (revision 1669491)
	+++ lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java (revision )
	@@ -32,6 +32,7 @@
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.BytesRefArray;
	import org.apache.lucene.util.BytesRefBuilder;
	+import org.apache.lucene.util.CharsRefBuilder;
	import org.apache.lucene.util.Counter;
	import org.apache.lucene.util.UnicodeUtil;

	@@ -52,8 +53,6 @@
	*/
	public final class TokenStreamFromTermVector extends TokenStream {

	- //TODO add a maxStartOffset filter, which highlighters will find handy
	-
	//This attribute factory uses less memory when captureState() is called.
	public static final AttributeFactory ATTRIBUTE_FACTORY =
	AttributeFactory.getStaticImplementation(
	@@ -65,9 +64,14 @@

	private final PositionIncrementAttribute positionIncrementAttribute;

	+ private final int maxStartOffset;
	+
	private OffsetAttribute offsetAttribute;//maybe null

	private PayloadAttribute payloadAttribute;//maybe null
	+
	+ private CharsRefBuilder termCharsBuilder;//term data here
	+
	private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null
	private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null

	@@ -79,12 +83,14 @@

	/**
	* Constructor.
	- *
	+ *
	* @param vector Terms that contains the data for
	* creating the TokenStream. Must have positions and/or offsets.
	+ * @param maxStartOffset if a token's start offset exceeds this then the token is not added. -1 disables the limit.
	*/
	- public TokenStreamFromTermVector(Terms vector) throws IOException {
	+ public TokenStreamFromTermVector(Terms vector, int maxStartOffset) throws IOException {
	super(ATTRIBUTE_FACTORY);
	+ this.maxStartOffset = maxStartOffset < 0 ? Integer.MAX_VALUE : maxStartOffset;
	assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads yet";
	if (!vector.hasPositions() && !vector.hasOffsets()) {
	throw new IllegalArgumentException("The term vector needs positions and/or offsets.");
	@@ -106,15 +112,22 @@
	//We delay initialization because we can see which attributes the consumer wants, particularly payloads
	private void init() throws IOException {
	assert !initialized;
	+ short dpEnumFlags = PostingsEnum.POSITIONS;
	if (vector.hasOffsets()) {
	+ dpEnumFlags \|= PostingsEnum.OFFSETS;
	offsetAttribute = addAttribute(OffsetAttribute.class);
	}
	if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
	+ dpEnumFlags \|= (PostingsEnum.OFFSETS \| PostingsEnum.PAYLOADS);//must ask for offsets too
	payloadAttribute = getAttribute(PayloadAttribute.class);
	payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
	spareBytesRefBuilder = new BytesRefBuilder();
	}

	+ // We put term data here
	+ termCharsBuilder = new CharsRefBuilder();
	+ termCharsBuilder.grow((int) (vector.size() * 7));//7 is over-estimate of average term len
	+
	// Step 1: iterate termsEnum and create a token, placing into an array of tokens by position

	TokenLL[] positionedTokens = initTokensArray();
	@@ -124,14 +137,17 @@
	final TermsEnum termsEnum = vector.iterator(null);
	BytesRef termBytesRef;
	PostingsEnum dpEnum = null;
	+ CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//only for UTF8->UTF16 call
	//int sumFreq = 0;
	while ((termBytesRef = termsEnum.next()) != null) {
	//Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
	// note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
	- final char[] termChars = new char[termBytesRef.length];
	- final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, termChars);
	+ tempCharsRefBuilder.grow(termBytesRef.length);
	+ final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
	+ final int termCharsOff = termCharsBuilder.length();
	+ termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);

	- dpEnum = termsEnum.postings(null, dpEnum, PostingsEnum.POSITIONS);
	+ dpEnum = termsEnum.postings(null, dpEnum, dpEnumFlags);
	assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier
	dpEnum.nextDoc();
	final int freq = dpEnum.freq();
	@@ -139,11 +155,14 @@
	for (int j = 0; j < freq; j++) {
	int pos = dpEnum.nextPosition();
	TokenLL token = new TokenLL();
	- token.termChars = termChars;
	- token.termCharsLen = termCharsLen;
	+ token.termCharsOff = termCharsOff;
	+ token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
	if (offsetAttribute != null) {
	token.startOffset = dpEnum.startOffset();
	- token.endOffset = dpEnum.endOffset();
	+ if (token.startOffset > maxStartOffset) {
	+ continue;//filter this token out; exceeds threshold
	+ }
	+ token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
	if (pos == -1) {
	pos = token.startOffset >> 3;//divide by 8
	}
	@@ -216,8 +235,8 @@
	}

	private TokenLL[] initTokensArray() throws IOException {
	- // Estimate the number of position slots we need. We use some estimation factors taken from Wikipedia
	- // that reduce the likelihood of needing to expand the array.
	+ // Estimate the number of position slots we need from term stats. We use some estimation factors taken from
	+ // Wikipedia that reduce the likelihood of needing to expand the array.
	int sumTotalTermFreq = (int) vector.getSumTotalTermFreq();
	if (sumTotalTermFreq == -1) {//unfortunately term vectors seem to not have this stat
	int size = (int) vector.size();
	@@ -227,7 +246,12 @@
	sumTotalTermFreq = (int)(size * 2.4);
	}
	final int originalPositionEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this
	- return new TokenLL[originalPositionEstimate];
	+
	+ // This estimate is based on maxStartOffset. Err on the side of this being larger than needed.
	+ final int offsetLimitPositionEstimate = (int) (maxStartOffset / 5.0);
	+
	+ // Take the smaller of the two estimates, but no smaller than 64
	+ return new TokenLL[Math.max(64, Math.min(originalPositionEstimate, offsetLimitPositionEstimate))];
	}

	@Override
	@@ -247,10 +271,10 @@
	return false;
	}
	clearAttributes();
	- termAttribute.copyBuffer(incrementToken.termChars, 0, incrementToken.termCharsLen);
	+ termAttribute.copyBuffer(termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen);
	positionIncrementAttribute.setPositionIncrement(incrementToken.positionIncrement);
	if (offsetAttribute != null) {
	- offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.endOffset);
	+ offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc);
	}
	if (payloadAttribute != null) {
	if (incrementToken.payloadIndex == -1) {
	@@ -263,11 +287,14 @@
	}

	private static class TokenLL {
	- char[] termChars;
	- int termCharsLen;
	+ // This class should weigh 32 bytes, including object header
	+
	+ int termCharsOff; // see termCharsBuilder
	+ short termCharsLen;
	+
	int positionIncrement;
	int startOffset;
	- int endOffset;
	+ short endOffsetInc; // add to startOffset to get endOffset
	int payloadIndex;

	TokenLL next;
	@@ -297,7 +324,7 @@
	int compareOffsets(TokenLL tokenB) {
	int cmp = Integer.compare(this.startOffset, tokenB.startOffset);
	if (cmp == 0) {
	- cmp = Integer.compare(this.endOffset, tokenB.endOffset);
	+ cmp = Short.compare(this.endOffsetInc, tokenB.endOffsetInc);
	}
	return cmp;
	}