docs/attachments/LUCENE-1333/LUCENE-1333a.txt - lucene-jira-archive - Git at Google

 Index: src/test/org/apache/lucene/analysis/TestToken.java
 ===================================================================
 --- src/test/org/apache/lucene/analysis/TestToken.java	(revision 675655)
 +++ src/test/org/apache/lucene/analysis/TestToken.java	(working copy)
 @@ -26,6 +26,119 @@
      super(name);
    }

 +  public void testCtor() throws Exception {
 +    Token t = new Token();
 +    char[] content = "hello".toCharArray();
 +    t.setTermBuffer(content, 0, content.length);
 +    char[] buf = t.termBuffer();
 +    assertNotSame(t.termBuffer(), content);
 +    assertEquals("hello", new String(t.termBuffer(), 0, t.termLength()));
 +    assertEquals("word", t.type());
 +    assertEquals(0, t.getFlags());
 +
 +    t = new Token(6, 22);
 +    t.setTermBuffer(content, 0, content.length);
 +    assertEquals("hello", new String(t.termBuffer(), 0, t.termLength()));
 +    assertEquals("(hello,6,22)", t.toString());
 +    assertEquals("word", t.type());
 +    assertEquals(0, t.getFlags());
 +
 +    t = new Token(6, 22, 7);
 +    t.setTermBuffer(content, 0, content.length);
 +    assertEquals("hello", new String(t.termBuffer(), 0, t.termLength()));
 +    assertEquals("(hello,6,22)", t.toString());
 +    assertEquals(7, t.getFlags());
 +
 +    t = new Token(6, 22, "junk");
 +    t.setTermBuffer(content, 0, content.length);
 +    assertEquals("hello", new String(t.termBuffer(), 0, t.termLength()));
 +    assertEquals("(hello,6,22,type=junk)", t.toString());
 +    assertEquals(0, t.getFlags());
 +  }
 +
 +  public void testResize() {
 +    Token t = new Token();
 +    char[] content = "hello".toCharArray();
 +    t.setTermBuffer(content, 0, content.length);
 +    for (int i = 0; i < 2000; i++)
 +    {
 +      t.resizeTermBuffer(i);
 +      assertTrue(i <= t.termBuffer().length);
 +      assertEquals("hello", new String(t.termBuffer(), 0, t.termLength()));
 +    }
 +  }
 +
 +  public void testGrow() {
 +    Token t = new Token();
 +    StringBuffer buf = new StringBuffer("ab");
 +    for (int i = 0; i < 20; i++)
 +    {
 +      char[] content = buf.toString().toCharArray();
 +      t.setTermBuffer(content, 0, content.length);
 +      assertEquals(buf.length(), t.termLength());
 +      assertEquals(buf.toString(), t.term());
 +      buf.append(buf.toString());
 +    }
 +    assertEquals(1048576, t.termLength());
 +    assertEquals(1179654, t.termBuffer().length);
 +
 +    // now as a string, first variant
 +    t = new Token();
 +    buf = new StringBuffer("ab");
 +    for (int i = 0; i < 20; i++)
 +    {
 +      String content = buf.toString();
 +      t.setTermBuffer(content, 0, content.length());
 +      assertEquals(content.length(), t.termLength());
 +      assertEquals(content, t.term());
 +      buf.append(content);
 +    }
 +    assertEquals(1048576, t.termLength());
 +    assertEquals(1179654, t.termBuffer().length);
 +
 +    // now as a string, second variant
 +    t = new Token();
 +    buf = new StringBuffer("ab");
 +    for (int i = 0; i < 20; i++)
 +    {
 +      String content = buf.toString();
 +      t.setTermBuffer(content);
 +      assertEquals(content.length(), t.termLength());
 +      assertEquals(content, t.term());
 +      buf.append(content);
 +    }
 +    assertEquals(1048576, t.termLength());
 +    assertEquals(1179654, t.termBuffer().length);
 +
 +    // Test for slow growth to a long term
 +    t = new Token();
 +    buf = new StringBuffer("a");
 +    for (int i = 0; i < 20000; i++)
 +    {
 +      String content = buf.toString();
 +      t.setTermBuffer(content);
 +      assertEquals(content.length(), t.termLength());
 +      assertEquals(content, t.term());
 +      buf.append("a");
 +    }
 +    assertEquals(20000, t.termLength());
 +    assertEquals(20331, t.termBuffer().length);
 +
 +    // Test for slow growth to a long term
 +    t = new Token();
 +    buf = new StringBuffer("a");
 +    for (int i = 0; i < 20000; i++)
 +    {
 +      String content = buf.toString();
 +      t.setTermBuffer(content);
 +      assertEquals(content.length(), t.termLength());
 +      assertEquals(content, t.term());
 +      buf.append("a");
 +    }
 +    assertEquals(20000, t.termLength());
 +    assertEquals(20331, t.termBuffer().length);
 +  }
 +
    public void testToString() throws Exception {
      char[] b = {'a', 'l', 'o', 'h', 'a'};
      Token t = new Token("", 0, 5);
 @@ -53,4 +166,13 @@
      buffer[1] = 'o';
      assertEquals(t.termText(), "hollo3");
    }
 +
 +  public void testClone() throws Exception {
 +    Token t = new Token(0, 5);
 +    char[] content = "hello".toCharArray();
 +    t.setTermBuffer(content, 0, 5);
 +    char[] buf = t.termBuffer();
 +    Token copy = (Token) t.clone();
 +    assertNotSame(buf, copy.termBuffer());
 +  }
  }
 Index: src/java/org/apache/lucene/analysis/Tokenizer.java
 ===================================================================
 --- src/java/org/apache/lucene/analysis/Tokenizer.java	(revision 675655)
 +++ src/java/org/apache/lucene/analysis/Tokenizer.java	(working copy)
 @@ -25,7 +25,7 @@
    This is an abstract class.
    <p>
    NOTE: subclasses must override at least one of {@link
 -  #next()} or {@link #next(Token)}.
 +  #next()} or {@link #next(Token)}. They should override {@link #next(Token)}.
    <p>
    NOTE: subclasses overriding {@link #next(Token)} must
    call {@link Token#clear()}.
 Index: src/java/org/apache/lucene/analysis/Token.java
 ===================================================================
 --- src/java/org/apache/lucene/analysis/Token.java	(revision 675655)
 +++ src/java/org/apache/lucene/analysis/Token.java	(working copy)
 @@ -20,7 +20,7 @@
  import org.apache.lucene.index.Payload;
  import org.apache.lucene.index.TermPositions;     // for javadoc

 -/** A Token is an occurence of a term from the text of a field.  It consists of
 +/** A Token is an occurrence of a term from the text of a field.  It consists of
    a term's text, the start and end offset of the term in the text of the field,
    and a type string.
    <p>
 @@ -49,7 +49,7 @@
    <p><b>NOTE:</b> As of 2.3, Token stores the term text
    internally as a malleable char[] termBuffer instead of
    String termText.  The indexing code and core tokenizers
 -  have been changed re-use a single Token instance, changing
 +  have been changed to re-use a single Token instance, changing
    its buffer and other fields in-place as the Token is
    processed.  This provides substantially better indexing
    performance as it saves the GC cost of new'ing a Token and
 @@ -62,14 +62,57 @@
    instance when possible for best performance, by
    implementing the {@link TokenStream#next(Token)} API.
    Failing that, to create a new Token you should first use
 -  one of the constructors that starts with null text.  Then
 -  you should call either {@link #termBuffer()} or {@link
 -  #resizeTermBuffer(int)} to retrieve the Token's
 -  termBuffer.  Fill in the characters of your term into this
 -  buffer, and finally call {@link #setTermLength(int)} to
 +  one of the constructors that starts with null text.  To load
 +  the token from a char[] use {@link #setTermBuffer(char[], int, int)}.
 +  To load from a String use {@link #setTermBuffer(String)}.
 +  Alternatively you can get the Token's termBuffer by calling either {@link #termBuffer()},
 +  if you know that your text is shorter than the capacity of the termBuffer
 +  or {@link #resizeTermBuffer(int)}, if there is any possibility
 +  that you may need to grow the buffer. Fill in the characters of your term into this
 +  buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string,
 +  or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setTermLength(int)} to
    set the length of the term text.  See <a target="_top"
    href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a>
    for details.</p>
 +  <p>Typical reuse patterns:
 +  <ul>
 +  <li> Copying text from a string:<br/>
 +  <pre>
 +    // prepare the token for re-use
 +    reusableToken.clear();
 +    reusableToken.setTermBuffer(string);
 +  </pre>
 +  </li>
 +  <li> Copying some text from a string:<br/>
 +  <pre>
 +    // prepare the token for re-use
 +    reusableToken.clear();
 +    reusableToken.setTermBuffer(string, 0, string.length() - 1);
 +  </pre>
 +  </li>
 +  <li> Copying text from char[] buffer:<br/>
 +  <pre>
 +    // prepare the token for re-use
 +    reusableToken.clear();
 +    reusableToken.setTermBuffer(buffer, 0, buffer.length);
 +  </pre>
 +  </li>
 +  <li> Copying some text from a char[] buffer:<br/>
 +  <pre>
 +    // prepare the token for re-use
 +    reusableToken.clear();
 +    reusableToken.setTermBuffer(buffer, start, end - start);
 +  </pre>
 +  </li>
 +  <li> Copying from one one Token to another:<br/>
 +  <pre>
 +    // prepare the token for re-use
 +    reusableToken.clear();
 +    reusableToken.setTermBuffer(source.termBuffer(), 0, source.termLength());
 +  </pre>
 +  </li>
 +  </ul>
 +  </p>

    @see org.apache.lucene.index.Payload
  */
 @@ -138,7 +181,9 @@
     *  term text.
     *  @param text term text
     *  @param start start offset
 -   *  @param end end offset */
 +   *  @param end end offset
 +   *  @deprecated
 +   */
    public Token(String text, int start, int end) {
      termText = text;
      startOffset = start;
 @@ -152,7 +197,9 @@
     *  @param text term text
     *  @param start start offset
     *  @param end end offset
 -   *  @param typ token type */
 +   *  @param typ token type
 +   *  @deprecated
 +   */
    public Token(String text, int start, int end, String typ) {
      termText = text;
      startOffset = start;
 @@ -169,6 +216,7 @@
     * @param start
     * @param end
     * @param flags token type bits
 +   * @deprecated
     */
    public Token(String text, int start, int end, int flags) {
      termText = text;
 @@ -218,7 +266,11 @@

    /** Sets the Token's term text.  <b>NOTE:</b> for better
     *  indexing speed you should instead use the char[]
 -   *  termBuffer methods to set the term text. */
 +   *  termBuffer methods to set the term text.
 +   *  @deprecated use {@link #setTermBuffer(char[], int, length)} or
 +   *                  {@link #setTermBuffer(String)} or
 +   *                  {@link #setTermBuffer(String, int, int)}.
 +   */
    public void setTermText(String text) {
      termText = text;
      termBuffer = null;
 @@ -230,7 +282,7 @@
     * because the text is stored internally in a char[].  If
     * possible, use {@link #termBuffer()} and {@link
     * #termLength()} directly instead.  If you really need a
 -   * String, use <b>new String(token.termBuffer(), 0, token.termLength())</b>
 +   * String, use {@link #term()}</b>
     */
    public final String termText() {
      if (termText == null && termBuffer != null)
 @@ -238,19 +290,68 @@
      return termText;
    }

 +  /** Returns the Token's term text.
 +   *
 +   * This method has a performance penalty
 +   * because the text is stored internally in a char[].  If
 +   * possible, use {@link #termBuffer()} and {@link
 +   * #termLength()} directly instead.  If you really need a
 +   * String, use this method, which is nothing more than
 +   * a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b>
 +   */
 +  public final String term() {
 +    if (termText != null)
 +      return termText;
 +    initTermBuffer();
 +    return new String(termBuffer, 0, termLength);
 +  }
 +
    /** Copies the contents of buffer, starting at offset for
 -   *  length characters, into the termBuffer
 -   *  array. <b>NOTE:</b> for better indexing speed you
 -   *  should instead retrieve the termBuffer, using {@link
 -   *  #termBuffer()} or {@link #resizeTermBuffer(int)}, and
 -   *  fill it in directly to set the term text.  This saves
 -   *  an extra copy. */
 +   *  length characters, into the termBuffer array.
 +   *  @param buffer the buffer to copy
 +   *  @param offset the index in the buffer of the first character to copy
 +   *  @param length the number of characters to copy
 +   */
    public final void setTermBuffer(char[] buffer, int offset, int length) {
 -    resizeTermBuffer(length);
 +    termText = null;
 +    char[] newCharBuffer = growTermBuffer(length);
 +    if (newCharBuffer != null) {
 +      termBuffer = newCharBuffer;
 +    }
      System.arraycopy(buffer, offset, termBuffer, 0, length);
      termLength = length;
    }

 +  /** Copies the contents of buffer into the termBuffer array.
 +   *  @param buffer the buffer to copy
 +   */
 +  public final void setTermBuffer(String buffer) {
 +    termText = null;
 +    int length = buffer.length();
 +    char[] newCharBuffer = growTermBuffer(length);
 +    if (newCharBuffer != null) {
 +      termBuffer = newCharBuffer;
 +    }
 +    buffer.getChars(0, length, termBuffer, 0);
 +    termLength = length;
 +  }
 +
 +  /** Copies the contents of buffer, starting at offset and continuing
 +   *  for length characters, into the termBuffer array.
 +   *  @param buffer the buffer to copy
 +   *  @param offset the index in the buffer of the first character to copy
 +   *  @param length the number of characters to copy
 +   */
 +  public final void setTermBuffer(String buffer, int offset, int length) {
 +    termText = null;
 +    char[] newCharBuffer = growTermBuffer(length);
 +    if (newCharBuffer != null) {
 +      termBuffer = newCharBuffer;
 +    }
 +    buffer.getChars(offset, offset + length, termBuffer, 0);
 +    termLength = length;
 +  }
 +
    /** Returns the internal termBuffer character array which
     *  you can then directly alter.  If the array is too
     *  small for your token, use {@link
 @@ -263,23 +364,81 @@
      return termBuffer;
    }

 -  /** Grows the termBuffer to at least size newSize.
 +  /** Grows the termBuffer to at least size newSize, preserving the
 +   *  existing content. Note: If the next operation is to change
 +   *  the contents of the term buffer use
 +   *  {@link #setTermBuffer(char[], int, int)},
 +   *  {@link #setTermBuffer(String)}, or
 +   *  {@link #setTermBuffer(String, int, int)}
 +   *  to optimally combine the resize with the setting of the termBuffer.
     *  @param newSize minimum size of the new termBuffer
     *  @return newly created termBuffer with length >= newSize
     */
    public char[] resizeTermBuffer(int newSize) {
 -    initTermBuffer();
 -    if (newSize > termBuffer.length) {
 -      int size = termBuffer.length;
 -      while(size < newSize)
 -        size *= 2;
 -      char[] newBuffer = new char[size];
 -      System.arraycopy(termBuffer, 0, newBuffer, 0, termBuffer.length);
 -      termBuffer = newBuffer;
 +    char[] newCharBuffer = growTermBuffer(newSize);
 +    if (termBuffer == null) {
 +      // If there were termText, then preserve it.
 +      // note that if termBuffer is null then newCharBuffer cannot be null
 +      if (termText != null) {
 +        termText.getChars(0, termText.length(), newCharBuffer, 0);
 +      }
 +      termBuffer = newCharBuffer;
      }
 +    else if (newCharBuffer != null) {
 +      // Note: if newCharBuffer != null then termBuffer needs to grow.
 +      // If there were a termBuffer, then preserve it
 +      System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
 +      termBuffer = newCharBuffer;
 +    }
 +    termText = null;
      return termBuffer;
    }

 +  /** Allocates a buffer char[] of at least newSize
 +   *  @param newSize minimum size of the buffer
 +   *  @return newly created buffer with length >= newSize or null if the current termBuffer is big enough
 +   */
 +  private char[] growTermBuffer(int newSize)
 +  {
 +    // determine the best size
 +    // The buffer is always at least MIN_BUFFER_SIZE
 +    if (newSize < MIN_BUFFER_SIZE) {
 +      newSize = MIN_BUFFER_SIZE;
 +    }
 +
 +    // If there is already a termText, then the size has to be at least that big
 +    if (termText != null) {
 +      int ttLength = termText.length();
 +      if (newSize < ttLength) {
 +        newSize = ttLength;
 +      }
 +    }
 +
 +    // if the buffer exists and is too small, then determine a better size.
 +    // this is the current doubling algorithm. it could be better.
 +    int tbLength = termBuffer == null ? 0 : termBuffer.length;
 +
 +    if (tbLength > 0 && newSize > tbLength) {
 +      /* A simple allocation based on the size of the request
 +       * is O(n**2). Using over-allocation will typically be O(n).
 +       * Previously, this used a doubling algorithm, which
 +       * was to aggressive in growth. This O(n) algorithm makes
 +       * modest room for additional growth.
 +       * The growth pattern is:
 +       *   MIN_BUFFER_SIZE, 18, 27, 37, 48, 61, 75, 91, 109, 129, 152, 178, 207, ...
 +       */
 +      newSize = (newSize >> 3) + 6 + newSize;
 +    }
 +
 +    // Check to see if the buffer needs to be resized
 +    if (newSize > tbLength)
 +    {
 +      return new char[newSize];
 +    }
 +
 +    return null;
 +  }
 +
    // TODO: once we remove the deprecated termText() method
    // and switch entirely to char[] termBuffer we don't need
    // to use this method anymore
 @@ -308,10 +467,16 @@
    }

    /** Set number of valid characters (length of the term) in
 -   *  the termBuffer array. */
 +   *  the termBuffer array. Use this to truncate the termBuffer
 +   *  or to synchronize with external manipulation of the termBuffer.
 +   *  Note: to grow the size of the array,
 +   *  use {@link #resizeTermBuffer(int)} first.
 +   *  @param length the truncated length
 +   */
    public final void setTermLength(int length) {
      initTermBuffer();
 -    termLength = length;
 +    if (length <= termBuffer.length)
 +      termLength = length;
    }

    /** Returns this Token's starting offset, the position of the first character
 @@ -424,9 +589,9 @@
    public Object clone() {
      try {
        Token t = (Token)super.clone();
 +      // Do a deep clone
        if (termBuffer != null) {
 -        t.termBuffer = null;
 -        t.setTermBuffer(termBuffer, 0, termLength);
 +        t.termBuffer = (char[]) termBuffer.clone();
        }
        if (payload != null) {
          t.setPayload((Payload) payload.clone());
 Index: src/java/org/apache/lucene/analysis/TokenFilter.java
 ===================================================================
 --- src/java/org/apache/lucene/analysis/TokenFilter.java	(revision 675655)
 +++ src/java/org/apache/lucene/analysis/TokenFilter.java	(working copy)
 @@ -23,7 +23,7 @@
    <p>
    This is an abstract class.
    NOTE: subclasses must override at least one of {@link
 -  #next()} or {@link #next(Token)}.
 +  #next()} or {@link #next(Token)}. They should override {@link #next(Token)}.
    */
  public abstract class TokenFilter extends TokenStream {
    /** The source of tokens for this filter. */
 Index: src/java/org/apache/lucene/analysis/TokenStream.java
 ===================================================================
 --- src/java/org/apache/lucene/analysis/TokenStream.java	(revision 675655)
 +++ src/java/org/apache/lucene/analysis/TokenStream.java	(working copy)
 @@ -32,13 +32,13 @@
    whose input is another TokenStream.
    </ul>
    NOTE: subclasses must override at least one of {@link
 -  #next()} or {@link #next(Token)}.
 +  #next(Token)} or {@link #next()}. They should override {@link #next(Token)}.
    */

  public abstract class TokenStream {

    /** Returns the next token in the stream, or null at EOS.
 -   *  The returned Token is a "full private copy" (not
 +   *  @deprecated The returned Token is a "full private copy" (not
     *  re-used across calls to next()) but will be slower
     *  than calling {@link #next(Token)} instead.. */
    public Token next() throws IOException {
	Index: src/test/org/apache/lucene/analysis/TestToken.java
	===================================================================
	--- src/test/org/apache/lucene/analysis/TestToken.java (revision 675655)
	+++ src/test/org/apache/lucene/analysis/TestToken.java (working copy)
	@@ -26,6 +26,119 @@
	super(name);
	}

	+ public void testCtor() throws Exception {
	+ Token t = new Token();
	+ char[] content = "hello".toCharArray();
	+ t.setTermBuffer(content, 0, content.length);
	+ char[] buf = t.termBuffer();
	+ assertNotSame(t.termBuffer(), content);
	+ assertEquals("hello", new String(t.termBuffer(), 0, t.termLength()));
	+ assertEquals("word", t.type());
	+ assertEquals(0, t.getFlags());
	+
	+ t = new Token(6, 22);
	+ t.setTermBuffer(content, 0, content.length);
	+ assertEquals("hello", new String(t.termBuffer(), 0, t.termLength()));
	+ assertEquals("(hello,6,22)", t.toString());
	+ assertEquals("word", t.type());
	+ assertEquals(0, t.getFlags());
	+
	+ t = new Token(6, 22, 7);
	+ t.setTermBuffer(content, 0, content.length);
	+ assertEquals("hello", new String(t.termBuffer(), 0, t.termLength()));
	+ assertEquals("(hello,6,22)", t.toString());
	+ assertEquals(7, t.getFlags());
	+
	+ t = new Token(6, 22, "junk");
	+ t.setTermBuffer(content, 0, content.length);
	+ assertEquals("hello", new String(t.termBuffer(), 0, t.termLength()));
	+ assertEquals("(hello,6,22,type=junk)", t.toString());
	+ assertEquals(0, t.getFlags());
	+ }
	+
	+ public void testResize() {
	+ Token t = new Token();
	+ char[] content = "hello".toCharArray();
	+ t.setTermBuffer(content, 0, content.length);
	+ for (int i = 0; i < 2000; i++)
	+ {
	+ t.resizeTermBuffer(i);
	+ assertTrue(i <= t.termBuffer().length);
	+ assertEquals("hello", new String(t.termBuffer(), 0, t.termLength()));
	+ }
	+ }
	+
	+ public void testGrow() {
	+ Token t = new Token();
	+ StringBuffer buf = new StringBuffer("ab");
	+ for (int i = 0; i < 20; i++)
	+ {
	+ char[] content = buf.toString().toCharArray();
	+ t.setTermBuffer(content, 0, content.length);
	+ assertEquals(buf.length(), t.termLength());
	+ assertEquals(buf.toString(), t.term());
	+ buf.append(buf.toString());
	+ }
	+ assertEquals(1048576, t.termLength());
	+ assertEquals(1179654, t.termBuffer().length);
	+
	+ // now as a string, first variant
	+ t = new Token();
	+ buf = new StringBuffer("ab");
	+ for (int i = 0; i < 20; i++)
	+ {
	+ String content = buf.toString();
	+ t.setTermBuffer(content, 0, content.length());
	+ assertEquals(content.length(), t.termLength());
	+ assertEquals(content, t.term());
	+ buf.append(content);
	+ }
	+ assertEquals(1048576, t.termLength());
	+ assertEquals(1179654, t.termBuffer().length);
	+
	+ // now as a string, second variant
	+ t = new Token();
	+ buf = new StringBuffer("ab");
	+ for (int i = 0; i < 20; i++)
	+ {
	+ String content = buf.toString();
	+ t.setTermBuffer(content);
	+ assertEquals(content.length(), t.termLength());
	+ assertEquals(content, t.term());
	+ buf.append(content);
	+ }
	+ assertEquals(1048576, t.termLength());
	+ assertEquals(1179654, t.termBuffer().length);
	+
	+ // Test for slow growth to a long term
	+ t = new Token();
	+ buf = new StringBuffer("a");
	+ for (int i = 0; i < 20000; i++)
	+ {
	+ String content = buf.toString();
	+ t.setTermBuffer(content);
	+ assertEquals(content.length(), t.termLength());
	+ assertEquals(content, t.term());
	+ buf.append("a");
	+ }
	+ assertEquals(20000, t.termLength());
	+ assertEquals(20331, t.termBuffer().length);
	+
	+ // Test for slow growth to a long term
	+ t = new Token();
	+ buf = new StringBuffer("a");
	+ for (int i = 0; i < 20000; i++)
	+ {
	+ String content = buf.toString();
	+ t.setTermBuffer(content);
	+ assertEquals(content.length(), t.termLength());
	+ assertEquals(content, t.term());
	+ buf.append("a");
	+ }
	+ assertEquals(20000, t.termLength());
	+ assertEquals(20331, t.termBuffer().length);
	+ }
	+
	public void testToString() throws Exception {
	char[] b = {'a', 'l', 'o', 'h', 'a'};
	Token t = new Token("", 0, 5);
	@@ -53,4 +166,13 @@
	buffer[1] = 'o';
	assertEquals(t.termText(), "hollo3");
	}
	+
	+ public void testClone() throws Exception {
	+ Token t = new Token(0, 5);
	+ char[] content = "hello".toCharArray();
	+ t.setTermBuffer(content, 0, 5);
	+ char[] buf = t.termBuffer();
	+ Token copy = (Token) t.clone();
	+ assertNotSame(buf, copy.termBuffer());
	+ }
	}
	Index: src/java/org/apache/lucene/analysis/Tokenizer.java
	===================================================================
	--- src/java/org/apache/lucene/analysis/Tokenizer.java (revision 675655)
	+++ src/java/org/apache/lucene/analysis/Tokenizer.java (working copy)
	@@ -25,7 +25,7 @@
	This is an abstract class.
	<p>
	NOTE: subclasses must override at least one of {@link
	- #next()} or {@link #next(Token)}.
	+ #next()} or {@link #next(Token)}. They should override {@link #next(Token)}.
	<p>
	NOTE: subclasses overriding {@link #next(Token)} must
	call {@link Token#clear()}.
	Index: src/java/org/apache/lucene/analysis/Token.java
	===================================================================
	--- src/java/org/apache/lucene/analysis/Token.java (revision 675655)
	+++ src/java/org/apache/lucene/analysis/Token.java (working copy)
	@@ -20,7 +20,7 @@
	import org.apache.lucene.index.Payload;
	import org.apache.lucene.index.TermPositions; // for javadoc

	-/** A Token is an occurence of a term from the text of a field. It consists of
	+/** A Token is an occurrence of a term from the text of a field. It consists of
	a term's text, the start and end offset of the term in the text of the field,
	and a type string.
	<p>
	@@ -49,7 +49,7 @@
	<p><b>NOTE:</b> As of 2.3, Token stores the term text
	internally as a malleable char[] termBuffer instead of
	String termText. The indexing code and core tokenizers
	- have been changed re-use a single Token instance, changing
	+ have been changed to re-use a single Token instance, changing
	its buffer and other fields in-place as the Token is
	processed. This provides substantially better indexing
	performance as it saves the GC cost of new'ing a Token and
	@@ -62,14 +62,57 @@
	instance when possible for best performance, by
	implementing the {@link TokenStream#next(Token)} API.
	Failing that, to create a new Token you should first use
	- one of the constructors that starts with null text. Then
	- you should call either {@link #termBuffer()} or {@link
	- #resizeTermBuffer(int)} to retrieve the Token's
	- termBuffer. Fill in the characters of your term into this
	- buffer, and finally call {@link #setTermLength(int)} to
	+ one of the constructors that starts with null text. To load
	+ the token from a char[] use {@link #setTermBuffer(char[], int, int)}.
	+ To load from a String use {@link #setTermBuffer(String)}.
	+ Alternatively you can get the Token's termBuffer by calling either {@link #termBuffer()},
	+ if you know that your text is shorter than the capacity of the termBuffer
	+ or {@link #resizeTermBuffer(int)}, if there is any possibility
	+ that you may need to grow the buffer. Fill in the characters of your term into this
	+ buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string,
	+ or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setTermLength(int)} to
	set the length of the term text. See <a target="_top"
	href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a>
	for details.</p>
	+ <p>Typical reuse patterns:
	+ <ul>
	+ <li> Copying text from a string:<br/>
	+ <pre>
	+ // prepare the token for re-use
	+ reusableToken.clear();
	+ reusableToken.setTermBuffer(string);
	+ </pre>
	+ </li>
	+ <li> Copying some text from a string:<br/>
	+ <pre>
	+ // prepare the token for re-use
	+ reusableToken.clear();
	+ reusableToken.setTermBuffer(string, 0, string.length() - 1);
	+ </pre>
	+ </li>
	+ <li> Copying text from char[] buffer:<br/>
	+ <pre>
	+ // prepare the token for re-use
	+ reusableToken.clear();
	+ reusableToken.setTermBuffer(buffer, 0, buffer.length);
	+ </pre>
	+ </li>
	+ <li> Copying some text from a char[] buffer:<br/>
	+ <pre>
	+ // prepare the token for re-use
	+ reusableToken.clear();
	+ reusableToken.setTermBuffer(buffer, start, end - start);
	+ </pre>
	+ </li>
	+ <li> Copying from one one Token to another:<br/>
	+ <pre>
	+ // prepare the token for re-use
	+ reusableToken.clear();
	+ reusableToken.setTermBuffer(source.termBuffer(), 0, source.termLength());
	+ </pre>
	+ </li>
	+ </ul>
	+ </p>

	@see org.apache.lucene.index.Payload
	*/
	@@ -138,7 +181,9 @@
	* term text.
	* @param text term text
	* @param start start offset
	- * @param end end offset */
	+ * @param end end offset
	+ * @deprecated
	+ */
	public Token(String text, int start, int end) {
	termText = text;
	startOffset = start;
	@@ -152,7 +197,9 @@
	* @param text term text
	* @param start start offset
	* @param end end offset
	- * @param typ token type */
	+ * @param typ token type
	+ * @deprecated
	+ */
	public Token(String text, int start, int end, String typ) {
	termText = text;
	startOffset = start;
	@@ -169,6 +216,7 @@
	* @param start
	* @param end
	* @param flags token type bits
	+ * @deprecated
	*/
	public Token(String text, int start, int end, int flags) {
	termText = text;
	@@ -218,7 +266,11 @@

	/** Sets the Token's term text. <b>NOTE:</b> for better
	* indexing speed you should instead use the char[]
	- * termBuffer methods to set the term text. */
	+ * termBuffer methods to set the term text.
	+ * @deprecated use {@link #setTermBuffer(char[], int, length)} or
	+ * {@link #setTermBuffer(String)} or
	+ * {@link #setTermBuffer(String, int, int)}.
	+ */
	public void setTermText(String text) {
	termText = text;
	termBuffer = null;
	@@ -230,7 +282,7 @@
	* because the text is stored internally in a char[]. If
	* possible, use {@link #termBuffer()} and {@link
	* #termLength()} directly instead. If you really need a
	- * String, use <b>new String(token.termBuffer(), 0, token.termLength())</b>
	+ * String, use {@link #term()}</b>
	*/
	public final String termText() {
	if (termText == null && termBuffer != null)
	@@ -238,19 +290,68 @@
	return termText;
	}

	+ /** Returns the Token's term text.
	+ *
	+ * This method has a performance penalty
	+ * because the text is stored internally in a char[]. If
	+ * possible, use {@link #termBuffer()} and {@link
	+ * #termLength()} directly instead. If you really need a
	+ * String, use this method, which is nothing more than
	+ * a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b>
	+ */
	+ public final String term() {
	+ if (termText != null)
	+ return termText;
	+ initTermBuffer();
	+ return new String(termBuffer, 0, termLength);
	+ }
	+
	/** Copies the contents of buffer, starting at offset for
	- * length characters, into the termBuffer
	- * array. <b>NOTE:</b> for better indexing speed you
	- * should instead retrieve the termBuffer, using {@link
	- * #termBuffer()} or {@link #resizeTermBuffer(int)}, and
	- * fill it in directly to set the term text. This saves
	- * an extra copy. */
	+ * length characters, into the termBuffer array.
	+ * @param buffer the buffer to copy
	+ * @param offset the index in the buffer of the first character to copy
	+ * @param length the number of characters to copy
	+ */
	public final void setTermBuffer(char[] buffer, int offset, int length) {
	- resizeTermBuffer(length);
	+ termText = null;
	+ char[] newCharBuffer = growTermBuffer(length);
	+ if (newCharBuffer != null) {
	+ termBuffer = newCharBuffer;
	+ }
	System.arraycopy(buffer, offset, termBuffer, 0, length);
	termLength = length;
	}

	+ /** Copies the contents of buffer into the termBuffer array.
	+ * @param buffer the buffer to copy
	+ */
	+ public final void setTermBuffer(String buffer) {
	+ termText = null;
	+ int length = buffer.length();
	+ char[] newCharBuffer = growTermBuffer(length);
	+ if (newCharBuffer != null) {
	+ termBuffer = newCharBuffer;
	+ }
	+ buffer.getChars(0, length, termBuffer, 0);
	+ termLength = length;
	+ }
	+
	+ /** Copies the contents of buffer, starting at offset and continuing
	+ * for length characters, into the termBuffer array.
	+ * @param buffer the buffer to copy
	+ * @param offset the index in the buffer of the first character to copy
	+ * @param length the number of characters to copy
	+ */
	+ public final void setTermBuffer(String buffer, int offset, int length) {
	+ termText = null;
	+ char[] newCharBuffer = growTermBuffer(length);
	+ if (newCharBuffer != null) {
	+ termBuffer = newCharBuffer;
	+ }
	+ buffer.getChars(offset, offset + length, termBuffer, 0);
	+ termLength = length;
	+ }
	+
	/** Returns the internal termBuffer character array which
	* you can then directly alter. If the array is too
	* small for your token, use {@link
	@@ -263,23 +364,81 @@
	return termBuffer;
	}

	- /** Grows the termBuffer to at least size newSize.
	+ /** Grows the termBuffer to at least size newSize, preserving the
	+ * existing content. Note: If the next operation is to change
	+ * the contents of the term buffer use
	+ * {@link #setTermBuffer(char[], int, int)},
	+ * {@link #setTermBuffer(String)}, or
	+ * {@link #setTermBuffer(String, int, int)}
	+ * to optimally combine the resize with the setting of the termBuffer.
	* @param newSize minimum size of the new termBuffer
	* @return newly created termBuffer with length >= newSize
	*/
	public char[] resizeTermBuffer(int newSize) {
	- initTermBuffer();
	- if (newSize > termBuffer.length) {
	- int size = termBuffer.length;
	- while(size < newSize)
	- size *= 2;
	- char[] newBuffer = new char[size];
	- System.arraycopy(termBuffer, 0, newBuffer, 0, termBuffer.length);
	- termBuffer = newBuffer;
	+ char[] newCharBuffer = growTermBuffer(newSize);
	+ if (termBuffer == null) {
	+ // If there were termText, then preserve it.
	+ // note that if termBuffer is null then newCharBuffer cannot be null
	+ if (termText != null) {
	+ termText.getChars(0, termText.length(), newCharBuffer, 0);
	+ }
	+ termBuffer = newCharBuffer;
	}
	+ else if (newCharBuffer != null) {
	+ // Note: if newCharBuffer != null then termBuffer needs to grow.
	+ // If there were a termBuffer, then preserve it
	+ System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
	+ termBuffer = newCharBuffer;
	+ }
	+ termText = null;
	return termBuffer;
	}

	+ /** Allocates a buffer char[] of at least newSize
	+ * @param newSize minimum size of the buffer
	+ * @return newly created buffer with length >= newSize or null if the current termBuffer is big enough
	+ */
	+ private char[] growTermBuffer(int newSize)
	+ {
	+ // determine the best size
	+ // The buffer is always at least MIN_BUFFER_SIZE
	+ if (newSize < MIN_BUFFER_SIZE) {
	+ newSize = MIN_BUFFER_SIZE;
	+ }
	+
	+ // If there is already a termText, then the size has to be at least that big
	+ if (termText != null) {
	+ int ttLength = termText.length();
	+ if (newSize < ttLength) {
	+ newSize = ttLength;
	+ }
	+ }
	+
	+ // if the buffer exists and is too small, then determine a better size.
	+ // this is the current doubling algorithm. it could be better.
	+ int tbLength = termBuffer == null ? 0 : termBuffer.length;
	+
	+ if (tbLength > 0 && newSize > tbLength) {
	+ /* A simple allocation based on the size of the request
	+ * is O(n**2). Using over-allocation will typically be O(n).
	+ * Previously, this used a doubling algorithm, which
	+ * was to aggressive in growth. This O(n) algorithm makes
	+ * modest room for additional growth.
	+ * The growth pattern is:
	+ * MIN_BUFFER_SIZE, 18, 27, 37, 48, 61, 75, 91, 109, 129, 152, 178, 207, ...
	+ */
	+ newSize = (newSize >> 3) + 6 + newSize;
	+ }
	+
	+ // Check to see if the buffer needs to be resized
	+ if (newSize > tbLength)
	+ {
	+ return new char[newSize];
	+ }
	+
	+ return null;
	+ }
	+
	// TODO: once we remove the deprecated termText() method
	// and switch entirely to char[] termBuffer we don't need
	// to use this method anymore
	@@ -308,10 +467,16 @@
	}

	/** Set number of valid characters (length of the term) in
	- * the termBuffer array. */
	+ * the termBuffer array. Use this to truncate the termBuffer
	+ * or to synchronize with external manipulation of the termBuffer.
	+ * Note: to grow the size of the array,
	+ * use {@link #resizeTermBuffer(int)} first.
	+ * @param length the truncated length
	+ */
	public final void setTermLength(int length) {
	initTermBuffer();
	- termLength = length;
	+ if (length <= termBuffer.length)
	+ termLength = length;
	}

	/** Returns this Token's starting offset, the position of the first character
	@@ -424,9 +589,9 @@
	public Object clone() {
	try {
	Token t = (Token)super.clone();
	+ // Do a deep clone
	if (termBuffer != null) {
	- t.termBuffer = null;
	- t.setTermBuffer(termBuffer, 0, termLength);
	+ t.termBuffer = (char[]) termBuffer.clone();
	}
	if (payload != null) {
	t.setPayload((Payload) payload.clone());
	Index: src/java/org/apache/lucene/analysis/TokenFilter.java
	===================================================================
	--- src/java/org/apache/lucene/analysis/TokenFilter.java (revision 675655)
	+++ src/java/org/apache/lucene/analysis/TokenFilter.java (working copy)
	@@ -23,7 +23,7 @@
	<p>
	This is an abstract class.
	NOTE: subclasses must override at least one of {@link
	- #next()} or {@link #next(Token)}.
	+ #next()} or {@link #next(Token)}. They should override {@link #next(Token)}.
	*/
	public abstract class TokenFilter extends TokenStream {
	/** The source of tokens for this filter. */
	Index: src/java/org/apache/lucene/analysis/TokenStream.java
	===================================================================
	--- src/java/org/apache/lucene/analysis/TokenStream.java (revision 675655)
	+++ src/java/org/apache/lucene/analysis/TokenStream.java (working copy)
	@@ -32,13 +32,13 @@
	whose input is another TokenStream.
	</ul>
	NOTE: subclasses must override at least one of {@link
	- #next()} or {@link #next(Token)}.
	+ #next(Token)} or {@link #next()}. They should override {@link #next(Token)}.
	*/

	public abstract class TokenStream {

	/** Returns the next token in the stream, or null at EOS.
	- * The returned Token is a "full private copy" (not
	+ * @deprecated The returned Token is a "full private copy" (not
	* re-used across calls to next()) but will be slower
	* than calling {@link #next(Token)} instead.. */
	public Token next() throws IOException {