| Index: src/test/org/apache/lucene/analysis/TestToken.java |
| =================================================================== |
| --- src/test/org/apache/lucene/analysis/TestToken.java (revision 675655) |
| +++ src/test/org/apache/lucene/analysis/TestToken.java (working copy) |
| @@ -26,6 +26,119 @@ |
| super(name); |
| } |
| |
| + public void testCtor() throws Exception { |
| + Token t = new Token(); |
| + char[] content = "hello".toCharArray(); |
| + t.setTermBuffer(content, 0, content.length); |
| + char[] buf = t.termBuffer(); |
| + assertNotSame(t.termBuffer(), content); |
| + assertEquals("hello", new String(t.termBuffer(), 0, t.termLength())); |
| + assertEquals("word", t.type()); |
| + assertEquals(0, t.getFlags()); |
| + |
| + t = new Token(6, 22); |
| + t.setTermBuffer(content, 0, content.length); |
| + assertEquals("hello", new String(t.termBuffer(), 0, t.termLength())); |
| + assertEquals("(hello,6,22)", t.toString()); |
| + assertEquals("word", t.type()); |
| + assertEquals(0, t.getFlags()); |
| + |
| + t = new Token(6, 22, 7); |
| + t.setTermBuffer(content, 0, content.length); |
| + assertEquals("hello", new String(t.termBuffer(), 0, t.termLength())); |
| + assertEquals("(hello,6,22)", t.toString()); |
| + assertEquals(7, t.getFlags()); |
| + |
| + t = new Token(6, 22, "junk"); |
| + t.setTermBuffer(content, 0, content.length); |
| + assertEquals("hello", new String(t.termBuffer(), 0, t.termLength())); |
| + assertEquals("(hello,6,22,type=junk)", t.toString()); |
| + assertEquals(0, t.getFlags()); |
| + } |
| + |
| + public void testResize() { |
| + Token t = new Token(); |
| + char[] content = "hello".toCharArray(); |
| + t.setTermBuffer(content, 0, content.length); |
| + for (int i = 0; i < 2000; i++) |
| + { |
| + t.resizeTermBuffer(i); |
| + assertTrue(i <= t.termBuffer().length); |
| + assertEquals("hello", new String(t.termBuffer(), 0, t.termLength())); |
| + } |
| + } |
| + |
| + public void testGrow() { |
| + Token t = new Token(); |
| + StringBuffer buf = new StringBuffer("ab"); |
| + for (int i = 0; i < 20; i++) |
| + { |
| + char[] content = buf.toString().toCharArray(); |
| + t.setTermBuffer(content, 0, content.length); |
| + assertEquals(buf.length(), t.termLength()); |
| + assertEquals(buf.toString(), t.term()); |
| + buf.append(buf.toString()); |
| + } |
| + assertEquals(1048576, t.termLength()); |
| + assertEquals(1179654, t.termBuffer().length); |
| + |
| + // now as a string, first variant |
| + t = new Token(); |
| + buf = new StringBuffer("ab"); |
| + for (int i = 0; i < 20; i++) |
| + { |
| + String content = buf.toString(); |
| + t.setTermBuffer(content, 0, content.length()); |
| + assertEquals(content.length(), t.termLength()); |
| + assertEquals(content, t.term()); |
| + buf.append(content); |
| + } |
| + assertEquals(1048576, t.termLength()); |
| + assertEquals(1179654, t.termBuffer().length); |
| + |
| + // now as a string, second variant |
| + t = new Token(); |
| + buf = new StringBuffer("ab"); |
| + for (int i = 0; i < 20; i++) |
| + { |
| + String content = buf.toString(); |
| + t.setTermBuffer(content); |
| + assertEquals(content.length(), t.termLength()); |
| + assertEquals(content, t.term()); |
| + buf.append(content); |
| + } |
| + assertEquals(1048576, t.termLength()); |
| + assertEquals(1179654, t.termBuffer().length); |
| + |
| + // Test for slow growth to a long term |
| + t = new Token(); |
| + buf = new StringBuffer("a"); |
| + for (int i = 0; i < 20000; i++) |
| + { |
| + String content = buf.toString(); |
| + t.setTermBuffer(content); |
| + assertEquals(content.length(), t.termLength()); |
| + assertEquals(content, t.term()); |
| + buf.append("a"); |
| + } |
| + assertEquals(20000, t.termLength()); |
| + assertEquals(20331, t.termBuffer().length); |
| + |
| + // Test for slow growth to a long term |
| + t = new Token(); |
| + buf = new StringBuffer("a"); |
| + for (int i = 0; i < 20000; i++) |
| + { |
| + String content = buf.toString(); |
| + t.setTermBuffer(content); |
| + assertEquals(content.length(), t.termLength()); |
| + assertEquals(content, t.term()); |
| + buf.append("a"); |
| + } |
| + assertEquals(20000, t.termLength()); |
| + assertEquals(20331, t.termBuffer().length); |
| + } |
| + |
| public void testToString() throws Exception { |
| char[] b = {'a', 'l', 'o', 'h', 'a'}; |
| Token t = new Token("", 0, 5); |
| @@ -53,4 +166,13 @@ |
| buffer[1] = 'o'; |
| assertEquals(t.termText(), "hollo3"); |
| } |
| + |
| + public void testClone() throws Exception { |
| + Token t = new Token(0, 5); |
| + char[] content = "hello".toCharArray(); |
| + t.setTermBuffer(content, 0, 5); |
| + char[] buf = t.termBuffer(); |
| + Token copy = (Token) t.clone(); |
| + assertNotSame(buf, copy.termBuffer()); |
| + } |
| } |
| Index: src/java/org/apache/lucene/analysis/Tokenizer.java |
| =================================================================== |
| --- src/java/org/apache/lucene/analysis/Tokenizer.java (revision 675655) |
| +++ src/java/org/apache/lucene/analysis/Tokenizer.java (working copy) |
| @@ -25,7 +25,7 @@ |
| This is an abstract class. |
| <p> |
| NOTE: subclasses must override at least one of {@link |
| - #next()} or {@link #next(Token)}. |
| + #next()} or {@link #next(Token)}. They should override {@link #next(Token)}. |
| <p> |
| NOTE: subclasses overriding {@link #next(Token)} must |
| call {@link Token#clear()}. |
| Index: src/java/org/apache/lucene/analysis/Token.java |
| =================================================================== |
| --- src/java/org/apache/lucene/analysis/Token.java (revision 675655) |
| +++ src/java/org/apache/lucene/analysis/Token.java (working copy) |
| @@ -20,7 +20,7 @@ |
| import org.apache.lucene.index.Payload; |
| import org.apache.lucene.index.TermPositions; // for javadoc |
| |
| -/** A Token is an occurence of a term from the text of a field. It consists of |
| +/** A Token is an occurrence of a term from the text of a field. It consists of |
| a term's text, the start and end offset of the term in the text of the field, |
| and a type string. |
| <p> |
| @@ -49,7 +49,7 @@ |
| <p><b>NOTE:</b> As of 2.3, Token stores the term text |
| internally as a malleable char[] termBuffer instead of |
| String termText. The indexing code and core tokenizers |
| - have been changed re-use a single Token instance, changing |
| + have been changed to re-use a single Token instance, changing |
| its buffer and other fields in-place as the Token is |
| processed. This provides substantially better indexing |
| performance as it saves the GC cost of new'ing a Token and |
| @@ -62,14 +62,57 @@ |
| instance when possible for best performance, by |
| implementing the {@link TokenStream#next(Token)} API. |
| Failing that, to create a new Token you should first use |
| - one of the constructors that starts with null text. Then |
| - you should call either {@link #termBuffer()} or {@link |
| - #resizeTermBuffer(int)} to retrieve the Token's |
| - termBuffer. Fill in the characters of your term into this |
| - buffer, and finally call {@link #setTermLength(int)} to |
| + one of the constructors that starts with null text. To load |
| + the token from a char[] use {@link #setTermBuffer(char[], int, int)}. |
| + To load from a String use {@link #setTermBuffer(String)}. |
| + Alternatively you can get the Token's termBuffer by calling either {@link #termBuffer()}, |
| + if you know that your text is shorter than the capacity of the termBuffer |
| + or {@link #resizeTermBuffer(int)}, if there is any possibility |
| + that you may need to grow the buffer. Fill in the characters of your term into this |
| + buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string, |
| + or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setTermLength(int)} to |
| set the length of the term text. See <a target="_top" |
| href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a> |
| for details.</p> |
| + <p>Typical reuse patterns: |
| + <ul> |
| + <li> Copying text from a string:<br/> |
| + <pre> |
| + // prepare the token for re-use |
| + reusableToken.clear(); |
| + reusableToken.setTermBuffer(string); |
| + </pre> |
| + </li> |
| + <li> Copying some text from a string:<br/> |
| + <pre> |
| + // prepare the token for re-use |
| + reusableToken.clear(); |
| + reusableToken.setTermBuffer(string, 0, string.length() - 1); |
| + </pre> |
| + </li> |
| + <li> Copying text from char[] buffer:<br/> |
| + <pre> |
| + // prepare the token for re-use |
| + reusableToken.clear(); |
| + reusableToken.setTermBuffer(buffer, 0, buffer.length); |
| + </pre> |
| + </li> |
| + <li> Copying some text from a char[] buffer:<br/> |
| + <pre> |
| + // prepare the token for re-use |
| + reusableToken.clear(); |
| + reusableToken.setTermBuffer(buffer, start, end - start); |
| + </pre> |
| + </li> |
| + <li> Copying from one one Token to another:<br/> |
| + <pre> |
| + // prepare the token for re-use |
| + reusableToken.clear(); |
| + reusableToken.setTermBuffer(source.termBuffer(), 0, source.termLength()); |
| + </pre> |
| + </li> |
| + </ul> |
| + </p> |
| |
| @see org.apache.lucene.index.Payload |
| */ |
| @@ -138,7 +181,9 @@ |
| * term text. |
| * @param text term text |
| * @param start start offset |
| - * @param end end offset */ |
| + * @param end end offset |
| + * @deprecated |
| + */ |
| public Token(String text, int start, int end) { |
| termText = text; |
| startOffset = start; |
| @@ -152,7 +197,9 @@ |
| * @param text term text |
| * @param start start offset |
| * @param end end offset |
| - * @param typ token type */ |
| + * @param typ token type |
| + * @deprecated |
| + */ |
| public Token(String text, int start, int end, String typ) { |
| termText = text; |
| startOffset = start; |
| @@ -169,6 +216,7 @@ |
| * @param start |
| * @param end |
| * @param flags token type bits |
| + * @deprecated |
| */ |
| public Token(String text, int start, int end, int flags) { |
| termText = text; |
| @@ -218,7 +266,11 @@ |
| |
| /** Sets the Token's term text. <b>NOTE:</b> for better |
| * indexing speed you should instead use the char[] |
| - * termBuffer methods to set the term text. */ |
| + * termBuffer methods to set the term text. |
| + * @deprecated use {@link #setTermBuffer(char[], int, length)} or |
| + * {@link #setTermBuffer(String)} or |
| + * {@link #setTermBuffer(String, int, int)}. |
| + */ |
| public void setTermText(String text) { |
| termText = text; |
| termBuffer = null; |
| @@ -230,7 +282,7 @@ |
| * because the text is stored internally in a char[]. If |
| * possible, use {@link #termBuffer()} and {@link |
| * #termLength()} directly instead. If you really need a |
| - * String, use <b>new String(token.termBuffer(), 0, token.termLength())</b> |
| + * String, use {@link #term()}</b> |
| */ |
| public final String termText() { |
| if (termText == null && termBuffer != null) |
| @@ -238,19 +290,68 @@ |
| return termText; |
| } |
| |
| + /** Returns the Token's term text. |
| + * |
| + * This method has a performance penalty |
| + * because the text is stored internally in a char[]. If |
| + * possible, use {@link #termBuffer()} and {@link |
| + * #termLength()} directly instead. If you really need a |
| + * String, use this method, which is nothing more than |
| + * a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b> |
| + */ |
| + public final String term() { |
| + if (termText != null) |
| + return termText; |
| + initTermBuffer(); |
| + return new String(termBuffer, 0, termLength); |
| + } |
| + |
| /** Copies the contents of buffer, starting at offset for |
| - * length characters, into the termBuffer |
| - * array. <b>NOTE:</b> for better indexing speed you |
| - * should instead retrieve the termBuffer, using {@link |
| - * #termBuffer()} or {@link #resizeTermBuffer(int)}, and |
| - * fill it in directly to set the term text. This saves |
| - * an extra copy. */ |
| + * length characters, into the termBuffer array. |
| + * @param buffer the buffer to copy |
| + * @param offset the index in the buffer of the first character to copy |
| + * @param length the number of characters to copy |
| + */ |
| public final void setTermBuffer(char[] buffer, int offset, int length) { |
| - resizeTermBuffer(length); |
| + termText = null; |
| + char[] newCharBuffer = growTermBuffer(length); |
| + if (newCharBuffer != null) { |
| + termBuffer = newCharBuffer; |
| + } |
| System.arraycopy(buffer, offset, termBuffer, 0, length); |
| termLength = length; |
| } |
| |
| + /** Copies the contents of buffer into the termBuffer array. |
| + * @param buffer the buffer to copy |
| + */ |
| + public final void setTermBuffer(String buffer) { |
| + termText = null; |
| + int length = buffer.length(); |
| + char[] newCharBuffer = growTermBuffer(length); |
| + if (newCharBuffer != null) { |
| + termBuffer = newCharBuffer; |
| + } |
| + buffer.getChars(0, length, termBuffer, 0); |
| + termLength = length; |
| + } |
| + |
| + /** Copies the contents of buffer, starting at offset and continuing |
| + * for length characters, into the termBuffer array. |
| + * @param buffer the buffer to copy |
| + * @param offset the index in the buffer of the first character to copy |
| + * @param length the number of characters to copy |
| + */ |
| + public final void setTermBuffer(String buffer, int offset, int length) { |
| + termText = null; |
| + char[] newCharBuffer = growTermBuffer(length); |
| + if (newCharBuffer != null) { |
| + termBuffer = newCharBuffer; |
| + } |
| + buffer.getChars(offset, offset + length, termBuffer, 0); |
| + termLength = length; |
| + } |
| + |
| /** Returns the internal termBuffer character array which |
| * you can then directly alter. If the array is too |
| * small for your token, use {@link |
| @@ -263,23 +364,81 @@ |
| return termBuffer; |
| } |
| |
| - /** Grows the termBuffer to at least size newSize. |
| + /** Grows the termBuffer to at least size newSize, preserving the |
| + * existing content. Note: If the next operation is to change |
| + * the contents of the term buffer use |
| + * {@link #setTermBuffer(char[], int, int)}, |
| + * {@link #setTermBuffer(String)}, or |
| + * {@link #setTermBuffer(String, int, int)} |
| + * to optimally combine the resize with the setting of the termBuffer. |
| * @param newSize minimum size of the new termBuffer |
| * @return newly created termBuffer with length >= newSize |
| */ |
| public char[] resizeTermBuffer(int newSize) { |
| - initTermBuffer(); |
| - if (newSize > termBuffer.length) { |
| - int size = termBuffer.length; |
| - while(size < newSize) |
| - size *= 2; |
| - char[] newBuffer = new char[size]; |
| - System.arraycopy(termBuffer, 0, newBuffer, 0, termBuffer.length); |
| - termBuffer = newBuffer; |
| + char[] newCharBuffer = growTermBuffer(newSize); |
| + if (termBuffer == null) { |
| + // If there were termText, then preserve it. |
| + // note that if termBuffer is null then newCharBuffer cannot be null |
| + if (termText != null) { |
| + termText.getChars(0, termText.length(), newCharBuffer, 0); |
| + } |
| + termBuffer = newCharBuffer; |
| } |
| + else if (newCharBuffer != null) { |
| + // Note: if newCharBuffer != null then termBuffer needs to grow. |
| + // If there were a termBuffer, then preserve it |
| + System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length); |
| + termBuffer = newCharBuffer; |
| + } |
| + termText = null; |
| return termBuffer; |
| } |
| |
| + /** Allocates a buffer char[] of at least newSize |
| + * @param newSize minimum size of the buffer |
| + * @return newly created buffer with length >= newSize or null if the current termBuffer is big enough |
| + */ |
| + private char[] growTermBuffer(int newSize) |
| + { |
| + // determine the best size |
| + // The buffer is always at least MIN_BUFFER_SIZE |
| + if (newSize < MIN_BUFFER_SIZE) { |
| + newSize = MIN_BUFFER_SIZE; |
| + } |
| + |
| + // If there is already a termText, then the size has to be at least that big |
| + if (termText != null) { |
| + int ttLength = termText.length(); |
| + if (newSize < ttLength) { |
| + newSize = ttLength; |
| + } |
| + } |
| + |
| + // if the buffer exists and is too small, then determine a better size. |
| + // this is the current doubling algorithm. it could be better. |
| + int tbLength = termBuffer == null ? 0 : termBuffer.length; |
| + |
| + if (tbLength > 0 && newSize > tbLength) { |
| + /* A simple allocation based on the size of the request |
| + * is O(n**2). Using over-allocation will typically be O(n). |
| + * Previously, this used a doubling algorithm, which |
| + * was to aggressive in growth. This O(n) algorithm makes |
| + * modest room for additional growth. |
| + * The growth pattern is: |
| + * MIN_BUFFER_SIZE, 18, 27, 37, 48, 61, 75, 91, 109, 129, 152, 178, 207, ... |
| + */ |
| + newSize = (newSize >> 3) + 6 + newSize; |
| + } |
| + |
| + // Check to see if the buffer needs to be resized |
| + if (newSize > tbLength) |
| + { |
| + return new char[newSize]; |
| + } |
| + |
| + return null; |
| + } |
| + |
| // TODO: once we remove the deprecated termText() method |
| // and switch entirely to char[] termBuffer we don't need |
| // to use this method anymore |
| @@ -308,10 +467,16 @@ |
| } |
| |
| /** Set number of valid characters (length of the term) in |
| - * the termBuffer array. */ |
| + * the termBuffer array. Use this to truncate the termBuffer |
| + * or to synchronize with external manipulation of the termBuffer. |
| + * Note: to grow the size of the array, |
| + * use {@link #resizeTermBuffer(int)} first. |
| + * @param length the truncated length |
| + */ |
| public final void setTermLength(int length) { |
| initTermBuffer(); |
| - termLength = length; |
| + if (length <= termBuffer.length) |
| + termLength = length; |
| } |
| |
| /** Returns this Token's starting offset, the position of the first character |
| @@ -424,9 +589,9 @@ |
| public Object clone() { |
| try { |
| Token t = (Token)super.clone(); |
| + // Do a deep clone |
| if (termBuffer != null) { |
| - t.termBuffer = null; |
| - t.setTermBuffer(termBuffer, 0, termLength); |
| + t.termBuffer = (char[]) termBuffer.clone(); |
| } |
| if (payload != null) { |
| t.setPayload((Payload) payload.clone()); |
| Index: src/java/org/apache/lucene/analysis/TokenFilter.java |
| =================================================================== |
| --- src/java/org/apache/lucene/analysis/TokenFilter.java (revision 675655) |
| +++ src/java/org/apache/lucene/analysis/TokenFilter.java (working copy) |
| @@ -23,7 +23,7 @@ |
| <p> |
| This is an abstract class. |
| NOTE: subclasses must override at least one of {@link |
| - #next()} or {@link #next(Token)}. |
| + #next()} or {@link #next(Token)}. They should override {@link #next(Token)}. |
| */ |
| public abstract class TokenFilter extends TokenStream { |
| /** The source of tokens for this filter. */ |
| Index: src/java/org/apache/lucene/analysis/TokenStream.java |
| =================================================================== |
| --- src/java/org/apache/lucene/analysis/TokenStream.java (revision 675655) |
| +++ src/java/org/apache/lucene/analysis/TokenStream.java (working copy) |
| @@ -32,13 +32,13 @@ |
| whose input is another TokenStream. |
| </ul> |
| NOTE: subclasses must override at least one of {@link |
| - #next()} or {@link #next(Token)}. |
| + #next(Token)} or {@link #next()}. They should override {@link #next(Token)}. |
| */ |
| |
| public abstract class TokenStream { |
| |
| /** Returns the next token in the stream, or null at EOS. |
| - * The returned Token is a "full private copy" (not |
| + * @deprecated The returned Token is a "full private copy" (not |
| * re-used across calls to next()) but will be slower |
| * than calling {@link #next(Token)} instead.. */ |
| public Token next() throws IOException { |