blob: f97e22207b4a7ab435763f9db85391469dbe9c2f [file] [log] [blame]
Index: src/test/org/apache/lucene/analysis/TestToken.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestToken.java (revision 675655)
+++ src/test/org/apache/lucene/analysis/TestToken.java (working copy)
@@ -26,6 +26,119 @@
super(name);
}
+ public void testCtor() throws Exception {
+ Token t = new Token();
+ char[] content = "hello".toCharArray();
+ t.setTermBuffer(content, 0, content.length);
+ char[] buf = t.termBuffer();
+ assertNotSame(t.termBuffer(), content);
+ assertEquals("hello", new String(t.termBuffer(), 0, t.termLength()));
+ assertEquals("word", t.type());
+ assertEquals(0, t.getFlags());
+
+ t = new Token(6, 22);
+ t.setTermBuffer(content, 0, content.length);
+ assertEquals("hello", new String(t.termBuffer(), 0, t.termLength()));
+ assertEquals("(hello,6,22)", t.toString());
+ assertEquals("word", t.type());
+ assertEquals(0, t.getFlags());
+
+ t = new Token(6, 22, 7);
+ t.setTermBuffer(content, 0, content.length);
+ assertEquals("hello", new String(t.termBuffer(), 0, t.termLength()));
+ assertEquals("(hello,6,22)", t.toString());
+ assertEquals(7, t.getFlags());
+
+ t = new Token(6, 22, "junk");
+ t.setTermBuffer(content, 0, content.length);
+ assertEquals("hello", new String(t.termBuffer(), 0, t.termLength()));
+ assertEquals("(hello,6,22,type=junk)", t.toString());
+ assertEquals(0, t.getFlags());
+ }
+
+ public void testResize() {
+ Token t = new Token();
+ char[] content = "hello".toCharArray();
+ t.setTermBuffer(content, 0, content.length);
+ for (int i = 0; i < 2000; i++)
+ {
+ t.resizeTermBuffer(i);
+ assertTrue(i <= t.termBuffer().length);
+ assertEquals("hello", new String(t.termBuffer(), 0, t.termLength()));
+ }
+ }
+
+ public void testGrow() {
+ Token t = new Token();
+ StringBuffer buf = new StringBuffer("ab");
+ for (int i = 0; i < 20; i++)
+ {
+ char[] content = buf.toString().toCharArray();
+ t.setTermBuffer(content, 0, content.length);
+ assertEquals(buf.length(), t.termLength());
+ assertEquals(buf.toString(), t.term());
+ buf.append(buf.toString());
+ }
+ assertEquals(1048576, t.termLength());
+ assertEquals(1179654, t.termBuffer().length);
+
+ // now as a string, first variant
+ t = new Token();
+ buf = new StringBuffer("ab");
+ for (int i = 0; i < 20; i++)
+ {
+ String content = buf.toString();
+ t.setTermBuffer(content, 0, content.length());
+ assertEquals(content.length(), t.termLength());
+ assertEquals(content, t.term());
+ buf.append(content);
+ }
+ assertEquals(1048576, t.termLength());
+ assertEquals(1179654, t.termBuffer().length);
+
+ // now as a string, second variant
+ t = new Token();
+ buf = new StringBuffer("ab");
+ for (int i = 0; i < 20; i++)
+ {
+ String content = buf.toString();
+ t.setTermBuffer(content);
+ assertEquals(content.length(), t.termLength());
+ assertEquals(content, t.term());
+ buf.append(content);
+ }
+ assertEquals(1048576, t.termLength());
+ assertEquals(1179654, t.termBuffer().length);
+
+ // Test for slow growth to a long term
+ t = new Token();
+ buf = new StringBuffer("a");
+ for (int i = 0; i < 20000; i++)
+ {
+ String content = buf.toString();
+ t.setTermBuffer(content);
+ assertEquals(content.length(), t.termLength());
+ assertEquals(content, t.term());
+ buf.append("a");
+ }
+ assertEquals(20000, t.termLength());
+ assertEquals(20331, t.termBuffer().length);
+
+ // Test for slow growth to a long term
+ t = new Token();
+ buf = new StringBuffer("a");
+ for (int i = 0; i < 20000; i++)
+ {
+ String content = buf.toString();
+ t.setTermBuffer(content);
+ assertEquals(content.length(), t.termLength());
+ assertEquals(content, t.term());
+ buf.append("a");
+ }
+ assertEquals(20000, t.termLength());
+ assertEquals(20331, t.termBuffer().length);
+ }
+
public void testToString() throws Exception {
char[] b = {'a', 'l', 'o', 'h', 'a'};
Token t = new Token("", 0, 5);
@@ -53,4 +166,13 @@
buffer[1] = 'o';
assertEquals(t.termText(), "hollo3");
}
+
+ public void testClone() throws Exception {
+ Token t = new Token(0, 5);
+ char[] content = "hello".toCharArray();
+ t.setTermBuffer(content, 0, 5);
+ char[] buf = t.termBuffer();
+ Token copy = (Token) t.clone();
+ assertNotSame(buf, copy.termBuffer());
+ }
}
Index: src/java/org/apache/lucene/analysis/Tokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/Tokenizer.java (revision 675655)
+++ src/java/org/apache/lucene/analysis/Tokenizer.java (working copy)
@@ -25,7 +25,7 @@
This is an abstract class.
<p>
NOTE: subclasses must override at least one of {@link
- #next()} or {@link #next(Token)}.
+ #next()} or {@link #next(Token)}. They should override {@link #next(Token)}.
<p>
NOTE: subclasses overriding {@link #next(Token)} must
call {@link Token#clear()}.
Index: src/java/org/apache/lucene/analysis/Token.java
===================================================================
--- src/java/org/apache/lucene/analysis/Token.java (revision 675655)
+++ src/java/org/apache/lucene/analysis/Token.java (working copy)
@@ -20,7 +20,7 @@
import org.apache.lucene.index.Payload;
import org.apache.lucene.index.TermPositions; // for javadoc
-/** A Token is an occurence of a term from the text of a field. It consists of
+/** A Token is an occurrence of a term from the text of a field. It consists of
a term's text, the start and end offset of the term in the text of the field,
and a type string.
<p>
@@ -49,7 +49,7 @@
<p><b>NOTE:</b> As of 2.3, Token stores the term text
internally as a malleable char[] termBuffer instead of
String termText. The indexing code and core tokenizers
- have been changed re-use a single Token instance, changing
+ have been changed to re-use a single Token instance, changing
its buffer and other fields in-place as the Token is
processed. This provides substantially better indexing
performance as it saves the GC cost of new'ing a Token and
@@ -62,14 +62,57 @@
instance when possible for best performance, by
implementing the {@link TokenStream#next(Token)} API.
Failing that, to create a new Token you should first use
- one of the constructors that starts with null text. Then
- you should call either {@link #termBuffer()} or {@link
- #resizeTermBuffer(int)} to retrieve the Token's
- termBuffer. Fill in the characters of your term into this
- buffer, and finally call {@link #setTermLength(int)} to
+ one of the constructors that starts with null text. To load
+ the token from a char[] use {@link #setTermBuffer(char[], int, int)}.
+ To load from a String use {@link #setTermBuffer(String)}.
+ Alternatively you can get the Token's termBuffer by calling either {@link #termBuffer()},
+ if you know that your text is shorter than the capacity of the termBuffer
+ or {@link #resizeTermBuffer(int)}, if there is any possibility
+ that you may need to grow the buffer. Fill in the characters of your term into this
+ buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string,
+ or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setTermLength(int)} to
set the length of the term text. See <a target="_top"
href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a>
for details.</p>
+ <p>Typical reuse patterns:
+ <ul>
+ <li> Copying text from a string:<br/>
+ <pre>
+ // prepare the token for re-use
+ reusableToken.clear();
+ reusableToken.setTermBuffer(string);
+ </pre>
+ </li>
+ <li> Copying some text from a string:<br/>
+ <pre>
+ // prepare the token for re-use
+ reusableToken.clear();
+ reusableToken.setTermBuffer(string, 0, string.length() - 1);
+ </pre>
+ </li>
+ <li> Copying text from char[] buffer:<br/>
+ <pre>
+ // prepare the token for re-use
+ reusableToken.clear();
+ reusableToken.setTermBuffer(buffer, 0, buffer.length);
+ </pre>
+ </li>
+ <li> Copying some text from a char[] buffer:<br/>
+ <pre>
+ // prepare the token for re-use
+ reusableToken.clear();
+ reusableToken.setTermBuffer(buffer, start, end - start);
+ </pre>
+ </li>
+ <li> Copying from one one Token to another:<br/>
+ <pre>
+ // prepare the token for re-use
+ reusableToken.clear();
+ reusableToken.setTermBuffer(source.termBuffer(), 0, source.termLength());
+ </pre>
+ </li>
+ </ul>
+ </p>
@see org.apache.lucene.index.Payload
*/
@@ -138,7 +181,9 @@
* term text.
* @param text term text
* @param start start offset
- * @param end end offset */
+ * @param end end offset
+ * @deprecated
+ */
public Token(String text, int start, int end) {
termText = text;
startOffset = start;
@@ -152,7 +197,9 @@
* @param text term text
* @param start start offset
* @param end end offset
- * @param typ token type */
+ * @param typ token type
+ * @deprecated
+ */
public Token(String text, int start, int end, String typ) {
termText = text;
startOffset = start;
@@ -169,6 +216,7 @@
* @param start
* @param end
* @param flags token type bits
+ * @deprecated
*/
public Token(String text, int start, int end, int flags) {
termText = text;
@@ -218,7 +266,11 @@
/** Sets the Token's term text. <b>NOTE:</b> for better
* indexing speed you should instead use the char[]
- * termBuffer methods to set the term text. */
+ * termBuffer methods to set the term text.
+ * @deprecated use {@link #setTermBuffer(char[], int, length)} or
+ * {@link #setTermBuffer(String)} or
+ * {@link #setTermBuffer(String, int, int)}.
+ */
public void setTermText(String text) {
termText = text;
termBuffer = null;
@@ -230,7 +282,7 @@
* because the text is stored internally in a char[]. If
* possible, use {@link #termBuffer()} and {@link
* #termLength()} directly instead. If you really need a
- * String, use <b>new String(token.termBuffer(), 0, token.termLength())</b>
+ * String, use {@link #term()}</b>
*/
public final String termText() {
if (termText == null && termBuffer != null)
@@ -238,19 +290,68 @@
return termText;
}
+ /** Returns the Token's term text.
+ *
+ * This method has a performance penalty
+ * because the text is stored internally in a char[]. If
+ * possible, use {@link #termBuffer()} and {@link
+ * #termLength()} directly instead. If you really need a
+ * String, use this method, which is nothing more than
+ * a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b>
+ */
+ public final String term() {
+ if (termText != null)
+ return termText;
+ initTermBuffer();
+ return new String(termBuffer, 0, termLength);
+ }
+
/** Copies the contents of buffer, starting at offset for
- * length characters, into the termBuffer
- * array. <b>NOTE:</b> for better indexing speed you
- * should instead retrieve the termBuffer, using {@link
- * #termBuffer()} or {@link #resizeTermBuffer(int)}, and
- * fill it in directly to set the term text. This saves
- * an extra copy. */
+ * length characters, into the termBuffer array.
+ * @param buffer the buffer to copy
+ * @param offset the index in the buffer of the first character to copy
+ * @param length the number of characters to copy
+ */
public final void setTermBuffer(char[] buffer, int offset, int length) {
- resizeTermBuffer(length);
+ termText = null;
+ char[] newCharBuffer = growTermBuffer(length);
+ if (newCharBuffer != null) {
+ termBuffer = newCharBuffer;
+ }
System.arraycopy(buffer, offset, termBuffer, 0, length);
termLength = length;
}
+ /** Copies the contents of buffer into the termBuffer array.
+ * @param buffer the buffer to copy
+ */
+ public final void setTermBuffer(String buffer) {
+ termText = null;
+ int length = buffer.length();
+ char[] newCharBuffer = growTermBuffer(length);
+ if (newCharBuffer != null) {
+ termBuffer = newCharBuffer;
+ }
+ buffer.getChars(0, length, termBuffer, 0);
+ termLength = length;
+ }
+
+ /** Copies the contents of buffer, starting at offset and continuing
+ * for length characters, into the termBuffer array.
+ * @param buffer the buffer to copy
+ * @param offset the index in the buffer of the first character to copy
+ * @param length the number of characters to copy
+ */
+ public final void setTermBuffer(String buffer, int offset, int length) {
+ termText = null;
+ char[] newCharBuffer = growTermBuffer(length);
+ if (newCharBuffer != null) {
+ termBuffer = newCharBuffer;
+ }
+ buffer.getChars(offset, offset + length, termBuffer, 0);
+ termLength = length;
+ }
+
/** Returns the internal termBuffer character array which
* you can then directly alter. If the array is too
* small for your token, use {@link
@@ -263,23 +364,81 @@
return termBuffer;
}
- /** Grows the termBuffer to at least size newSize.
+ /** Grows the termBuffer to at least size newSize, preserving the
+ * existing content. Note: If the next operation is to change
+ * the contents of the term buffer use
+ * {@link #setTermBuffer(char[], int, int)},
+ * {@link #setTermBuffer(String)}, or
+ * {@link #setTermBuffer(String, int, int)}
+ * to optimally combine the resize with the setting of the termBuffer.
* @param newSize minimum size of the new termBuffer
* @return newly created termBuffer with length >= newSize
*/
public char[] resizeTermBuffer(int newSize) {
- initTermBuffer();
- if (newSize > termBuffer.length) {
- int size = termBuffer.length;
- while(size < newSize)
- size *= 2;
- char[] newBuffer = new char[size];
- System.arraycopy(termBuffer, 0, newBuffer, 0, termBuffer.length);
- termBuffer = newBuffer;
+ char[] newCharBuffer = growTermBuffer(newSize);
+ if (termBuffer == null) {
+ // If there were termText, then preserve it.
+ // note that if termBuffer is null then newCharBuffer cannot be null
+ if (termText != null) {
+ termText.getChars(0, termText.length(), newCharBuffer, 0);
+ }
+ termBuffer = newCharBuffer;
}
+ else if (newCharBuffer != null) {
+ // Note: if newCharBuffer != null then termBuffer needs to grow.
+ // If there were a termBuffer, then preserve it
+ System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
+ termBuffer = newCharBuffer;
+ }
+ termText = null;
return termBuffer;
}
+ /** Allocates a buffer char[] of at least newSize
+ * @param newSize minimum size of the buffer
+ * @return newly created buffer with length >= newSize or null if the current termBuffer is big enough
+ */
+ private char[] growTermBuffer(int newSize)
+ {
+ // determine the best size
+ // The buffer is always at least MIN_BUFFER_SIZE
+ if (newSize < MIN_BUFFER_SIZE) {
+ newSize = MIN_BUFFER_SIZE;
+ }
+
+ // If there is already a termText, then the size has to be at least that big
+ if (termText != null) {
+ int ttLength = termText.length();
+ if (newSize < ttLength) {
+ newSize = ttLength;
+ }
+ }
+
+ // if the buffer exists and is too small, then determine a better size.
+ // this is the current doubling algorithm. it could be better.
+ int tbLength = termBuffer == null ? 0 : termBuffer.length;
+
+ if (tbLength > 0 && newSize > tbLength) {
+ /* A simple allocation based on the size of the request
+ * is O(n**2). Using over-allocation will typically be O(n).
+ * Previously, this used a doubling algorithm, which
+ * was to aggressive in growth. This O(n) algorithm makes
+ * modest room for additional growth.
+ * The growth pattern is:
+ * MIN_BUFFER_SIZE, 18, 27, 37, 48, 61, 75, 91, 109, 129, 152, 178, 207, ...
+ */
+ newSize = (newSize >> 3) + 6 + newSize;
+ }
+
+ // Check to see if the buffer needs to be resized
+ if (newSize > tbLength)
+ {
+ return new char[newSize];
+ }
+
+ return null;
+ }
+
// TODO: once we remove the deprecated termText() method
// and switch entirely to char[] termBuffer we don't need
// to use this method anymore
@@ -308,10 +467,16 @@
}
/** Set number of valid characters (length of the term) in
- * the termBuffer array. */
+ * the termBuffer array. Use this to truncate the termBuffer
+ * or to synchronize with external manipulation of the termBuffer.
+ * Note: to grow the size of the array,
+ * use {@link #resizeTermBuffer(int)} first.
+ * @param length the truncated length
+ */
public final void setTermLength(int length) {
initTermBuffer();
- termLength = length;
+ if (length <= termBuffer.length)
+ termLength = length;
}
/** Returns this Token's starting offset, the position of the first character
@@ -424,9 +589,9 @@
public Object clone() {
try {
Token t = (Token)super.clone();
+ // Do a deep clone
if (termBuffer != null) {
- t.termBuffer = null;
- t.setTermBuffer(termBuffer, 0, termLength);
+ t.termBuffer = (char[]) termBuffer.clone();
}
if (payload != null) {
t.setPayload((Payload) payload.clone());
Index: src/java/org/apache/lucene/analysis/TokenFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/TokenFilter.java (revision 675655)
+++ src/java/org/apache/lucene/analysis/TokenFilter.java (working copy)
@@ -23,7 +23,7 @@
<p>
This is an abstract class.
NOTE: subclasses must override at least one of {@link
- #next()} or {@link #next(Token)}.
+ #next()} or {@link #next(Token)}. They should override {@link #next(Token)}.
*/
public abstract class TokenFilter extends TokenStream {
/** The source of tokens for this filter. */
Index: src/java/org/apache/lucene/analysis/TokenStream.java
===================================================================
--- src/java/org/apache/lucene/analysis/TokenStream.java (revision 675655)
+++ src/java/org/apache/lucene/analysis/TokenStream.java (working copy)
@@ -32,13 +32,13 @@
whose input is another TokenStream.
</ul>
NOTE: subclasses must override at least one of {@link
- #next()} or {@link #next(Token)}.
+ #next(Token)} or {@link #next()}. They should override {@link #next(Token)}.
*/
public abstract class TokenStream {
/** Returns the next token in the stream, or null at EOS.
- * The returned Token is a "full private copy" (not
+ * @deprecated The returned Token is a "full private copy" (not
* re-used across calls to next()) but will be slower
* than calling {@link #next(Token)} instead.. */
public Token next() throws IOException {