blob: 74db29889e6c70d0462afe57f089d33be54e4e0b [file] [log] [blame]
Index: src/java/org/apache/lucene/analysis/AttributeSource.java
===================================================================
--- src/java/org/apache/lucene/analysis/AttributeSource.java (revision 0)
+++ src/java/org/apache/lucene/analysis/AttributeSource.java (revision 0)
@@ -0,0 +1,160 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.tokenattributes.Attribute;
+
+/**
+ * An AttributeSource contains a list of different {@link Attribute}s,
+ * and methods to add and get them. There can only be a single instance
+ * of an attribute in the same AttributeSource instance. This is ensured
+ * by passing in the actual type of the Attribute (Class<Attribute>) to
+ * the {@link #addAttribute(Class)}, which then checks if an instance of
+ * that type is already present. If yes, it returns the instance, otherwise
+ * it creates a new instance and returns it.
+ */
+public abstract class AttributeSource {
+
+ /**
+ * An AttributeAcceptor defines only a single method {@link #accept(Class)}.
+ * It can be used for e. g. buffering purposes to specify which attributes
+ * to buffer.
+ */
+ public static abstract class AttributeAcceptor {
+ /** Return true, to accept this attribute; false otherwise */
+ public abstract boolean accept(Class attClass);
+ }
+
+ /**
+ * Default AttributeAcceptor that accepts all attributes.
+ */
+ public static final AttributeAcceptor AllAcceptor = new AttributeAcceptor() {
+ public boolean accept(Class attClass) {return true;}
+ };
+
+ /**
+ * Holds the Class<Attribute> -> Attribute mapping
+ */
+ protected Map attributes = new LinkedHashMap();
+
+ /** Returns an iterator that iterates the attributes
+ * in the same order they were added in.
+ */
+ public Iterator getAttributesIterator() {
+ return attributes.values().iterator();
+ }
+
+ /**
+ * The caller must pass in a Class<? extends Attribute> value.
+ * This method first checks if an instance of that class is
+ * already in this AttributeSource and returns it. Otherwise a
+ * new instance is created, added to this AttributeSource and returned.
+ */
+ public Attribute addAttribute(Class attClass) {
+ Attribute att = (Attribute) attributes.get(attClass);
+ if (att == null) {
+ try {
+ att = (Attribute) attClass.newInstance();
+ } catch (InstantiationException e) {
+ throw new IllegalArgumentException("Could not instantiate class " + attClass);
+ } catch (IllegalAccessException e) {
+ throw new IllegalArgumentException("Could not instantiate class " + attClass);
+ }
+
+ attributes.put(attClass, att);
+ }
+ return att;
+ }
+
+ /** Returns true, iff this AttributeSource has any attributes */
+ public boolean hasAttributes() {
+ return !this.attributes.isEmpty();
+ }
+
+ /**
+ * The caller must pass in a Class<? extends Attribute> value.
+ * Returns true, iff this AttributeSource contains the passed-in Attribute.
+ */
+ public boolean hasAttribute(Class attClass) {
+ return this.attributes.containsKey(attClass);
+ }
+
+ /**
+ * The caller must pass in a Class<? extends Attribute> value.
+ * Returns the instance of the passed in Attribute contained in this AttributeSource
+ *
+ * @throws IllegalArgumentException if this AttributeSource does not contain the
+ * Attribute
+ */
+ public Attribute getAttribute(Class attClass) {
+ Attribute att = (Attribute) this.attributes.get(attClass);
+ if (att == null) {
+ throw new IllegalArgumentException("This token does not have the attribute '" + attClass + "'.");
+ }
+
+ return att;
+ }
+
+ /**
+ * Resets all Attributes in this AttributeSource by calling
+ * {@link Attribute#clear()} on each Attribute.
+ */
+ public void clearAttributes() {
+ Iterator it = getAttributesIterator();
+ while (it.hasNext()) {
+ ((Attribute) it.next()).clear();
+ }
+ }
+
+// TODO: Java 1.5
+// private Map<Class<? extends Attribute>, Attribute> attributes;
+// public <T extends Attribute> T addAttribute(Class<T> attClass) {
+// T att = (T) attributes.get(attClass);
+// if (att == null) {
+// try {
+// att = attClass.newInstance();
+// } catch (InstantiationException e) {
+// throw new IllegalArgumentException("Could not instantiate class " + attClass);
+// } catch (IllegalAccessException e) {
+// throw new IllegalArgumentException("Could not instantiate class " + attClass);
+// }
+//
+// attributes.put(attClass, att);
+// }
+// return att;
+// }
+//
+// public boolean hasAttribute(Class<? extends Attribute> attClass) {
+// return this.attributes.containsKey(attClass);
+// }
+//
+// public <T extends Attribute> T getAttribute(Class<T> attClass) {
+// Attribute att = this.attributes.get(attClass);
+// if (att == null) {
+// throw new IllegalArgumentException("This token does not have the attribute '" + attClass + "'.");
+// }
+//
+// return (T) att;
+// }
+//
+
+}
Property changes on: src\java\org\apache\lucene\analysis\AttributeSource.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/analysis/CachingTokenFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/CachingTokenFilter.java (revision 708658)
+++ src/java/org/apache/lucene/analysis/CachingTokenFilter.java (working copy)
@@ -34,12 +34,31 @@
*/
public class CachingTokenFilter extends TokenFilter {
private List cache;
- private Iterator iterator;
+ private Iterator iterator;
public CachingTokenFilter(TokenStream input) {
super(input);
}
+ public boolean incrementToken() throws IOException {
+ if (cache == null) {
+ // fill cache lazily
+ cache = new LinkedList();
+ fillCache();
+ iterator = cache.iterator();
+ }
+
+ if (!iterator.hasNext()) {
+ // the cache is exhausted, return null
+ return false;
+ }
+ // Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
+ TokenStreamState state = (TokenStreamState) iterator.next();
+ state.restore(this);
+ return true;
+ }
+
+ /** @deprecated */
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (cache == null) {
@@ -60,10 +79,17 @@
public void reset() throws IOException {
if(cache != null) {
- iterator = cache.iterator();
+ iterator = cache.iterator();
}
}
+ private void fillCache() throws IOException {
+ while(input.incrementToken()) {
+ cache.add(TokenStreamState.capture(this));
+ }
+ }
+
+ /** @deprecated */
private void fillCache(final Token reusableToken) throws IOException {
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
cache.add(nextToken.clone());
Index: src/java/org/apache/lucene/analysis/CharTokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/CharTokenizer.java (revision 708658)
+++ src/java/org/apache/lucene/analysis/CharTokenizer.java (working copy)
@@ -20,6 +20,9 @@
import java.io.IOException;
import java.io.Reader;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
/** An abstract base class for simple, character-oriented tokenizers.*/
public abstract class CharTokenizer extends Tokenizer {
public CharTokenizer(Reader input) {
@@ -30,6 +33,9 @@
private static final int MAX_WORD_LEN = 255;
private static final int IO_BUFFER_SIZE = 4096;
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
/** Returns true iff a character should be included in a token. This
* tokenizer generates as tokens adjacent sequences of characters which
@@ -43,7 +49,58 @@
protected char normalize(char c) {
return c;
}
+
+ public void initialize() throws IOException {
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ }
+ public final boolean incrementToken() throws IOException {
+ assert termAtt != null && offsetAtt != null;
+
+ clearAttributes();
+ int length = 0;
+ int start = bufferIndex;
+ char[] buffer = termAtt.termBuffer();
+ while (true) {
+
+ if (bufferIndex >= dataLen) {
+ offset += dataLen;
+ dataLen = input.read(ioBuffer);
+ if (dataLen == -1) {
+ if (length > 0)
+ break;
+ else
+ return false;
+ }
+ bufferIndex = 0;
+ }
+
+ final char c = ioBuffer[bufferIndex++];
+
+ if (isTokenChar(c)) { // if it's a token char
+
+ if (length == 0) // start of token
+ start = offset + bufferIndex - 1;
+ else if (length == buffer.length)
+ buffer = termAtt.resizeTermBuffer(1+length);
+
+ buffer[length++] = normalize(c); // buffer it, normalized
+
+ if (length == MAX_WORD_LEN) // buffer overflow!
+ break;
+
+ } else if (length > 0) // at non-Letter w/ chars
+ break; // return 'em
+ }
+
+ termAtt.setTermLength(length);
+ offsetAtt.setStartOffset(start);
+ offsetAtt.setEndOffset(start+length);
+ return true;
+ }
+
+ /** @deprecated */
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
reusableToken.clear();
Index: src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java (revision 708658)
+++ src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java (working copy)
@@ -1,5 +1,9 @@
package org.apache.lucene.analysis;
+import java.io.IOException;
+
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -31,7 +35,34 @@
private char[] output = new char[256];
private int outputPos;
-
+ private TermAttribute termAtt;
+
+ public void initialize() throws IOException {
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ }
+
+ public final boolean incrementToken() throws java.io.IOException {
+ assert termAtt != null;
+
+ if (input.incrementToken()) {
+ final char[] buffer = termAtt.termBuffer();
+ final int length = termAtt.termLength();
+ // If no characters actually require rewriting then we
+ // just return token as-is:
+ for(int i=0;i<length;i++) {
+ final char c = buffer[i];
+ if (c >= '\u00c0' && c <= '\uFB06') {
+ removeAccents(buffer, length);
+ termAtt.setTermBuffer(output, 0, outputPos);
+ break;
+ }
+ }
+ return true;
+ } else
+ return false;
+ }
+
+ /** @deprecated */
public final Token next(final Token reusableToken) throws java.io.IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
@@ -241,7 +272,7 @@
case '\uFB06': // st
output[outputPos++] = 's';
output[outputPos++] = 't';
- break;
+ break;
default :
output[outputPos++] = c;
break;
Index: src/java/org/apache/lucene/analysis/KeywordTokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/KeywordTokenizer.java (revision 708658)
+++ src/java/org/apache/lucene/analysis/KeywordTokenizer.java (working copy)
@@ -20,6 +20,8 @@
import java.io.IOException;
import java.io.Reader;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
/**
* Emits the entire input as a single token.
*/
@@ -28,7 +30,8 @@
private static final int DEFAULT_BUFFER_SIZE = 256;
private boolean done;
-
+ private TermAttribute termAtt;
+
public KeywordTokenizer(Reader input) {
this(input, DEFAULT_BUFFER_SIZE);
}
@@ -37,7 +40,32 @@
super(input);
this.done = false;
}
+
+ public void initialize() throws IOException {
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ }
+
+ public boolean incrementToken() throws IOException {
+ assert termAtt != null;
+ if (!done) {
+ done = true;
+ int upto = 0;
+ termAtt.clear();
+ char[] buffer = termAtt.termBuffer();
+ while (true) {
+ final int length = input.read(buffer, upto, buffer.length-upto);
+ if (length == -1) break;
+ upto += length;
+ if (upto == buffer.length)
+ buffer = termAtt.resizeTermBuffer(1+buffer.length);
+ }
+ termAtt.setTermLength(upto);
+ return true;
+ }
+ return false;
+ }
+ /** @deprecated */
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (!done) {
Index: src/java/org/apache/lucene/analysis/LengthFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/LengthFilter.java (revision 708658)
+++ src/java/org/apache/lucene/analysis/LengthFilter.java (working copy)
@@ -19,6 +19,8 @@
import java.io.IOException;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
/**
* Removes words that are too long and too short from the stream.
*
@@ -29,6 +31,8 @@
final int min;
final int max;
+
+ private TermAttribute termAtt;
/**
* Build a filter that removes words that are too long or too
@@ -41,9 +45,33 @@
this.max = max;
}
+ public void initialize() throws IOException {
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ }
+
+
/**
* Returns the next input Token whose term() is the right len
*/
+ public final boolean incrementToken() throws IOException
+ {
+ assert termAtt != null;
+ // return the first non-stop word found
+ while (input.incrementToken()) {
+ int len = termAtt.termLength();
+ if (len >= min && len <= max) {
+ return true;
+ }
+ // note: else we ignore it but should we index each part of it?
+ }
+ // reached EOS -- return null
+ return false;
+ }
+
+ /**
+ * Returns the next input Token whose term() is the right len
+ * @deprecated
+ */
public final Token next(final Token reusableToken) throws IOException
{
assert reusableToken != null;
Index: src/java/org/apache/lucene/analysis/LowerCaseFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/LowerCaseFilter.java (revision 708658)
+++ src/java/org/apache/lucene/analysis/LowerCaseFilter.java (working copy)
@@ -19,6 +19,8 @@
import java.io.IOException;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
/**
* Normalizes token text to lower case.
*
@@ -29,6 +31,28 @@
super(in);
}
+ private TermAttribute termAtt;
+
+ public void initialize() throws IOException {
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ }
+
+ public final boolean incrementToken() throws IOException {
+ assert termAtt != null;
+
+ if (input.incrementToken()) {
+
+ final char[] buffer = termAtt.termBuffer();
+ final int length = termAtt.termLength();
+ for(int i=0;i<length;i++)
+ buffer[i] = Character.toLowerCase(buffer[i]);
+
+ return true;
+ } else
+ return false;
+ }
+
+ /** @deprecated */
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
Index: src/java/org/apache/lucene/analysis/PorterStemFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/PorterStemFilter.java (revision 708658)
+++ src/java/org/apache/lucene/analysis/PorterStemFilter.java (working copy)
@@ -19,6 +19,8 @@
import java.io.IOException;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
/** Transforms the token stream as per the Porter stemming algorithm.
Note: the input to the stemming filter must already be in lower case,
so you will need to use LowerCaseFilter or LowerCaseTokenizer farther
@@ -39,12 +41,29 @@
*/
public final class PorterStemFilter extends TokenFilter {
private PorterStemmer stemmer;
+ private TermAttribute termAtt;
public PorterStemFilter(TokenStream in) {
super(in);
stemmer = new PorterStemmer();
}
+
+ public void initialize() throws IOException {
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ }
+ public final boolean incrementToken() throws IOException {
+ assert termAtt != null;
+
+ if (!input.incrementToken())
+ return false;
+
+ if (stemmer.stem(termAtt.termBuffer(), 0, termAtt.termLength()))
+ termAtt.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
+ return true;
+ }
+
+ /** @deprecated */
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
Index: src/java/org/apache/lucene/analysis/SinkTokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/SinkTokenizer.java (revision 708658)
+++ src/java/org/apache/lucene/analysis/SinkTokenizer.java (working copy)
@@ -32,7 +32,7 @@
public class SinkTokenizer extends Tokenizer {
protected List/*<Token>*/ lst = new ArrayList/*<Token>*/();
protected Iterator/*<Token>*/ iter;
-
+
public SinkTokenizer(List/*<Token>*/ input) {
this.lst = input;
if (this.lst == null) this.lst = new ArrayList/*<Token>*/();
@@ -66,6 +66,27 @@
* @return The next {@link org.apache.lucene.analysis.Token} in the Sink.
* @throws IOException
*/
+ public boolean incrementToken() throws IOException {
+ if (iter == null) iter = lst.iterator();
+ // Since this TokenStream can be reset we have to maintain the tokens as immutable
+ if (iter.hasNext()) {
+ TokenStreamState state = (TokenStreamState) iter.next();
+ state.restore(this);
+ return true;
+ }
+ return false;
+ }
+
+ public void add(TokenStreamState source) throws IOException {
+ lst.add(source);
+ }
+
+ /**
+ * Returns the next token out of the list of cached tokens
+ * @return The next {@link org.apache.lucene.analysis.Token} in the Sink.
+ * @throws IOException
+ * @deprecated
+ */
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (iter == null) iter = lst.iterator();
@@ -77,8 +98,6 @@
return null;
}
-
-
/**
* Override this method to cache only certain tokens, or new tokens based
* on the old tokens.
Index: src/java/org/apache/lucene/analysis/standard/StandardFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/standard/StandardFilter.java (revision 708658)
+++ src/java/org/apache/lucene/analysis/standard/StandardFilter.java (working copy)
@@ -17,9 +17,13 @@
* limitations under the License.
*/
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/** Normalizes tokens extracted with {@link StandardTokenizer}. */
@@ -34,10 +38,53 @@
private static final String APOSTROPHE_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.APOSTROPHE];
private static final String ACRONYM_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM];
+ // this filters uses attribute type
+ private TypeAttribute typeAtt;
+ private TermAttribute termAtt;
+
+ public final void initialize() throws IOException {
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+ }
+
/** Returns the next token in the stream, or null at EOS.
* <p>Removes <tt>'s</tt> from the end of words.
* <p>Removes dots from acronyms.
*/
+ public final boolean incrementToken() throws java.io.IOException {
+ assert termAtt != null;
+ if (!input.incrementToken()) {
+ return false;
+ }
+
+ char[] buffer = termAtt.termBuffer();
+ final int bufferLength = termAtt.termLength();
+ final String type = typeAtt.type();
+
+ if (type == APOSTROPHE_TYPE && // remove 's
+ bufferLength >= 2 &&
+ buffer[bufferLength-2] == '\'' &&
+ (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
+ // Strip last 2 characters off
+ termAtt.setTermLength(bufferLength - 2);
+ } else if (type == ACRONYM_TYPE) { // remove dots
+ int upto = 0;
+ for(int i=0;i<bufferLength;i++) {
+ char c = buffer[i];
+ if (c != '.')
+ buffer[upto++] = c;
+ }
+ termAtt.setTermLength(upto);
+ }
+
+ return true;
+ }
+
+ /** Returns the next token in the stream, or null at EOS.
+ * <p>Removes <tt>'s</tt> from the end of words.
+ * <p>Removes dots from acronyms.
+ * @deprecated
+ */
public final Token next(final Token reusableToken) throws java.io.IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
Index: src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
===================================================================
--- src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (revision 708658)
+++ src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (working copy)
@@ -22,6 +22,10 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/** A grammar-based tokenizer constructed with JFlex
*
@@ -127,11 +131,71 @@
this.scanner = new StandardTokenizerImpl(input);
}
+ // this tokenizer generates three attributes:
+ // offset, positionIncrement and type
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+ private PositionIncrementAttribute posIncrAtt;
+ private TypeAttribute typeAtt;
+
+ public void initialize() throws IOException {
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+
+ }
+
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.TokenStream#next()
*/
+ public boolean incrementToken() throws IOException {
+ assert termAtt != null;
+ int posIncr = 1;
+
+ while(true) {
+ int tokenType = scanner.getNextToken();
+
+ if (tokenType == StandardTokenizerImpl.YYEOF) {
+ return false;
+ }
+
+ if (scanner.yylength() <= maxTokenLength) {
+ termAtt.clear();
+ posIncrAtt.setPositionIncrement(posIncr);
+ scanner.getText(termAtt);
+ final int start = scanner.yychar();
+ offsetAtt.setStartOffset(start);
+ offsetAtt.setEndOffset(start+termAtt.termLength());
+ // This 'if' should be removed in the next release. For now, it converts
+ // invalid acronyms to HOST. When removed, only the 'else' part should
+ // remain.
+ if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
+ if (replaceInvalidAcronym) {
+ typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
+ termAtt.setTermLength(termAtt.termLength() - 1); // remove extra '.'
+ } else {
+ typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
+ }
+ } else {
+ typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
+ }
+ return true;
+ } else
+ // When we skip a too-long term, we still increment the
+ // position increment
+ posIncr++;
+ }
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.analysis.TokenStream#next()
+ */
+ /** @deprecated */
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
int posIncr = 1;
Index: src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
===================================================================
--- src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java (revision 708658)
+++ src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java (working copy)
@@ -30,6 +30,7 @@
*/
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
@@ -368,7 +369,14 @@
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+/**
+ * Fills TermAttribute with the current token text.
+ */
+final void getText(TermAttribute t) {
+ t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
+}
+
/**
* Creates a new scanner
* There is also a java.io.InputStream version of this constructor.
Index: src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
===================================================================
--- src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (revision 708658)
+++ src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (working copy)
@@ -29,6 +29,7 @@
*/
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
%%
@@ -69,6 +70,14 @@
final void getText(Token t) {
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
+
+/**
+ * Fills TermAttribute with the current token text.
+ */
+final void getText(TermAttribute t) {
+ t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
+}
+
%}
THAI = [\u0E00-\u0E59]
Index: src/java/org/apache/lucene/analysis/StopFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/StopFilter.java (revision 708658)
+++ src/java/org/apache/lucene/analysis/StopFilter.java (working copy)
@@ -21,6 +21,9 @@
import java.util.Arrays;
import java.util.Set;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
/**
* Removes stop words from a token stream.
*/
@@ -32,6 +35,9 @@
private final CharArraySet stopWords;
private boolean enablePositionIncrements = ENABLE_POSITION_INCREMENTS_DEFAULT;
+ private TermAttribute termAtt;
+ private PositionIncrementAttribute posIncrAtt;
+
/**
* Construct a token stream filtering the given input.
*/
@@ -85,6 +91,11 @@
public StopFilter(TokenStream in, Set stopWords) {
this(in, stopWords, false);
}
+
+ public void initialize() throws IOException {
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ }
/**
* Builds a Set from an array of stop words,
@@ -109,9 +120,30 @@
stopSet.addAll(Arrays.asList(stopWords));
return stopSet;
}
+
+ /**
+ * Returns the next input Token whose term() is not a stop word.
+ */
+ public final boolean incrementToken() throws IOException {
+ assert termAtt != null;
+ // return the first non-stop word found
+ int skippedPositions = 0;
+ while (input.incrementToken()) {
+ if (!stopWords.contains(termAtt.termBuffer(), 0, termAtt.termLength())) {
+ if (enablePositionIncrements) {
+ posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
+ }
+ return true;
+ }
+ skippedPositions += posIncrAtt.getPositionIncrement();
+ }
+ // reached EOS -- return null
+ return false;
+ }
/**
* Returns the next input Token whose term() is not a stop word.
+ * @deprecated
*/
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Index: src/java/org/apache/lucene/analysis/TeeTokenFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/TeeTokenFilter.java (revision 708658)
+++ src/java/org/apache/lucene/analysis/TeeTokenFilter.java (working copy)
@@ -62,6 +62,15 @@
this.sink = sink;
}
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ sink.add(TokenStreamState.capture(this));
+ return true;
+ }
+ return false;
+ }
+
+ /** @deprecated */
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
Index: src/java/org/apache/lucene/analysis/Token.java
===================================================================
--- src/java/org/apache/lucene/analysis/Token.java (revision 708658)
+++ src/java/org/apache/lucene/analysis/Token.java (working copy)
@@ -21,7 +21,11 @@
import org.apache.lucene.index.TermPositions; // for javadoc
import org.apache.lucene.util.ArrayUtil;
-/** A Token is an occurrence of a term from the text of a field. It consists of
+/**
+ This class is now deprecated and a new TokenStream API was introduced with Lucene 2.9.
+ See Javadocs in {@link TokenStream} for further details.
+ <p>
+ A Token is an occurrence of a term from the text of a field. It consists of
a term's text, the start and end offset of the term in the text of the field,
and a type string.
<p>
@@ -114,6 +118,8 @@
</p>
@see org.apache.lucene.index.Payload
+ @deprecated A new TokenStream API was introduced with Lucene 2.9.
+ See javadocs in {@link TokenStream} for further details.
*/
public class Token implements Cloneable {
Index: src/java/org/apache/lucene/analysis/tokenattributes/Attribute.java
===================================================================
--- src/java/org/apache/lucene/analysis/tokenattributes/Attribute.java (revision 0)
+++ src/java/org/apache/lucene/analysis/tokenattributes/Attribute.java (revision 0)
@@ -0,0 +1,91 @@
+package org.apache.lucene.analysis.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Serializable;
+
+/**
+ * Base class for Attributes that can be added to a
+ * {@link org.apache.lucene.analysis.AttributeSource}.
+ *
+ * Attributes are used to deliver information about Tokens from
+ * the analyzer to the indexing chain.
+ */
+public abstract class Attribute implements Cloneable, Serializable {
+ /**
+ * Clears the values in this Attribute and resets it to its
+ * default value.
+ */
+ public abstract void clear();
+
+ /**
+ * Subclasses must implement this method and should follow a syntax
+ * similar to this one:
+ *
+ * <pre>
+ * public String toString() {
+ * return "start=" + startOffset + ",end=" + endOffset;
+ * }
+ * </pre>
+ */
+ public abstract String toString();
+
+ public Attribute() {
+ // empty ctor used to instantiate by reflection
+ }
+
+ /**
+ * Subclasses must implement this method and should compute
+ * a hashCode similar to this:
+ * <pre>
+ * public int hashCode() {
+ * int code = startOffset;
+ * code = code * 31 + endOffset;
+ * return code;
+ * }
+ * </pre>
+ *
+ * see also {@link #equals(Object)}
+ */
+ public abstract int hashCode();
+
+ /**
+ * All values used for computation of {@link #hashCode()}
+ * should be checked here for equality.
+ *
+ * see also {@link Object#equals(Object)}
+ */
+ public abstract boolean equals(Object other);
+
+ /**
+ * Copies the values from this Attribute into the passed-in
+ * target attribute. The type of the target must match the type
+ * of this attribute.
+ */
+ public abstract void copyTo(Attribute target);
+
+ public Object clone() {
+ Object clone = null;
+ try {
+ clone = super.clone();
+ } catch (CloneNotSupportedException e) {
+ throw new RuntimeException(e); // shouldn't happen
+ }
+ return clone;
+ }
+}
Index: src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java
===================================================================
--- src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java (revision 0)
+++ src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java (revision 0)
@@ -0,0 +1,78 @@
+package org.apache.lucene.analysis.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Serializable;
+
+/**
+ * This attribute can be used to pass different flags down the tokenizer chain,
+ * e. g. from one TokenFilter to another one.
+ */
+public class FlagsAttribute extends Attribute implements Cloneable, Serializable {
+ private int flags = 0;
+
+ /**
+ * EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
+ * <p/>
+ *
+ * Get the bitset for any bits that have been set. This is completely distinct from {@link #type()}, although they do share similar purposes.
+ * The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
+ *
+ *
+ * @return The bits
+ */
+ public int getFlags() {
+ return flags;
+ }
+
+ /**
+ * @see #getFlags()
+ */
+ public void setFlags(int flags) {
+ this.flags = flags;
+ }
+
+ public void clear() {
+ flags = 0;
+ }
+
+ public String toString() {
+ return "flags=" + flags;
+ }
+
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+
+ if (other instanceof FlagsAttribute) {
+ return ((FlagsAttribute) other).flags == flags;
+ }
+
+ return false;
+ }
+
+ public int hashCode() {
+ return flags;
+ }
+
+ public void copyTo(Attribute target) {
+ FlagsAttribute t = (FlagsAttribute) target;
+ t.setFlags(flags);
+ }
+}
Index: src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java
===================================================================
--- src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java (revision 0)
+++ src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java (revision 0)
@@ -0,0 +1,91 @@
+package org.apache.lucene.analysis.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Serializable;
+
+/**
+ * The start and end character offset of a Token.
+ */
+public class OffsetAttribute extends Attribute implements Cloneable, Serializable {
+ private int startOffset;
+ private int endOffset;
+
+ /** Returns this Token's starting offset, the position of the first character
+ corresponding to this token in the source text.
+
+ Note that the difference between endOffset() and startOffset() may not be
+ equal to termText.length(), as the term text may have been altered by a
+ stemmer or some other filter. */
+ public int startOffset() {
+ return startOffset;
+ }
+
+ /** Set the starting offset.
+ @see #startOffset() */
+ public void setStartOffset(int offset) {
+ this.startOffset = offset;
+ }
+
+ /** Returns this Token's ending offset, one greater than the position of the
+ last character corresponding to this token in the source text. The length
+ of the token in the source text is (endOffset - startOffset). */
+ public int endOffset() {
+ return endOffset;
+ }
+
+ /** Set the ending offset.
+ @see #endOffset() */
+ public void setEndOffset(int offset) {
+ this.endOffset = offset;
+ }
+
+ public void clear() {
+ startOffset = 0;
+ endOffset = 0;
+ }
+
+ public String toString() {
+ return "start=" + startOffset + ",end=" + endOffset;
+ }
+
+ public boolean equals(Object other) {
+ if (other == this) {
+ return true;
+ }
+
+ if (other instanceof OffsetAttribute) {
+ OffsetAttribute o = (OffsetAttribute) other;
+ return o.startOffset == startOffset && o.endOffset == endOffset;
+ }
+
+ return false;
+ }
+
+ public int hashCode() {
+ int code = startOffset;
+ code = code * 31 + endOffset;
+ return code;
+ }
+
+ public void copyTo(Attribute target) {
+ OffsetAttribute t = (OffsetAttribute) target;
+ t.setStartOffset(startOffset);
+ t.setEndOffset(endOffset);
+ }
+}
Index: src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java
===================================================================
--- src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java (revision 0)
+++ src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java (revision 0)
@@ -0,0 +1,103 @@
+package org.apache.lucene.analysis.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Serializable;
+
+import org.apache.lucene.index.Payload;
+
+/**
+ * The payload of a Token. See also {@link Payload}.
+ */
+public class PayloadAttribute extends Attribute implements Cloneable, Serializable {
+ private Payload payload;
+
+ /**
+ * Initialize this attribute with no payload.
+ */
+ public PayloadAttribute() {}
+
+ /**
+ * Initialize this attribute with the given payload.
+ */
+ public PayloadAttribute(Payload payload) {
+ this.payload = payload;
+ }
+
+ /**
+ * Returns this Token's payload.
+ */
+ public Payload getPayload() {
+ return this.payload;
+ }
+
+ /**
+ * Sets this Token's payload.
+ */
+ public void setPayload(Payload payload) {
+ this.payload = payload;
+ }
+
+ public void clear() {
+ payload = null;
+ }
+
+ public String toString() {
+ if (payload == null) {
+ return "payload=null";
+ }
+
+ return "payload=" + payload.toString();
+ }
+
+ public Object clone() {
+ PayloadAttribute clone = (PayloadAttribute) super.clone();
+ if (payload != null) {
+ clone.payload = (Payload) payload.clone();
+ }
+ return clone;
+ }
+
+ public boolean equals(Object other) {
+ if (other == this) {
+ return true;
+ }
+
+ if (other instanceof PayloadAttribute) {
+ PayloadAttribute o = (PayloadAttribute) other;
+ if (o.payload == null || payload == null) {
+ return o.payload == null && payload == null;
+ }
+
+ return o.payload.equals(payload);
+ }
+
+ return false;
+ }
+
+ public int hashCode() {
+ return (payload == null) ? 0 : payload.hashCode();
+ }
+
+ public void copyTo(Attribute target) {
+ PayloadAttribute t = (PayloadAttribute) target;
+ t.setPayload((payload == null) ? null : (Payload) payload.clone());
+ }
+
+
+}
Index: src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java
===================================================================
--- src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java (revision 0)
+++ src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java (revision 0)
@@ -0,0 +1,99 @@
+package org.apache.lucene.analysis.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Serializable;
+
+import org.apache.lucene.analysis.TokenStream;
+
+/** The positionIncrement determines the position of this token
+ * relative to the previous Token in a {@link TokenStream}, used in phrase
+ * searching.
+ *
+ * <p>The default value is one.
+ *
+ * <p>Some common uses for this are:<ul>
+ *
+ * <li>Set it to zero to put multiple terms in the same position. This is
+ * useful if, e.g., a word has multiple stems. Searches for phrases
+ * including either stem will match. In this case, all but the first stem's
+ * increment should be set to zero: the increment of the first instance
+ * should be one. Repeating a token with an increment of zero can also be
+ * used to boost the scores of matches on that token.
+ *
+ * <li>Set it to values greater than one to inhibit exact phrase matches.
+ * If, for example, one does not want phrases to match across removed stop
+ * words, then one could build a stop word filter that removes stop words and
+ * also sets the increment to the number of stop words removed before each
+ * non-stop word. Then exact phrase queries will only match when the terms
+ * occur with no intervening stop words.
+ *
+ * </ul>
+ * @see org.apache.lucene.index.TermPositions
+ */
+public class PositionIncrementAttribute extends Attribute implements Cloneable, Serializable {
+ private int positionIncrement = 1;
+
+ /** Set the position increment. The default value is one.
+ *
+ * @param positionIncrement the distance from the prior term
+ */
+ public void setPositionIncrement(int positionIncrement) {
+ if (positionIncrement < 0)
+ throw new IllegalArgumentException
+ ("Increment must be zero or greater: " + positionIncrement);
+ this.positionIncrement = positionIncrement;
+ }
+
+ /** Returns the position increment of this Token.
+ * @see #setPositionIncrement
+ */
+ public int getPositionIncrement() {
+ return positionIncrement;
+ }
+
+ public void clear() {
+ this.positionIncrement = 1;
+ }
+
+ public String toString() {
+ return "positionIncrement=" + positionIncrement;
+ }
+
+ public boolean equals(Object other) {
+ if (other == this) {
+ return true;
+ }
+
+ if (other instanceof PositionIncrementAttribute) {
+ return positionIncrement == ((PositionIncrementAttribute) other).positionIncrement;
+ }
+
+ return false;
+ }
+
+ public int hashCode() {
+ return positionIncrement;
+ }
+
+ public void copyTo(Attribute target) {
+ PositionIncrementAttribute t = (PositionIncrementAttribute) target;
+ t.setPositionIncrement(positionIncrement);
+ }
+
+}
Index: src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java
===================================================================
--- src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java (revision 0)
+++ src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java (revision 0)
@@ -0,0 +1,245 @@
+package org.apache.lucene.analysis.tokenattributes;
+
+/** Set the position increment. This determines the position of this token
+ * relative to the previous Token in a {@link TokenStream}, used in phrase
+ * searching.
+ *
+ * <p>The default value is one.
+ *
+ * <p>Some common uses for this are:<ul>
+ *
+ * <li>Set it to zero to put multiple terms in the same position. This is
+ * useful if, e.g., a word has multiple stems. Searches for phrases
+ * including either stem will match. In this case, all but the first stem's
+ * increment should be set to zero: the increment of the first instance
+ * should be one. Repeating a token with an increment of zero can also be
+ * used to boost the scores of matches on that token.
+ *
+ * <li>Set it to values greater than one to inhibit exact phrase matches.
+ * If, for example, one does not want phrases to match across removed stop
+ * words, then one could build a stop word filter that removes stop words and
+ * also sets the increment to the number of stop words removed before each
+ * non-stop word. Then exact phrase queries will only match when the terms
+ * occur with no intervening stop words.
+ *
+ * </ul>
+ * @param positionIncrement the distance from the prior term
+ * @see org.apache.lucene.index.TermPositions
+ */
+
+import java.io.Serializable;
+
+import org.apache.lucene.util.ArrayUtil;
+
+/**
+ * The term text of a Token.
+ */
+public class TermAttribute extends Attribute implements Cloneable, Serializable {
+ private static int MIN_BUFFER_SIZE = 10;
+
+ private char[] termBuffer;
+ private int termLength;
+
+ /** Returns the Token's term text.
+ *
+ * This method has a performance penalty
+ * because the text is stored internally in a char[]. If
+ * possible, use {@link #termBuffer()} and {@link
+ * #termLength()} directly instead. If you really need a
+ * String, use this method, which is nothing more than
+ * a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b>
+ */
+ public String term() {
+ initTermBuffer();
+ return new String(termBuffer, 0, termLength);
+ }
+
+ /** Copies the contents of buffer, starting at offset for
+ * length characters, into the termBuffer array.
+ * @param buffer the buffer to copy
+ * @param offset the index in the buffer of the first character to copy
+ * @param length the number of characters to copy
+ */
+ public void setTermBuffer(char[] buffer, int offset, int length) {
+ char[] newCharBuffer = growTermBuffer(length);
+ if (newCharBuffer != null) {
+ termBuffer = newCharBuffer;
+ }
+ System.arraycopy(buffer, offset, termBuffer, 0, length);
+ termLength = length;
+ }
+
+ /** Copies the contents of buffer into the termBuffer array.
+ * @param buffer the buffer to copy
+ */
+ public void setTermBuffer(String buffer) {
+ int length = buffer.length();
+ char[] newCharBuffer = growTermBuffer(length);
+ if (newCharBuffer != null) {
+ termBuffer = newCharBuffer;
+ }
+ buffer.getChars(0, length, termBuffer, 0);
+ termLength = length;
+ }
+
+ /** Copies the contents of buffer, starting at offset and continuing
+ * for length characters, into the termBuffer array.
+ * @param buffer the buffer to copy
+ * @param offset the index in the buffer of the first character to copy
+ * @param length the number of characters to copy
+ */
+ public void setTermBuffer(String buffer, int offset, int length) {
+ assert offset <= buffer.length();
+ assert offset + length <= buffer.length();
+ char[] newCharBuffer = growTermBuffer(length);
+ if (newCharBuffer != null) {
+ termBuffer = newCharBuffer;
+ }
+ buffer.getChars(offset, offset + length, termBuffer, 0);
+ termLength = length;
+ }
+
+ /** Returns the internal termBuffer character array which
+ * you can then directly alter. If the array is too
+ * small for your token, use {@link
+ * #resizeTermBuffer(int)} to increase it. After
+ * altering the buffer be sure to call {@link
+ * #setTermLength} to record the number of valid
+ * characters that were placed into the termBuffer. */
+ public char[] termBuffer() {
+ initTermBuffer();
+ return termBuffer;
+ }
+
+ /** Grows the termBuffer to at least size newSize, preserving the
+ * existing content. Note: If the next operation is to change
+ * the contents of the term buffer use
+ * {@link #setTermBuffer(char[], int, int)},
+ * {@link #setTermBuffer(String)}, or
+ * {@link #setTermBuffer(String, int, int)}
+ * to optimally combine the resize with the setting of the termBuffer.
+ * @param newSize minimum size of the new termBuffer
+ * @return newly created termBuffer with length >= newSize
+ */
+ public char[] resizeTermBuffer(int newSize) {
+ char[] newCharBuffer = growTermBuffer(newSize);
+ if (termBuffer == null) {
+ // If there were termText, then preserve it.
+ // note that if termBuffer is null then newCharBuffer cannot be null
+ assert newCharBuffer != null;
+ termBuffer = newCharBuffer;
+ } else if (newCharBuffer != null) {
+ // Note: if newCharBuffer != null then termBuffer needs to grow.
+ // If there were a termBuffer, then preserve it
+ System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
+ termBuffer = newCharBuffer;
+ }
+ return termBuffer;
+ }
+
+ /** Allocates a buffer char[] of at least newSize
+ * @param newSize minimum size of the buffer
+ * @return newly created buffer with length >= newSize or null if the current termBuffer is big enough
+ */
+ private char[] growTermBuffer(int newSize) {
+ if (termBuffer != null) {
+ if (termBuffer.length >= newSize)
+ // Already big enough
+ return null;
+ else
+ // Not big enough; create a new array with slight
+ // over allocation:
+ return new char[ArrayUtil.getNextSize(newSize)];
+ } else {
+
+ // determine the best size
+ // The buffer is always at least MIN_BUFFER_SIZE
+ if (newSize < MIN_BUFFER_SIZE) {
+ newSize = MIN_BUFFER_SIZE;
+ }
+
+ return new char[newSize];
+ }
+ }
+
+ // TODO: once we remove the deprecated termText() method
+ // and switch entirely to char[] termBuffer we don't need
+ // to use this method anymore
+ private void initTermBuffer() {
+ if (termBuffer == null) {
+ termBuffer = new char[MIN_BUFFER_SIZE];
+ termLength = 0;
+ }
+ }
+
+ /** Return number of valid characters (length of the term)
+ * in the termBuffer array. */
+ public int termLength() {
+ initTermBuffer();
+ return termLength;
+ }
+
+ /** Set number of valid characters (length of the term) in
+ * the termBuffer array. Use this to truncate the termBuffer
+ * or to synchronize with external manipulation of the termBuffer.
+ * Note: to grow the size of the array,
+ * use {@link #resizeTermBuffer(int)} first.
+ * @param length the truncated length
+ */
+ public void setTermLength(int length) {
+ initTermBuffer();
+ if (length > termBuffer.length)
+ throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")");
+ termLength = length;
+ }
+
+ public int hashCode() {
+ initTermBuffer();
+ int code = termLength;
+ code = code * 31 + ArrayUtil.hashCode(termBuffer, 0, termLength);
+ return code;
+ }
+
+ public void clear() {
+ termLength = 0;
+ }
+
+ public Object clone() {
+ TermAttribute t = (TermAttribute)super.clone();
+ // Do a deep clone
+ if (termBuffer != null) {
+ t.termBuffer = (char[]) termBuffer.clone();
+ }
+ return t;
+ }
+
+ public boolean equals(Object other) {
+ if (other == this) {
+ return true;
+ }
+
+ if (other instanceof TermAttribute) {
+ initTermBuffer();
+ TermAttribute o = ((TermAttribute) other);
+ o.initTermBuffer();
+
+ for(int i=0;i<termLength;i++) {
+ if (termBuffer[i] != o.termBuffer[i]) {
+ return false;
+ }
+ }
+ }
+
+ return false;
+ }
+
+ public String toString() {
+ initTermBuffer();
+ return "term=" + new String(termBuffer, 0, termLength);
+ }
+
+ public void copyTo(Attribute target) {
+ TermAttribute t = (TermAttribute) target;
+ t.setTermBuffer(termBuffer, 0, termLength);
+ }
+}
Index: src/java/org/apache/lucene/analysis/tokenattributes/TypeAttribute.java
===================================================================
--- src/java/org/apache/lucene/analysis/tokenattributes/TypeAttribute.java (revision 0)
+++ src/java/org/apache/lucene/analysis/tokenattributes/TypeAttribute.java (revision 0)
@@ -0,0 +1,86 @@
+package org.apache.lucene.analysis.tokenattributes;
+
+/** Set the position increment. This determines the position of this token
+ * relative to the previous Token in a {@link TokenStream}, used in phrase
+ * searching.
+ *
+ * <p>The default value is one.
+ *
+ * <p>Some common uses for this are:<ul>
+ *
+ * <li>Set it to zero to put multiple terms in the same position. This is
+ * useful if, e.g., a word has multiple stems. Searches for phrases
+ * including either stem will match. In this case, all but the first stem's
+ * increment should be set to zero: the increment of the first instance
+ * should be one. Repeating a token with an increment of zero can also be
+ * used to boost the scores of matches on that token.
+ *
+ * <li>Set it to values greater than one to inhibit exact phrase matches.
+ * If, for example, one does not want phrases to match across removed stop
+ * words, then one could build a stop word filter that removes stop words and
+ * also sets the increment to the number of stop words removed before each
+ * non-stop word. Then exact phrase queries will only match when the terms
+ * occur with no intervening stop words.
+ *
+ * </ul>
+ * @param positionIncrement the distance from the prior term
+ * @see org.apache.lucene.index.TermPositions
+ */
+
+import java.io.Serializable;
+
+/**
+ * A Token's lexical type. The Default value is "word".
+ */
+public class TypeAttribute extends Attribute implements Cloneable, Serializable {
+ private String type;
+ public static final String DEFAULT_TYPE = "word";
+
+ public TypeAttribute() {
+ this(DEFAULT_TYPE);
+ }
+
+ public TypeAttribute(String type) {
+ this.type = type;
+ }
+
+ /** Returns this Token's lexical type. Defaults to "word". */
+ public String type() {
+ return type;
+ }
+
+ /** Set the lexical type.
+ @see #type() */
+ public void setType(String type) {
+ this.type = type;
+ }
+
+ public void clear() {
+ type = DEFAULT_TYPE;
+ }
+
+ public String toString() {
+ return "type=" + type;
+ }
+
+ public boolean equals(Object other) {
+ if (other == this) {
+ return true;
+ }
+
+ if (other instanceof TypeAttribute) {
+ return type.equals(((TypeAttribute) other).type);
+ }
+
+ return false;
+ }
+
+ public int hashCode() {
+ return type.hashCode();
+ }
+
+ public void copyTo(Attribute target) {
+ TypeAttribute t = (TypeAttribute) target;
+ t.setType(new String(type));
+ }
+}
Index: src/java/org/apache/lucene/analysis/TokenFilter.java
===================================================================
--- src/java/org/apache/lucene/analysis/TokenFilter.java (revision 708658)
+++ src/java/org/apache/lucene/analysis/TokenFilter.java (working copy)
@@ -22,9 +22,12 @@
/** A TokenFilter is a TokenStream whose input is another token stream.
<p>
This is an abstract class.
- NOTE: subclasses must override {@link #next(Token)}. It's
- also OK to instead override {@link #next()} but that
- method is now deprecated in favor of {@link #next(Token)}.
+ NOTE: subclasses must override {@link #initialize()} and
+ {@link #incrementToken()} if the new TokenStream API is used
+ and {@link #next(Token)} or {@link #next()} if the old
+ TokenStream API is used.
+ <p>
+ See {@link TokenStream}
*/
public abstract class TokenFilter extends TokenStream {
/** The source of tokens for this filter. */
@@ -33,8 +36,14 @@
/** Construct a token stream filtering the given input. */
protected TokenFilter(TokenStream input) {
this.input = input;
+ this.attributes = input.attributes;
}
-
+
+ public final void start() throws IOException {
+ input.start();
+ initialize();
+ }
+
/** Close the input TokenStream. */
public void close() throws IOException {
input.close();
Index: src/java/org/apache/lucene/analysis/TokenStream.java
===================================================================
--- src/java/org/apache/lucene/analysis/TokenStream.java (revision 708658)
+++ src/java/org/apache/lucene/analysis/TokenStream.java (working copy)
@@ -17,10 +17,12 @@
* limitations under the License.
*/
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.lucene.analysis.tokenattributes.Attribute;
import org.apache.lucene.index.Payload;
-import java.io.IOException;
-
/** A TokenStream enumerates the sequence of tokens, either from
fields of a document or from query text.
<p>
@@ -31,13 +33,99 @@
<li>{@link TokenFilter}, a TokenStream
whose input is another TokenStream.
</ul>
- NOTE: subclasses must override {@link #next(Token)}. It's
- also OK to instead override {@link #next()} but that
- method is now deprecated in favor of {@link #next(Token)}.
+ A new TokenStream API is introduced with Lucene 2.9. Since
+ 2.9 Token is deprecated and the preferred way to store
+ the information of a token is to use {@link Attribute}s.
+ <p>
+ For that reason TokenStream extends {@link AttributeSource}
+ now. Note that only one instance per {@link Attribute} is
+ created and reused for every token. This approach reduces
+ object creations and allows local caching of references to
+ the {@link Attribute}s. See {@link #initialize()} and
+ {@link #incrementToken()} for further details.
+ <p>
+ Sometimes it is desirable to capture a current state of a
+ TokenStream, e. g. for buffering purposes (see {@link CachingTokenFilter},
+ {@link TeeTokenFilter}/{@link SinkTokenizer}). For this usecase
+ the class {@link TokenStreamState} can be used.
+ <p>
+ <b>NOTE:</b> In order to enable the new API the method
+ {@link #useNewAPI()} has to be called with useNewAPI=true.
+ Otherwise the deprecated method {@link #next(Token)} will
+ be used by Lucene consumers (indexer and queryparser) to
+ consume the tokens. {@link #next(Token)} will be removed
+ in Lucene 3.0.
+ <p>
+ NOTE: To use the old API subclasses must override {@link #next(Token)}.
+ It's also OK to instead override {@link #next()} but that
+ method is slower compared to {@link #next(Token)}.
*/
-public abstract class TokenStream {
+public abstract class TokenStream extends AttributeSource {
+ private static boolean useNewAPI = false;
+
+ /**
+ * Returns whether or not the new TokenStream APIs are used
+ * (see {@link #incrementToken()}, {@link AttributeSource}).
+ */
+ public static boolean useNewAPI() {
+ return useNewAPI;
+ }
+ /**
+ * Use this API to enable or disable the new TokenStream API.
+ * (see {@link #incrementToken()}, {@link AttributeSource}).
+ * <p>
+ * If set to true, the indexer will call {@link #start()}
+ * and {@link #incrementToken()} to consume Tokens from this
+ * stream.
+ * <p>
+ * If set to false, the indexer will call {@link #next(Token)}
+ * instead.
+ */
+ public static void setUseNewAPI(boolean use) {
+ useNewAPI = use;
+ }
+
+ /**
+ * Consumers of the stream must call this method before calling
+ * {@link #incrementToken()} for the first time to initialize
+ * this stream.
+ */
+ public void start() throws IOException {
+ initialize();
+ }
+
+ /**
+ * This method does nothing by default. Subclasses must should
+ * implement this and call {@link #addAttribute(Class)} or
+ * {@link #getAttribute(Class)} to store local references of
+ * attributes. See {@link #incrementToken()} for more information.
+ */
+ public void initialize() throws IOException {}
+
+ /**
+ * Consumers (e. g. the indexer) use this method to advance the stream
+ * to the next token. Implementing classes must implement this method
+ * and update the appropriate {@link Attribute}s with content of the
+ * next token.
+ * <p>
+ * This method is called for every token of a document, so an efficient
+ * implementation is crucial for good performance. To avoid calls to
+ * {@link #addAttribute(Class)} and {@link #getAttribute(Class)} and
+ * downcasts, references to all {@link Attribute}s that this stream uses
+ * should be cached in {@link #initialize()};
+ *
+ * @return false for end of stream; true otherwise
+ *
+ * <p>
+ * <b>Note that this method will be defined abstract in Lucene 3.0.<b>
+ */
+ public boolean incrementToken() throws IOException {
+ // subclasses must implement this method; will be made abstract in Lucene 3.0
+ return false;
+ }
+
/** Returns the next token in the stream, or null at EOS.
* @deprecated The returned Token is a "full private copy" (not
* re-used across calls to next()) but will be slower
@@ -84,6 +172,8 @@
* is not required to check for null before using it, but it is a
* good idea to assert that it is not null.)
* @return next token in the stream or null if end-of-stream was hit
+ * @deprecated The new {@link #incrementToken()} and {@link AttributeSource}
+ * APIs should be used instead. See also {@link #useNewAPI()}.
*/
public Token next(final Token reusableToken) throws IOException {
// We don't actually use inputToken, but still add this assert
@@ -107,4 +197,25 @@
/** Releases resources associated with this stream. */
public void close() throws IOException {}
+
+ public String toString() {
+ StringBuffer sb = new StringBuffer();
+ sb.append('(');
+
+ if (hasAttributes()) {
+ // TODO Java 1.5
+ //Iterator<Attribute> it = attributes.values().iterator();
+ Iterator it = getAttributesIterator();
+ if (it.hasNext()) {
+ sb.append(it.next().toString());
+ }
+ while (it.hasNext()) {
+ sb.append(',');
+ sb.append(it.next().toString());
+ }
+ }
+ sb.append(')');
+ return sb.toString();
+ }
+
}
Index: src/java/org/apache/lucene/analysis/TokenStreamState.java
===================================================================
--- src/java/org/apache/lucene/analysis/TokenStreamState.java (revision 0)
+++ src/java/org/apache/lucene/analysis/TokenStreamState.java (revision 0)
@@ -0,0 +1,86 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Iterator;
+
+import org.apache.lucene.analysis.tokenattributes.Attribute;
+
+/**
+ * This class can be used to capture a certain state of a {@link TokenStream}.
+ * This is useful for buffering usecases (see {@link CachingTokenFilter},
+ * {@link TeeTokenFilter}/{@link SinkTokenizer}).
+ */
+public class TokenStreamState extends AttributeSource {
+ private TokenStreamState(TokenStream input, AttributeAcceptor acceptor) {
+ Iterator it = input.getAttributesIterator();
+ while(it.hasNext()) {
+ Attribute att = (Attribute) it.next();
+ if (acceptor.accept(att.getClass())) {
+ Attribute clone = (Attribute) att.clone();
+ this.attributes.put(att.getClass(), clone);
+ }
+ }
+ }
+
+ public TokenStreamState() {}
+
+ /**
+ * Captures the current state of the passed in TokenStream.
+ * <p>
+ * This state will contain all of the passed in TokenStream's
+ * {@link Attribute}s. If only a subset of the attributes is needed
+ * please use {@link #capture(TokenStream, AttributeAcceptor)}
+ */
+ public static TokenStreamState capture(TokenStream from) {
+ return new TokenStreamState(from, AllAcceptor);
+ }
+
+ /**
+ * Captures the current state of the passed in TokenStream.
+ * <p>
+ * This state will contain all of the passed in TokenStream's
+ * {@link Attribute}s which the {@link AttributeAcceptor} accepts.
+ */
+ public static TokenStreamState capture(TokenStream from, AttributeAcceptor acceptor) {
+ return new TokenStreamState(from, acceptor);
+ }
+
+ /**
+ * Restores this state by copying the values of all attributes
+ * that this state contains into the attributes of the targetStream.
+ * The targetStream must contain a corresponding instance for each argument
+ * contained in this state.
+ * <p>
+ * Note that this method does not affect attributes of the targetStream
+ * that are not contained in this state. In other words, if for example
+ * the targetStream contains an OffsetAttribute, but this state doesn't, then
+ * the value of the OffsetAttribute remains unchanged. It might be desirable to
+ * reset its value to the default, in which case the caller should first
+ * call {@link TokenStream#clearAttributes()} on the targetStream.
+ */
+ public void restore(TokenStream targetStream) {
+ Iterator it = getAttributesIterator();
+ while (it.hasNext()) {
+ Attribute att = (Attribute) it.next();
+ Attribute targetAtt = targetStream.getAttribute(att.getClass());
+ att.copyTo(targetAtt);
+ }
+ }
+
+}
Property changes on: src\java\org\apache\lucene\analysis\TokenStreamState.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/index/DocInverter.java
===================================================================
--- src/java/org/apache/lucene/index/DocInverter.java (revision 708658)
+++ src/java/org/apache/lucene/index/DocInverter.java (working copy)
@@ -17,13 +17,15 @@
* limitations under the License.
*/
-import java.util.Map;
+import java.io.IOException;
+import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
-import java.util.Collection;
import java.util.Iterator;
-import java.io.IOException;
+import java.util.Map;
+import org.apache.lucene.analysis.AttributeSource;
+
/** This is a DocFieldConsumer that inverts each field,
* separately, from a Document, and accepts a
* InvertedTermsConsumer to process those terms. */
@@ -98,12 +100,14 @@
int length;
int offset;
float boost;
+ AttributeSource attributeSource;
void reset(float docBoost) {
position = 0;
length = 0;
offset = 0;
boost = docBoost;
+ attributeSource = null;
}
}
}
Index: src/java/org/apache/lucene/index/DocInverterPerField.java
===================================================================
--- src/java/org/apache/lucene/index/DocInverterPerField.java (revision 708658)
+++ src/java/org/apache/lucene/index/DocInverterPerField.java (working copy)
@@ -22,6 +22,8 @@
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/**
* Holds state for inverting all occurrences of a single
@@ -79,10 +81,14 @@
if (!field.isTokenized()) { // un-tokenized field
String stringValue = field.stringValue();
final int valueLength = stringValue.length();
- Token token = perThread.localToken.reinit(stringValue, fieldState.offset, fieldState.offset + valueLength);
+ perThread.localToken.reinit(stringValue, fieldState.offset, fieldState.offset + valueLength);
+ fieldState.attributeSource = perThread.localTokenStream;
+ perThread.localTokenStream.set(perThread.localToken);
+ perThread.localTokenStream.start();
+ consumer.start(field);
boolean success = false;
try {
- consumer.add(token);
+ consumer.add();
success = true;
} finally {
if (!success)
@@ -122,7 +128,30 @@
try {
int offsetEnd = fieldState.offset-1;
- final Token localToken = perThread.localToken;
+
+ boolean useNewTokenStreamAPI = TokenStream.useNewAPI();
+ Token localToken = null;
+ OffsetAttribute offsetAttribute = null;
+ PositionIncrementAttribute posIncrAttribute = null;
+
+ if (useNewTokenStreamAPI) {
+ fieldState.attributeSource = stream;
+ stream.start();
+ } else {
+ fieldState.attributeSource = perThread.localTokenStream;
+ localToken = perThread.localToken;
+ perThread.localTokenStream.start();
+ }
+
+ consumer.start(field);
+
+ if (fieldState.attributeSource.hasAttribute(OffsetAttribute.class)) {
+ offsetAttribute = (OffsetAttribute) fieldState.attributeSource.getAttribute(OffsetAttribute.class);
+ }
+ if (fieldState.attributeSource.hasAttribute(PositionIncrementAttribute.class)) {
+ posIncrAttribute = (PositionIncrementAttribute) fieldState.attributeSource.getAttribute(PositionIncrementAttribute.class);
+ }
+
for(;;) {
// If we hit an exception in stream.next below
@@ -131,10 +160,26 @@
// non-aborting and (above) this one document
// will be marked as deleted, but still
// consume a docID
- Token token = stream.next(localToken);
+ Token token = null;
+ if (useNewTokenStreamAPI) {
+ if (!stream.incrementToken()) break;
+ } else {
+ token = stream.next(localToken);
+ if (token == null) break;
+ perThread.localTokenStream.set(token);
+ }
+
+ int positionIncrement = 1;
+ int endOffset = 0;
- if (token == null) break;
- fieldState.position += (token.getPositionIncrement() - 1);
+ if (posIncrAttribute != null) {
+ positionIncrement = posIncrAttribute.getPositionIncrement();
+ }
+ if (offsetAttribute != null) {
+ endOffset = offsetAttribute.endOffset();
+ }
+
+ fieldState.position += (positionIncrement - 1);
boolean success = false;
try {
// If we hit an exception in here, we abort
@@ -143,14 +188,14 @@
// internal state of the consumer is now
// corrupt and should not be flushed to a
// new segment:
- consumer.add(token);
+ consumer.add();
success = true;
} finally {
if (!success)
docState.docWriter.setAborting();
}
fieldState.position++;
- offsetEnd = fieldState.offset + token.endOffset();
+ offsetEnd = fieldState.offset + endOffset;
if (++fieldState.length >= maxFieldLength) {
if (docState.infoStream != null)
docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
Index: src/java/org/apache/lucene/index/DocInverterPerThread.java
===================================================================
--- src/java/org/apache/lucene/index/DocInverterPerThread.java (revision 708658)
+++ src/java/org/apache/lucene/index/DocInverterPerThread.java (working copy)
@@ -18,8 +18,19 @@
*/
import java.io.IOException;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import org.apache.lucene.analysis.AttributeSource;
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.Attribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/** This is a DocFieldConsumer that inverts each field,
* separately, from a Document, and accepts a
@@ -30,6 +41,103 @@
final InvertedDocConsumerPerThread consumer;
final InvertedDocEndConsumerPerThread endConsumer;
final Token localToken = new Token();
+ final BackwardsCompatibilityStream localTokenStream = new BackwardsCompatibilityStream();
+
+ /** This stream wrapper is only used to maintain backwards compatibility with the
+ * old TokenStream API and can be removed in Lucene 3.0
+ * @deprecated
+ */
+ static class BackwardsCompatibilityStream extends TokenStream {
+ private Token token;
+
+ TermAttribute termAttribute = new TermAttribute() {
+ public String term() {
+ return token.term();
+ }
+
+ public char[] termBuffer() {
+ return token.termBuffer();
+ }
+
+ public int termLength() {
+ return token.termLength();
+ }
+ };
+ OffsetAttribute offsetAttribute = new OffsetAttribute() {
+ public int startOffset() {
+ return token.startOffset();
+ }
+
+ public int endOffset() {
+ return token.endOffset();
+ }
+ };
+
+ PositionIncrementAttribute positionIncrementAttribute = new PositionIncrementAttribute() {
+ public int getPositionIncrement() {
+ return token.getPositionIncrement();
+ }
+ };
+
+ FlagsAttribute flagsAttribute = new FlagsAttribute() {
+ public int getFlags() {
+ return token.getFlags();
+ }
+ };
+
+ PayloadAttribute payloadAttribute = new PayloadAttribute() {
+ public Payload getPayload() {
+ return token.getPayload();
+ }
+ };
+
+ TypeAttribute typeAttribute = new TypeAttribute() {
+ public String type() {
+ return token.type();
+ }
+ };
+
+ BackwardsCompatibilityStream() {
+ attributes.put(TermAttribute.class, termAttribute);
+ attributes.put(OffsetAttribute.class, offsetAttribute);
+ attributes.put(PositionIncrementAttribute.class, positionIncrementAttribute);
+ attributes.put(FlagsAttribute.class, flagsAttribute);
+ attributes.put(PayloadAttribute.class, payloadAttribute);
+ attributes.put(TypeAttribute.class, typeAttribute);
+ }
+
+ public Attribute addAttribute(Class attClass) {
+ Attribute att = (Attribute) attributes.get(attClass);
+ if (att == null) {
+ return super.addAttribute(attClass);
+ }
+ return att;
+ }
+
+ public boolean hasAttribute(Class attClass) {
+ return this.attributes.containsKey(attClass) || super.hasAttribute(attClass);
+ }
+
+ public Attribute getAttribute(Class attClass) {
+ Attribute att = (Attribute) this.attributes.get(attClass);
+ if (att == null) {
+ return super.getAttribute(attClass);
+ }
+
+ return att;
+ }
+
+ public AttributeSource reinit(String stringValue, int startOffset, int endOffset) {
+ termAttribute.setTermBuffer(stringValue);
+ offsetAttribute.setStartOffset(startOffset);
+ offsetAttribute.setEndOffset(endOffset);
+ return this;
+ }
+
+ public void set(Token token) {
+ this.token = token;
+ }
+ };
final DocumentsWriter.DocState docState;
final DocInverter.FieldInvertState fieldState = new DocInverter.FieldInvertState();
Index: src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
===================================================================
--- src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (revision 708658)
+++ src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (working copy)
@@ -19,7 +19,7 @@
import java.io.IOException;
import org.apache.lucene.document.Fieldable;
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
// TODO: break into separate freq and prox writers as
// codecs; make separate container (tii/tis/skip/*) that can
@@ -32,6 +32,8 @@
final DocumentsWriter.DocState docState;
final DocInverter.FieldInvertState fieldState;
boolean omitTf;
+
+ PayloadAttribute payloadAttribute;
public FreqProxTermsWriterPerField(TermsHashPerField termsHashPerField, FreqProxTermsWriterPerThread perThread, FieldInfo fieldInfo) {
this.termsHashPerField = termsHashPerField;
@@ -53,7 +55,7 @@
boolean hasPayloads;
- void skippingLongTerm(Token t) throws IOException {}
+ void skippingLongTerm() throws IOException {}
public int compareTo(Object other0) {
FreqProxTermsWriterPerField other = (FreqProxTermsWriterPerField) other0;
@@ -64,6 +66,7 @@
// Record, up front, whether our in-RAM format will be
// with or without term freqs:
omitTf = fieldInfo.omitTf;
+ payloadAttribute = null;
}
boolean start(Fieldable[] fields, int count) {
@@ -72,9 +75,23 @@
return true;
return false;
}
+
+ void start(Fieldable f) {
+ if (fieldState.attributeSource.hasAttribute(PayloadAttribute.class)) {
+ payloadAttribute = (PayloadAttribute) fieldState.attributeSource.getAttribute(PayloadAttribute.class);
+ } else {
+ payloadAttribute = null;
+ }
+ }
- final void writeProx(Token t, FreqProxTermsWriter.PostingList p, int proxCode) {
- final Payload payload = t.getPayload();
+ final void writeProx(FreqProxTermsWriter.PostingList p, int proxCode) {
+ final Payload payload;
+ if (payloadAttribute == null) {
+ payload = null;
+ } else {
+ payload = payloadAttribute.getPayload();
+ }
+
if (payload != null && payload.length > 0) {
termsHashPerField.writeVInt(1, (proxCode<<1)|1);
termsHashPerField.writeVInt(1, payload.length);
@@ -85,7 +102,7 @@
p.lastPosition = fieldState.position;
}
- final void newTerm(Token t, RawPostingList p0) {
+ final void newTerm(RawPostingList p0) {
// First time we're seeing this term since the last
// flush
assert docState.testPoint("FreqProxTermsWriterPerField.newTerm start");
@@ -96,11 +113,11 @@
} else {
p.lastDocCode = docState.docID << 1;
p.docFreq = 1;
- writeProx(t, p, fieldState.position);
+ writeProx(p, fieldState.position);
}
}
- final void addTerm(Token t, RawPostingList p0) {
+ final void addTerm(RawPostingList p0) {
assert docState.testPoint("FreqProxTermsWriterPerField.addTerm start");
@@ -132,10 +149,10 @@
p.docFreq = 1;
p.lastDocCode = (docState.docID - p.lastDocID) << 1;
p.lastDocID = docState.docID;
- writeProx(t, p, fieldState.position);
+ writeProx(p, fieldState.position);
} else {
p.docFreq++;
- writeProx(t, p, fieldState.position-p.lastPosition);
+ writeProx(p, fieldState.position-p.lastPosition);
}
}
}
Index: src/java/org/apache/lucene/index/InvertedDocConsumerPerField.java
===================================================================
--- src/java/org/apache/lucene/index/InvertedDocConsumerPerField.java (revision 708658)
+++ src/java/org/apache/lucene/index/InvertedDocConsumerPerField.java (working copy)
@@ -17,10 +17,10 @@
* limitations under the License.
*/
-import org.apache.lucene.document.Fieldable;
-import org.apache.lucene.analysis.Token;
import java.io.IOException;
+import org.apache.lucene.document.Fieldable;
+
abstract class InvertedDocConsumerPerField {
// Called once per field, and is given all Fieldable
@@ -29,8 +29,11 @@
// fields:
abstract boolean start(Fieldable[] fields, int count) throws IOException;
+ // Called before a field instance is being processed
+ abstract void start(Fieldable field);
+
// Called once per inverted token
- abstract void add(Token token) throws IOException;
+ abstract void add() throws IOException;
// Called once per field per document, after all Fieldable
// occurrences are inverted
Index: src/java/org/apache/lucene/index/Payload.java
===================================================================
--- src/java/org/apache/lucene/index/Payload.java (revision 708658)
+++ src/java/org/apache/lucene/index/Payload.java (working copy)
@@ -19,7 +19,6 @@
import java.io.Serializable;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.ArrayUtil;
@@ -29,7 +28,7 @@
* specific term.
* <p>
* To store payloads in the index a {@link TokenStream} has to be used that
- * produces {@link Token}s containing payload data.
+ * produces payload data.
* <p>
* Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)}
* to retrieve the payloads from the index.<br>
Index: src/java/org/apache/lucene/index/TermsHashConsumerPerField.java
===================================================================
--- src/java/org/apache/lucene/index/TermsHashConsumerPerField.java (revision 708658)
+++ src/java/org/apache/lucene/index/TermsHashConsumerPerField.java (working copy)
@@ -23,14 +23,15 @@
* multiple streams for each unique Token. */
import java.io.IOException;
+
import org.apache.lucene.document.Fieldable;
-import org.apache.lucene.analysis.Token;
abstract class TermsHashConsumerPerField {
abstract boolean start(Fieldable[] fields, int count) throws IOException;
abstract void finish() throws IOException;
- abstract void skippingLongTerm(Token t) throws IOException;
- abstract void newTerm(Token t, RawPostingList p) throws IOException;
- abstract void addTerm(Token t, RawPostingList p) throws IOException;
+ abstract void skippingLongTerm() throws IOException;
+ abstract void start(Fieldable field);
+ abstract void newTerm(RawPostingList p) throws IOException;
+ abstract void addTerm(RawPostingList p) throws IOException;
abstract int getStreamCount();
}
Index: src/java/org/apache/lucene/index/TermsHashPerField.java
===================================================================
--- src/java/org/apache/lucene/index/TermsHashPerField.java (revision 708658)
+++ src/java/org/apache/lucene/index/TermsHashPerField.java (working copy)
@@ -20,8 +20,8 @@
import java.io.IOException;
import java.util.Arrays;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Fieldable;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.util.UnicodeUtil;
final class TermsHashPerField extends InvertedDocConsumerPerField {
@@ -31,6 +31,8 @@
final TermsHashPerThread perThread;
final DocumentsWriter.DocState docState;
final DocInverter.FieldInvertState fieldState;
+
+ TermAttribute termAtt;
// Copied from our perThread
final CharBlockPool charPool;
@@ -49,7 +51,7 @@
private int postingsHashMask = postingsHashSize-1;
private RawPostingList[] postingsHash = new RawPostingList[postingsHashSize];
private RawPostingList p;
-
+
public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) {
this.perThread = perThread;
intPool = perThread.intPool;
@@ -247,6 +249,14 @@
private boolean doCall;
private boolean doNextCall;
+ void start(Fieldable f) {
+ termAtt = (TermAttribute) fieldState.attributeSource.addAttribute(TermAttribute.class);
+ consumer.start(f);
+ if (nextPerField != null) {
+ nextPerField.start(f);
+ }
+ }
+
boolean start(Fieldable[] fields, int count) throws IOException {
doCall = consumer.start(fields, count);
if (nextPerField != null)
@@ -257,7 +267,7 @@
// Secondary entry point (for 2nd & subsequent TermsHash),
// because token text has already been "interned" into
// textStart, so we hash by textStart
- public void add(Token token, int textStart) throws IOException {
+ public void add(int textStart) throws IOException {
int code = textStart;
@@ -320,17 +330,17 @@
}
p.byteStart = intUptos[intUptoStart];
- consumer.newTerm(token, p);
+ consumer.newTerm(p);
} else {
intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
- consumer.addTerm(token, p);
+ consumer.addTerm(p);
}
}
// Primary entry point (for first TermsHash)
- void add(Token token) throws IOException {
+ void add() throws IOException {
assert !postingsCompacted;
@@ -338,8 +348,8 @@
// term text into textStart address
// Get the text of this term.
- final char[] tokenText = token.termBuffer();
- final int tokenTextLen = token.termLength();
+ final char[] tokenText = termAtt.termBuffer();;
+ final int tokenTextLen = termAtt.termLength();
// Compute hashcode & replace any invalid UTF16 sequences
int downto = tokenTextLen;
@@ -403,7 +413,7 @@
if (docState.maxTermPrefix == null)
docState.maxTermPrefix = new String(tokenText, 0, 30);
- consumer.skippingLongTerm(token);
+ consumer.skippingLongTerm();
return;
}
charPool.nextBuffer();
@@ -450,16 +460,16 @@
}
p.byteStart = intUptos[intUptoStart];
- consumer.newTerm(token, p);
+ consumer.newTerm(p);
} else {
intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
- consumer.addTerm(token, p);
+ consumer.addTerm(p);
}
if (doNextCall)
- nextPerField.add(token, p.textStart);
+ nextPerField.add(p.textStart);
}
int[] intUptos;
Index: src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
===================================================================
--- src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (revision 708658)
+++ src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (working copy)
@@ -18,10 +18,11 @@
*/
import java.io.IOException;
-import org.apache.lucene.util.UnicodeUtil;
-import org.apache.lucene.analysis.Token;
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.UnicodeUtil;
final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
@@ -37,7 +38,8 @@
boolean doVectorOffsets;
int maxNumPostings;
-
+ OffsetAttribute offsetAttribute = null;
+
public TermVectorsTermsWriterPerField(TermsHashPerField termsHashPerField, TermVectorsTermsWriterPerThread perThread, FieldInfo fieldInfo) {
this.termsHashPerField = termsHashPerField;
this.perThread = perThread;
@@ -191,8 +193,16 @@
termsHashPerField.shrinkHash(maxNumPostings);
maxNumPostings = 0;
}
+
+ void start(Fieldable f) {
+ if (doVectorOffsets && fieldState.attributeSource.hasAttribute(OffsetAttribute.class)) {
+ offsetAttribute = (OffsetAttribute) fieldState.attributeSource.getAttribute(OffsetAttribute.class);
+ } else {
+ offsetAttribute = null;
+ }
+ }
- void newTerm(Token t, RawPostingList p0) {
+ void newTerm(RawPostingList p0) {
assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start");
@@ -201,8 +211,9 @@
p.freq = 1;
if (doVectorOffsets) {
- final int startOffset = fieldState.offset + t.startOffset();
- final int endOffset = fieldState.offset + t.endOffset();
+ int startOffset = fieldState.offset + offsetAttribute.startOffset();;
+ int endOffset = fieldState.offset + offsetAttribute.endOffset();
+
termsHashPerField.writeVInt(1, startOffset);
termsHashPerField.writeVInt(1, endOffset - startOffset);
p.lastOffset = endOffset;
@@ -214,7 +225,7 @@
}
}
- void addTerm(Token t, RawPostingList p0) {
+ void addTerm(RawPostingList p0) {
assert docState.testPoint("TermVectorsTermsWriterPerField.addTerm start");
@@ -222,8 +233,9 @@
p.freq++;
if (doVectorOffsets) {
- final int startOffset = fieldState.offset + t.startOffset();
- final int endOffset = fieldState.offset + t.endOffset();
+ int startOffset = fieldState.offset + offsetAttribute.startOffset();;
+ int endOffset = fieldState.offset + offsetAttribute.endOffset();
+
termsHashPerField.writeVInt(1, startOffset - p.lastOffset);
termsHashPerField.writeVInt(1, endOffset - startOffset);
p.lastOffset = endOffset;
@@ -235,5 +247,5 @@
}
}
- void skippingLongTerm(Token t) {}
+ void skippingLongTerm() {}
}
Index: src/java/org/apache/lucene/queryParser/QueryParser.java
===================================================================
--- src/java/org/apache/lucene/queryParser/QueryParser.java (revision 708658)
+++ src/java/org/apache/lucene/queryParser/QueryParser.java (working copy)
@@ -3,8 +3,8 @@
import java.io.IOException;
import java.io.StringReader;
+import java.text.Collator;
import java.text.DateFormat;
-import java.text.Collator;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
@@ -15,7 +15,10 @@
import java.util.Vector;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.DateField;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.Term;
@@ -508,48 +511,126 @@
// PhraseQuery, or nothing based on the term count
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
- List list = new ArrayList();
- final org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token();
- org.apache.lucene.analysis.Token nextToken;
- int positionCount = 0;
- boolean severalTokensAtSamePosition = false;
+ CachingTokenFilter buffer = new CachingTokenFilter(source);
+ TermAttribute termAtt = null;
+ PositionIncrementAttribute posIncrAtt = null;
+ int numTokens = 0;
- while (true) {
+ org.apache.lucene.analysis.Token reusableToken = null;
+ org.apache.lucene.analysis.Token nextToken = null;
+
+
+ boolean useNewAPI = TokenStream.useNewAPI();
+
+ if (useNewAPI) {
+ boolean success = false;
try {
- nextToken = source.next(reusableToken);
+ buffer.start();
+ success = true;
+ } catch (IOException e) {
+ // success==false if we hit an exception
}
- catch (IOException e) {
- nextToken = null;
+ if (success) {
+ if (buffer.hasAttribute(TermAttribute.class)) {
+ termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class);
+ }
+ if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
+ posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class);
+ }
}
- if (nextToken == null)
- break;
- list.add(nextToken.clone());
- if (nextToken.getPositionIncrement() != 0)
- positionCount += nextToken.getPositionIncrement();
- else
- severalTokensAtSamePosition = true;
+ } else {
+ reusableToken = new org.apache.lucene.analysis.Token();
}
+
+ int positionCount = 0;
+ boolean severalTokensAtSamePosition = false;
+
+ if (useNewAPI) {
+ if (termAtt != null) {
+ try {
+ while (buffer.incrementToken()) {
+ numTokens++;
+ int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
+ if (positionIncrement != 0) {
+ positionCount += positionIncrement;
+ } else {
+ severalTokensAtSamePosition = true;
+ }
+ }
+ } catch (IOException e) {
+ // ignore
+ }
+ }
+ } else {
+ while (true) {
+ try {
+ nextToken = buffer.next(reusableToken);
+ }
+ catch (IOException e) {
+ nextToken = null;
+ }
+ if (nextToken == null)
+ break;
+ numTokens++;
+ if (nextToken.getPositionIncrement() != 0)
+ positionCount += nextToken.getPositionIncrement();
+ else
+ severalTokensAtSamePosition = true;
+ }
+ }
try {
+ // rewind the buffer stream
+ buffer.reset();
+
+ // close original stream - all tokens buffered
source.close();
}
catch (IOException e) {
// ignore
}
+
+ if (numTokens == 0)
+ return null;
+ else if (numTokens == 1) {
+ String term = null;
+ try {
- if (list.size() == 0)
- return null;
- else if (list.size() == 1) {
- nextToken = (org.apache.lucene.analysis.Token) list.get(0);
- return newTermQuery(new Term(field, nextToken.term()));
+ if (useNewAPI) {
+ boolean hasNext = buffer.incrementToken();
+ assert hasNext == true;
+ term = termAtt.term();
+ } else {
+ nextToken = buffer.next(reusableToken);
+ assert nextToken != null;
+ term = nextToken.term();
+ }
+ } catch (IOException e) {
+ // safe to ignore, because we know the number of tokens
+ }
+ return newTermQuery(new Term(field, term));
} else {
if (severalTokensAtSamePosition) {
if (positionCount == 1) {
// no phrase query:
BooleanQuery q = newBooleanQuery(true);
- for (int i = 0; i < list.size(); i++) {
- nextToken = (org.apache.lucene.analysis.Token) list.get(i);
+ for (int i = 0; i < numTokens; i++) {
+ String term = null;
+ try {
+ if (useNewAPI) {
+ boolean hasNext = buffer.incrementToken();
+ assert hasNext == true;
+ term = termAtt.term();
+ } else {
+ nextToken = buffer.next(reusableToken);
+ assert nextToken != null;
+ term = nextToken.term();
+ }
+ } catch (IOException e) {
+ // safe to ignore, because we know the number of tokens
+ }
+
Query currentQuery = newTermQuery(
- new Term(field, nextToken.term()));
+ new Term(field, term));
q.add(currentQuery, BooleanClause.Occur.SHOULD);
}
return q;
@@ -560,9 +641,28 @@
mpq.setSlop(phraseSlop);
List multiTerms = new ArrayList();
int position = -1;
- for (int i = 0; i < list.size(); i++) {
- nextToken = (org.apache.lucene.analysis.Token) list.get(i);
- if (nextToken.getPositionIncrement() > 0 && multiTerms.size() > 0) {
+ for (int i = 0; i < numTokens; i++) {
+ String term = null;
+ int positionIncrement = 1;
+ try {
+ if (useNewAPI) {
+ boolean hasNext = buffer.incrementToken();
+ assert hasNext == true;
+ term = termAtt.term();
+ if (posIncrAtt != null) {
+ positionIncrement = posIncrAtt.getPositionIncrement();
+ }
+ } else {
+ nextToken = buffer.next(reusableToken);
+ assert nextToken != null;
+ term = nextToken.term();
+ positionIncrement = nextToken.getPositionIncrement();
+ }
+ } catch (IOException e) {
+ // safe to ignore, because we know the number of tokens
+ }
+
+ if (positionIncrement > 0 && multiTerms.size() > 0) {
if (enablePositionIncrements) {
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
} else {
@@ -570,8 +670,8 @@
}
multiTerms.clear();
}
- position += nextToken.getPositionIncrement();
- multiTerms.add(new Term(field, nextToken.term()));
+ position += positionIncrement;
+ multiTerms.add(new Term(field, term));
}
if (enablePositionIncrements) {
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
@@ -585,13 +685,36 @@
PhraseQuery pq = newPhraseQuery();
pq.setSlop(phraseSlop);
int position = -1;
- for (int i = 0; i < list.size(); i++) {
- nextToken = (org.apache.lucene.analysis.Token) list.get(i);
+
+
+ for (int i = 0; i < numTokens; i++) {
+ String term = null;
+ int positionIncrement = 1;
+
+ try {
+ if (useNewAPI) {
+
+ boolean hasNext = buffer.incrementToken();
+ assert hasNext == true;
+ term = termAtt.term();
+ if (posIncrAtt != null) {
+ positionIncrement = posIncrAtt.getPositionIncrement();
+ }
+ } else {
+ nextToken = buffer.next(reusableToken);
+ assert nextToken != null;
+ term = nextToken.term();
+ positionIncrement = nextToken.getPositionIncrement();
+ }
+ } catch (IOException e) {
+ // safe to ignore, because we know the number of tokens
+ }
+
if (enablePositionIncrements) {
- position += nextToken.getPositionIncrement();
- pq.add(new Term(field, nextToken.term()),position);
+ position += positionIncrement;
+ pq.add(new Term(field, term),position);
} else {
- pq.add(new Term(field, nextToken.term()));
+ pq.add(new Term(field, term));
}
}
return pq;
Index: src/java/org/apache/lucene/queryParser/QueryParser.jj
===================================================================
--- src/java/org/apache/lucene/queryParser/QueryParser.jj (revision 708658)
+++ src/java/org/apache/lucene/queryParser/QueryParser.jj (working copy)
@@ -27,8 +27,8 @@
import java.io.IOException;
import java.io.StringReader;
+import java.text.Collator;
import java.text.DateFormat;
-import java.text.Collator;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
@@ -39,7 +39,10 @@
import java.util.Vector;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.DateField;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.Term;
@@ -535,48 +538,126 @@
// PhraseQuery, or nothing based on the term count
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
- List list = new ArrayList();
- final org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token();
- org.apache.lucene.analysis.Token nextToken;
- int positionCount = 0;
- boolean severalTokensAtSamePosition = false;
+ CachingTokenFilter buffer = new CachingTokenFilter(source);
+ TermAttribute termAtt = null;
+ PositionIncrementAttribute posIncrAtt = null;
+ int numTokens = 0;
- while (true) {
+ org.apache.lucene.analysis.Token reusableToken = null;
+ org.apache.lucene.analysis.Token nextToken = null;
+
+
+ boolean useNewAPI = TokenStream.useNewAPI();
+
+ if (useNewAPI) {
+ boolean success = false;
try {
- nextToken = source.next(reusableToken);
+ buffer.start();
+ success = true;
+ } catch (IOException e) {
+ // success==false if we hit an exception
}
- catch (IOException e) {
- nextToken = null;
+ if (success) {
+ if (buffer.hasAttribute(TermAttribute.class)) {
+ termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class);
+ }
+ if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
+ posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class);
+ }
}
- if (nextToken == null)
- break;
- list.add(nextToken.clone());
- if (nextToken.getPositionIncrement() != 0)
- positionCount += nextToken.getPositionIncrement();
- else
- severalTokensAtSamePosition = true;
+ } else {
+ reusableToken = new org.apache.lucene.analysis.Token();
}
+
+ int positionCount = 0;
+ boolean severalTokensAtSamePosition = false;
+
+ if (useNewAPI) {
+ if (termAtt != null) {
+ try {
+ while (buffer.incrementToken()) {
+ numTokens++;
+ int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
+ if (positionIncrement != 0) {
+ positionCount += positionIncrement;
+ } else {
+ severalTokensAtSamePosition = true;
+ }
+ }
+ } catch (IOException e) {
+ // ignore
+ }
+ }
+ } else {
+ while (true) {
+ try {
+ nextToken = buffer.next(reusableToken);
+ }
+ catch (IOException e) {
+ nextToken = null;
+ }
+ if (nextToken == null)
+ break;
+ numTokens++;
+ if (nextToken.getPositionIncrement() != 0)
+ positionCount += nextToken.getPositionIncrement();
+ else
+ severalTokensAtSamePosition = true;
+ }
+ }
try {
+ // rewind the buffer stream
+ buffer.reset();
+
+ // close original stream - all tokens buffered
source.close();
}
catch (IOException e) {
// ignore
}
+
+ if (numTokens == 0)
+ return null;
+ else if (numTokens == 1) {
+ String term = null;
+ try {
- if (list.size() == 0)
- return null;
- else if (list.size() == 1) {
- nextToken = (org.apache.lucene.analysis.Token) list.get(0);
- return newTermQuery(new Term(field, nextToken.term()));
+ if (useNewAPI) {
+ boolean hasNext = buffer.incrementToken();
+ assert hasNext == true;
+ term = termAtt.term();
+ } else {
+ nextToken = buffer.next(reusableToken);
+ assert nextToken != null;
+ term = nextToken.term();
+ }
+ } catch (IOException e) {
+ // safe to ignore, because we know the number of tokens
+ }
+ return newTermQuery(new Term(field, term));
} else {
if (severalTokensAtSamePosition) {
if (positionCount == 1) {
// no phrase query:
BooleanQuery q = newBooleanQuery(true);
- for (int i = 0; i < list.size(); i++) {
- nextToken = (org.apache.lucene.analysis.Token) list.get(i);
+ for (int i = 0; i < numTokens; i++) {
+ String term = null;
+ try {
+ if (useNewAPI) {
+ boolean hasNext = buffer.incrementToken();
+ assert hasNext == true;
+ term = termAtt.term();
+ } else {
+ nextToken = buffer.next(reusableToken);
+ assert nextToken != null;
+ term = nextToken.term();
+ }
+ } catch (IOException e) {
+ // safe to ignore, because we know the number of tokens
+ }
+
Query currentQuery = newTermQuery(
- new Term(field, nextToken.term()));
+ new Term(field, term));
q.add(currentQuery, BooleanClause.Occur.SHOULD);
}
return q;
@@ -587,9 +668,28 @@
mpq.setSlop(phraseSlop);
List multiTerms = new ArrayList();
int position = -1;
- for (int i = 0; i < list.size(); i++) {
- nextToken = (org.apache.lucene.analysis.Token) list.get(i);
- if (nextToken.getPositionIncrement() > 0 && multiTerms.size() > 0) {
+ for (int i = 0; i < numTokens; i++) {
+ String term = null;
+ int positionIncrement = 1;
+ try {
+ if (useNewAPI) {
+ boolean hasNext = buffer.incrementToken();
+ assert hasNext == true;
+ term = termAtt.term();
+ if (posIncrAtt != null) {
+ positionIncrement = posIncrAtt.getPositionIncrement();
+ }
+ } else {
+ nextToken = buffer.next(reusableToken);
+ assert nextToken != null;
+ term = nextToken.term();
+ positionIncrement = nextToken.getPositionIncrement();
+ }
+ } catch (IOException e) {
+ // safe to ignore, because we know the number of tokens
+ }
+
+ if (positionIncrement > 0 && multiTerms.size() > 0) {
if (enablePositionIncrements) {
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
} else {
@@ -597,8 +697,8 @@
}
multiTerms.clear();
}
- position += nextToken.getPositionIncrement();
- multiTerms.add(new Term(field, nextToken.term()));
+ position += positionIncrement;
+ multiTerms.add(new Term(field, term));
}
if (enablePositionIncrements) {
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
@@ -612,13 +712,36 @@
PhraseQuery pq = newPhraseQuery();
pq.setSlop(phraseSlop);
int position = -1;
- for (int i = 0; i < list.size(); i++) {
- nextToken = (org.apache.lucene.analysis.Token) list.get(i);
+
+
+ for (int i = 0; i < numTokens; i++) {
+ String term = null;
+ int positionIncrement = 1;
+
+ try {
+ if (useNewAPI) {
+
+ boolean hasNext = buffer.incrementToken();
+ assert hasNext == true;
+ term = termAtt.term();
+ if (posIncrAtt != null) {
+ positionIncrement = posIncrAtt.getPositionIncrement();
+ }
+ } else {
+ nextToken = buffer.next(reusableToken);
+ assert nextToken != null;
+ term = nextToken.term();
+ positionIncrement = nextToken.getPositionIncrement();
+ }
+ } catch (IOException e) {
+ // safe to ignore, because we know the number of tokens
+ }
+
if (enablePositionIncrements) {
- position += nextToken.getPositionIncrement();
- pq.add(new Term(field, nextToken.term()),position);
+ position += positionIncrement;
+ pq.add(new Term(field, term),position);
} else {
- pq.add(new Term(field, nextToken.term()));
+ pq.add(new Term(field, term));
}
}
return pq;
@@ -627,6 +750,7 @@
}
+
/**
* Base implementation delegates to {@link #getFieldQuery(String,String)}.
* This method may be overridden, for example, to return
Index: src/java/org/apache/lucene/search/QueryTermVector.java
===================================================================
--- src/java/org/apache/lucene/search/QueryTermVector.java (revision 708658)
+++ src/java/org/apache/lucene/search/QueryTermVector.java (working copy)
@@ -29,6 +29,7 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.index.TermFreqVector;
/**
@@ -58,9 +59,17 @@
{
List terms = new ArrayList();
try {
- final Token reusableToken = new Token();
- for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
- terms.add(nextToken.term());
+ if (TokenStream.useNewAPI()) {
+ stream.start();
+ TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
+ while (stream.incrementToken()) {
+ terms.add(termAtt.term());
+ }
+ } else {
+ final Token reusableToken = new Token();
+ for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
+ terms.add(nextToken.term());
+ }
}
processTerms((String[])terms.toArray(new String[terms.size()]));
} catch (IOException e) {
Index: src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java (revision 708658)
+++ src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java (working copy)
@@ -22,6 +22,8 @@
import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.TermVector;
@@ -41,6 +43,8 @@
Document doc = new Document();
TokenStream stream = new TokenStream() {
private int index = 0;
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
@@ -51,6 +55,22 @@
}
}
+ public void initialize() {
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ }
+
+ public boolean incrementToken() throws IOException {
+ if (index == tokens.length) {
+ return false;
+ } else {
+ termAtt.setTermBuffer(tokens[index++]);
+ offsetAtt.setStartOffset(0);
+ offsetAtt.setEndOffset(0);
+ return true;
+ }
+ }
+
};
stream = new CachingTokenFilter(stream);
@@ -91,7 +111,30 @@
}
private void checkTokens(TokenStream stream) throws IOException {
+ if (TokenStream.useNewAPI()) {
+ checkTokensNewAPI(stream);
+ } else {
+ checkTokensOldAPI(stream);
+ }
+ }
+
+ private void checkTokensNewAPI(TokenStream stream) throws IOException {
int count = 0;
+ stream.start();
+
+ TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
+ assertNotNull(termAtt);
+ while (stream.incrementToken()) {
+ assertTrue(count < tokens.length);
+ assertEquals(tokens[count], termAtt.term());
+ count++;
+ }
+
+ assertEquals(tokens.length, count);
+ }
+
+ private void checkTokensOldAPI(TokenStream stream) throws IOException {
+ int count = 0;
final Token reusableToken = new Token();
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
assertTrue(count < tokens.length);
Index: src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (revision 708658)
+++ src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (working copy)
@@ -1,6 +1,10 @@
package org.apache.lucene.analysis;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.LuceneTestCase;
import java.io.StringReader;
@@ -35,19 +39,26 @@
public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes, int[] expectedPosIncrs) throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
- final Token reusableToken = new Token();
+ ts.start();
+ // TODO Java 1.5
+ //final TypeAttribute typeAtt = reusableToken.getAttribute(TypeAttribute.class);
+ //final PositionIncrementAttribute posIncrAtt = reusableToken.getAttribute(PositionIncrementAttribute.class);
+
+ final TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
+ final TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
+ final PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts.getAttribute(PositionIncrementAttribute.class);
+
for (int i = 0; i < expectedImages.length; i++) {
- Token nextToken = ts.next(reusableToken);
- assertNotNull(nextToken);
- assertEquals(expectedImages[i], nextToken.term());
+ assertTrue(ts.incrementToken());
+ assertEquals(expectedImages[i], new String(termAtt.termBuffer(), 0, termAtt.termLength()));
if (expectedTypes != null) {
- assertEquals(expectedTypes[i], nextToken.type());
+ assertEquals(expectedTypes[i], typeAtt.type());
}
if (expectedPosIncrs != null) {
- assertEquals(expectedPosIncrs[i], nextToken.getPositionIncrement());
+ assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement());
}
}
- assertNull(ts.next(reusableToken));
+ assertFalse(ts.incrementToken());
ts.close();
}
Index: src/test/org/apache/lucene/analysis/TokenStreamTestUtils.java
===================================================================
--- src/test/org/apache/lucene/analysis/TokenStreamTestUtils.java (revision 0)
+++ src/test/org/apache/lucene/analysis/TokenStreamTestUtils.java (revision 0)
@@ -0,0 +1,112 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.index.Payload;
+
+public class TokenStreamTestUtils {
+ public static abstract class BackwardsCompatibleFilter extends TokenFilter {
+ boolean first = true;
+
+ public BackwardsCompatibleFilter(TokenStream input) {
+ super(input);
+ }
+
+ public Token next(Token reusableToken) throws IOException {
+ if (first) {
+ start();
+ first = false;
+ }
+
+ boolean hasNext = incrementToken();
+ if (!hasNext) {
+ return null;
+ }
+
+ return getNextToken(this, reusableToken);
+ }
+ }
+
+ public static abstract class BackwardsCompatibleStream extends TokenStream {
+ boolean first = true;
+
+ public BackwardsCompatibleStream() {
+ super();
+ }
+
+ public Token next(Token reusableToken) throws IOException {
+ if (first) {
+ start();
+ first = false;
+ }
+
+ boolean hasNext = incrementToken();
+ if (!hasNext) {
+ return null;
+ }
+
+ reusableToken = getNextToken(this, reusableToken);
+ return reusableToken;
+ }
+ }
+
+ private static Token getNextToken(TokenStream stream, Token nextToken) throws IOException {
+ if (stream.hasAttribute(PayloadAttribute.class)) {
+ PayloadAttribute att = (PayloadAttribute) stream.getAttribute(PayloadAttribute.class);
+ Payload p = att.getPayload();
+ if (p != null) {
+ p = (Payload) p.clone();
+ }
+ nextToken.setPayload(p);
+ }
+
+ if (stream.hasAttribute(TermAttribute.class)) {
+ TermAttribute att = (TermAttribute) stream.getAttribute(TermAttribute.class);
+ nextToken.setTermBuffer(att.termBuffer(), 0, att.termLength());
+ }
+ if (stream.hasAttribute(OffsetAttribute.class)) {
+ OffsetAttribute att = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class);
+ nextToken.setStartOffset(att.startOffset());
+ nextToken.setEndOffset(att.endOffset());
+ }
+ if (stream.hasAttribute(TypeAttribute.class)) {
+ TypeAttribute att = (TypeAttribute) stream.getAttribute(TypeAttribute.class);
+ nextToken.setType(att.type());
+ }
+ if (stream.hasAttribute(FlagsAttribute.class)) {
+ FlagsAttribute att = (FlagsAttribute) stream.getAttribute(FlagsAttribute.class);
+ nextToken.setFlags(att.getFlags());
+ }
+ if (stream.hasAttribute(PositionIncrementAttribute.class)) {
+ PositionIncrementAttribute att = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class);
+ nextToken.setPositionIncrement(att.getPositionIncrement());
+ }
+
+ return nextToken;
+ }
+
+
+}
Property changes on: src\test\org\apache\lucene\analysis\TokenStreamTestUtils.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: src/test/org/apache/lucene/index/TestDocumentWriter.java
===================================================================
--- src/test/org/apache/lucene/index/TestDocumentWriter.java (revision 708658)
+++ src/test/org/apache/lucene/index/TestDocumentWriter.java (working copy)
@@ -22,12 +22,18 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.analysis.TestToken;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.TokenStreamState;
+import org.apache.lucene.analysis.TokenStreamTestUtils;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
@@ -136,35 +142,49 @@
public void testTokenReuse() throws IOException {
Analyzer analyzer = new Analyzer() {
public TokenStream tokenStream(String fieldName, Reader reader) {
- return new TokenFilter(new WhitespaceTokenizer(reader)) {
+ return new TokenStreamTestUtils.BackwardsCompatibleFilter(new WhitespaceTokenizer(reader)) {
boolean first=true;
+ TokenStreamState state;
Token buffered;
- public Token next(final Token reusableToken) throws IOException {
- if (buffered != null) {
- Token nextToken = buffered;
- buffered=null;
- return nextToken;
+
+ public boolean incrementToken() throws IOException {
+ if (state != null) {
+ state.restore(this);
+ payloadAtt.setPayload(null);
+ posIncrAtt.setPositionIncrement(0);
+ termAtt.setTermBuffer(new char[]{'b'}, 0, 1);
+ state = null;
+ return true;
}
- Token nextToken = input.next(reusableToken);
- if (nextToken==null) return null;
- if (Character.isDigit(nextToken.termBuffer()[0])) {
- nextToken.setPositionIncrement(nextToken.termBuffer()[0] - '0');
+
+ boolean hasNext = input.incrementToken();
+ if (!hasNext) return false;
+ if (Character.isDigit(termAtt.termBuffer()[0])) {
+ posIncrAtt.setPositionIncrement(termAtt.termBuffer()[0] - '0');
}
if (first) {
// set payload on first position only
- nextToken.setPayload(new Payload(new byte[]{100}));
+ payloadAtt.setPayload(new Payload(new byte[]{100}));
first = false;
}
// index a "synonym" for every token
- buffered = (Token)nextToken.clone();
- buffered.setPayload(null);
- buffered.setPositionIncrement(0);
- buffered.setTermBuffer(new char[]{'b'}, 0, 1);
+ state = TokenStreamState.capture(this);
+ return true;
- return nextToken;
}
+
+ TermAttribute termAtt = null;
+ PayloadAttribute payloadAtt = null;
+ PositionIncrementAttribute posIncrAtt = null;
+
+ public void initialize() throws IOException {
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
+ posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ }
+
};
}
};
@@ -197,16 +217,22 @@
IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
Document doc = new Document();
- doc.add(new Field("preanalyzed", new TokenStream() {
+ doc.add(new Field("preanalyzed", new TokenStreamTestUtils.BackwardsCompatibleStream() {
private String[] tokens = new String[] {"term1", "term2", "term3", "term2"};
private int index = 0;
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
+ private TermAttribute termAtt;
+
+ public void initialize() {
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ }
+
+ public boolean incrementToken() throws IOException {
if (index == tokens.length) {
- return null;
+ return false;
} else {
- return reusableToken.reinit(tokens[index++], 0, 0);
+ termAtt.setTermBuffer(tokens[index++]);
+ return true;
}
}
Index: src/test/org/apache/lucene/index/TestIndexWriter.java
===================================================================
--- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 708658)
+++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy)
@@ -30,6 +30,9 @@
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.analysis.TestToken;
+import org.apache.lucene.analysis.TokenStreamState;
+import org.apache.lucene.analysis.TokenStreamTestUtils;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.Analyzer;
@@ -38,6 +41,8 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@@ -1790,14 +1795,14 @@
IndexWriter writer = new IndexWriter(dir, new Analyzer() {
public TokenStream tokenStream(String fieldName, Reader reader) {
- return new TokenFilter(new StandardTokenizer(reader)) {
+ return new TokenStreamTestUtils.BackwardsCompatibleFilter(new StandardTokenizer(reader)) {
private int count = 0;
- public Token next(final Token reusableToken) throws IOException {
+ public boolean incrementToken() throws IOException {
if (count++ == 5) {
throw new IOException();
}
- return input.next(reusableToken);
+ return input.incrementToken();
}
};
}
@@ -1907,7 +1912,7 @@
reader.close();
}
- private class CrashingFilter extends TokenFilter {
+ private class CrashingFilter extends TokenStreamTestUtils.BackwardsCompatibleFilter {
String fieldName;
int count;
@@ -1916,10 +1921,10 @@
this.fieldName = fieldName;
}
- public Token next(final Token reusableToken) throws IOException {
+ public boolean incrementToken() throws IOException {
if (this.fieldName.equals("crash") && count++ >= 4)
throw new IOException("I'm experiencing problems");
- return input.next(reusableToken);
+ return input.incrementToken();
}
public void reset() throws IOException {
@@ -3577,23 +3582,58 @@
}
}
+ private static class MyAnalyzer extends Analyzer {
+
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ return new TokenStreamTestUtils.BackwardsCompatibleFilter(new WhitespaceTokenizer(reader)) {
+ public void initialize() throws IOException {
+ addAttribute(PositionIncrementAttribute.class);
+ }
+ };
+ }
+
+ }
+
// LUCENE-1255
public void testNegativePositions() throws Throwable {
- SinkTokenizer tokens = new SinkTokenizer();
- Token t = new Token();
- t.setTermBuffer("a");
- t.setPositionIncrement(0);
- tokens.add(t);
- t.setTermBuffer("b");
- t.setPositionIncrement(1);
- tokens.add(t);
- t.setTermBuffer("c");
- tokens.add(t);
+ SinkTokenizer tokens = new SinkTokenizer() {
+ public void initialize() throws IOException {
+ addAttribute(TermAttribute.class);
+ addAttribute(PositionIncrementAttribute.class);
+ }
+ };
+ TokenStreamState state = new TokenStreamState();
+ TermAttribute termAtt = (TermAttribute) state.addAttribute(TermAttribute.class);
+ PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class);
+ termAtt.setTermBuffer("a");
+ posIncrAtt.setPositionIncrement(0);
+ tokens.add(state);
+
+ state = new TokenStreamState();
+ termAtt = (TermAttribute) state.addAttribute(TermAttribute.class);
+ posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class);
+
+ termAtt.setTermBuffer("b");
+ posIncrAtt.setPositionIncrement(1);
+ tokens.add(state);
+
+ state = new TokenStreamState();
+ termAtt = (TermAttribute) state.addAttribute(TermAttribute.class);
+ posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class);
+
+ termAtt.setTermBuffer("c");
+ posIncrAtt.setPositionIncrement(1);
+ tokens.add(state);
+
MockRAMDirectory dir = new MockRAMDirectory();
- IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
+ IndexWriter w = new IndexWriter(dir, new MyAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
Document doc = new Document();
- doc.add(new Field("field", tokens));
+ doc.add(new Field("field", new TokenStreamTestUtils.BackwardsCompatibleFilter(tokens) {
+ public boolean incrementToken() throws IOException {
+ return input.incrementToken();
+ }
+ }));
w.addDocument(doc);
w.commit();
Index: src/test/org/apache/lucene/index/TestMultiLevelSkipList.java
===================================================================
--- src/test/org/apache/lucene/index/TestMultiLevelSkipList.java (revision 708658)
+++ src/test/org/apache/lucene/index/TestMultiLevelSkipList.java (working copy)
@@ -24,9 +24,12 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseTokenizer;
+import org.apache.lucene.analysis.TestToken;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.TokenStreamTestUtils;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
@@ -96,20 +99,25 @@
}
- private static class PayloadFilter extends TokenFilter {
+ private static class PayloadFilter extends TokenStreamTestUtils.BackwardsCompatibleFilter {
static int count = 0;
+ PayloadAttribute payloadAtt;
+
+ public void initialize() throws IOException {
+ payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
+ }
+
protected PayloadFilter(TokenStream input) {
super(input);
}
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken);
- if (nextToken != null) {
- nextToken.setPayload(new Payload(new byte[] { (byte) count++ }));
- }
- return nextToken;
+ public boolean incrementToken() throws IOException {
+ boolean hasNext = input.incrementToken();
+ if (hasNext) {
+ payloadAtt.setPayload(new Payload(new byte[] { (byte) count++ }));
+ }
+ return hasNext;
}
}
Index: src/test/org/apache/lucene/index/TestPayloads.java
===================================================================
--- src/test/org/apache/lucene/index/TestPayloads.java (revision 708658)
+++ src/test/org/apache/lucene/index/TestPayloads.java (working copy)
@@ -27,20 +27,20 @@
import java.util.Map;
import java.util.Random;
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.UnicodeUtil;
-
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.TokenStreamTestUtils;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.UnicodeUtil;
public class TestPayloads extends LuceneTestCase {
@@ -437,12 +437,17 @@
/**
* This Filter adds payloads to the tokens.
*/
- private static class PayloadFilter extends TokenFilter {
+ private static class PayloadFilter extends TokenStreamTestUtils.BackwardsCompatibleFilter {
private byte[] data;
private int length;
private int offset;
Payload payload = new Payload();
+ PayloadAttribute payloadAtt;
+ public void initialize() throws IOException {
+ payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
+ }
+
public PayloadFilter(TokenStream in, byte[] data, int offset, int length) {
super(in);
this.data = data;
@@ -450,24 +455,23 @@
this.offset = offset;
}
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken);
- if (nextToken != null) {
+ public boolean incrementToken() throws IOException {
+ boolean hasNext = input.incrementToken();
+ if (hasNext) {
if (offset + length <= data.length) {
Payload p = null;
if (p == null) {
p = new Payload();
- nextToken.setPayload(p);
+ payloadAtt.setPayload(p);
}
p.setData(data, offset, length);
offset += length;
} else {
- nextToken.setPayload(null);
+ payloadAtt.setPayload(null);
}
}
- return nextToken;
+ return hasNext;
}
}
@@ -524,11 +528,20 @@
assertEquals(pool.size(), numThreads);
}
- private static class PoolingPayloadTokenStream extends TokenStream {
+ private static class PoolingPayloadTokenStream extends TokenStreamTestUtils.BackwardsCompatibleStream {
private byte[] payload;
private boolean first;
private ByteArrayPool pool;
private String term;
+
+ TermAttribute termAtt;
+ PayloadAttribute payloadAtt;
+
+ public void initialize() {
+ payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ }
+
PoolingPayloadTokenStream(ByteArrayPool pool) {
this.pool = pool;
payload = pool.get();
@@ -537,11 +550,11 @@
first = true;
}
- public Token next(final Token reusableToken) throws IOException {
- if (!first) return null;
- reusableToken.reinit(term, 0, 0);
- reusableToken.setPayload(new Payload(payload));
- return reusableToken;
+ public boolean incrementToken() throws IOException {
+ if (!first) return false;
+ termAtt.setTermBuffer(term);
+ payloadAtt.setPayload(new Payload(payload));
+ return true;
}
public void close() throws IOException {
Index: src/test/org/apache/lucene/index/TestTermVectorsReader.java
===================================================================
--- src/test/org/apache/lucene/index/TestTermVectorsReader.java (revision 708658)
+++ src/test/org/apache/lucene/index/TestTermVectorsReader.java (working copy)
@@ -20,6 +20,10 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.TokenStreamTestUtils;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.MockRAMDirectory;
@@ -116,19 +120,33 @@
fieldInfos = new FieldInfos(dir, seg + "." + IndexFileNames.FIELD_INFOS_EXTENSION);
}
- private class MyTokenStream extends TokenStream {
+ private class MyTokenStream extends TokenStreamTestUtils.BackwardsCompatibleStream {
int tokenUpto;
- public Token next(final Token reusableToken) {
+
+ TermAttribute termAtt;
+ PositionIncrementAttribute posIncrAtt;
+ OffsetAttribute offsetAtt;
+
+ public void initialize() {
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ }
+
+ public boolean incrementToken() {
if (tokenUpto >= tokens.length)
- return null;
+ return false;
else {
final TestToken testToken = tokens[tokenUpto++];
- reusableToken.reinit(testToken.text, testToken.startOffset, testToken.endOffset);
- if (tokenUpto > 1)
- reusableToken.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos);
- else
- reusableToken.setPositionIncrement(testToken.pos+1);
- return reusableToken;
+ termAtt.setTermBuffer(testToken.text);
+ offsetAtt.setStartOffset(testToken.startOffset);
+ offsetAtt.setEndOffset(testToken.endOffset);
+ if (tokenUpto > 1) {
+ posIncrAtt.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos);
+ } else {
+ posIncrAtt.setPositionIncrement(testToken.pos+1);
+ }
+ return true;
}
}
}
Index: src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java
===================================================================
--- src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java (revision 708658)
+++ src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java (working copy)
@@ -17,6 +17,7 @@
* limitations under the License.
*/
+import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.util.LuceneTestCase;
@@ -27,7 +28,12 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.TokenStreamTestUtils;
import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/**
* Test QueryParser's ability to deal with Analyzers that return more
@@ -138,36 +144,54 @@
}
}
- private final class TestFilter extends TokenFilter {
+ private final class TestFilter extends TokenStreamTestUtils.BackwardsCompatibleFilter {
- private Token prevToken;
+ private String prevType;
+ private int prevStartOffset;
+ private int prevEndOffset;
+ TermAttribute termAtt;
+ PositionIncrementAttribute posIncrAtt;
+ OffsetAttribute offsetAtt;
+ TypeAttribute typeAtt;
+
+ public void initialize() throws IOException {
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+ }
+
public TestFilter(TokenStream in) {
super(in);
}
- public final Token next(final Token reusableToken) throws java.io.IOException {
+ public final boolean incrementToken() throws java.io.IOException {
if (multiToken > 0) {
- reusableToken.reinit("multi"+(multiToken+1), prevToken.startOffset(), prevToken.endOffset(), prevToken.type());
- reusableToken.setPositionIncrement(0);
+ termAtt.setTermBuffer("multi"+(multiToken+1));
+ offsetAtt.setStartOffset(prevStartOffset);
+ offsetAtt.setEndOffset(prevEndOffset);
+ typeAtt.setType(prevType);
+ posIncrAtt.setPositionIncrement(0);
multiToken--;
- return reusableToken;
+ return true;
} else {
- Token nextToken = input.next(reusableToken);
- if (nextToken == null) {
- prevToken = null;
- return null;
+ boolean next = input.incrementToken();
+ if (next == false) {
+ return false;
}
- prevToken = (Token) nextToken.clone();
- String text = nextToken.term();
+ prevType = typeAtt.type();
+ prevStartOffset = offsetAtt.startOffset();
+ prevEndOffset = offsetAtt.endOffset();
+ String text = termAtt.term();
if (text.equals("triplemulti")) {
multiToken = 2;
- return nextToken;
+ return true;
} else if (text.equals("multi")) {
multiToken = 1;
- return nextToken;
+ return true;
} else {
- return nextToken;
+ return true;
}
}
}
@@ -190,25 +214,34 @@
}
}
- private final class TestPosIncrementFilter extends TokenFilter {
+ private final class TestPosIncrementFilter extends TokenStreamTestUtils.BackwardsCompatibleFilter {
+ TermAttribute termAtt;
+ PositionIncrementAttribute posIncrAtt;
+
+ public void initialize() throws IOException {
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ }
+
+
public TestPosIncrementFilter(TokenStream in) {
super(in);
}
- public final Token next(final Token reusableToken) throws java.io.IOException {
- for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
- if (nextToken.term().equals("the")) {
+ public final boolean incrementToken () throws java.io.IOException {
+ while(input.incrementToken()) {
+ if (termAtt.term().equals("the")) {
// stopword, do nothing
- } else if (nextToken.term().equals("quick")) {
- nextToken.setPositionIncrement(2);
- return nextToken;
+ } else if (termAtt.term().equals("quick")) {
+ posIncrAtt.setPositionIncrement(2);
+ return true;
} else {
- nextToken.setPositionIncrement(1);
- return nextToken;
+ posIncrAtt.setPositionIncrement(1);
+ return true;
}
}
- return null;
+ return false;
}
}
Index: src/test/org/apache/lucene/queryParser/TestQueryParser.java
===================================================================
--- src/test/org/apache/lucene/queryParser/TestQueryParser.java (revision 708658)
+++ src/test/org/apache/lucene/queryParser/TestQueryParser.java (working copy)
@@ -34,8 +34,13 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.TokenStreamTestUtils;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.document.DateField;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
@@ -64,7 +69,17 @@
public static Analyzer qpAnalyzer = new QPTestAnalyzer();
- public static class QPTestFilter extends TokenFilter {
+ public static class QPTestFilter extends TokenStreamTestUtils.BackwardsCompatibleFilter {
+ TermAttribute termAtt;
+ OffsetAttribute offsetAtt;
+
+
+ public void initialize() throws IOException {
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ }
+
+
/**
* Filter which discards the token 'stop' and which expands the
* token 'phrase' into 'phrase1 phrase2'
@@ -76,25 +91,31 @@
boolean inPhrase = false;
int savedStart = 0, savedEnd = 0;
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
+ public boolean incrementToken() throws IOException {
if (inPhrase) {
inPhrase = false;
- return reusableToken.reinit("phrase2", savedStart, savedEnd);
+ termAtt.setTermBuffer("phrase2");
+ offsetAtt.setStartOffset(savedStart);
+ offsetAtt.setEndOffset(savedEnd);
+ return true;
} else
- for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
- if (nextToken.term().equals("phrase")) {
+ while (input.incrementToken()) {
+ if (termAtt.term().equals("phrase")) {
inPhrase = true;
- savedStart = nextToken.startOffset();
- savedEnd = nextToken.endOffset();
- return nextToken.reinit("phrase1", savedStart, savedEnd);
- } else if (!nextToken.term().equals("stop"))
- return nextToken;
+ savedStart = offsetAtt.startOffset();
+ savedEnd = offsetAtt.endOffset();
+ termAtt.setTermBuffer("phrase1");
+ offsetAtt.setStartOffset(savedStart);
+ offsetAtt.setEndOffset(savedEnd);
+ return true;
+ } else if (!termAtt.term().equals("stop"))
+ return true;
}
- return null;
+ return false;
}
}
+
public static class QPTestAnalyzer extends Analyzer {
/** Filters LowerCaseTokenizer with StopFilter. */
Index: src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java
===================================================================
--- src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java (revision 708658)
+++ src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java (working copy)
@@ -21,9 +21,14 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseTokenizer;
+import org.apache.lucene.analysis.TestToken;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.TokenStreamTestUtils;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
@@ -63,32 +68,38 @@
}
}
- private class PayloadFilter extends TokenFilter {
+ private class PayloadFilter extends TokenStreamTestUtils.BackwardsCompatibleFilter {
String fieldName;
int numSeen = 0;
+
+ PayloadAttribute payloadAtt;
+
+ public void initialize() throws IOException {
+ payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
+ }
public PayloadFilter(TokenStream input, String fieldName) {
super(input);
this.fieldName = fieldName;
}
-
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken);
- if (nextToken != null) {
+
+ public boolean incrementToken() throws IOException {
+ boolean hasNext = input.incrementToken();
+ if (hasNext) {
if (fieldName.equals("field")) {
- nextToken.setPayload(new Payload(payloadField));
+ payloadAtt.setPayload(new Payload(payloadField));
} else if (fieldName.equals("multiField")) {
if (numSeen % 2 == 0) {
- nextToken.setPayload(new Payload(payloadMultiField1));
+ payloadAtt.setPayload(new Payload(payloadMultiField1));
} else {
- nextToken.setPayload(new Payload(payloadMultiField2));
+ payloadAtt.setPayload(new Payload(payloadMultiField2));
}
numSeen++;
}
-
+ return true;
+ } else {
+ return false;
}
- return nextToken;
}
}
Index: src/test/org/apache/lucene/search/TestPositionIncrement.java
===================================================================
--- src/test/org/apache/lucene/search/TestPositionIncrement.java (revision 708658)
+++ src/test/org/apache/lucene/search/TestPositionIncrement.java (working copy)
@@ -17,14 +17,20 @@
* limitations under the License.
*/
+import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TestToken;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.TokenStreamTestUtils;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
@@ -44,19 +50,30 @@
public void testSetPosition() throws Exception {
Analyzer analyzer = new Analyzer() {
public TokenStream tokenStream(String fieldName, Reader reader) {
- return new TokenStream() {
+ return new TokenStreamTestUtils.BackwardsCompatibleStream() {
private final String[] TOKENS = {"1", "2", "3", "4", "5"};
private final int[] INCREMENTS = {1, 2, 1, 0, 1};
private int i = 0;
- public Token next(final Token reusableToken) {
- assert reusableToken != null;
+ PositionIncrementAttribute posIncrAtt;
+ TermAttribute termAtt;
+ OffsetAttribute offsetAtt;
+
+ public void initialize() throws IOException {
+ posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ }
+
+ public boolean incrementToken() {
if (i == TOKENS.length)
- return null;
- reusableToken.reinit(TOKENS[i], i, i);
- reusableToken.setPositionIncrement(INCREMENTS[i]);
+ return false;
+ termAtt.setTermBuffer(TOKENS[i]);
+ offsetAtt.setStartOffset(i);
+ offsetAtt.setEndOffset(i);
+ posIncrAtt.setPositionIncrement(INCREMENTS[i]);
i++;
- return reusableToken;
+ return true;
}
};
}
Index: src/test/org/apache/lucene/util/LuceneTestCase.java
===================================================================
--- src/test/org/apache/lucene/util/LuceneTestCase.java (revision 708658)
+++ src/test/org/apache/lucene/util/LuceneTestCase.java (working copy)
@@ -17,6 +17,7 @@
* limitations under the License.
*/
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import junit.framework.TestCase;
@@ -42,6 +43,7 @@
protected void setUp() throws Exception {
ConcurrentMergeScheduler.setTestMode();
+ TokenStream.setUseNewAPI(true);
}
protected void tearDown() throws Exception {