| Index: src/java/org/apache/lucene/analysis/AttributeSource.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/AttributeSource.java (revision 0)
|
| +++ src/java/org/apache/lucene/analysis/AttributeSource.java (revision 0)
|
| @@ -0,0 +1,160 @@
|
| +package org.apache.lucene.analysis; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.util.Iterator; |
| +import java.util.LinkedHashMap; |
| +import java.util.Map; |
| + |
| +import org.apache.lucene.analysis.tokenattributes.Attribute; |
| + |
| +/** |
| + * An AttributeSource contains a list of different {@link Attribute}s, |
| + * and methods to add and get them. There can only be a single instance |
| + * of an attribute in the same AttributeSource instance. This is ensured |
| + * by passing in the actual type of the Attribute (Class<Attribute>) to |
| + * the {@link #addAttribute(Class)}, which then checks if an instance of |
| + * that type is already present. If yes, it returns the instance, otherwise |
| + * it creates a new instance and returns it. |
| + */ |
| +public abstract class AttributeSource { |
| + |
| + /** |
| + * An AttributeAcceptor defines only a single method {@link #accept(Class)}. |
| + * It can be used for e. g. buffering purposes to specify which attributes |
| + * to buffer. |
| + */ |
| + public static abstract class AttributeAcceptor { |
| + /** Return true, to accept this attribute; false otherwise */ |
| + public abstract boolean accept(Class attClass); |
| + } |
| + |
| + /** |
| + * Default AttributeAcceptor that accepts all attributes. |
| + */ |
| + public static final AttributeAcceptor AllAcceptor = new AttributeAcceptor() { |
| + public boolean accept(Class attClass) {return true;} |
| + }; |
| + |
| + /** |
| + * Holds the Class<Attribute> -> Attribute mapping |
| + */ |
| + protected Map attributes = new LinkedHashMap(); |
| + |
| + /** Returns an iterator that iterates the attributes |
| + * in the same order they were added in. |
| + */ |
| + public Iterator getAttributesIterator() { |
| + return attributes.values().iterator(); |
| + } |
| + |
| + /** |
| + * The caller must pass in a Class<? extends Attribute> value. |
| + * This method first checks if an instance of that class is |
| + * already in this AttributeSource and returns it. Otherwise a |
| + * new instance is created, added to this AttributeSource and returned. |
| + */ |
| + public Attribute addAttribute(Class attClass) { |
| + Attribute att = (Attribute) attributes.get(attClass); |
| + if (att == null) { |
| + try { |
| + att = (Attribute) attClass.newInstance(); |
| + } catch (InstantiationException e) { |
| + throw new IllegalArgumentException("Could not instantiate class " + attClass); |
| + } catch (IllegalAccessException e) { |
| + throw new IllegalArgumentException("Could not instantiate class " + attClass); |
| + } |
| + |
| + attributes.put(attClass, att); |
| + } |
| + return att; |
| + } |
| + |
| + /** Returns true, iff this AttributeSource has any attributes */ |
| + public boolean hasAttributes() { |
| + return !this.attributes.isEmpty(); |
| + } |
| + |
| + /** |
| + * The caller must pass in a Class<? extends Attribute> value. |
| + * Returns true, iff this AttributeSource contains the passed-in Attribute. |
| + */ |
| + public boolean hasAttribute(Class attClass) { |
| + return this.attributes.containsKey(attClass); |
| + } |
| + |
| + /** |
| + * The caller must pass in a Class<? extends Attribute> value. |
| + * Returns the instance of the passed in Attribute contained in this AttributeSource |
| + * |
| + * @throws IllegalArgumentException if this AttributeSource does not contain the |
| + * Attribute |
| + */ |
| + public Attribute getAttribute(Class attClass) { |
| + Attribute att = (Attribute) this.attributes.get(attClass); |
| + if (att == null) { |
| + throw new IllegalArgumentException("This token does not have the attribute '" + attClass + "'."); |
| + } |
| + |
| + return att; |
| + } |
| + |
| + /** |
| + * Resets all Attributes in this AttributeSource by calling |
| + * {@link Attribute#clear()} on each Attribute. |
| + */ |
| + public void clearAttributes() { |
| + Iterator it = getAttributesIterator(); |
| + while (it.hasNext()) { |
| + ((Attribute) it.next()).clear(); |
| + } |
| + } |
| + |
| +// TODO: Java 1.5 |
| +// private Map<Class<? extends Attribute>, Attribute> attributes; |
| +// public <T extends Attribute> T addAttribute(Class<T> attClass) { |
| +// T att = (T) attributes.get(attClass); |
| +// if (att == null) { |
| +// try { |
| +// att = attClass.newInstance(); |
| +// } catch (InstantiationException e) { |
| +// throw new IllegalArgumentException("Could not instantiate class " + attClass); |
| +// } catch (IllegalAccessException e) { |
| +// throw new IllegalArgumentException("Could not instantiate class " + attClass); |
| +// } |
| +// |
| +// attributes.put(attClass, att); |
| +// } |
| +// return att; |
| +// } |
| +// |
| +// public boolean hasAttribute(Class<? extends Attribute> attClass) { |
| +// return this.attributes.containsKey(attClass); |
| +// } |
| +// |
| +// public <T extends Attribute> T getAttribute(Class<T> attClass) { |
| +// Attribute att = this.attributes.get(attClass); |
| +// if (att == null) { |
| +// throw new IllegalArgumentException("This token does not have the attribute '" + attClass + "'."); |
| +// } |
| +// |
| +// return (T) att; |
| +// } |
| +// |
| + |
| +} |
|
|
| Property changes on: src\java\org\apache\lucene\analysis\AttributeSource.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: src/java/org/apache/lucene/analysis/CachingTokenFilter.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/CachingTokenFilter.java (revision 708658)
|
| +++ src/java/org/apache/lucene/analysis/CachingTokenFilter.java (working copy)
|
| @@ -34,12 +34,31 @@
|
| */ |
| public class CachingTokenFilter extends TokenFilter { |
| private List cache; |
| - private Iterator iterator; |
| + private Iterator iterator; |
| |
| public CachingTokenFilter(TokenStream input) { |
| super(input); |
| } |
| |
| + public boolean incrementToken() throws IOException { |
| + if (cache == null) { |
| + // fill cache lazily |
| + cache = new LinkedList(); |
| + fillCache(); |
| + iterator = cache.iterator(); |
| + } |
| + |
| + if (!iterator.hasNext()) { |
| + // the cache is exhausted, return null |
| + return false; |
| + } |
| + // Since the TokenFilter can be reset, the tokens need to be preserved as immutable. |
| + TokenStreamState state = (TokenStreamState) iterator.next(); |
| + state.restore(this); |
| + return true; |
| + } |
| + |
| + /** @deprecated */ |
| public Token next(final Token reusableToken) throws IOException { |
| assert reusableToken != null; |
| if (cache == null) { |
| @@ -60,10 +79,17 @@
|
| |
| public void reset() throws IOException { |
| if(cache != null) { |
| - iterator = cache.iterator(); |
| + iterator = cache.iterator(); |
| } |
| } |
| |
| + private void fillCache() throws IOException { |
| + while(input.incrementToken()) { |
| + cache.add(TokenStreamState.capture(this)); |
| + } |
| + } |
| + |
| + /** @deprecated */ |
| private void fillCache(final Token reusableToken) throws IOException { |
| for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) { |
| cache.add(nextToken.clone()); |
| Index: src/java/org/apache/lucene/analysis/CharTokenizer.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/CharTokenizer.java (revision 708658)
|
| +++ src/java/org/apache/lucene/analysis/CharTokenizer.java (working copy)
|
| @@ -20,6 +20,9 @@
|
| import java.io.IOException; |
| import java.io.Reader; |
| |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| + |
| /** An abstract base class for simple, character-oriented tokenizers.*/ |
| public abstract class CharTokenizer extends Tokenizer { |
| public CharTokenizer(Reader input) { |
| @@ -30,6 +33,9 @@
|
| private static final int MAX_WORD_LEN = 255; |
| private static final int IO_BUFFER_SIZE = 4096; |
| private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; |
| + |
| + private TermAttribute termAtt; |
| + private OffsetAttribute offsetAtt; |
| |
| /** Returns true iff a character should be included in a token. This |
| * tokenizer generates as tokens adjacent sequences of characters which |
| @@ -43,7 +49,58 @@
|
| protected char normalize(char c) { |
| return c; |
| } |
| + |
| + public void initialize() throws IOException { |
| + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + } |
| |
| + public final boolean incrementToken() throws IOException { |
| + assert termAtt != null && offsetAtt != null; |
| + |
| + clearAttributes(); |
| + int length = 0; |
| + int start = bufferIndex; |
| + char[] buffer = termAtt.termBuffer(); |
| + while (true) { |
| + |
| + if (bufferIndex >= dataLen) { |
| + offset += dataLen; |
| + dataLen = input.read(ioBuffer); |
| + if (dataLen == -1) { |
| + if (length > 0) |
| + break; |
| + else |
| + return false; |
| + } |
| + bufferIndex = 0; |
| + } |
| + |
| + final char c = ioBuffer[bufferIndex++]; |
| + |
| + if (isTokenChar(c)) { // if it's a token char |
| + |
| + if (length == 0) // start of token |
| + start = offset + bufferIndex - 1; |
| + else if (length == buffer.length) |
| + buffer = termAtt.resizeTermBuffer(1+length); |
| + |
| + buffer[length++] = normalize(c); // buffer it, normalized |
| + |
| + if (length == MAX_WORD_LEN) // buffer overflow! |
| + break; |
| + |
| + } else if (length > 0) // at non-Letter w/ chars |
| + break; // return 'em |
| + } |
| + |
| + termAtt.setTermLength(length); |
| + offsetAtt.setStartOffset(start); |
| + offsetAtt.setEndOffset(start+length); |
| + return true; |
| + } |
| + |
| + /** @deprecated */ |
| public final Token next(final Token reusableToken) throws IOException { |
| assert reusableToken != null; |
| reusableToken.clear(); |
| Index: src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java (revision 708658)
|
| +++ src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java (working copy)
|
| @@ -1,5 +1,9 @@
|
| package org.apache.lucene.analysis; |
| |
| +import java.io.IOException; |
| + |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| + |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| @@ -31,7 +35,34 @@
|
| |
| private char[] output = new char[256]; |
| private int outputPos; |
| - |
| + private TermAttribute termAtt; |
| + |
| + public void initialize() throws IOException { |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + } |
| + |
| + public final boolean incrementToken() throws java.io.IOException { |
| + assert termAtt != null; |
| + |
| + if (input.incrementToken()) { |
| + final char[] buffer = termAtt.termBuffer(); |
| + final int length = termAtt.termLength(); |
| + // If no characters actually require rewriting then we |
| + // just return token as-is: |
| + for(int i=0;i<length;i++) { |
| + final char c = buffer[i]; |
| + if (c >= '\u00c0' && c <= '\uFB06') { |
| + removeAccents(buffer, length); |
| + termAtt.setTermBuffer(output, 0, outputPos); |
| + break; |
| + } |
| + } |
| + return true; |
| + } else |
| + return false; |
| + } |
| + |
| + /** @deprecated */ |
| public final Token next(final Token reusableToken) throws java.io.IOException { |
| assert reusableToken != null; |
| Token nextToken = input.next(reusableToken); |
| @@ -241,7 +272,7 @@
|
| case '\uFB06': // st |
| output[outputPos++] = 's'; |
| output[outputPos++] = 't'; |
| - break; |
| + break; |
| default : |
| output[outputPos++] = c; |
| break; |
| Index: src/java/org/apache/lucene/analysis/KeywordTokenizer.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/KeywordTokenizer.java (revision 708658)
|
| +++ src/java/org/apache/lucene/analysis/KeywordTokenizer.java (working copy)
|
| @@ -20,6 +20,8 @@
|
| import java.io.IOException; |
| import java.io.Reader; |
| |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| + |
| /** |
| * Emits the entire input as a single token. |
| */ |
| @@ -28,7 +30,8 @@
|
| private static final int DEFAULT_BUFFER_SIZE = 256; |
| |
| private boolean done; |
| - |
| + private TermAttribute termAtt; |
| + |
| public KeywordTokenizer(Reader input) { |
| this(input, DEFAULT_BUFFER_SIZE); |
| } |
| @@ -37,7 +40,32 @@
|
| super(input); |
| this.done = false; |
| } |
| + |
| + public void initialize() throws IOException { |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + } |
| + |
| + public boolean incrementToken() throws IOException { |
| + assert termAtt != null; |
| + if (!done) { |
| + done = true; |
| + int upto = 0; |
| + termAtt.clear(); |
| + char[] buffer = termAtt.termBuffer(); |
| + while (true) { |
| + final int length = input.read(buffer, upto, buffer.length-upto); |
| + if (length == -1) break; |
| + upto += length; |
| + if (upto == buffer.length) |
| + buffer = termAtt.resizeTermBuffer(1+buffer.length); |
| + } |
| + termAtt.setTermLength(upto); |
| + return true; |
| + } |
| + return false; |
| + } |
| |
| + /** @deprecated */ |
| public Token next(final Token reusableToken) throws IOException { |
| assert reusableToken != null; |
| if (!done) { |
| Index: src/java/org/apache/lucene/analysis/LengthFilter.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/LengthFilter.java (revision 708658)
|
| +++ src/java/org/apache/lucene/analysis/LengthFilter.java (working copy)
|
| @@ -19,6 +19,8 @@
|
| |
| import java.io.IOException; |
| |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| + |
| /** |
| * Removes words that are too long and too short from the stream. |
| * |
| @@ -29,6 +31,8 @@
|
| |
| final int min; |
| final int max; |
| + |
| + private TermAttribute termAtt; |
| |
| /** |
| * Build a filter that removes words that are too long or too |
| @@ -41,9 +45,33 @@
|
| this.max = max; |
| } |
| |
| + public void initialize() throws IOException { |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + } |
| + |
| + |
| /** |
| * Returns the next input Token whose term() is the right len |
| */ |
| + public final boolean incrementToken() throws IOException |
| + { |
| + assert termAtt != null; |
| + // return the first non-stop word found |
| + while (input.incrementToken()) { |
| + int len = termAtt.termLength(); |
| + if (len >= min && len <= max) { |
| + return true; |
| + } |
| + // note: else we ignore it but should we index each part of it? |
| + } |
| + // reached EOS -- return null |
| + return false; |
| + } |
| + |
| + /** |
| + * Returns the next input Token whose term() is the right len |
| + * @deprecated |
| + */ |
| public final Token next(final Token reusableToken) throws IOException |
| { |
| assert reusableToken != null; |
| Index: src/java/org/apache/lucene/analysis/LowerCaseFilter.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/LowerCaseFilter.java (revision 708658)
|
| +++ src/java/org/apache/lucene/analysis/LowerCaseFilter.java (working copy)
|
| @@ -19,6 +19,8 @@
|
| |
| import java.io.IOException; |
| |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| + |
| /** |
| * Normalizes token text to lower case. |
| * |
| @@ -29,6 +31,28 @@
|
| super(in); |
| } |
| |
| + private TermAttribute termAtt; |
| + |
| + public void initialize() throws IOException { |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + } |
| + |
| + public final boolean incrementToken() throws IOException { |
| + assert termAtt != null; |
| + |
| + if (input.incrementToken()) { |
| + |
| + final char[] buffer = termAtt.termBuffer(); |
| + final int length = termAtt.termLength(); |
| + for(int i=0;i<length;i++) |
| + buffer[i] = Character.toLowerCase(buffer[i]); |
| + |
| + return true; |
| + } else |
| + return false; |
| + } |
| + |
| + /** @deprecated */ |
| public final Token next(final Token reusableToken) throws IOException { |
| assert reusableToken != null; |
| Token nextToken = input.next(reusableToken); |
| Index: src/java/org/apache/lucene/analysis/PorterStemFilter.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/PorterStemFilter.java (revision 708658)
|
| +++ src/java/org/apache/lucene/analysis/PorterStemFilter.java (working copy)
|
| @@ -19,6 +19,8 @@
|
| |
| import java.io.IOException; |
| |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| + |
| /** Transforms the token stream as per the Porter stemming algorithm. |
| Note: the input to the stemming filter must already be in lower case, |
| so you will need to use LowerCaseFilter or LowerCaseTokenizer farther |
| @@ -39,12 +41,29 @@
|
| */ |
| public final class PorterStemFilter extends TokenFilter { |
| private PorterStemmer stemmer; |
| + private TermAttribute termAtt; |
| |
| public PorterStemFilter(TokenStream in) { |
| super(in); |
| stemmer = new PorterStemmer(); |
| } |
| + |
| + public void initialize() throws IOException { |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + } |
| |
| + public final boolean incrementToken() throws IOException { |
| + assert termAtt != null; |
| + |
| + if (!input.incrementToken()) |
| + return false; |
| + |
| + if (stemmer.stem(termAtt.termBuffer(), 0, termAtt.termLength())) |
| + termAtt.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength()); |
| + return true; |
| + } |
| + |
| + /** @deprecated */ |
| public final Token next(final Token reusableToken) throws IOException { |
| assert reusableToken != null; |
| Token nextToken = input.next(reusableToken); |
| Index: src/java/org/apache/lucene/analysis/SinkTokenizer.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/SinkTokenizer.java (revision 708658)
|
| +++ src/java/org/apache/lucene/analysis/SinkTokenizer.java (working copy)
|
| @@ -32,7 +32,7 @@
|
| public class SinkTokenizer extends Tokenizer { |
| protected List/*<Token>*/ lst = new ArrayList/*<Token>*/(); |
| protected Iterator/*<Token>*/ iter; |
| - |
| + |
| public SinkTokenizer(List/*<Token>*/ input) { |
| this.lst = input; |
| if (this.lst == null) this.lst = new ArrayList/*<Token>*/(); |
| @@ -66,6 +66,27 @@
|
| * @return The next {@link org.apache.lucene.analysis.Token} in the Sink. |
| * @throws IOException |
| */ |
| + public boolean incrementToken() throws IOException { |
| + if (iter == null) iter = lst.iterator(); |
| + // Since this TokenStream can be reset we have to maintain the tokens as immutable |
| + if (iter.hasNext()) { |
| + TokenStreamState state = (TokenStreamState) iter.next(); |
| + state.restore(this); |
| + return true; |
| + } |
| + return false; |
| + } |
| + |
| + public void add(TokenStreamState source) throws IOException { |
| + lst.add(source); |
| + } |
| + |
| + /** |
| + * Returns the next token out of the list of cached tokens |
| + * @return The next {@link org.apache.lucene.analysis.Token} in the Sink. |
| + * @throws IOException |
| + * @deprecated |
| + */ |
| public Token next(final Token reusableToken) throws IOException { |
| assert reusableToken != null; |
| if (iter == null) iter = lst.iterator(); |
| @@ -77,8 +98,6 @@
|
| return null; |
| } |
| |
| - |
| - |
| /** |
| * Override this method to cache only certain tokens, or new tokens based |
| * on the old tokens. |
| Index: src/java/org/apache/lucene/analysis/standard/StandardFilter.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/standard/StandardFilter.java (revision 708658)
|
| +++ src/java/org/apache/lucene/analysis/standard/StandardFilter.java (working copy)
|
| @@ -17,9 +17,13 @@
|
| * limitations under the License. |
| */ |
| |
| +import java.io.IOException; |
| + |
| +import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenFilter; |
| -import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
| |
| /** Normalizes tokens extracted with {@link StandardTokenizer}. */ |
| |
| @@ -34,10 +38,53 @@
|
| private static final String APOSTROPHE_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.APOSTROPHE]; |
| private static final String ACRONYM_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]; |
| |
| + // this filters uses attribute type |
| + private TypeAttribute typeAtt; |
| + private TermAttribute termAtt; |
| + |
| + public final void initialize() throws IOException { |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); |
| + } |
| + |
| /** Returns the next token in the stream, or null at EOS. |
| * <p>Removes <tt>'s</tt> from the end of words. |
| * <p>Removes dots from acronyms. |
| */ |
| + public final boolean incrementToken() throws java.io.IOException { |
| + assert termAtt != null; |
| + if (!input.incrementToken()) { |
| + return false; |
| + } |
| + |
| + char[] buffer = termAtt.termBuffer(); |
| + final int bufferLength = termAtt.termLength(); |
| + final String type = typeAtt.type(); |
| + |
| + if (type == APOSTROPHE_TYPE && // remove 's |
| + bufferLength >= 2 && |
| + buffer[bufferLength-2] == '\'' && |
| + (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) { |
| + // Strip last 2 characters off |
| + termAtt.setTermLength(bufferLength - 2); |
| + } else if (type == ACRONYM_TYPE) { // remove dots |
| + int upto = 0; |
| + for(int i=0;i<bufferLength;i++) { |
| + char c = buffer[i]; |
| + if (c != '.') |
| + buffer[upto++] = c; |
| + } |
| + termAtt.setTermLength(upto); |
| + } |
| + |
| + return true; |
| + } |
| + |
| + /** Returns the next token in the stream, or null at EOS. |
| + * <p>Removes <tt>'s</tt> from the end of words. |
| + * <p>Removes dots from acronyms. |
| + * @deprecated |
| + */ |
| public final Token next(final Token reusableToken) throws java.io.IOException { |
| assert reusableToken != null; |
| Token nextToken = input.next(reusableToken); |
| Index: src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (revision 708658)
|
| +++ src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (working copy)
|
| @@ -22,6 +22,10 @@
|
| |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.Tokenizer; |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
| |
| /** A grammar-based tokenizer constructed with JFlex |
| * |
| @@ -127,11 +131,71 @@
|
| this.scanner = new StandardTokenizerImpl(input); |
| } |
| |
| + // this tokenizer generates three attributes: |
| + // offset, positionIncrement and type |
| + private TermAttribute termAtt; |
| + private OffsetAttribute offsetAtt; |
| + private PositionIncrementAttribute posIncrAtt; |
| + private TypeAttribute typeAtt; |
| + |
| + public void initialize() throws IOException { |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); |
| + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); |
| + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); |
| + |
| + } |
| + |
| /* |
| * (non-Javadoc) |
| * |
| * @see org.apache.lucene.analysis.TokenStream#next() |
| */ |
| + public boolean incrementToken() throws IOException { |
| + assert termAtt != null; |
| + int posIncr = 1; |
| + |
| + while(true) { |
| + int tokenType = scanner.getNextToken(); |
| + |
| + if (tokenType == StandardTokenizerImpl.YYEOF) { |
| + return false; |
| + } |
| + |
| + if (scanner.yylength() <= maxTokenLength) { |
| + termAtt.clear(); |
| + posIncrAtt.setPositionIncrement(posIncr); |
| + scanner.getText(termAtt); |
| + final int start = scanner.yychar(); |
| + offsetAtt.setStartOffset(start); |
| + offsetAtt.setEndOffset(start+termAtt.termLength()); |
| + // This 'if' should be removed in the next release. For now, it converts |
| + // invalid acronyms to HOST. When removed, only the 'else' part should |
| + // remain. |
| + if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) { |
| + if (replaceInvalidAcronym) { |
| + typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]); |
| + termAtt.setTermLength(termAtt.termLength() - 1); // remove extra '.' |
| + } else { |
| + typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]); |
| + } |
| + } else { |
| + typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]); |
| + } |
| + return true; |
| + } else |
| + // When we skip a too-long term, we still increment the |
| + // position increment |
| + posIncr++; |
| + } |
| + } |
| + |
| + /* |
| + * (non-Javadoc) |
| + * |
| + * @see org.apache.lucene.analysis.TokenStream#next() |
| + */ |
| + /** @deprecated */ |
| public Token next(final Token reusableToken) throws IOException { |
| assert reusableToken != null; |
| int posIncr = 1; |
| Index: src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java (revision 708658)
|
| +++ src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java (working copy)
|
| @@ -30,6 +30,7 @@
|
| */ |
| |
| import org.apache.lucene.analysis.Token; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| |
| |
| /** |
| @@ -368,7 +369,14 @@
|
| t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); |
| } |
| |
| +/** |
| + * Fills TermAttribute with the current token text. |
| + */ |
| +final void getText(TermAttribute t) { |
| + t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); |
| +} |
| |
| + |
| /** |
| * Creates a new scanner |
| * There is also a java.io.InputStream version of this constructor. |
| Index: src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (revision 708658)
|
| +++ src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (working copy)
|
| @@ -29,6 +29,7 @@
|
| */ |
| |
| import org.apache.lucene.analysis.Token; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| |
| %% |
| |
| @@ -69,6 +70,14 @@
|
| final void getText(Token t) { |
| t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); |
| } |
| + |
| +/** |
| + * Fills TermAttribute with the current token text. |
| + */ |
| +final void getText(TermAttribute t) { |
| + t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); |
| +} |
| + |
| %} |
| |
| THAI = [\u0E00-\u0E59] |
| Index: src/java/org/apache/lucene/analysis/StopFilter.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/StopFilter.java (revision 708658)
|
| +++ src/java/org/apache/lucene/analysis/StopFilter.java (working copy)
|
| @@ -21,6 +21,9 @@
|
| import java.util.Arrays; |
| import java.util.Set; |
| |
| +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| + |
| /** |
| * Removes stop words from a token stream. |
| */ |
| @@ -32,6 +35,9 @@
|
| private final CharArraySet stopWords; |
| private boolean enablePositionIncrements = ENABLE_POSITION_INCREMENTS_DEFAULT; |
| |
| + private TermAttribute termAtt; |
| + private PositionIncrementAttribute posIncrAtt; |
| + |
| /** |
| * Construct a token stream filtering the given input. |
| */ |
| @@ -85,6 +91,11 @@
|
| public StopFilter(TokenStream in, Set stopWords) { |
| this(in, stopWords, false); |
| } |
| + |
| + public void initialize() throws IOException { |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); |
| + } |
| |
| /** |
| * Builds a Set from an array of stop words, |
| @@ -109,9 +120,30 @@
|
| stopSet.addAll(Arrays.asList(stopWords)); |
| return stopSet; |
| } |
| + |
| + /** |
| + * Returns the next input Token whose term() is not a stop word. |
| + */ |
| + public final boolean incrementToken() throws IOException { |
| + assert termAtt != null; |
| + // return the first non-stop word found |
| + int skippedPositions = 0; |
| + while (input.incrementToken()) { |
| + if (!stopWords.contains(termAtt.termBuffer(), 0, termAtt.termLength())) { |
| + if (enablePositionIncrements) { |
| + posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); |
| + } |
| + return true; |
| + } |
| + skippedPositions += posIncrAtt.getPositionIncrement(); |
| + } |
| + // reached EOS -- return null |
| + return false; |
| + } |
| |
| /** |
| * Returns the next input Token whose term() is not a stop word. |
| + * @deprecated |
| */ |
| public final Token next(final Token reusableToken) throws IOException { |
| assert reusableToken != null; |
| Index: src/java/org/apache/lucene/analysis/TeeTokenFilter.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/TeeTokenFilter.java (revision 708658)
|
| +++ src/java/org/apache/lucene/analysis/TeeTokenFilter.java (working copy)
|
| @@ -62,6 +62,15 @@
|
| this.sink = sink; |
| } |
| |
| + public boolean incrementToken() throws IOException { |
| + if (input.incrementToken()) { |
| + sink.add(TokenStreamState.capture(this)); |
| + return true; |
| + } |
| + return false; |
| + } |
| + |
| + /** @deprecated */ |
| public Token next(final Token reusableToken) throws IOException { |
| assert reusableToken != null; |
| Token nextToken = input.next(reusableToken); |
| Index: src/java/org/apache/lucene/analysis/Token.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/Token.java (revision 708658)
|
| +++ src/java/org/apache/lucene/analysis/Token.java (working copy)
|
| @@ -21,7 +21,11 @@
|
| import org.apache.lucene.index.TermPositions; // for javadoc |
| import org.apache.lucene.util.ArrayUtil; |
| |
| -/** A Token is an occurrence of a term from the text of a field. It consists of |
| +/** |
| + This class is now deprecated and a new TokenStream API was introduced with Lucene 2.9. |
| + See Javadocs in {@link TokenStream} for further details. |
| + <p> |
| + A Token is an occurrence of a term from the text of a field. It consists of |
| a term's text, the start and end offset of the term in the text of the field, |
| and a type string. |
| <p> |
| @@ -114,6 +118,8 @@
|
| </p> |
| |
| @see org.apache.lucene.index.Payload |
| + @deprecated A new TokenStream API was introduced with Lucene 2.9. |
| + See javadocs in {@link TokenStream} for further details. |
| */ |
| public class Token implements Cloneable { |
| |
| Index: src/java/org/apache/lucene/analysis/tokenattributes/Attribute.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/tokenattributes/Attribute.java (revision 0)
|
| +++ src/java/org/apache/lucene/analysis/tokenattributes/Attribute.java (revision 0)
|
| @@ -0,0 +1,91 @@
|
| +package org.apache.lucene.analysis.tokenattributes;
|
| +
|
| +/**
|
| + * Licensed to the Apache Software Foundation (ASF) under one or more
|
| + * contributor license agreements. See the NOTICE file distributed with
|
| + * this work for additional information regarding copyright ownership.
|
| + * The ASF licenses this file to You under the Apache License, Version 2.0
|
| + * (the "License"); you may not use this file except in compliance with
|
| + * the License. You may obtain a copy of the License at
|
| + *
|
| + * http://www.apache.org/licenses/LICENSE-2.0
|
| + *
|
| + * Unless required by applicable law or agreed to in writing, software
|
| + * distributed under the License is distributed on an "AS IS" BASIS,
|
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| + * See the License for the specific language governing permissions and
|
| + * limitations under the License.
|
| + */
|
| +
|
| +import java.io.Serializable;
|
| +
|
| +/**
|
| + * Base class for Attributes that can be added to a
|
| + * {@link org.apache.lucene.analysis.AttributeSource}.
|
| + *
|
| + * Attributes are used to deliver information about Tokens from
|
| + * the analyzer to the indexing chain.
|
| + */
|
| +public abstract class Attribute implements Cloneable, Serializable {
|
| + /**
|
| + * Clears the values in this Attribute and resets it to its
|
| + * default value.
|
| + */
|
| + public abstract void clear();
|
| +
|
| + /**
|
| + * Subclasses must implement this method and should follow a syntax
|
| + * similar to this one:
|
| + *
|
| + * <pre>
|
| + * public String toString() {
|
| + * return "start=" + startOffset + ",end=" + endOffset;
|
| + * }
|
| + * </pre>
|
| + */
|
| + public abstract String toString();
|
| +
|
| + public Attribute() {
|
| + // empty ctor used to instantiate by reflection
|
| + }
|
| +
|
| + /**
|
| + * Subclasses must implement this method and should compute
|
| + * a hashCode similar to this:
|
| + * <pre>
|
| + * public int hashCode() {
|
| + * int code = startOffset;
|
| + * code = code * 31 + endOffset;
|
| + * return code;
|
| + * }
|
| + * </pre>
|
| + *
|
| + * see also {@link #equals(Object)}
|
| + */
|
| + public abstract int hashCode();
|
| +
|
| + /**
|
| + * All values used for computation of {@link #hashCode()}
|
| + * should be checked here for equality.
|
| + *
|
| + * see also {@link Object#equals(Object)}
|
| + */
|
| + public abstract boolean equals(Object other);
|
| +
|
| + /**
|
| + * Copies the values from this Attribute into the passed-in
|
| + * target attribute. The type of the target must match the type
|
| + * of this attribute.
|
| + */
|
| + public abstract void copyTo(Attribute target);
|
| +
|
| + public Object clone() {
|
| + Object clone = null;
|
| + try {
|
| + clone = super.clone();
|
| + } catch (CloneNotSupportedException e) {
|
| + throw new RuntimeException(e); // shouldn't happen
|
| + }
|
| + return clone;
|
| + }
|
| +}
|
| Index: src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java (revision 0)
|
| +++ src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java (revision 0)
|
| @@ -0,0 +1,78 @@
|
| +package org.apache.lucene.analysis.tokenattributes;
|
| +
|
| +/**
|
| + * Licensed to the Apache Software Foundation (ASF) under one or more
|
| + * contributor license agreements. See the NOTICE file distributed with
|
| + * this work for additional information regarding copyright ownership.
|
| + * The ASF licenses this file to You under the Apache License, Version 2.0
|
| + * (the "License"); you may not use this file except in compliance with
|
| + * the License. You may obtain a copy of the License at
|
| + *
|
| + * http://www.apache.org/licenses/LICENSE-2.0
|
| + *
|
| + * Unless required by applicable law or agreed to in writing, software
|
| + * distributed under the License is distributed on an "AS IS" BASIS,
|
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| + * See the License for the specific language governing permissions and
|
| + * limitations under the License.
|
| + */
|
| +
|
| +import java.io.Serializable;
|
| +
|
| +/**
|
| + * This attribute can be used to pass different flags down the tokenizer chain,
|
| + * e. g. from one TokenFilter to another one.
|
| + */
|
| +public class FlagsAttribute extends Attribute implements Cloneable, Serializable {
|
| + private int flags = 0;
|
| +
|
| + /**
|
| + * EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
|
| + * <p/>
|
| + *
|
| + * Get the bitset for any bits that have been set. This is completely distinct from {@link #type()}, although they do share similar purposes.
|
| + * The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
|
| + *
|
| + *
|
| + * @return The bits
|
| + */
|
| + public int getFlags() {
|
| + return flags;
|
| + }
|
| +
|
| + /**
|
| + * @see #getFlags()
|
| + */
|
| + public void setFlags(int flags) {
|
| + this.flags = flags;
|
| + }
|
| +
|
| + public void clear() {
|
| + flags = 0;
|
| + }
|
| +
|
| + public String toString() {
|
| + return "flags=" + flags;
|
| + }
|
| +
|
| + public boolean equals(Object other) {
|
| + if (this == other) {
|
| + return true;
|
| + }
|
| +
|
| + if (other instanceof FlagsAttribute) {
|
| + return ((FlagsAttribute) other).flags == flags;
|
| + }
|
| +
|
| + return false;
|
| + }
|
| +
|
| + public int hashCode() {
|
| + return flags;
|
| + }
|
| +
|
| + public void copyTo(Attribute target) {
|
| + FlagsAttribute t = (FlagsAttribute) target;
|
| + t.setFlags(flags);
|
| + }
|
| +}
|
| Index: src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java (revision 0)
|
| +++ src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java (revision 0)
|
| @@ -0,0 +1,91 @@
|
| +package org.apache.lucene.analysis.tokenattributes;
|
| +
|
| +/**
|
| + * Licensed to the Apache Software Foundation (ASF) under one or more
|
| + * contributor license agreements. See the NOTICE file distributed with
|
| + * this work for additional information regarding copyright ownership.
|
| + * The ASF licenses this file to You under the Apache License, Version 2.0
|
| + * (the "License"); you may not use this file except in compliance with
|
| + * the License. You may obtain a copy of the License at
|
| + *
|
| + * http://www.apache.org/licenses/LICENSE-2.0
|
| + *
|
| + * Unless required by applicable law or agreed to in writing, software
|
| + * distributed under the License is distributed on an "AS IS" BASIS,
|
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| + * See the License for the specific language governing permissions and
|
| + * limitations under the License.
|
| + */
|
| +
|
| +import java.io.Serializable;
|
| +
|
| +/**
|
| + * The start and end character offset of a Token.
|
| + */
|
| +public class OffsetAttribute extends Attribute implements Cloneable, Serializable {
|
| + private int startOffset;
|
| + private int endOffset;
|
| +
|
| + /** Returns this Token's starting offset, the position of the first character
|
| + corresponding to this token in the source text.
|
| +
|
| + Note that the difference between endOffset() and startOffset() may not be
|
| + equal to termText.length(), as the term text may have been altered by a
|
| + stemmer or some other filter. */
|
| + public int startOffset() {
|
| + return startOffset;
|
| + }
|
| +
|
| + /** Set the starting offset.
|
| + @see #startOffset() */
|
| + public void setStartOffset(int offset) {
|
| + this.startOffset = offset;
|
| + }
|
| +
|
| + /** Returns this Token's ending offset, one greater than the position of the
|
| + last character corresponding to this token in the source text. The length
|
| + of the token in the source text is (endOffset - startOffset). */
|
| + public int endOffset() {
|
| + return endOffset;
|
| + }
|
| +
|
| + /** Set the ending offset.
|
| + @see #endOffset() */
|
| + public void setEndOffset(int offset) {
|
| + this.endOffset = offset;
|
| + }
|
| +
|
| + public void clear() {
|
| + startOffset = 0;
|
| + endOffset = 0;
|
| + }
|
| +
|
| + public String toString() {
|
| + return "start=" + startOffset + ",end=" + endOffset;
|
| + }
|
| +
|
| + public boolean equals(Object other) {
|
| + if (other == this) {
|
| + return true;
|
| + }
|
| +
|
| + if (other instanceof OffsetAttribute) {
|
| + OffsetAttribute o = (OffsetAttribute) other;
|
| + return o.startOffset == startOffset && o.endOffset == endOffset;
|
| + }
|
| +
|
| + return false;
|
| + }
|
| +
|
| + public int hashCode() {
|
| + int code = startOffset;
|
| + code = code * 31 + endOffset;
|
| + return code;
|
| + }
|
| +
|
| + public void copyTo(Attribute target) {
|
| + OffsetAttribute t = (OffsetAttribute) target;
|
| + t.setStartOffset(startOffset);
|
| + t.setEndOffset(endOffset);
|
| + }
|
| +}
|
| Index: src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java (revision 0)
|
| +++ src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java (revision 0)
|
| @@ -0,0 +1,103 @@
|
| +package org.apache.lucene.analysis.tokenattributes;
|
| +
|
| +/**
|
| + * Licensed to the Apache Software Foundation (ASF) under one or more
|
| + * contributor license agreements. See the NOTICE file distributed with
|
| + * this work for additional information regarding copyright ownership.
|
| + * The ASF licenses this file to You under the Apache License, Version 2.0
|
| + * (the "License"); you may not use this file except in compliance with
|
| + * the License. You may obtain a copy of the License at
|
| + *
|
| + * http://www.apache.org/licenses/LICENSE-2.0
|
| + *
|
| + * Unless required by applicable law or agreed to in writing, software
|
| + * distributed under the License is distributed on an "AS IS" BASIS,
|
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| + * See the License for the specific language governing permissions and
|
| + * limitations under the License.
|
| + */
|
| +
|
| +import java.io.Serializable;
|
| +
|
| +import org.apache.lucene.index.Payload;
|
| +
|
| +/**
|
| + * The payload of a Token. See also {@link Payload}.
|
| + */
|
| +public class PayloadAttribute extends Attribute implements Cloneable, Serializable {
|
| + private Payload payload;
|
| +
|
| + /**
|
| + * Initialize this attribute with no payload.
|
| + */
|
| + public PayloadAttribute() {}
|
| +
|
| + /**
|
| + * Initialize this attribute with the given payload.
|
| + */
|
| + public PayloadAttribute(Payload payload) {
|
| + this.payload = payload;
|
| + }
|
| +
|
| + /**
|
| + * Returns this Token's payload.
|
| + */
|
| + public Payload getPayload() {
|
| + return this.payload;
|
| + }
|
| +
|
| + /**
|
| + * Sets this Token's payload.
|
| + */
|
| + public void setPayload(Payload payload) {
|
| + this.payload = payload;
|
| + }
|
| +
|
| + public void clear() {
|
| + payload = null;
|
| + }
|
| +
|
| + public String toString() {
|
| + if (payload == null) {
|
| + return "payload=null";
|
| + }
|
| +
|
| + return "payload=" + payload.toString();
|
| + }
|
| +
|
| + public Object clone() {
|
| + PayloadAttribute clone = (PayloadAttribute) super.clone();
|
| + if (payload != null) {
|
| + clone.payload = (Payload) payload.clone();
|
| + }
|
| + return clone;
|
| + }
|
| +
|
| + public boolean equals(Object other) {
|
| + if (other == this) {
|
| + return true;
|
| + }
|
| +
|
| + if (other instanceof PayloadAttribute) {
|
| + PayloadAttribute o = (PayloadAttribute) other;
|
| + if (o.payload == null || payload == null) {
|
| + return o.payload == null && payload == null;
|
| + }
|
| +
|
| + return o.payload.equals(payload);
|
| + }
|
| +
|
| + return false;
|
| + }
|
| +
|
| + public int hashCode() {
|
| + return (payload == null) ? 0 : payload.hashCode();
|
| + }
|
| +
|
| + public void copyTo(Attribute target) {
|
| + PayloadAttribute t = (PayloadAttribute) target;
|
| + t.setPayload((payload == null) ? null : (Payload) payload.clone());
|
| + }
|
| +
|
| +
|
| +}
|
| Index: src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java (revision 0)
|
| +++ src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java (revision 0)
|
| @@ -0,0 +1,99 @@
|
| +package org.apache.lucene.analysis.tokenattributes;
|
| +
|
| +/**
|
| + * Licensed to the Apache Software Foundation (ASF) under one or more
|
| + * contributor license agreements. See the NOTICE file distributed with
|
| + * this work for additional information regarding copyright ownership.
|
| + * The ASF licenses this file to You under the Apache License, Version 2.0
|
| + * (the "License"); you may not use this file except in compliance with
|
| + * the License. You may obtain a copy of the License at
|
| + *
|
| + * http://www.apache.org/licenses/LICENSE-2.0
|
| + *
|
| + * Unless required by applicable law or agreed to in writing, software
|
| + * distributed under the License is distributed on an "AS IS" BASIS,
|
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| + * See the License for the specific language governing permissions and
|
| + * limitations under the License.
|
| + */
|
| +
|
| +import java.io.Serializable;
|
| +
|
| +import org.apache.lucene.analysis.TokenStream;
|
| +
|
| +/** The positionIncrement determines the position of this token
|
| + * relative to the previous Token in a {@link TokenStream}, used in phrase
|
| + * searching.
|
| + *
|
| + * <p>The default value is one.
|
| + *
|
| + * <p>Some common uses for this are:<ul>
|
| + *
|
| + * <li>Set it to zero to put multiple terms in the same position. This is
|
| + * useful if, e.g., a word has multiple stems. Searches for phrases
|
| + * including either stem will match. In this case, all but the first stem's
|
| + * increment should be set to zero: the increment of the first instance
|
| + * should be one. Repeating a token with an increment of zero can also be
|
| + * used to boost the scores of matches on that token.
|
| + *
|
| + * <li>Set it to values greater than one to inhibit exact phrase matches.
|
| + * If, for example, one does not want phrases to match across removed stop
|
| + * words, then one could build a stop word filter that removes stop words and
|
| + * also sets the increment to the number of stop words removed before each
|
| + * non-stop word. Then exact phrase queries will only match when the terms
|
| + * occur with no intervening stop words.
|
| + *
|
| + * </ul>
|
| + * @see org.apache.lucene.index.TermPositions
|
| + */
|
| +public class PositionIncrementAttribute extends Attribute implements Cloneable, Serializable {
|
| + private int positionIncrement = 1;
|
| +
|
| + /** Set the position increment. The default value is one.
|
| + *
|
| + * @param positionIncrement the distance from the prior term
|
| + */
|
| + public void setPositionIncrement(int positionIncrement) {
|
| + if (positionIncrement < 0)
|
| + throw new IllegalArgumentException
|
| + ("Increment must be zero or greater: " + positionIncrement);
|
| + this.positionIncrement = positionIncrement;
|
| + }
|
| +
|
| + /** Returns the position increment of this Token.
|
| + * @see #setPositionIncrement
|
| + */
|
| + public int getPositionIncrement() {
|
| + return positionIncrement;
|
| + }
|
| +
|
| + public void clear() {
|
| + this.positionIncrement = 1;
|
| + }
|
| +
|
| + public String toString() {
|
| + return "positionIncrement=" + positionIncrement;
|
| + }
|
| +
|
| + public boolean equals(Object other) {
|
| + if (other == this) {
|
| + return true;
|
| + }
|
| +
|
| + if (other instanceof PositionIncrementAttribute) {
|
| + return positionIncrement == ((PositionIncrementAttribute) other).positionIncrement;
|
| + }
|
| +
|
| + return false;
|
| + }
|
| +
|
| + public int hashCode() {
|
| + return positionIncrement;
|
| + }
|
| +
|
| + public void copyTo(Attribute target) {
|
| + PositionIncrementAttribute t = (PositionIncrementAttribute) target;
|
| + t.setPositionIncrement(positionIncrement);
|
| + }
|
| +
|
| +}
|
| Index: src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java (revision 0)
|
| +++ src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java (revision 0)
|
| @@ -0,0 +1,245 @@
|
| +package org.apache.lucene.analysis.tokenattributes;
|
| +
|
| +/** Set the position increment. This determines the position of this token
|
| + * relative to the previous Token in a {@link TokenStream}, used in phrase
|
| + * searching.
|
| + *
|
| + * <p>The default value is one.
|
| + *
|
| + * <p>Some common uses for this are:<ul>
|
| + *
|
| + * <li>Set it to zero to put multiple terms in the same position. This is
|
| + * useful if, e.g., a word has multiple stems. Searches for phrases
|
| + * including either stem will match. In this case, all but the first stem's
|
| + * increment should be set to zero: the increment of the first instance
|
| + * should be one. Repeating a token with an increment of zero can also be
|
| + * used to boost the scores of matches on that token.
|
| + *
|
| + * <li>Set it to values greater than one to inhibit exact phrase matches.
|
| + * If, for example, one does not want phrases to match across removed stop
|
| + * words, then one could build a stop word filter that removes stop words and
|
| + * also sets the increment to the number of stop words removed before each
|
| + * non-stop word. Then exact phrase queries will only match when the terms
|
| + * occur with no intervening stop words.
|
| + *
|
| + * </ul>
|
| + * @param positionIncrement the distance from the prior term
|
| + * @see org.apache.lucene.index.TermPositions
|
| + */
|
| +
|
| +import java.io.Serializable;
|
| +
|
| +import org.apache.lucene.util.ArrayUtil;
|
| +
|
| +/**
|
| + * The term text of a Token.
|
| + */
|
| +public class TermAttribute extends Attribute implements Cloneable, Serializable {
|
| + private static int MIN_BUFFER_SIZE = 10;
|
| +
|
| + private char[] termBuffer;
|
| + private int termLength;
|
| +
|
| + /** Returns the Token's term text.
|
| + *
|
| + * This method has a performance penalty
|
| + * because the text is stored internally in a char[]. If
|
| + * possible, use {@link #termBuffer()} and {@link
|
| + * #termLength()} directly instead. If you really need a
|
| + * String, use this method, which is nothing more than
|
| + * a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b>
|
| + */
|
| + public String term() {
|
| + initTermBuffer();
|
| + return new String(termBuffer, 0, termLength);
|
| + }
|
| +
|
| + /** Copies the contents of buffer, starting at offset for
|
| + * length characters, into the termBuffer array.
|
| + * @param buffer the buffer to copy
|
| + * @param offset the index in the buffer of the first character to copy
|
| + * @param length the number of characters to copy
|
| + */
|
| + public void setTermBuffer(char[] buffer, int offset, int length) {
|
| + char[] newCharBuffer = growTermBuffer(length);
|
| + if (newCharBuffer != null) {
|
| + termBuffer = newCharBuffer;
|
| + }
|
| + System.arraycopy(buffer, offset, termBuffer, 0, length);
|
| + termLength = length;
|
| + }
|
| +
|
| + /** Copies the contents of buffer into the termBuffer array.
|
| + * @param buffer the buffer to copy
|
| + */
|
| + public void setTermBuffer(String buffer) {
|
| + int length = buffer.length();
|
| + char[] newCharBuffer = growTermBuffer(length);
|
| + if (newCharBuffer != null) {
|
| + termBuffer = newCharBuffer;
|
| + }
|
| + buffer.getChars(0, length, termBuffer, 0);
|
| + termLength = length;
|
| + }
|
| +
|
| + /** Copies the contents of buffer, starting at offset and continuing
|
| + * for length characters, into the termBuffer array.
|
| + * @param buffer the buffer to copy
|
| + * @param offset the index in the buffer of the first character to copy
|
| + * @param length the number of characters to copy
|
| + */
|
| + public void setTermBuffer(String buffer, int offset, int length) {
|
| + assert offset <= buffer.length();
|
| + assert offset + length <= buffer.length();
|
| + char[] newCharBuffer = growTermBuffer(length);
|
| + if (newCharBuffer != null) {
|
| + termBuffer = newCharBuffer;
|
| + }
|
| + buffer.getChars(offset, offset + length, termBuffer, 0);
|
| + termLength = length;
|
| + }
|
| +
|
| + /** Returns the internal termBuffer character array which
|
| + * you can then directly alter. If the array is too
|
| + * small for your token, use {@link
|
| + * #resizeTermBuffer(int)} to increase it. After
|
| + * altering the buffer be sure to call {@link
|
| + * #setTermLength} to record the number of valid
|
| + * characters that were placed into the termBuffer. */
|
| + public char[] termBuffer() {
|
| + initTermBuffer();
|
| + return termBuffer;
|
| + }
|
| +
|
| + /** Grows the termBuffer to at least size newSize, preserving the
|
| + * existing content. Note: If the next operation is to change
|
| + * the contents of the term buffer use
|
| + * {@link #setTermBuffer(char[], int, int)},
|
| + * {@link #setTermBuffer(String)}, or
|
| + * {@link #setTermBuffer(String, int, int)}
|
| + * to optimally combine the resize with the setting of the termBuffer.
|
| + * @param newSize minimum size of the new termBuffer
|
| + * @return newly created termBuffer with length >= newSize
|
| + */
|
| + public char[] resizeTermBuffer(int newSize) {
|
| + char[] newCharBuffer = growTermBuffer(newSize);
|
| + if (termBuffer == null) {
|
| + // If there were termText, then preserve it.
|
| + // note that if termBuffer is null then newCharBuffer cannot be null
|
| + assert newCharBuffer != null;
|
| + termBuffer = newCharBuffer;
|
| + } else if (newCharBuffer != null) {
|
| + // Note: if newCharBuffer != null then termBuffer needs to grow.
|
| + // If there were a termBuffer, then preserve it
|
| + System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
|
| + termBuffer = newCharBuffer;
|
| + }
|
| + return termBuffer;
|
| + }
|
| +
|
| + /** Allocates a buffer char[] of at least newSize
|
| + * @param newSize minimum size of the buffer
|
| + * @return newly created buffer with length >= newSize or null if the current termBuffer is big enough
|
| + */
|
| + private char[] growTermBuffer(int newSize) {
|
| + if (termBuffer != null) {
|
| + if (termBuffer.length >= newSize)
|
| + // Already big enough
|
| + return null;
|
| + else
|
| + // Not big enough; create a new array with slight
|
| + // over allocation:
|
| + return new char[ArrayUtil.getNextSize(newSize)];
|
| + } else {
|
| +
|
| + // determine the best size
|
| + // The buffer is always at least MIN_BUFFER_SIZE
|
| + if (newSize < MIN_BUFFER_SIZE) {
|
| + newSize = MIN_BUFFER_SIZE;
|
| + }
|
| +
|
| + return new char[newSize];
|
| + }
|
| + }
|
| +
|
| + // TODO: once we remove the deprecated termText() method
|
| + // and switch entirely to char[] termBuffer we don't need
|
| + // to use this method anymore
|
| + private void initTermBuffer() {
|
| + if (termBuffer == null) {
|
| + termBuffer = new char[MIN_BUFFER_SIZE];
|
| + termLength = 0;
|
| + }
|
| + }
|
| +
|
| + /** Return number of valid characters (length of the term)
|
| + * in the termBuffer array. */
|
| + public int termLength() {
|
| + initTermBuffer();
|
| + return termLength;
|
| + }
|
| +
|
| + /** Set number of valid characters (length of the term) in
|
| + * the termBuffer array. Use this to truncate the termBuffer
|
| + * or to synchronize with external manipulation of the termBuffer.
|
| + * Note: to grow the size of the array,
|
| + * use {@link #resizeTermBuffer(int)} first.
|
| + * @param length the truncated length
|
| + */
|
| + public void setTermLength(int length) {
|
| + initTermBuffer();
|
| + if (length > termBuffer.length)
|
| + throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")");
|
| + termLength = length;
|
| + }
|
| +
|
| + public int hashCode() {
|
| + initTermBuffer();
|
| + int code = termLength;
|
| + code = code * 31 + ArrayUtil.hashCode(termBuffer, 0, termLength);
|
| + return code;
|
| + }
|
| +
|
| + public void clear() {
|
| + termLength = 0;
|
| + }
|
| +
|
| + public Object clone() {
|
| + TermAttribute t = (TermAttribute)super.clone();
|
| + // Do a deep clone
|
| + if (termBuffer != null) {
|
| + t.termBuffer = (char[]) termBuffer.clone();
|
| + }
|
| + return t;
|
| + }
|
| +
|
| + public boolean equals(Object other) {
|
| + if (other == this) {
|
| + return true;
|
| + }
|
| +
|
| + if (other instanceof TermAttribute) {
|
| + initTermBuffer();
|
| + TermAttribute o = ((TermAttribute) other);
|
| + o.initTermBuffer();
|
| +
|
| + for(int i=0;i<termLength;i++) {
|
| + if (termBuffer[i] != o.termBuffer[i]) {
|
| + return false;
|
| + }
|
| + }
|
| + }
|
| +
|
| + return false;
|
| + }
|
| +
|
| + public String toString() {
|
| + initTermBuffer();
|
| + return "term=" + new String(termBuffer, 0, termLength);
|
| + }
|
| +
|
| + public void copyTo(Attribute target) {
|
| + TermAttribute t = (TermAttribute) target;
|
| + t.setTermBuffer(termBuffer, 0, termLength);
|
| + }
|
| +}
|
| Index: src/java/org/apache/lucene/analysis/tokenattributes/TypeAttribute.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/tokenattributes/TypeAttribute.java (revision 0)
|
| +++ src/java/org/apache/lucene/analysis/tokenattributes/TypeAttribute.java (revision 0)
|
| @@ -0,0 +1,86 @@
|
| +package org.apache.lucene.analysis.tokenattributes;
|
| +
|
| +/** Set the position increment. This determines the position of this token
|
| + * relative to the previous Token in a {@link TokenStream}, used in phrase
|
| + * searching.
|
| + *
|
| + * <p>The default value is one.
|
| + *
|
| + * <p>Some common uses for this are:<ul>
|
| + *
|
| + * <li>Set it to zero to put multiple terms in the same position. This is
|
| + * useful if, e.g., a word has multiple stems. Searches for phrases
|
| + * including either stem will match. In this case, all but the first stem's
|
| + * increment should be set to zero: the increment of the first instance
|
| + * should be one. Repeating a token with an increment of zero can also be
|
| + * used to boost the scores of matches on that token.
|
| + *
|
| + * <li>Set it to values greater than one to inhibit exact phrase matches.
|
| + * If, for example, one does not want phrases to match across removed stop
|
| + * words, then one could build a stop word filter that removes stop words and
|
| + * also sets the increment to the number of stop words removed before each
|
| + * non-stop word. Then exact phrase queries will only match when the terms
|
| + * occur with no intervening stop words.
|
| + *
|
| + * </ul>
|
| + * @param positionIncrement the distance from the prior term
|
| + * @see org.apache.lucene.index.TermPositions
|
| + */
|
| +
|
| +import java.io.Serializable;
|
| +
|
| +/**
|
| + * A Token's lexical type. The Default value is "word".
|
| + */
|
| +public class TypeAttribute extends Attribute implements Cloneable, Serializable {
|
| + private String type;
|
| + public static final String DEFAULT_TYPE = "word";
|
| +
|
| + public TypeAttribute() {
|
| + this(DEFAULT_TYPE);
|
| + }
|
| +
|
| + public TypeAttribute(String type) {
|
| + this.type = type;
|
| + }
|
| +
|
| + /** Returns this Token's lexical type. Defaults to "word". */
|
| + public String type() {
|
| + return type;
|
| + }
|
| +
|
| + /** Set the lexical type.
|
| + @see #type() */
|
| + public void setType(String type) {
|
| + this.type = type;
|
| + }
|
| +
|
| + public void clear() {
|
| + type = DEFAULT_TYPE;
|
| + }
|
| +
|
| + public String toString() {
|
| + return "type=" + type;
|
| + }
|
| +
|
| + public boolean equals(Object other) {
|
| + if (other == this) {
|
| + return true;
|
| + }
|
| +
|
| + if (other instanceof TypeAttribute) {
|
| + return type.equals(((TypeAttribute) other).type);
|
| + }
|
| +
|
| + return false;
|
| + }
|
| +
|
| + public int hashCode() {
|
| + return type.hashCode();
|
| + }
|
| +
|
| + public void copyTo(Attribute target) {
|
| + TypeAttribute t = (TypeAttribute) target;
|
| + t.setType(new String(type));
|
| + }
|
| +}
|
| Index: src/java/org/apache/lucene/analysis/TokenFilter.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/TokenFilter.java (revision 708658)
|
| +++ src/java/org/apache/lucene/analysis/TokenFilter.java (working copy)
|
| @@ -22,9 +22,12 @@
|
| /** A TokenFilter is a TokenStream whose input is another token stream. |
| <p> |
| This is an abstract class. |
| - NOTE: subclasses must override {@link #next(Token)}. It's |
| - also OK to instead override {@link #next()} but that |
| - method is now deprecated in favor of {@link #next(Token)}. |
| + NOTE: subclasses must override {@link #initialize()} and |
| + {@link #incrementToken()} if the new TokenStream API is used |
| + and {@link #next(Token)} or {@link #next()} if the old |
| + TokenStream API is used. |
| + <p> |
| + See {@link TokenStream} |
| */ |
| public abstract class TokenFilter extends TokenStream { |
| /** The source of tokens for this filter. */ |
| @@ -33,8 +36,14 @@
|
| /** Construct a token stream filtering the given input. */ |
| protected TokenFilter(TokenStream input) { |
| this.input = input; |
| + this.attributes = input.attributes; |
| } |
| - |
| + |
| + public final void start() throws IOException { |
| + input.start(); |
| + initialize(); |
| + } |
| + |
| /** Close the input TokenStream. */ |
| public void close() throws IOException { |
| input.close(); |
| Index: src/java/org/apache/lucene/analysis/TokenStream.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/TokenStream.java (revision 708658)
|
| +++ src/java/org/apache/lucene/analysis/TokenStream.java (working copy)
|
| @@ -17,10 +17,12 @@
|
| * limitations under the License. |
| */ |
| |
| +import java.io.IOException; |
| +import java.util.Iterator; |
| + |
| +import org.apache.lucene.analysis.tokenattributes.Attribute; |
| import org.apache.lucene.index.Payload; |
| |
| -import java.io.IOException; |
| - |
| /** A TokenStream enumerates the sequence of tokens, either from |
| fields of a document or from query text. |
| <p> |
| @@ -31,13 +33,99 @@
|
| <li>{@link TokenFilter}, a TokenStream |
| whose input is another TokenStream. |
| </ul> |
| - NOTE: subclasses must override {@link #next(Token)}. It's |
| - also OK to instead override {@link #next()} but that |
| - method is now deprecated in favor of {@link #next(Token)}. |
| + A new TokenStream API is introduced with Lucene 2.9. Since |
| + 2.9 Token is deprecated and the preferred way to store |
| + the information of a token is to use {@link Attribute}s. |
| + <p> |
| + For that reason TokenStream extends {@link AttributeSource} |
| + now. Note that only one instance per {@link Attribute} is |
| + created and reused for every token. This approach reduces |
| + object creations and allows local caching of references to |
| + the {@link Attribute}s. See {@link #initialize()} and |
| + {@link #incrementToken()} for further details. |
| + <p> |
| + Sometimes it is desirable to capture a current state of a |
| + TokenStream, e. g. for buffering purposes (see {@link CachingTokenFilter}, |
| + {@link TeeTokenFilter}/{@link SinkTokenizer}). For this usecase |
| + the class {@link TokenStreamState} can be used. |
| + <p> |
| + <b>NOTE:</b> In order to enable the new API the method |
| + {@link #useNewAPI()} has to be called with useNewAPI=true. |
| + Otherwise the deprecated method {@link #next(Token)} will |
| + be used by Lucene consumers (indexer and queryparser) to |
| + consume the tokens. {@link #next(Token)} will be removed |
| + in Lucene 3.0. |
| + <p> |
| + NOTE: To use the old API subclasses must override {@link #next(Token)}. |
| + It's also OK to instead override {@link #next()} but that |
| + method is slower compared to {@link #next(Token)}. |
| */ |
| |
| -public abstract class TokenStream { |
| +public abstract class TokenStream extends AttributeSource { |
| + private static boolean useNewAPI = false; |
| + |
| + /** |
| + * Returns whether or not the new TokenStream APIs are used |
| + * (see {@link #incrementToken()}, {@link AttributeSource}). |
| + */ |
| + public static boolean useNewAPI() { |
| + return useNewAPI; |
| + } |
| |
| + /** |
| + * Use this API to enable or disable the new TokenStream API. |
| + * (see {@link #incrementToken()}, {@link AttributeSource}). |
| + * <p> |
| + * If set to true, the indexer will call {@link #start()} |
| + * and {@link #incrementToken()} to consume Tokens from this |
| + * stream. |
| + * <p> |
| + * If set to false, the indexer will call {@link #next(Token)} |
| + * instead. |
| + */ |
| + public static void setUseNewAPI(boolean use) { |
| + useNewAPI = use; |
| + } |
| + |
| + /** |
| + * Consumers of the stream must call this method before calling |
| + * {@link #incrementToken()} for the first time to initialize |
| + * this stream. |
| + */ |
| + public void start() throws IOException { |
| + initialize(); |
| + } |
| + |
| + /** |
| + * This method does nothing by default. Subclasses must should |
| + * implement this and call {@link #addAttribute(Class)} or |
| + * {@link #getAttribute(Class)} to store local references of |
| + * attributes. See {@link #incrementToken()} for more information. |
| + */ |
| + public void initialize() throws IOException {} |
| + |
| + /** |
| + * Consumers (e. g. the indexer) use this method to advance the stream |
| + * to the next token. Implementing classes must implement this method |
| + * and update the appropriate {@link Attribute}s with content of the |
| + * next token. |
| + * <p> |
| + * This method is called for every token of a document, so an efficient |
| + * implementation is crucial for good performance. To avoid calls to |
| + * {@link #addAttribute(Class)} and {@link #getAttribute(Class)} and |
| + * downcasts, references to all {@link Attribute}s that this stream uses |
| + * should be cached in {@link #initialize()}; |
| + * |
| + * @return false for end of stream; true otherwise |
| + * |
| + * <p> |
| + * <b>Note that this method will be defined abstract in Lucene 3.0.<b> |
| + */ |
| + public boolean incrementToken() throws IOException { |
| + // subclasses must implement this method; will be made abstract in Lucene 3.0 |
| + return false; |
| + } |
| + |
| /** Returns the next token in the stream, or null at EOS. |
| * @deprecated The returned Token is a "full private copy" (not |
| * re-used across calls to next()) but will be slower |
| @@ -84,6 +172,8 @@
|
| * is not required to check for null before using it, but it is a |
| * good idea to assert that it is not null.) |
| * @return next token in the stream or null if end-of-stream was hit |
| + * @deprecated The new {@link #incrementToken()} and {@link AttributeSource} |
| + * APIs should be used instead. See also {@link #useNewAPI()}. |
| */ |
| public Token next(final Token reusableToken) throws IOException { |
| // We don't actually use inputToken, but still add this assert |
| @@ -107,4 +197,25 @@
|
| |
| /** Releases resources associated with this stream. */ |
| public void close() throws IOException {} |
| + |
| + public String toString() { |
| + StringBuffer sb = new StringBuffer(); |
| + sb.append('('); |
| + |
| + if (hasAttributes()) { |
| + // TODO Java 1.5 |
| + //Iterator<Attribute> it = attributes.values().iterator(); |
| + Iterator it = getAttributesIterator(); |
| + if (it.hasNext()) { |
| + sb.append(it.next().toString()); |
| + } |
| + while (it.hasNext()) { |
| + sb.append(','); |
| + sb.append(it.next().toString()); |
| + } |
| + } |
| + sb.append(')'); |
| + return sb.toString(); |
| + } |
| + |
| } |
| Index: src/java/org/apache/lucene/analysis/TokenStreamState.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/analysis/TokenStreamState.java (revision 0)
|
| +++ src/java/org/apache/lucene/analysis/TokenStreamState.java (revision 0)
|
| @@ -0,0 +1,86 @@
|
| +package org.apache.lucene.analysis; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.util.Iterator; |
| + |
| +import org.apache.lucene.analysis.tokenattributes.Attribute; |
| + |
| +/** |
| + * This class can be used to capture a certain state of a {@link TokenStream}. |
| + * This is useful for buffering usecases (see {@link CachingTokenFilter}, |
| + * {@link TeeTokenFilter}/{@link SinkTokenizer}). |
| + */ |
| +public class TokenStreamState extends AttributeSource { |
| + private TokenStreamState(TokenStream input, AttributeAcceptor acceptor) { |
| + Iterator it = input.getAttributesIterator(); |
| + while(it.hasNext()) { |
| + Attribute att = (Attribute) it.next(); |
| + if (acceptor.accept(att.getClass())) { |
| + Attribute clone = (Attribute) att.clone(); |
| + this.attributes.put(att.getClass(), clone); |
| + } |
| + } |
| + } |
| + |
| + public TokenStreamState() {} |
| + |
| + /** |
| + * Captures the current state of the passed in TokenStream. |
| + * <p> |
| + * This state will contain all of the passed in TokenStream's |
| + * {@link Attribute}s. If only a subset of the attributes is needed |
| + * please use {@link #capture(TokenStream, AttributeAcceptor)} |
| + */ |
| + public static TokenStreamState capture(TokenStream from) { |
| + return new TokenStreamState(from, AllAcceptor); |
| + } |
| + |
| + /** |
| + * Captures the current state of the passed in TokenStream. |
| + * <p> |
| + * This state will contain all of the passed in TokenStream's |
| + * {@link Attribute}s which the {@link AttributeAcceptor} accepts. |
| + */ |
| + public static TokenStreamState capture(TokenStream from, AttributeAcceptor acceptor) { |
| + return new TokenStreamState(from, acceptor); |
| + } |
| + |
| + /** |
| + * Restores this state by copying the values of all attributes |
| + * that this state contains into the attributes of the targetStream. |
| + * The targetStream must contain a corresponding instance for each argument |
| + * contained in this state. |
| + * <p> |
| + * Note that this method does not affect attributes of the targetStream |
| + * that are not contained in this state. In other words, if for example |
| + * the targetStream contains an OffsetAttribute, but this state doesn't, then |
| + * the value of the OffsetAttribute remains unchanged. It might be desirable to |
| + * reset its value to the default, in which case the caller should first |
| + * call {@link TokenStream#clearAttributes()} on the targetStream. |
| + */ |
| + public void restore(TokenStream targetStream) { |
| + Iterator it = getAttributesIterator(); |
| + while (it.hasNext()) { |
| + Attribute att = (Attribute) it.next(); |
| + Attribute targetAtt = targetStream.getAttribute(att.getClass()); |
| + att.copyTo(targetAtt); |
| + } |
| + } |
| + |
| +} |
|
|
| Property changes on: src\java\org\apache\lucene\analysis\TokenStreamState.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: src/java/org/apache/lucene/index/DocInverter.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/index/DocInverter.java (revision 708658)
|
| +++ src/java/org/apache/lucene/index/DocInverter.java (working copy)
|
| @@ -17,13 +17,15 @@
|
| * limitations under the License. |
| */ |
| |
| -import java.util.Map; |
| +import java.io.IOException; |
| +import java.util.Collection; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| -import java.util.Collection; |
| import java.util.Iterator; |
| -import java.io.IOException; |
| +import java.util.Map; |
| |
| +import org.apache.lucene.analysis.AttributeSource; |
| + |
| /** This is a DocFieldConsumer that inverts each field, |
| * separately, from a Document, and accepts a |
| * InvertedTermsConsumer to process those terms. */ |
| @@ -98,12 +100,14 @@
|
| int length; |
| int offset; |
| float boost; |
| + AttributeSource attributeSource; |
| |
| void reset(float docBoost) { |
| position = 0; |
| length = 0; |
| offset = 0; |
| boost = docBoost; |
| + attributeSource = null; |
| } |
| } |
| } |
| Index: src/java/org/apache/lucene/index/DocInverterPerField.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/index/DocInverterPerField.java (revision 708658)
|
| +++ src/java/org/apache/lucene/index/DocInverterPerField.java (working copy)
|
| @@ -22,6 +22,8 @@
|
| import org.apache.lucene.document.Fieldable; |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| |
| /** |
| * Holds state for inverting all occurrences of a single |
| @@ -79,10 +81,14 @@
|
| if (!field.isTokenized()) { // un-tokenized field |
| String stringValue = field.stringValue(); |
| final int valueLength = stringValue.length(); |
| - Token token = perThread.localToken.reinit(stringValue, fieldState.offset, fieldState.offset + valueLength); |
| + perThread.localToken.reinit(stringValue, fieldState.offset, fieldState.offset + valueLength); |
| + fieldState.attributeSource = perThread.localTokenStream; |
| + perThread.localTokenStream.set(perThread.localToken); |
| + perThread.localTokenStream.start(); |
| + consumer.start(field); |
| boolean success = false; |
| try { |
| - consumer.add(token); |
| + consumer.add(); |
| success = true; |
| } finally { |
| if (!success) |
| @@ -122,7 +128,30 @@
|
| |
| try { |
| int offsetEnd = fieldState.offset-1; |
| - final Token localToken = perThread.localToken; |
| + |
| + boolean useNewTokenStreamAPI = TokenStream.useNewAPI(); |
| + Token localToken = null; |
| + OffsetAttribute offsetAttribute = null; |
| + PositionIncrementAttribute posIncrAttribute = null; |
| + |
| + if (useNewTokenStreamAPI) { |
| + fieldState.attributeSource = stream; |
| + stream.start(); |
| + } else { |
| + fieldState.attributeSource = perThread.localTokenStream; |
| + localToken = perThread.localToken; |
| + perThread.localTokenStream.start(); |
| + } |
| + |
| + consumer.start(field); |
| + |
| + if (fieldState.attributeSource.hasAttribute(OffsetAttribute.class)) { |
| + offsetAttribute = (OffsetAttribute) fieldState.attributeSource.getAttribute(OffsetAttribute.class); |
| + } |
| + if (fieldState.attributeSource.hasAttribute(PositionIncrementAttribute.class)) { |
| + posIncrAttribute = (PositionIncrementAttribute) fieldState.attributeSource.getAttribute(PositionIncrementAttribute.class); |
| + } |
| + |
| for(;;) { |
| |
| // If we hit an exception in stream.next below |
| @@ -131,10 +160,26 @@
|
| // non-aborting and (above) this one document |
| // will be marked as deleted, but still |
| // consume a docID |
| - Token token = stream.next(localToken); |
| + Token token = null; |
| + if (useNewTokenStreamAPI) { |
| + if (!stream.incrementToken()) break; |
| + } else { |
| + token = stream.next(localToken); |
| + if (token == null) break; |
| + perThread.localTokenStream.set(token); |
| + } |
| + |
| + int positionIncrement = 1; |
| + int endOffset = 0; |
| |
| - if (token == null) break; |
| - fieldState.position += (token.getPositionIncrement() - 1); |
| + if (posIncrAttribute != null) { |
| + positionIncrement = posIncrAttribute.getPositionIncrement(); |
| + } |
| + if (offsetAttribute != null) { |
| + endOffset = offsetAttribute.endOffset(); |
| + } |
| + |
| + fieldState.position += (positionIncrement - 1); |
| boolean success = false; |
| try { |
| // If we hit an exception in here, we abort |
| @@ -143,14 +188,14 @@
|
| // internal state of the consumer is now |
| // corrupt and should not be flushed to a |
| // new segment: |
| - consumer.add(token); |
| + consumer.add(); |
| success = true; |
| } finally { |
| if (!success) |
| docState.docWriter.setAborting(); |
| } |
| fieldState.position++; |
| - offsetEnd = fieldState.offset + token.endOffset(); |
| + offsetEnd = fieldState.offset + endOffset; |
| if (++fieldState.length >= maxFieldLength) { |
| if (docState.infoStream != null) |
| docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens"); |
| Index: src/java/org/apache/lucene/index/DocInverterPerThread.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/index/DocInverterPerThread.java (revision 708658)
|
| +++ src/java/org/apache/lucene/index/DocInverterPerThread.java (working copy)
|
| @@ -18,8 +18,19 @@
|
| */ |
| |
| import java.io.IOException; |
| +import java.util.LinkedHashMap; |
| +import java.util.Map; |
| |
| +import org.apache.lucene.analysis.AttributeSource; |
| import org.apache.lucene.analysis.Token; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.tokenattributes.Attribute; |
| +import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
| |
| /** This is a DocFieldConsumer that inverts each field, |
| * separately, from a Document, and accepts a |
| @@ -30,6 +41,103 @@
|
| final InvertedDocConsumerPerThread consumer; |
| final InvertedDocEndConsumerPerThread endConsumer; |
| final Token localToken = new Token(); |
| + final BackwardsCompatibilityStream localTokenStream = new BackwardsCompatibilityStream(); |
| + |
| + /** This stream wrapper is only used to maintain backwards compatibility with the |
| + * old TokenStream API and can be removed in Lucene 3.0 |
| + * @deprecated |
| + */ |
| + static class BackwardsCompatibilityStream extends TokenStream { |
| + private Token token; |
| + |
| + TermAttribute termAttribute = new TermAttribute() { |
| + public String term() { |
| + return token.term(); |
| + } |
| + |
| + public char[] termBuffer() { |
| + return token.termBuffer(); |
| + } |
| + |
| + public int termLength() { |
| + return token.termLength(); |
| + } |
| + }; |
| + OffsetAttribute offsetAttribute = new OffsetAttribute() { |
| + public int startOffset() { |
| + return token.startOffset(); |
| + } |
| + |
| + public int endOffset() { |
| + return token.endOffset(); |
| + } |
| + }; |
| + |
| + PositionIncrementAttribute positionIncrementAttribute = new PositionIncrementAttribute() { |
| + public int getPositionIncrement() { |
| + return token.getPositionIncrement(); |
| + } |
| + }; |
| + |
| + FlagsAttribute flagsAttribute = new FlagsAttribute() { |
| + public int getFlags() { |
| + return token.getFlags(); |
| + } |
| + }; |
| + |
| + PayloadAttribute payloadAttribute = new PayloadAttribute() { |
| + public Payload getPayload() { |
| + return token.getPayload(); |
| + } |
| + }; |
| + |
| + TypeAttribute typeAttribute = new TypeAttribute() { |
| + public String type() { |
| + return token.type(); |
| + } |
| + }; |
| + |
| + BackwardsCompatibilityStream() { |
| + attributes.put(TermAttribute.class, termAttribute); |
| + attributes.put(OffsetAttribute.class, offsetAttribute); |
| + attributes.put(PositionIncrementAttribute.class, positionIncrementAttribute); |
| + attributes.put(FlagsAttribute.class, flagsAttribute); |
| + attributes.put(PayloadAttribute.class, payloadAttribute); |
| + attributes.put(TypeAttribute.class, typeAttribute); |
| + } |
| + |
| + public Attribute addAttribute(Class attClass) { |
| + Attribute att = (Attribute) attributes.get(attClass); |
| + if (att == null) { |
| + return super.addAttribute(attClass); |
| + } |
| + return att; |
| + } |
| + |
| + public boolean hasAttribute(Class attClass) { |
| + return this.attributes.containsKey(attClass) || super.hasAttribute(attClass); |
| + } |
| + |
| + public Attribute getAttribute(Class attClass) { |
| + Attribute att = (Attribute) this.attributes.get(attClass); |
| + if (att == null) { |
| + return super.getAttribute(attClass); |
| + } |
| + |
| + return att; |
| + } |
| + |
| + public AttributeSource reinit(String stringValue, int startOffset, int endOffset) { |
| + termAttribute.setTermBuffer(stringValue); |
| + offsetAttribute.setStartOffset(startOffset); |
| + offsetAttribute.setEndOffset(endOffset); |
| + return this; |
| + } |
| + |
| + public void set(Token token) { |
| + this.token = token; |
| + } |
| + }; |
| final DocumentsWriter.DocState docState; |
| |
| final DocInverter.FieldInvertState fieldState = new DocInverter.FieldInvertState(); |
| Index: src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (revision 708658)
|
| +++ src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (working copy)
|
| @@ -19,7 +19,7 @@
|
| |
| import java.io.IOException; |
| import org.apache.lucene.document.Fieldable; |
| -import org.apache.lucene.analysis.Token; |
| +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; |
| |
| // TODO: break into separate freq and prox writers as |
| // codecs; make separate container (tii/tis/skip/*) that can |
| @@ -32,6 +32,8 @@
|
| final DocumentsWriter.DocState docState; |
| final DocInverter.FieldInvertState fieldState; |
| boolean omitTf; |
| + |
| + PayloadAttribute payloadAttribute; |
| |
| public FreqProxTermsWriterPerField(TermsHashPerField termsHashPerField, FreqProxTermsWriterPerThread perThread, FieldInfo fieldInfo) { |
| this.termsHashPerField = termsHashPerField; |
| @@ -53,7 +55,7 @@
|
| |
| boolean hasPayloads; |
| |
| - void skippingLongTerm(Token t) throws IOException {} |
| + void skippingLongTerm() throws IOException {} |
| |
| public int compareTo(Object other0) { |
| FreqProxTermsWriterPerField other = (FreqProxTermsWriterPerField) other0; |
| @@ -64,6 +66,7 @@
|
| // Record, up front, whether our in-RAM format will be |
| // with or without term freqs: |
| omitTf = fieldInfo.omitTf; |
| + payloadAttribute = null; |
| } |
| |
| boolean start(Fieldable[] fields, int count) { |
| @@ -72,9 +75,23 @@
|
| return true; |
| return false; |
| } |
| + |
| + void start(Fieldable f) { |
| + if (fieldState.attributeSource.hasAttribute(PayloadAttribute.class)) { |
| + payloadAttribute = (PayloadAttribute) fieldState.attributeSource.getAttribute(PayloadAttribute.class); |
| + } else { |
| + payloadAttribute = null; |
| + } |
| + } |
| |
| - final void writeProx(Token t, FreqProxTermsWriter.PostingList p, int proxCode) { |
| - final Payload payload = t.getPayload(); |
| + final void writeProx(FreqProxTermsWriter.PostingList p, int proxCode) { |
| + final Payload payload; |
| + if (payloadAttribute == null) { |
| + payload = null; |
| + } else { |
| + payload = payloadAttribute.getPayload(); |
| + } |
| + |
| if (payload != null && payload.length > 0) { |
| termsHashPerField.writeVInt(1, (proxCode<<1)|1); |
| termsHashPerField.writeVInt(1, payload.length); |
| @@ -85,7 +102,7 @@
|
| p.lastPosition = fieldState.position; |
| } |
| |
| - final void newTerm(Token t, RawPostingList p0) { |
| + final void newTerm(RawPostingList p0) { |
| // First time we're seeing this term since the last |
| // flush |
| assert docState.testPoint("FreqProxTermsWriterPerField.newTerm start"); |
| @@ -96,11 +113,11 @@
|
| } else { |
| p.lastDocCode = docState.docID << 1; |
| p.docFreq = 1; |
| - writeProx(t, p, fieldState.position); |
| + writeProx(p, fieldState.position); |
| } |
| } |
| |
| - final void addTerm(Token t, RawPostingList p0) { |
| + final void addTerm(RawPostingList p0) { |
| |
| assert docState.testPoint("FreqProxTermsWriterPerField.addTerm start"); |
| |
| @@ -132,10 +149,10 @@
|
| p.docFreq = 1; |
| p.lastDocCode = (docState.docID - p.lastDocID) << 1; |
| p.lastDocID = docState.docID; |
| - writeProx(t, p, fieldState.position); |
| + writeProx(p, fieldState.position); |
| } else { |
| p.docFreq++; |
| - writeProx(t, p, fieldState.position-p.lastPosition); |
| + writeProx(p, fieldState.position-p.lastPosition); |
| } |
| } |
| } |
| Index: src/java/org/apache/lucene/index/InvertedDocConsumerPerField.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/index/InvertedDocConsumerPerField.java (revision 708658)
|
| +++ src/java/org/apache/lucene/index/InvertedDocConsumerPerField.java (working copy)
|
| @@ -17,10 +17,10 @@
|
| * limitations under the License. |
| */ |
| |
| -import org.apache.lucene.document.Fieldable; |
| -import org.apache.lucene.analysis.Token; |
| import java.io.IOException; |
| |
| +import org.apache.lucene.document.Fieldable; |
| + |
| abstract class InvertedDocConsumerPerField { |
| |
| // Called once per field, and is given all Fieldable |
| @@ -29,8 +29,11 @@
|
| // fields: |
| abstract boolean start(Fieldable[] fields, int count) throws IOException; |
| |
| + // Called before a field instance is being processed |
| + abstract void start(Fieldable field); |
| + |
| // Called once per inverted token |
| - abstract void add(Token token) throws IOException; |
| + abstract void add() throws IOException; |
| |
| // Called once per field per document, after all Fieldable |
| // occurrences are inverted |
| Index: src/java/org/apache/lucene/index/Payload.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/index/Payload.java (revision 708658)
|
| +++ src/java/org/apache/lucene/index/Payload.java (working copy)
|
| @@ -19,7 +19,6 @@
|
| |
| import java.io.Serializable; |
| |
| -import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.util.ArrayUtil; |
| |
| @@ -29,7 +28,7 @@
|
| * specific term. |
| * <p> |
| * To store payloads in the index a {@link TokenStream} has to be used that |
| - * produces {@link Token}s containing payload data. |
| + * produces payload data. |
| * <p> |
| * Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)} |
| * to retrieve the payloads from the index.<br> |
| Index: src/java/org/apache/lucene/index/TermsHashConsumerPerField.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/index/TermsHashConsumerPerField.java (revision 708658)
|
| +++ src/java/org/apache/lucene/index/TermsHashConsumerPerField.java (working copy)
|
| @@ -23,14 +23,15 @@
|
| * multiple streams for each unique Token. */ |
| |
| import java.io.IOException; |
| + |
| import org.apache.lucene.document.Fieldable; |
| -import org.apache.lucene.analysis.Token; |
| |
| abstract class TermsHashConsumerPerField { |
| abstract boolean start(Fieldable[] fields, int count) throws IOException; |
| abstract void finish() throws IOException; |
| - abstract void skippingLongTerm(Token t) throws IOException; |
| - abstract void newTerm(Token t, RawPostingList p) throws IOException; |
| - abstract void addTerm(Token t, RawPostingList p) throws IOException; |
| + abstract void skippingLongTerm() throws IOException; |
| + abstract void start(Fieldable field); |
| + abstract void newTerm(RawPostingList p) throws IOException; |
| + abstract void addTerm(RawPostingList p) throws IOException; |
| abstract int getStreamCount(); |
| } |
| Index: src/java/org/apache/lucene/index/TermsHashPerField.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/index/TermsHashPerField.java (revision 708658)
|
| +++ src/java/org/apache/lucene/index/TermsHashPerField.java (working copy)
|
| @@ -20,8 +20,8 @@
|
| import java.io.IOException; |
| import java.util.Arrays; |
| |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| import org.apache.lucene.document.Fieldable; |
| -import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.util.UnicodeUtil; |
| |
| final class TermsHashPerField extends InvertedDocConsumerPerField { |
| @@ -31,6 +31,8 @@
|
| final TermsHashPerThread perThread; |
| final DocumentsWriter.DocState docState; |
| final DocInverter.FieldInvertState fieldState; |
| + |
| + TermAttribute termAtt; |
| |
| // Copied from our perThread |
| final CharBlockPool charPool; |
| @@ -49,7 +51,7 @@
|
| private int postingsHashMask = postingsHashSize-1; |
| private RawPostingList[] postingsHash = new RawPostingList[postingsHashSize]; |
| private RawPostingList p; |
| - |
| + |
| public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) { |
| this.perThread = perThread; |
| intPool = perThread.intPool; |
| @@ -247,6 +249,14 @@
|
| private boolean doCall; |
| private boolean doNextCall; |
| |
| + void start(Fieldable f) { |
| + termAtt = (TermAttribute) fieldState.attributeSource.addAttribute(TermAttribute.class); |
| + consumer.start(f); |
| + if (nextPerField != null) { |
| + nextPerField.start(f); |
| + } |
| + } |
| + |
| boolean start(Fieldable[] fields, int count) throws IOException { |
| doCall = consumer.start(fields, count); |
| if (nextPerField != null) |
| @@ -257,7 +267,7 @@
|
| // Secondary entry point (for 2nd & subsequent TermsHash), |
| // because token text has already been "interned" into |
| // textStart, so we hash by textStart |
| - public void add(Token token, int textStart) throws IOException { |
| + public void add(int textStart) throws IOException { |
| |
| int code = textStart; |
| |
| @@ -320,17 +330,17 @@
|
| } |
| p.byteStart = intUptos[intUptoStart]; |
| |
| - consumer.newTerm(token, p); |
| + consumer.newTerm(p); |
| |
| } else { |
| intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT]; |
| intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK; |
| - consumer.addTerm(token, p); |
| + consumer.addTerm(p); |
| } |
| } |
| |
| // Primary entry point (for first TermsHash) |
| - void add(Token token) throws IOException { |
| + void add() throws IOException { |
| |
| assert !postingsCompacted; |
| |
| @@ -338,8 +348,8 @@
|
| // term text into textStart address |
| |
| // Get the text of this term. |
| - final char[] tokenText = token.termBuffer(); |
| - final int tokenTextLen = token.termLength(); |
| + final char[] tokenText = termAtt.termBuffer();; |
| + final int tokenTextLen = termAtt.termLength(); |
| |
| // Compute hashcode & replace any invalid UTF16 sequences |
| int downto = tokenTextLen; |
| @@ -403,7 +413,7 @@
|
| if (docState.maxTermPrefix == null) |
| docState.maxTermPrefix = new String(tokenText, 0, 30); |
| |
| - consumer.skippingLongTerm(token); |
| + consumer.skippingLongTerm(); |
| return; |
| } |
| charPool.nextBuffer(); |
| @@ -450,16 +460,16 @@
|
| } |
| p.byteStart = intUptos[intUptoStart]; |
| |
| - consumer.newTerm(token, p); |
| + consumer.newTerm(p); |
| |
| } else { |
| intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT]; |
| intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK; |
| - consumer.addTerm(token, p); |
| + consumer.addTerm(p); |
| } |
| |
| if (doNextCall) |
| - nextPerField.add(token, p.textStart); |
| + nextPerField.add(p.textStart); |
| } |
| |
| int[] intUptos; |
| Index: src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (revision 708658)
|
| +++ src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (working copy)
|
| @@ -18,10 +18,11 @@
|
| */ |
| |
| import java.io.IOException; |
| -import org.apache.lucene.util.UnicodeUtil; |
| -import org.apache.lucene.analysis.Token; |
| + |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.document.Fieldable; |
| import org.apache.lucene.store.IndexOutput; |
| +import org.apache.lucene.util.UnicodeUtil; |
| |
| final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField { |
| |
| @@ -37,7 +38,8 @@
|
| boolean doVectorOffsets; |
| |
| int maxNumPostings; |
| - |
| + OffsetAttribute offsetAttribute = null; |
| + |
| public TermVectorsTermsWriterPerField(TermsHashPerField termsHashPerField, TermVectorsTermsWriterPerThread perThread, FieldInfo fieldInfo) { |
| this.termsHashPerField = termsHashPerField; |
| this.perThread = perThread; |
| @@ -191,8 +193,16 @@
|
| termsHashPerField.shrinkHash(maxNumPostings); |
| maxNumPostings = 0; |
| } |
| + |
| + void start(Fieldable f) { |
| + if (doVectorOffsets && fieldState.attributeSource.hasAttribute(OffsetAttribute.class)) { |
| + offsetAttribute = (OffsetAttribute) fieldState.attributeSource.getAttribute(OffsetAttribute.class); |
| + } else { |
| + offsetAttribute = null; |
| + } |
| + } |
| |
| - void newTerm(Token t, RawPostingList p0) { |
| + void newTerm(RawPostingList p0) { |
| |
| assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start"); |
| |
| @@ -201,8 +211,9 @@
|
| p.freq = 1; |
| |
| if (doVectorOffsets) { |
| - final int startOffset = fieldState.offset + t.startOffset(); |
| - final int endOffset = fieldState.offset + t.endOffset(); |
| + int startOffset = fieldState.offset + offsetAttribute.startOffset();; |
| + int endOffset = fieldState.offset + offsetAttribute.endOffset(); |
| + |
| termsHashPerField.writeVInt(1, startOffset); |
| termsHashPerField.writeVInt(1, endOffset - startOffset); |
| p.lastOffset = endOffset; |
| @@ -214,7 +225,7 @@
|
| } |
| } |
| |
| - void addTerm(Token t, RawPostingList p0) { |
| + void addTerm(RawPostingList p0) { |
| |
| assert docState.testPoint("TermVectorsTermsWriterPerField.addTerm start"); |
| |
| @@ -222,8 +233,9 @@
|
| p.freq++; |
| |
| if (doVectorOffsets) { |
| - final int startOffset = fieldState.offset + t.startOffset(); |
| - final int endOffset = fieldState.offset + t.endOffset(); |
| + int startOffset = fieldState.offset + offsetAttribute.startOffset();; |
| + int endOffset = fieldState.offset + offsetAttribute.endOffset(); |
| + |
| termsHashPerField.writeVInt(1, startOffset - p.lastOffset); |
| termsHashPerField.writeVInt(1, endOffset - startOffset); |
| p.lastOffset = endOffset; |
| @@ -235,5 +247,5 @@
|
| } |
| } |
| |
| - void skippingLongTerm(Token t) {} |
| + void skippingLongTerm() {} |
| } |
| Index: src/java/org/apache/lucene/queryParser/QueryParser.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/queryParser/QueryParser.java (revision 708658)
|
| +++ src/java/org/apache/lucene/queryParser/QueryParser.java (working copy)
|
| @@ -3,8 +3,8 @@
|
| |
| import java.io.IOException; |
| import java.io.StringReader; |
| +import java.text.Collator; |
| import java.text.DateFormat; |
| -import java.text.Collator; |
| import java.util.ArrayList; |
| import java.util.Calendar; |
| import java.util.Date; |
| @@ -15,7 +15,10 @@
|
| import java.util.Vector; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.CachingTokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| import org.apache.lucene.document.DateField; |
| import org.apache.lucene.document.DateTools; |
| import org.apache.lucene.index.Term; |
| @@ -508,48 +511,126 @@
|
| // PhraseQuery, or nothing based on the term count |
| |
| TokenStream source = analyzer.tokenStream(field, new StringReader(queryText)); |
| - List list = new ArrayList(); |
| - final org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token(); |
| - org.apache.lucene.analysis.Token nextToken; |
| - int positionCount = 0; |
| - boolean severalTokensAtSamePosition = false; |
| + CachingTokenFilter buffer = new CachingTokenFilter(source); |
| + TermAttribute termAtt = null; |
| + PositionIncrementAttribute posIncrAtt = null; |
| + int numTokens = 0; |
| |
| - while (true) { |
| + org.apache.lucene.analysis.Token reusableToken = null; |
| + org.apache.lucene.analysis.Token nextToken = null; |
| + |
| + |
| + boolean useNewAPI = TokenStream.useNewAPI(); |
| + |
| + if (useNewAPI) { |
| + boolean success = false; |
| try { |
| - nextToken = source.next(reusableToken); |
| + buffer.start(); |
| + success = true; |
| + } catch (IOException e) { |
| + // success==false if we hit an exception |
| } |
| - catch (IOException e) { |
| - nextToken = null; |
| + if (success) { |
| + if (buffer.hasAttribute(TermAttribute.class)) { |
| + termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class); |
| + } |
| + if (buffer.hasAttribute(PositionIncrementAttribute.class)) { |
| + posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class); |
| + } |
| } |
| - if (nextToken == null) |
| - break; |
| - list.add(nextToken.clone()); |
| - if (nextToken.getPositionIncrement() != 0) |
| - positionCount += nextToken.getPositionIncrement(); |
| - else |
| - severalTokensAtSamePosition = true; |
| + } else { |
| + reusableToken = new org.apache.lucene.analysis.Token(); |
| } |
| + |
| + int positionCount = 0; |
| + boolean severalTokensAtSamePosition = false; |
| + |
| + if (useNewAPI) { |
| + if (termAtt != null) { |
| + try { |
| + while (buffer.incrementToken()) { |
| + numTokens++; |
| + int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; |
| + if (positionIncrement != 0) { |
| + positionCount += positionIncrement; |
| + } else { |
| + severalTokensAtSamePosition = true; |
| + } |
| + } |
| + } catch (IOException e) { |
| + // ignore |
| + } |
| + } |
| + } else { |
| + while (true) { |
| + try { |
| + nextToken = buffer.next(reusableToken); |
| + } |
| + catch (IOException e) { |
| + nextToken = null; |
| + } |
| + if (nextToken == null) |
| + break; |
| + numTokens++; |
| + if (nextToken.getPositionIncrement() != 0) |
| + positionCount += nextToken.getPositionIncrement(); |
| + else |
| + severalTokensAtSamePosition = true; |
| + } |
| + } |
| try { |
| + // rewind the buffer stream |
| + buffer.reset(); |
| + |
| + // close original stream - all tokens buffered |
| source.close(); |
| } |
| catch (IOException e) { |
| // ignore |
| } |
| + |
| + if (numTokens == 0) |
| + return null; |
| + else if (numTokens == 1) { |
| + String term = null; |
| + try { |
| |
| - if (list.size() == 0) |
| - return null; |
| - else if (list.size() == 1) { |
| - nextToken = (org.apache.lucene.analysis.Token) list.get(0); |
| - return newTermQuery(new Term(field, nextToken.term())); |
| + if (useNewAPI) { |
| + boolean hasNext = buffer.incrementToken(); |
| + assert hasNext == true; |
| + term = termAtt.term(); |
| + } else { |
| + nextToken = buffer.next(reusableToken); |
| + assert nextToken != null; |
| + term = nextToken.term(); |
| + } |
| + } catch (IOException e) { |
| + // safe to ignore, because we know the number of tokens |
| + } |
| + return newTermQuery(new Term(field, term)); |
| } else { |
| if (severalTokensAtSamePosition) { |
| if (positionCount == 1) { |
| // no phrase query: |
| BooleanQuery q = newBooleanQuery(true); |
| - for (int i = 0; i < list.size(); i++) { |
| - nextToken = (org.apache.lucene.analysis.Token) list.get(i); |
| + for (int i = 0; i < numTokens; i++) { |
| + String term = null; |
| + try { |
| + if (useNewAPI) { |
| + boolean hasNext = buffer.incrementToken(); |
| + assert hasNext == true; |
| + term = termAtt.term(); |
| + } else { |
| + nextToken = buffer.next(reusableToken); |
| + assert nextToken != null; |
| + term = nextToken.term(); |
| + } |
| + } catch (IOException e) { |
| + // safe to ignore, because we know the number of tokens |
| + } |
| + |
| Query currentQuery = newTermQuery( |
| - new Term(field, nextToken.term())); |
| + new Term(field, term)); |
| q.add(currentQuery, BooleanClause.Occur.SHOULD); |
| } |
| return q; |
| @@ -560,9 +641,28 @@
|
| mpq.setSlop(phraseSlop); |
| List multiTerms = new ArrayList(); |
| int position = -1; |
| - for (int i = 0; i < list.size(); i++) { |
| - nextToken = (org.apache.lucene.analysis.Token) list.get(i); |
| - if (nextToken.getPositionIncrement() > 0 && multiTerms.size() > 0) { |
| + for (int i = 0; i < numTokens; i++) { |
| + String term = null; |
| + int positionIncrement = 1; |
| + try { |
| + if (useNewAPI) { |
| + boolean hasNext = buffer.incrementToken(); |
| + assert hasNext == true; |
| + term = termAtt.term(); |
| + if (posIncrAtt != null) { |
| + positionIncrement = posIncrAtt.getPositionIncrement(); |
| + } |
| + } else { |
| + nextToken = buffer.next(reusableToken); |
| + assert nextToken != null; |
| + term = nextToken.term(); |
| + positionIncrement = nextToken.getPositionIncrement(); |
| + } |
| + } catch (IOException e) { |
| + // safe to ignore, because we know the number of tokens |
| + } |
| + |
| + if (positionIncrement > 0 && multiTerms.size() > 0) { |
| if (enablePositionIncrements) { |
| mpq.add((Term[])multiTerms.toArray(new Term[0]),position); |
| } else { |
| @@ -570,8 +670,8 @@
|
| } |
| multiTerms.clear(); |
| } |
| - position += nextToken.getPositionIncrement(); |
| - multiTerms.add(new Term(field, nextToken.term())); |
| + position += positionIncrement; |
| + multiTerms.add(new Term(field, term)); |
| } |
| if (enablePositionIncrements) { |
| mpq.add((Term[])multiTerms.toArray(new Term[0]),position); |
| @@ -585,13 +685,36 @@
|
| PhraseQuery pq = newPhraseQuery(); |
| pq.setSlop(phraseSlop); |
| int position = -1; |
| - for (int i = 0; i < list.size(); i++) { |
| - nextToken = (org.apache.lucene.analysis.Token) list.get(i); |
| + |
| + |
| + for (int i = 0; i < numTokens; i++) { |
| + String term = null; |
| + int positionIncrement = 1; |
| + |
| + try { |
| + if (useNewAPI) { |
| + |
| + boolean hasNext = buffer.incrementToken(); |
| + assert hasNext == true; |
| + term = termAtt.term(); |
| + if (posIncrAtt != null) { |
| + positionIncrement = posIncrAtt.getPositionIncrement(); |
| + } |
| + } else { |
| + nextToken = buffer.next(reusableToken); |
| + assert nextToken != null; |
| + term = nextToken.term(); |
| + positionIncrement = nextToken.getPositionIncrement(); |
| + } |
| + } catch (IOException e) { |
| + // safe to ignore, because we know the number of tokens |
| + } |
| + |
| if (enablePositionIncrements) { |
| - position += nextToken.getPositionIncrement(); |
| - pq.add(new Term(field, nextToken.term()),position); |
| + position += positionIncrement; |
| + pq.add(new Term(field, term),position); |
| } else { |
| - pq.add(new Term(field, nextToken.term())); |
| + pq.add(new Term(field, term)); |
| } |
| } |
| return pq; |
| Index: src/java/org/apache/lucene/queryParser/QueryParser.jj
|
| ===================================================================
|
| --- src/java/org/apache/lucene/queryParser/QueryParser.jj (revision 708658)
|
| +++ src/java/org/apache/lucene/queryParser/QueryParser.jj (working copy)
|
| @@ -27,8 +27,8 @@
|
| |
| import java.io.IOException; |
| import java.io.StringReader; |
| +import java.text.Collator; |
| import java.text.DateFormat; |
| -import java.text.Collator; |
| import java.util.ArrayList; |
| import java.util.Calendar; |
| import java.util.Date; |
| @@ -39,7 +39,10 @@
|
| import java.util.Vector; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.CachingTokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| import org.apache.lucene.document.DateField; |
| import org.apache.lucene.document.DateTools; |
| import org.apache.lucene.index.Term; |
| @@ -535,48 +538,126 @@
|
| // PhraseQuery, or nothing based on the term count |
| |
| TokenStream source = analyzer.tokenStream(field, new StringReader(queryText)); |
| - List list = new ArrayList(); |
| - final org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token(); |
| - org.apache.lucene.analysis.Token nextToken; |
| - int positionCount = 0; |
| - boolean severalTokensAtSamePosition = false; |
| + CachingTokenFilter buffer = new CachingTokenFilter(source); |
| + TermAttribute termAtt = null; |
| + PositionIncrementAttribute posIncrAtt = null; |
| + int numTokens = 0; |
| |
| - while (true) { |
| + org.apache.lucene.analysis.Token reusableToken = null; |
| + org.apache.lucene.analysis.Token nextToken = null; |
| + |
| + |
| + boolean useNewAPI = TokenStream.useNewAPI(); |
| + |
| + if (useNewAPI) { |
| + boolean success = false; |
| try { |
| - nextToken = source.next(reusableToken); |
| + buffer.start(); |
| + success = true; |
| + } catch (IOException e) { |
| + // success==false if we hit an exception |
| } |
| - catch (IOException e) { |
| - nextToken = null; |
| + if (success) { |
| + if (buffer.hasAttribute(TermAttribute.class)) { |
| + termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class); |
| + } |
| + if (buffer.hasAttribute(PositionIncrementAttribute.class)) { |
| + posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class); |
| + } |
| } |
| - if (nextToken == null) |
| - break; |
| - list.add(nextToken.clone()); |
| - if (nextToken.getPositionIncrement() != 0) |
| - positionCount += nextToken.getPositionIncrement(); |
| - else |
| - severalTokensAtSamePosition = true; |
| + } else { |
| + reusableToken = new org.apache.lucene.analysis.Token(); |
| } |
| + |
| + int positionCount = 0; |
| + boolean severalTokensAtSamePosition = false; |
| + |
| + if (useNewAPI) { |
| + if (termAtt != null) { |
| + try { |
| + while (buffer.incrementToken()) { |
| + numTokens++; |
| + int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; |
| + if (positionIncrement != 0) { |
| + positionCount += positionIncrement; |
| + } else { |
| + severalTokensAtSamePosition = true; |
| + } |
| + } |
| + } catch (IOException e) { |
| + // ignore |
| + } |
| + } |
| + } else { |
| + while (true) { |
| + try { |
| + nextToken = buffer.next(reusableToken); |
| + } |
| + catch (IOException e) { |
| + nextToken = null; |
| + } |
| + if (nextToken == null) |
| + break; |
| + numTokens++; |
| + if (nextToken.getPositionIncrement() != 0) |
| + positionCount += nextToken.getPositionIncrement(); |
| + else |
| + severalTokensAtSamePosition = true; |
| + } |
| + } |
| try { |
| + // rewind the buffer stream |
| + buffer.reset(); |
| + |
| + // close original stream - all tokens buffered |
| source.close(); |
| } |
| catch (IOException e) { |
| // ignore |
| } |
| + |
| + if (numTokens == 0) |
| + return null; |
| + else if (numTokens == 1) { |
| + String term = null; |
| + try { |
| |
| - if (list.size() == 0) |
| - return null; |
| - else if (list.size() == 1) { |
| - nextToken = (org.apache.lucene.analysis.Token) list.get(0); |
| - return newTermQuery(new Term(field, nextToken.term())); |
| + if (useNewAPI) { |
| + boolean hasNext = buffer.incrementToken(); |
| + assert hasNext == true; |
| + term = termAtt.term(); |
| + } else { |
| + nextToken = buffer.next(reusableToken); |
| + assert nextToken != null; |
| + term = nextToken.term(); |
| + } |
| + } catch (IOException e) { |
| + // safe to ignore, because we know the number of tokens |
| + } |
| + return newTermQuery(new Term(field, term)); |
| } else { |
| if (severalTokensAtSamePosition) { |
| if (positionCount == 1) { |
| // no phrase query: |
| BooleanQuery q = newBooleanQuery(true); |
| - for (int i = 0; i < list.size(); i++) { |
| - nextToken = (org.apache.lucene.analysis.Token) list.get(i); |
| + for (int i = 0; i < numTokens; i++) { |
| + String term = null; |
| + try { |
| + if (useNewAPI) { |
| + boolean hasNext = buffer.incrementToken(); |
| + assert hasNext == true; |
| + term = termAtt.term(); |
| + } else { |
| + nextToken = buffer.next(reusableToken); |
| + assert nextToken != null; |
| + term = nextToken.term(); |
| + } |
| + } catch (IOException e) { |
| + // safe to ignore, because we know the number of tokens |
| + } |
| + |
| Query currentQuery = newTermQuery( |
| - new Term(field, nextToken.term())); |
| + new Term(field, term)); |
| q.add(currentQuery, BooleanClause.Occur.SHOULD); |
| } |
| return q; |
| @@ -587,9 +668,28 @@
|
| mpq.setSlop(phraseSlop); |
| List multiTerms = new ArrayList(); |
| int position = -1; |
| - for (int i = 0; i < list.size(); i++) { |
| - nextToken = (org.apache.lucene.analysis.Token) list.get(i); |
| - if (nextToken.getPositionIncrement() > 0 && multiTerms.size() > 0) { |
| + for (int i = 0; i < numTokens; i++) { |
| + String term = null; |
| + int positionIncrement = 1; |
| + try { |
| + if (useNewAPI) { |
| + boolean hasNext = buffer.incrementToken(); |
| + assert hasNext == true; |
| + term = termAtt.term(); |
| + if (posIncrAtt != null) { |
| + positionIncrement = posIncrAtt.getPositionIncrement(); |
| + } |
| + } else { |
| + nextToken = buffer.next(reusableToken); |
| + assert nextToken != null; |
| + term = nextToken.term(); |
| + positionIncrement = nextToken.getPositionIncrement(); |
| + } |
| + } catch (IOException e) { |
| + // safe to ignore, because we know the number of tokens |
| + } |
| + |
| + if (positionIncrement > 0 && multiTerms.size() > 0) { |
| if (enablePositionIncrements) { |
| mpq.add((Term[])multiTerms.toArray(new Term[0]),position); |
| } else { |
| @@ -597,8 +697,8 @@
|
| } |
| multiTerms.clear(); |
| } |
| - position += nextToken.getPositionIncrement(); |
| - multiTerms.add(new Term(field, nextToken.term())); |
| + position += positionIncrement; |
| + multiTerms.add(new Term(field, term)); |
| } |
| if (enablePositionIncrements) { |
| mpq.add((Term[])multiTerms.toArray(new Term[0]),position); |
| @@ -612,13 +712,36 @@
|
| PhraseQuery pq = newPhraseQuery(); |
| pq.setSlop(phraseSlop); |
| int position = -1; |
| - for (int i = 0; i < list.size(); i++) { |
| - nextToken = (org.apache.lucene.analysis.Token) list.get(i); |
| + |
| + |
| + for (int i = 0; i < numTokens; i++) { |
| + String term = null; |
| + int positionIncrement = 1; |
| + |
| + try { |
| + if (useNewAPI) { |
| + |
| + boolean hasNext = buffer.incrementToken(); |
| + assert hasNext == true; |
| + term = termAtt.term(); |
| + if (posIncrAtt != null) { |
| + positionIncrement = posIncrAtt.getPositionIncrement(); |
| + } |
| + } else { |
| + nextToken = buffer.next(reusableToken); |
| + assert nextToken != null; |
| + term = nextToken.term(); |
| + positionIncrement = nextToken.getPositionIncrement(); |
| + } |
| + } catch (IOException e) { |
| + // safe to ignore, because we know the number of tokens |
| + } |
| + |
| if (enablePositionIncrements) { |
| - position += nextToken.getPositionIncrement(); |
| - pq.add(new Term(field, nextToken.term()),position); |
| + position += positionIncrement; |
| + pq.add(new Term(field, term),position); |
| } else { |
| - pq.add(new Term(field, nextToken.term())); |
| + pq.add(new Term(field, term)); |
| } |
| } |
| return pq; |
| @@ -627,6 +750,7 @@
|
| } |
| |
| |
| + |
| /** |
| * Base implementation delegates to {@link #getFieldQuery(String,String)}. |
| * This method may be overridden, for example, to return |
| Index: src/java/org/apache/lucene/search/QueryTermVector.java
|
| ===================================================================
|
| --- src/java/org/apache/lucene/search/QueryTermVector.java (revision 708658)
|
| +++ src/java/org/apache/lucene/search/QueryTermVector.java (working copy)
|
| @@ -29,6 +29,7 @@
|
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| import org.apache.lucene.index.TermFreqVector; |
| |
| /** |
| @@ -58,9 +59,17 @@
|
| { |
| List terms = new ArrayList(); |
| try { |
| - final Token reusableToken = new Token(); |
| - for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { |
| - terms.add(nextToken.term()); |
| + if (TokenStream.useNewAPI()) { |
| + stream.start(); |
| + TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class); |
| + while (stream.incrementToken()) { |
| + terms.add(termAtt.term()); |
| + } |
| + } else { |
| + final Token reusableToken = new Token(); |
| + for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { |
| + terms.add(nextToken.term()); |
| + } |
| } |
| processTerms((String[])terms.toArray(new String[terms.size()])); |
| } catch (IOException e) { |
| Index: src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java
|
| ===================================================================
|
| --- src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java (revision 708658)
|
| +++ src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java (working copy)
|
| @@ -22,6 +22,8 @@
|
| |
| import org.apache.lucene.util.LuceneTestCase; |
| |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.Field.TermVector; |
| @@ -41,6 +43,8 @@
|
| Document doc = new Document(); |
| TokenStream stream = new TokenStream() { |
| private int index = 0; |
| + private TermAttribute termAtt; |
| + private OffsetAttribute offsetAtt; |
| |
| public Token next(final Token reusableToken) throws IOException { |
| assert reusableToken != null; |
| @@ -51,6 +55,22 @@
|
| } |
| } |
| |
| + public void initialize() { |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); |
| + } |
| + |
| + public boolean incrementToken() throws IOException { |
| + if (index == tokens.length) { |
| + return false; |
| + } else { |
| + termAtt.setTermBuffer(tokens[index++]); |
| + offsetAtt.setStartOffset(0); |
| + offsetAtt.setEndOffset(0); |
| + return true; |
| + } |
| + } |
| + |
| }; |
| |
| stream = new CachingTokenFilter(stream); |
| @@ -91,7 +111,30 @@
|
| } |
| |
| private void checkTokens(TokenStream stream) throws IOException { |
| + if (TokenStream.useNewAPI()) { |
| + checkTokensNewAPI(stream); |
| + } else { |
| + checkTokensOldAPI(stream); |
| + } |
| + } |
| + |
| + private void checkTokensNewAPI(TokenStream stream) throws IOException { |
| int count = 0; |
| + stream.start(); |
| + |
| + TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class); |
| + assertNotNull(termAtt); |
| + while (stream.incrementToken()) { |
| + assertTrue(count < tokens.length); |
| + assertEquals(tokens[count], termAtt.term()); |
| + count++; |
| + } |
| + |
| + assertEquals(tokens.length, count); |
| + } |
| + |
| + private void checkTokensOldAPI(TokenStream stream) throws IOException { |
| + int count = 0; |
| final Token reusableToken = new Token(); |
| for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { |
| assertTrue(count < tokens.length); |
| Index: src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
|
| ===================================================================
|
| --- src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (revision 708658)
|
| +++ src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (working copy)
|
| @@ -1,6 +1,10 @@
|
| package org.apache.lucene.analysis; |
| |
| import org.apache.lucene.analysis.standard.StandardAnalyzer; |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
| import org.apache.lucene.util.LuceneTestCase; |
| |
| import java.io.StringReader; |
| @@ -35,19 +39,26 @@
|
| |
| public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes, int[] expectedPosIncrs) throws Exception { |
| TokenStream ts = a.tokenStream("dummy", new StringReader(input)); |
| - final Token reusableToken = new Token(); |
| + ts.start(); |
| + // TODO Java 1.5 |
| + //final TypeAttribute typeAtt = reusableToken.getAttribute(TypeAttribute.class); |
| + //final PositionIncrementAttribute posIncrAtt = reusableToken.getAttribute(PositionIncrementAttribute.class); |
| + |
| + final TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); |
| + final TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class); |
| + final PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts.getAttribute(PositionIncrementAttribute.class); |
| + |
| for (int i = 0; i < expectedImages.length; i++) { |
| - Token nextToken = ts.next(reusableToken); |
| - assertNotNull(nextToken); |
| - assertEquals(expectedImages[i], nextToken.term()); |
| + assertTrue(ts.incrementToken()); |
| + assertEquals(expectedImages[i], new String(termAtt.termBuffer(), 0, termAtt.termLength())); |
| if (expectedTypes != null) { |
| - assertEquals(expectedTypes[i], nextToken.type()); |
| + assertEquals(expectedTypes[i], typeAtt.type()); |
| } |
| if (expectedPosIncrs != null) { |
| - assertEquals(expectedPosIncrs[i], nextToken.getPositionIncrement()); |
| + assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement()); |
| } |
| } |
| - assertNull(ts.next(reusableToken)); |
| + assertFalse(ts.incrementToken()); |
| ts.close(); |
| } |
| |
| Index: src/test/org/apache/lucene/analysis/TokenStreamTestUtils.java
|
| ===================================================================
|
| --- src/test/org/apache/lucene/analysis/TokenStreamTestUtils.java (revision 0)
|
| +++ src/test/org/apache/lucene/analysis/TokenStreamTestUtils.java (revision 0)
|
| @@ -0,0 +1,112 @@
|
| +package org.apache.lucene.analysis; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| + |
| +import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
| +import org.apache.lucene.index.Payload; |
| + |
| +public class TokenStreamTestUtils { |
| + public static abstract class BackwardsCompatibleFilter extends TokenFilter { |
| + boolean first = true; |
| + |
| + public BackwardsCompatibleFilter(TokenStream input) { |
| + super(input); |
| + } |
| + |
| + public Token next(Token reusableToken) throws IOException { |
| + if (first) { |
| + start(); |
| + first = false; |
| + } |
| + |
| + boolean hasNext = incrementToken(); |
| + if (!hasNext) { |
| + return null; |
| + } |
| + |
| + return getNextToken(this, reusableToken); |
| + } |
| + } |
| + |
| + public static abstract class BackwardsCompatibleStream extends TokenStream { |
| + boolean first = true; |
| + |
| + public BackwardsCompatibleStream() { |
| + super(); |
| + } |
| + |
| + public Token next(Token reusableToken) throws IOException { |
| + if (first) { |
| + start(); |
| + first = false; |
| + } |
| + |
| + boolean hasNext = incrementToken(); |
| + if (!hasNext) { |
| + return null; |
| + } |
| + |
| + reusableToken = getNextToken(this, reusableToken); |
| + return reusableToken; |
| + } |
| + } |
| + |
| + private static Token getNextToken(TokenStream stream, Token nextToken) throws IOException { |
| + if (stream.hasAttribute(PayloadAttribute.class)) { |
| + PayloadAttribute att = (PayloadAttribute) stream.getAttribute(PayloadAttribute.class); |
| + Payload p = att.getPayload(); |
| + if (p != null) { |
| + p = (Payload) p.clone(); |
| + } |
| + nextToken.setPayload(p); |
| + } |
| + |
| + if (stream.hasAttribute(TermAttribute.class)) { |
| + TermAttribute att = (TermAttribute) stream.getAttribute(TermAttribute.class); |
| + nextToken.setTermBuffer(att.termBuffer(), 0, att.termLength()); |
| + } |
| + if (stream.hasAttribute(OffsetAttribute.class)) { |
| + OffsetAttribute att = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class); |
| + nextToken.setStartOffset(att.startOffset()); |
| + nextToken.setEndOffset(att.endOffset()); |
| + } |
| + if (stream.hasAttribute(TypeAttribute.class)) { |
| + TypeAttribute att = (TypeAttribute) stream.getAttribute(TypeAttribute.class); |
| + nextToken.setType(att.type()); |
| + } |
| + if (stream.hasAttribute(FlagsAttribute.class)) { |
| + FlagsAttribute att = (FlagsAttribute) stream.getAttribute(FlagsAttribute.class); |
| + nextToken.setFlags(att.getFlags()); |
| + } |
| + if (stream.hasAttribute(PositionIncrementAttribute.class)) { |
| + PositionIncrementAttribute att = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class); |
| + nextToken.setPositionIncrement(att.getPositionIncrement()); |
| + } |
| + |
| + return nextToken; |
| + } |
| + |
| + |
| +} |
|
|
| Property changes on: src\test\org\apache\lucene\analysis\TokenStreamTestUtils.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: src/test/org/apache/lucene/index/TestDocumentWriter.java
|
| ===================================================================
|
| --- src/test/org/apache/lucene/index/TestDocumentWriter.java (revision 708658)
|
| +++ src/test/org/apache/lucene/index/TestDocumentWriter.java (working copy)
|
| @@ -22,12 +22,18 @@
|
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.SimpleAnalyzer; |
| +import org.apache.lucene.analysis.TestToken; |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.TokenStreamState; |
| +import org.apache.lucene.analysis.TokenStreamTestUtils; |
| import org.apache.lucene.analysis.WhitespaceAnalyzer; |
| import org.apache.lucene.analysis.WhitespaceTokenizer; |
| import org.apache.lucene.analysis.standard.StandardAnalyzer; |
| +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.Fieldable; |
| @@ -136,35 +142,49 @@
|
| public void testTokenReuse() throws IOException { |
| Analyzer analyzer = new Analyzer() { |
| public TokenStream tokenStream(String fieldName, Reader reader) { |
| - return new TokenFilter(new WhitespaceTokenizer(reader)) { |
| + return new TokenStreamTestUtils.BackwardsCompatibleFilter(new WhitespaceTokenizer(reader)) { |
| boolean first=true; |
| + TokenStreamState state; |
| Token buffered; |
| |
| - public Token next(final Token reusableToken) throws IOException { |
| - if (buffered != null) { |
| - Token nextToken = buffered; |
| - buffered=null; |
| - return nextToken; |
| + |
| + public boolean incrementToken() throws IOException { |
| + if (state != null) { |
| + state.restore(this); |
| + payloadAtt.setPayload(null); |
| + posIncrAtt.setPositionIncrement(0); |
| + termAtt.setTermBuffer(new char[]{'b'}, 0, 1); |
| + state = null; |
| + return true; |
| } |
| - Token nextToken = input.next(reusableToken); |
| - if (nextToken==null) return null; |
| - if (Character.isDigit(nextToken.termBuffer()[0])) { |
| - nextToken.setPositionIncrement(nextToken.termBuffer()[0] - '0'); |
| + |
| + boolean hasNext = input.incrementToken(); |
| + if (!hasNext) return false; |
| + if (Character.isDigit(termAtt.termBuffer()[0])) { |
| + posIncrAtt.setPositionIncrement(termAtt.termBuffer()[0] - '0'); |
| } |
| if (first) { |
| // set payload on first position only |
| - nextToken.setPayload(new Payload(new byte[]{100})); |
| + payloadAtt.setPayload(new Payload(new byte[]{100})); |
| first = false; |
| } |
| |
| // index a "synonym" for every token |
| - buffered = (Token)nextToken.clone(); |
| - buffered.setPayload(null); |
| - buffered.setPositionIncrement(0); |
| - buffered.setTermBuffer(new char[]{'b'}, 0, 1); |
| + state = TokenStreamState.capture(this); |
| + return true; |
| |
| - return nextToken; |
| } |
| + |
| + TermAttribute termAtt = null; |
| + PayloadAttribute payloadAtt = null; |
| + PositionIncrementAttribute posIncrAtt = null; |
| + |
| + public void initialize() throws IOException { |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); |
| + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); |
| + } |
| + |
| }; |
| } |
| }; |
| @@ -197,16 +217,22 @@
|
| IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); |
| Document doc = new Document(); |
| |
| - doc.add(new Field("preanalyzed", new TokenStream() { |
| + doc.add(new Field("preanalyzed", new TokenStreamTestUtils.BackwardsCompatibleStream() { |
| private String[] tokens = new String[] {"term1", "term2", "term3", "term2"}; |
| private int index = 0; |
| |
| - public Token next(final Token reusableToken) throws IOException { |
| - assert reusableToken != null; |
| + private TermAttribute termAtt; |
| + |
| + public void initialize() { |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + } |
| + |
| + public boolean incrementToken() throws IOException { |
| if (index == tokens.length) { |
| - return null; |
| + return false; |
| } else { |
| - return reusableToken.reinit(tokens[index++], 0, 0); |
| + termAtt.setTermBuffer(tokens[index++]); |
| + return true; |
| } |
| } |
| |
| Index: src/test/org/apache/lucene/index/TestIndexWriter.java
|
| ===================================================================
|
| --- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 708658)
|
| +++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy)
|
| @@ -30,6 +30,9 @@
|
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.UnicodeUtil; |
| |
| +import org.apache.lucene.analysis.TestToken; |
| +import org.apache.lucene.analysis.TokenStreamState; |
| +import org.apache.lucene.analysis.TokenStreamTestUtils; |
| import org.apache.lucene.analysis.WhitespaceAnalyzer; |
| import org.apache.lucene.analysis.WhitespaceTokenizer; |
| import org.apache.lucene.analysis.Analyzer; |
| @@ -38,6 +41,8 @@
|
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.standard.StandardAnalyzer; |
| import org.apache.lucene.analysis.standard.StandardTokenizer; |
| +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| @@ -1790,14 +1795,14 @@
|
| IndexWriter writer = new IndexWriter(dir, new Analyzer() { |
| |
| public TokenStream tokenStream(String fieldName, Reader reader) { |
| - return new TokenFilter(new StandardTokenizer(reader)) { |
| + return new TokenStreamTestUtils.BackwardsCompatibleFilter(new StandardTokenizer(reader)) { |
| private int count = 0; |
| |
| - public Token next(final Token reusableToken) throws IOException { |
| + public boolean incrementToken() throws IOException { |
| if (count++ == 5) { |
| throw new IOException(); |
| } |
| - return input.next(reusableToken); |
| + return input.incrementToken(); |
| } |
| }; |
| } |
| @@ -1907,7 +1912,7 @@
|
| reader.close(); |
| } |
| |
| - private class CrashingFilter extends TokenFilter { |
| + private class CrashingFilter extends TokenStreamTestUtils.BackwardsCompatibleFilter { |
| String fieldName; |
| int count; |
| |
| @@ -1916,10 +1921,10 @@
|
| this.fieldName = fieldName; |
| } |
| |
| - public Token next(final Token reusableToken) throws IOException { |
| + public boolean incrementToken() throws IOException { |
| if (this.fieldName.equals("crash") && count++ >= 4) |
| throw new IOException("I'm experiencing problems"); |
| - return input.next(reusableToken); |
| + return input.incrementToken(); |
| } |
| |
| public void reset() throws IOException { |
| @@ -3577,23 +3582,58 @@
|
| } |
| } |
| |
| + private static class MyAnalyzer extends Analyzer { |
| + |
| + public TokenStream tokenStream(String fieldName, Reader reader) { |
| + return new TokenStreamTestUtils.BackwardsCompatibleFilter(new WhitespaceTokenizer(reader)) { |
| + public void initialize() throws IOException { |
| + addAttribute(PositionIncrementAttribute.class); |
| + } |
| + }; |
| + } |
| + |
| + } |
| + |
| // LUCENE-1255 |
| public void testNegativePositions() throws Throwable { |
| - SinkTokenizer tokens = new SinkTokenizer(); |
| - Token t = new Token(); |
| - t.setTermBuffer("a"); |
| - t.setPositionIncrement(0); |
| - tokens.add(t); |
| - t.setTermBuffer("b"); |
| - t.setPositionIncrement(1); |
| - tokens.add(t); |
| - t.setTermBuffer("c"); |
| - tokens.add(t); |
| + SinkTokenizer tokens = new SinkTokenizer() { |
| + public void initialize() throws IOException { |
| + addAttribute(TermAttribute.class); |
| + addAttribute(PositionIncrementAttribute.class); |
| + } |
| + }; |
| |
| + TokenStreamState state = new TokenStreamState(); |
| + TermAttribute termAtt = (TermAttribute) state.addAttribute(TermAttribute.class); |
| + PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class); |
| + termAtt.setTermBuffer("a"); |
| + posIncrAtt.setPositionIncrement(0); |
| + tokens.add(state); |
| + |
| + state = new TokenStreamState(); |
| + termAtt = (TermAttribute) state.addAttribute(TermAttribute.class); |
| + posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class); |
| + |
| + termAtt.setTermBuffer("b"); |
| + posIncrAtt.setPositionIncrement(1); |
| + tokens.add(state); |
| + |
| + state = new TokenStreamState(); |
| + termAtt = (TermAttribute) state.addAttribute(TermAttribute.class); |
| + posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class); |
| + |
| + termAtt.setTermBuffer("c"); |
| + posIncrAtt.setPositionIncrement(1); |
| + tokens.add(state); |
| + |
| MockRAMDirectory dir = new MockRAMDirectory(); |
| - IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED); |
| + IndexWriter w = new IndexWriter(dir, new MyAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED); |
| Document doc = new Document(); |
| - doc.add(new Field("field", tokens)); |
| + doc.add(new Field("field", new TokenStreamTestUtils.BackwardsCompatibleFilter(tokens) { |
| + public boolean incrementToken() throws IOException { |
| + return input.incrementToken(); |
| + } |
| + })); |
| w.addDocument(doc); |
| w.commit(); |
| |
| Index: src/test/org/apache/lucene/index/TestMultiLevelSkipList.java
|
| ===================================================================
|
| --- src/test/org/apache/lucene/index/TestMultiLevelSkipList.java (revision 708658)
|
| +++ src/test/org/apache/lucene/index/TestMultiLevelSkipList.java (working copy)
|
| @@ -24,9 +24,12 @@
|
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.LowerCaseTokenizer; |
| +import org.apache.lucene.analysis.TestToken; |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.TokenStreamTestUtils; |
| +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.Field.Index; |
| @@ -96,20 +99,25 @@
|
| |
| } |
| |
| - private static class PayloadFilter extends TokenFilter { |
| + private static class PayloadFilter extends TokenStreamTestUtils.BackwardsCompatibleFilter { |
| static int count = 0; |
| |
| + PayloadAttribute payloadAtt; |
| + |
| + public void initialize() throws IOException { |
| + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); |
| + } |
| + |
| protected PayloadFilter(TokenStream input) { |
| super(input); |
| } |
| |
| - public Token next(final Token reusableToken) throws IOException { |
| - assert reusableToken != null; |
| - Token nextToken = input.next(reusableToken); |
| - if (nextToken != null) { |
| - nextToken.setPayload(new Payload(new byte[] { (byte) count++ })); |
| - } |
| - return nextToken; |
| + public boolean incrementToken() throws IOException { |
| + boolean hasNext = input.incrementToken(); |
| + if (hasNext) { |
| + payloadAtt.setPayload(new Payload(new byte[] { (byte) count++ })); |
| + } |
| + return hasNext; |
| } |
| |
| } |
| Index: src/test/org/apache/lucene/index/TestPayloads.java
|
| ===================================================================
|
| --- src/test/org/apache/lucene/index/TestPayloads.java (revision 708658)
|
| +++ src/test/org/apache/lucene/index/TestPayloads.java (working copy)
|
| @@ -27,20 +27,20 @@
|
| import java.util.Map; |
| import java.util.Random; |
| |
| -import org.apache.lucene.util.LuceneTestCase; |
| -import org.apache.lucene.util.UnicodeUtil; |
| - |
| import org.apache.lucene.analysis.Analyzer; |
| -import org.apache.lucene.analysis.Token; |
| -import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.TokenStreamTestUtils; |
| import org.apache.lucene.analysis.WhitespaceAnalyzer; |
| import org.apache.lucene.analysis.WhitespaceTokenizer; |
| +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.store.FSDirectory; |
| import org.apache.lucene.store.RAMDirectory; |
| +import org.apache.lucene.util.LuceneTestCase; |
| +import org.apache.lucene.util.UnicodeUtil; |
| |
| |
| public class TestPayloads extends LuceneTestCase { |
| @@ -437,12 +437,17 @@
|
| /** |
| * This Filter adds payloads to the tokens. |
| */ |
| - private static class PayloadFilter extends TokenFilter { |
| + private static class PayloadFilter extends TokenStreamTestUtils.BackwardsCompatibleFilter { |
| private byte[] data; |
| private int length; |
| private int offset; |
| Payload payload = new Payload(); |
| + PayloadAttribute payloadAtt; |
| |
| + public void initialize() throws IOException { |
| + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); |
| + } |
| + |
| public PayloadFilter(TokenStream in, byte[] data, int offset, int length) { |
| super(in); |
| this.data = data; |
| @@ -450,24 +455,23 @@
|
| this.offset = offset; |
| } |
| |
| - public Token next(final Token reusableToken) throws IOException { |
| - assert reusableToken != null; |
| - Token nextToken = input.next(reusableToken); |
| - if (nextToken != null) { |
| + public boolean incrementToken() throws IOException { |
| + boolean hasNext = input.incrementToken(); |
| + if (hasNext) { |
| if (offset + length <= data.length) { |
| Payload p = null; |
| if (p == null) { |
| p = new Payload(); |
| - nextToken.setPayload(p); |
| + payloadAtt.setPayload(p); |
| } |
| p.setData(data, offset, length); |
| offset += length; |
| } else { |
| - nextToken.setPayload(null); |
| + payloadAtt.setPayload(null); |
| } |
| } |
| |
| - return nextToken; |
| + return hasNext; |
| } |
| } |
| |
| @@ -524,11 +528,20 @@
|
| assertEquals(pool.size(), numThreads); |
| } |
| |
| - private static class PoolingPayloadTokenStream extends TokenStream { |
| + private static class PoolingPayloadTokenStream extends TokenStreamTestUtils.BackwardsCompatibleStream { |
| private byte[] payload; |
| private boolean first; |
| private ByteArrayPool pool; |
| private String term; |
| + |
| + TermAttribute termAtt; |
| + PayloadAttribute payloadAtt; |
| + |
| + public void initialize() { |
| + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + } |
| + |
| PoolingPayloadTokenStream(ByteArrayPool pool) { |
| this.pool = pool; |
| payload = pool.get(); |
| @@ -537,11 +550,11 @@
|
| first = true; |
| } |
| |
| - public Token next(final Token reusableToken) throws IOException { |
| - if (!first) return null; |
| - reusableToken.reinit(term, 0, 0); |
| - reusableToken.setPayload(new Payload(payload)); |
| - return reusableToken; |
| + public boolean incrementToken() throws IOException { |
| + if (!first) return false; |
| + termAtt.setTermBuffer(term); |
| + payloadAtt.setPayload(new Payload(payload)); |
| + return true; |
| } |
| |
| public void close() throws IOException { |
| Index: src/test/org/apache/lucene/index/TestTermVectorsReader.java
|
| ===================================================================
|
| --- src/test/org/apache/lucene/index/TestTermVectorsReader.java (revision 708658)
|
| +++ src/test/org/apache/lucene/index/TestTermVectorsReader.java (working copy)
|
| @@ -20,6 +20,10 @@
|
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.TokenStreamTestUtils; |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.store.MockRAMDirectory; |
| @@ -116,19 +120,33 @@
|
| fieldInfos = new FieldInfos(dir, seg + "." + IndexFileNames.FIELD_INFOS_EXTENSION); |
| } |
| |
| - private class MyTokenStream extends TokenStream { |
| + private class MyTokenStream extends TokenStreamTestUtils.BackwardsCompatibleStream { |
| int tokenUpto; |
| - public Token next(final Token reusableToken) { |
| + |
| + TermAttribute termAtt; |
| + PositionIncrementAttribute posIncrAtt; |
| + OffsetAttribute offsetAtt; |
| + |
| + public void initialize() { |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); |
| + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); |
| + } |
| + |
| + public boolean incrementToken() { |
| if (tokenUpto >= tokens.length) |
| - return null; |
| + return false; |
| else { |
| final TestToken testToken = tokens[tokenUpto++]; |
| - reusableToken.reinit(testToken.text, testToken.startOffset, testToken.endOffset); |
| - if (tokenUpto > 1) |
| - reusableToken.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos); |
| - else |
| - reusableToken.setPositionIncrement(testToken.pos+1); |
| - return reusableToken; |
| + termAtt.setTermBuffer(testToken.text); |
| + offsetAtt.setStartOffset(testToken.startOffset); |
| + offsetAtt.setEndOffset(testToken.endOffset); |
| + if (tokenUpto > 1) { |
| + posIncrAtt.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos); |
| + } else { |
| + posIncrAtt.setPositionIncrement(testToken.pos+1); |
| + } |
| + return true; |
| } |
| } |
| } |
| Index: src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java
|
| ===================================================================
|
| --- src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java (revision 708658)
|
| +++ src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java (working copy)
|
| @@ -17,6 +17,7 @@
|
| * limitations under the License. |
| */ |
| |
| +import java.io.IOException; |
| import java.io.Reader; |
| |
| import org.apache.lucene.util.LuceneTestCase; |
| @@ -27,7 +28,12 @@
|
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.TokenStreamTestUtils; |
| import org.apache.lucene.analysis.standard.StandardTokenizer; |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
| |
| /** |
| * Test QueryParser's ability to deal with Analyzers that return more |
| @@ -138,36 +144,54 @@
|
| } |
| } |
| |
| - private final class TestFilter extends TokenFilter { |
| + private final class TestFilter extends TokenStreamTestUtils.BackwardsCompatibleFilter { |
| |
| - private Token prevToken; |
| + private String prevType; |
| + private int prevStartOffset; |
| + private int prevEndOffset; |
| |
| + TermAttribute termAtt; |
| + PositionIncrementAttribute posIncrAtt; |
| + OffsetAttribute offsetAtt; |
| + TypeAttribute typeAtt; |
| + |
| + public void initialize() throws IOException { |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); |
| + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); |
| + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); |
| + } |
| + |
| public TestFilter(TokenStream in) { |
| super(in); |
| } |
| |
| - public final Token next(final Token reusableToken) throws java.io.IOException { |
| + public final boolean incrementToken() throws java.io.IOException { |
| if (multiToken > 0) { |
| - reusableToken.reinit("multi"+(multiToken+1), prevToken.startOffset(), prevToken.endOffset(), prevToken.type()); |
| - reusableToken.setPositionIncrement(0); |
| + termAtt.setTermBuffer("multi"+(multiToken+1)); |
| + offsetAtt.setStartOffset(prevStartOffset); |
| + offsetAtt.setEndOffset(prevEndOffset); |
| + typeAtt.setType(prevType); |
| + posIncrAtt.setPositionIncrement(0); |
| multiToken--; |
| - return reusableToken; |
| + return true; |
| } else { |
| - Token nextToken = input.next(reusableToken); |
| - if (nextToken == null) { |
| - prevToken = null; |
| - return null; |
| + boolean next = input.incrementToken(); |
| + if (next == false) { |
| + return false; |
| } |
| - prevToken = (Token) nextToken.clone(); |
| - String text = nextToken.term(); |
| + prevType = typeAtt.type(); |
| + prevStartOffset = offsetAtt.startOffset(); |
| + prevEndOffset = offsetAtt.endOffset(); |
| + String text = termAtt.term(); |
| if (text.equals("triplemulti")) { |
| multiToken = 2; |
| - return nextToken; |
| + return true; |
| } else if (text.equals("multi")) { |
| multiToken = 1; |
| - return nextToken; |
| + return true; |
| } else { |
| - return nextToken; |
| + return true; |
| } |
| } |
| } |
| @@ -190,25 +214,34 @@
|
| } |
| } |
| |
| - private final class TestPosIncrementFilter extends TokenFilter { |
| + private final class TestPosIncrementFilter extends TokenStreamTestUtils.BackwardsCompatibleFilter { |
| |
| + TermAttribute termAtt; |
| + PositionIncrementAttribute posIncrAtt; |
| + |
| + public void initialize() throws IOException { |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); |
| + } |
| + |
| + |
| public TestPosIncrementFilter(TokenStream in) { |
| super(in); |
| } |
| |
| - public final Token next(final Token reusableToken) throws java.io.IOException { |
| - for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) { |
| - if (nextToken.term().equals("the")) { |
| + public final boolean incrementToken () throws java.io.IOException { |
| + while(input.incrementToken()) { |
| + if (termAtt.term().equals("the")) { |
| // stopword, do nothing |
| - } else if (nextToken.term().equals("quick")) { |
| - nextToken.setPositionIncrement(2); |
| - return nextToken; |
| + } else if (termAtt.term().equals("quick")) { |
| + posIncrAtt.setPositionIncrement(2); |
| + return true; |
| } else { |
| - nextToken.setPositionIncrement(1); |
| - return nextToken; |
| + posIncrAtt.setPositionIncrement(1); |
| + return true; |
| } |
| } |
| - return null; |
| + return false; |
| } |
| } |
| |
| Index: src/test/org/apache/lucene/queryParser/TestQueryParser.java
|
| ===================================================================
|
| --- src/test/org/apache/lucene/queryParser/TestQueryParser.java (revision 708658)
|
| +++ src/test/org/apache/lucene/queryParser/TestQueryParser.java (working copy)
|
| @@ -34,8 +34,13 @@
|
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.TokenStreamTestUtils; |
| import org.apache.lucene.analysis.WhitespaceAnalyzer; |
| import org.apache.lucene.analysis.standard.StandardAnalyzer; |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
| import org.apache.lucene.document.DateField; |
| import org.apache.lucene.document.DateTools; |
| import org.apache.lucene.document.Document; |
| @@ -64,7 +69,17 @@
|
| |
| public static Analyzer qpAnalyzer = new QPTestAnalyzer(); |
| |
| - public static class QPTestFilter extends TokenFilter { |
| + public static class QPTestFilter extends TokenStreamTestUtils.BackwardsCompatibleFilter { |
| + TermAttribute termAtt; |
| + OffsetAttribute offsetAtt; |
| + |
| + |
| + public void initialize() throws IOException { |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); |
| + } |
| + |
| + |
| /** |
| * Filter which discards the token 'stop' and which expands the |
| * token 'phrase' into 'phrase1 phrase2' |
| @@ -76,25 +91,31 @@
|
| boolean inPhrase = false; |
| int savedStart = 0, savedEnd = 0; |
| |
| - public Token next(final Token reusableToken) throws IOException { |
| - assert reusableToken != null; |
| + public boolean incrementToken() throws IOException { |
| if (inPhrase) { |
| inPhrase = false; |
| - return reusableToken.reinit("phrase2", savedStart, savedEnd); |
| + termAtt.setTermBuffer("phrase2"); |
| + offsetAtt.setStartOffset(savedStart); |
| + offsetAtt.setEndOffset(savedEnd); |
| + return true; |
| } else |
| - for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) { |
| - if (nextToken.term().equals("phrase")) { |
| + while (input.incrementToken()) { |
| + if (termAtt.term().equals("phrase")) { |
| inPhrase = true; |
| - savedStart = nextToken.startOffset(); |
| - savedEnd = nextToken.endOffset(); |
| - return nextToken.reinit("phrase1", savedStart, savedEnd); |
| - } else if (!nextToken.term().equals("stop")) |
| - return nextToken; |
| + savedStart = offsetAtt.startOffset(); |
| + savedEnd = offsetAtt.endOffset(); |
| + termAtt.setTermBuffer("phrase1"); |
| + offsetAtt.setStartOffset(savedStart); |
| + offsetAtt.setEndOffset(savedEnd); |
| + return true; |
| + } else if (!termAtt.term().equals("stop")) |
| + return true; |
| } |
| - return null; |
| + return false; |
| } |
| } |
| |
| + |
| public static class QPTestAnalyzer extends Analyzer { |
| |
| /** Filters LowerCaseTokenizer with StopFilter. */ |
| Index: src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java
|
| ===================================================================
|
| --- src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java (revision 708658)
|
| +++ src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java (working copy)
|
| @@ -21,9 +21,14 @@
|
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.LowerCaseTokenizer; |
| +import org.apache.lucene.analysis.TestToken; |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.TokenStreamTestUtils; |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.index.IndexWriter; |
| @@ -63,32 +68,38 @@
|
| } |
| } |
| |
| - private class PayloadFilter extends TokenFilter { |
| + private class PayloadFilter extends TokenStreamTestUtils.BackwardsCompatibleFilter { |
| String fieldName; |
| int numSeen = 0; |
| + |
| + PayloadAttribute payloadAtt; |
| + |
| + public void initialize() throws IOException { |
| + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); |
| + } |
| |
| public PayloadFilter(TokenStream input, String fieldName) { |
| super(input); |
| this.fieldName = fieldName; |
| } |
| - |
| - public Token next(final Token reusableToken) throws IOException { |
| - assert reusableToken != null; |
| - Token nextToken = input.next(reusableToken); |
| - if (nextToken != null) { |
| + |
| + public boolean incrementToken() throws IOException { |
| + boolean hasNext = input.incrementToken(); |
| + if (hasNext) { |
| if (fieldName.equals("field")) { |
| - nextToken.setPayload(new Payload(payloadField)); |
| + payloadAtt.setPayload(new Payload(payloadField)); |
| } else if (fieldName.equals("multiField")) { |
| if (numSeen % 2 == 0) { |
| - nextToken.setPayload(new Payload(payloadMultiField1)); |
| + payloadAtt.setPayload(new Payload(payloadMultiField1)); |
| } else { |
| - nextToken.setPayload(new Payload(payloadMultiField2)); |
| + payloadAtt.setPayload(new Payload(payloadMultiField2)); |
| } |
| numSeen++; |
| } |
| - |
| + return true; |
| + } else { |
| + return false; |
| } |
| - return nextToken; |
| } |
| } |
| |
| Index: src/test/org/apache/lucene/search/TestPositionIncrement.java
|
| ===================================================================
|
| --- src/test/org/apache/lucene/search/TestPositionIncrement.java (revision 708658)
|
| +++ src/test/org/apache/lucene/search/TestPositionIncrement.java (working copy)
|
| @@ -17,14 +17,20 @@
|
| * limitations under the License. |
| */ |
| |
| +import java.io.IOException; |
| import java.io.Reader; |
| import java.io.StringReader; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.StopFilter; |
| +import org.apache.lucene.analysis.TestToken; |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.TokenStreamTestUtils; |
| import org.apache.lucene.analysis.WhitespaceAnalyzer; |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.index.IndexWriter; |
| @@ -44,19 +50,30 @@
|
| public void testSetPosition() throws Exception { |
| Analyzer analyzer = new Analyzer() { |
| public TokenStream tokenStream(String fieldName, Reader reader) { |
| - return new TokenStream() { |
| + return new TokenStreamTestUtils.BackwardsCompatibleStream() { |
| private final String[] TOKENS = {"1", "2", "3", "4", "5"}; |
| private final int[] INCREMENTS = {1, 2, 1, 0, 1}; |
| private int i = 0; |
| |
| - public Token next(final Token reusableToken) { |
| - assert reusableToken != null; |
| + PositionIncrementAttribute posIncrAtt; |
| + TermAttribute termAtt; |
| + OffsetAttribute offsetAtt; |
| + |
| + public void initialize() throws IOException { |
| + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); |
| + } |
| + |
| + public boolean incrementToken() { |
| if (i == TOKENS.length) |
| - return null; |
| - reusableToken.reinit(TOKENS[i], i, i); |
| - reusableToken.setPositionIncrement(INCREMENTS[i]); |
| + return false; |
| + termAtt.setTermBuffer(TOKENS[i]); |
| + offsetAtt.setStartOffset(i); |
| + offsetAtt.setEndOffset(i); |
| + posIncrAtt.setPositionIncrement(INCREMENTS[i]); |
| i++; |
| - return reusableToken; |
| + return true; |
| } |
| }; |
| } |
| Index: src/test/org/apache/lucene/util/LuceneTestCase.java
|
| ===================================================================
|
| --- src/test/org/apache/lucene/util/LuceneTestCase.java (revision 708658)
|
| +++ src/test/org/apache/lucene/util/LuceneTestCase.java (working copy)
|
| @@ -17,6 +17,7 @@
|
| * limitations under the License. |
| */ |
| |
| +import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.index.ConcurrentMergeScheduler; |
| import junit.framework.TestCase; |
| |
| @@ -42,6 +43,7 @@
|
| |
| protected void setUp() throws Exception { |
| ConcurrentMergeScheduler.setTestMode(); |
| + TokenStream.setUseNewAPI(true); |
| } |
| |
| protected void tearDown() throws Exception { |