blob: 6def80ac167ce0bac8c536bcbfa7fd7cf646a799 [file] [log] [blame]
package org.apache.lucene.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.util.CloseableThreadLocal;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
/**
* An Analyzer builds TokenStreams, which analyze text. It thus represents a
* policy for extracting index terms from text.
* <p>
* In order to define what analysis is done, subclasses must define their
* {@link TokenStreamComponents} in {@link #createComponents(String, Reader)}.
* The components are then reused in each call to {@link #tokenStream(String, Reader)}.
*/
public abstract class Analyzer {
private final ReuseStrategy reuseStrategy;
public Analyzer() {
this(new GlobalReuseStrategy());
}
public Analyzer(ReuseStrategy reuseStrategy) {
this.reuseStrategy = reuseStrategy;
}
/**
* Creates a new {@link TokenStreamComponents} instance for this analyzer.
*
* @param fieldName
* the name of the fields content passed to the
* {@link TokenStreamComponents} sink as a reader
* @param aReader
* the reader passed to the {@link Tokenizer} constructor
* @return the {@link TokenStreamComponents} for this analyzer.
*/
protected abstract TokenStreamComponents createComponents(String fieldName,
Reader aReader);
/**
* Creates a TokenStream that is allowed to be re-use from the previous time
* that the same thread called this method. Callers that do not need to use
* more than one TokenStream at the same time from this analyzer should use
* this method for better performance.
* <p>
* This method uses {@link #createComponents(String, Reader)} to obtain an
* instance of {@link TokenStreamComponents}. It returns the sink of the
* components and stores the components internally. Subsequent calls to this
* method will reuse the previously stored components after resetting them
* through {@link TokenStreamComponents#reset(Reader)}.
* </p>
*
* @param fieldName the name of the field the created TokenStream is used for
* @param reader the reader the streams source reads from
*/
public final TokenStream tokenStream(final String fieldName,
final Reader reader) throws IOException {
TokenStreamComponents components = reuseStrategy.getReusableComponents(fieldName);
final Reader r = initReader(reader);
if (components == null) {
components = createComponents(fieldName, r);
reuseStrategy.setReusableComponents(fieldName, components);
} else {
components.reset(r);
}
return components.getTokenStream();
}
/**
* Override this if you want to add a CharFilter chain.
*/
protected Reader initReader(Reader reader) {
return reader;
}
/**
* Invoked before indexing a IndexableField instance if
* terms have already been added to that field. This allows custom
* analyzers to place an automatic position increment gap between
* IndexbleField instances using the same field name. The default value
* position increment gap is 0. With a 0 position increment gap and
* the typical default token position increment of 1, all terms in a field,
* including across IndexableField instances, are in successive positions, allowing
* exact PhraseQuery matches, for instance, across IndexableField instance boundaries.
*
* @param fieldName IndexableField name being indexed.
* @return position increment gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
*/
public int getPositionIncrementGap(String fieldName) {
return 0;
}
/**
* Just like {@link #getPositionIncrementGap}, except for
* Token offsets instead. By default this returns 1 for
* tokenized fields and, as if the fields were joined
* with an extra space character, and 0 for un-tokenized
* fields. This method is only called if the field
* produced at least one token for indexing.
*
* @param field the field just indexed
* @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
*/
public int getOffsetGap(IndexableField field) {
if (field.fieldType().tokenized()) {
return 1;
} else {
return 0;
}
}
/** Frees persistent resources used by this Analyzer */
public void close() {
reuseStrategy.close();
}
/**
* This class encapsulates the outer components of a token stream. It provides
* access to the source ({@link Tokenizer}) and the outer end (sink), an
* instance of {@link TokenFilter} which also serves as the
* {@link TokenStream} returned by
* {@link Analyzer#tokenStream(String, Reader)} and
* {@link Analyzer#tokenStream(String, Reader)}.
*/
public static class TokenStreamComponents {
protected final Tokenizer source;
protected final TokenStream sink;
/**
* Creates a new {@link TokenStreamComponents} instance.
*
* @param source
* the analyzer's tokenizer
* @param result
* the analyzer's resulting token stream
*/
public TokenStreamComponents(final Tokenizer source,
final TokenStream result) {
this.source = source;
this.sink = result;
}
/**
* Creates a new {@link TokenStreamComponents} instance.
*
* @param source
* the analyzer's tokenizer
*/
public TokenStreamComponents(final Tokenizer source) {
this.source = source;
this.sink = source;
}
/**
* Resets the encapsulated components with the given reader. If the components
* cannot be reset, an Exception should be thrown.
*
* @param reader
* a reader to reset the source component
* @throws IOException
* if the component's reset method throws an {@link IOException}
*/
protected void reset(final Reader reader) throws IOException {
source.reset(reader);
}
/**
* Returns the sink {@link TokenStream}
*
* @return the sink {@link TokenStream}
*/
public TokenStream getTokenStream() {
return sink;
}
/**
* Returns the component's {@link Tokenizer}
*
* @return Component's {@link Tokenizer}
*/
public Tokenizer getTokenizer() {
return source;
}
}
/**
* Strategy defining how TokenStreamComponents are reused per call to
* {@link Analyzer#tokenStream(String, java.io.Reader)}.
*/
public static abstract class ReuseStrategy {
private CloseableThreadLocal<Object> storedValue = new CloseableThreadLocal<Object>();
/**
* Gets the reusable TokenStreamComponents for the field with the given name
*
* @param fieldName Name of the field whose reusable TokenStreamComponents
* are to be retrieved
* @return Reusable TokenStreamComponents for the field, or {@code null}
* if there was no previous components for the field
*/
public abstract TokenStreamComponents getReusableComponents(String fieldName);
/**
* Stores the given TokenStreamComponents as the reusable components for the
* field with the give name
*
* @param fieldName Name of the field whose TokenStreamComponents are being set
* @param components TokenStreamComponents which are to be reused for the field
*/
public abstract void setReusableComponents(String fieldName, TokenStreamComponents components);
/**
* Returns the currently stored value
*
* @return Currently stored value or {@code null} if no value is stored
*/
protected final Object getStoredValue() {
try {
return storedValue.get();
} catch (NullPointerException npe) {
if (storedValue == null) {
throw new AlreadyClosedException("this Analyzer is closed");
} else {
throw npe;
}
}
}
/**
* Sets the stored value
*
* @param storedValue Value to store
*/
protected final void setStoredValue(Object storedValue) {
try {
this.storedValue.set(storedValue);
} catch (NullPointerException npe) {
if (storedValue == null) {
throw new AlreadyClosedException("this Analyzer is closed");
} else {
throw npe;
}
}
}
/**
* Closes the ReuseStrategy, freeing any resources
*/
public void close() {
storedValue.close();
storedValue = null;
}
}
/**
* Implementation of {@link ReuseStrategy} that reuses the same components for
* every field.
*/
public final static class GlobalReuseStrategy extends ReuseStrategy {
/**
* {@inheritDoc}
*/
public TokenStreamComponents getReusableComponents(String fieldName) {
return (TokenStreamComponents) getStoredValue();
}
/**
* {@inheritDoc}
*/
public void setReusableComponents(String fieldName, TokenStreamComponents components) {
setStoredValue(components);
}
}
/**
* Implementation of {@link ReuseStrategy} that reuses components per-field by
* maintaining a Map of TokenStreamComponent per field name.
*/
public static class PerFieldReuseStrategy extends ReuseStrategy {
/**
* {@inheritDoc}
*/
@SuppressWarnings("unchecked")
public TokenStreamComponents getReusableComponents(String fieldName) {
Map<String, TokenStreamComponents> componentsPerField = (Map<String, TokenStreamComponents>) getStoredValue();
return componentsPerField != null ? componentsPerField.get(fieldName) : null;
}
/**
* {@inheritDoc}
*/
@SuppressWarnings("unchecked")
public void setReusableComponents(String fieldName, TokenStreamComponents components) {
Map<String, TokenStreamComponents> componentsPerField = (Map<String, TokenStreamComponents>) getStoredValue();
if (componentsPerField == null) {
componentsPerField = new HashMap<String, TokenStreamComponents>();
setStoredValue(componentsPerField);
}
componentsPerField.put(fieldName, components);
}
}
}