| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis; |
| |
| import java.io.Closeable; |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.io.StringReader; |
| import java.util.HashMap; |
| import java.util.Map; |
| import java.util.function.Consumer; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; |
| import org.apache.lucene.store.AlreadyClosedException; |
| import org.apache.lucene.util.AttributeFactory; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.CloseableThreadLocal; |
| import org.apache.lucene.util.Version; |
| |
| /** |
| * An Analyzer builds TokenStreams, which analyze text. It thus represents a policy for extracting |
| * index terms from text. |
| * |
| * <p>In order to define what analysis is done, subclasses must define their {@link |
| * TokenStreamComponents TokenStreamComponents} in {@link #createComponents(String)}. The components |
| * are then reused in each call to {@link #tokenStream(String, Reader)}. |
| * |
| * <p>Simple example: |
| * |
| * <pre class="prettyprint"> |
| * Analyzer analyzer = new Analyzer() { |
| * {@literal @Override} |
| * protected TokenStreamComponents createComponents(String fieldName) { |
| * Tokenizer source = new FooTokenizer(reader); |
| * TokenStream filter = new FooFilter(source); |
| * filter = new BarFilter(filter); |
| * return new TokenStreamComponents(source, filter); |
| * } |
| * {@literal @Override} |
| * protected TokenStream normalize(TokenStream in) { |
| * // Assuming FooFilter is about normalization and BarFilter is about |
| * // stemming, only FooFilter should be applied |
| * return new FooFilter(in); |
| * } |
| * }; |
| * </pre> |
| * |
| * For more examples, see the {@link org.apache.lucene.analysis Analysis package documentation}. |
| * |
| * <p>For some concrete implementations bundled with Lucene, look in the analysis modules: |
| * |
| * <ul> |
| * <li><a href="{@docRoot}/../analysis/common/overview-summary.html">Common</a>: Analyzers for |
| * indexing content in different languages and domains. |
| * <li><a href="{@docRoot}/../analysis/icu/overview-summary.html">ICU</a>: Exposes functionality |
| * from ICU to Apache Lucene. |
| * <li><a href="{@docRoot}/../analysis/kuromoji/overview-summary.html">Kuromoji</a>: Morphological |
| * analyzer for Japanese text. |
| * <li><a href="{@docRoot}/../analysis/morfologik/overview-summary.html">Morfologik</a>: |
| * Dictionary-driven lemmatization for the Polish language. |
| * <li><a href="{@docRoot}/../analysis/phonetic/overview-summary.html">Phonetic</a>: Analysis for |
| * indexing phonetic signatures (for sounds-alike search). |
| * <li><a href="{@docRoot}/../analysis/smartcn/overview-summary.html">Smart Chinese</a>: Analyzer |
| * for Simplified Chinese, which indexes words. |
| * <li><a href="{@docRoot}/../analysis/stempel/overview-summary.html">Stempel</a>: Algorithmic |
| * Stemmer for the Polish Language. |
| * </ul> |
| * |
| * @since 3.1 |
| */ |
| public abstract class Analyzer implements Closeable { |
| |
| private final ReuseStrategy reuseStrategy; |
| private Version version = Version.LATEST; |
| |
| // non final as it gets nulled if closed; pkg private for access by ReuseStrategy's final helper |
| // methods: |
| CloseableThreadLocal<Object> storedValue = new CloseableThreadLocal<>(); |
| |
| /** |
| * Create a new Analyzer, reusing the same set of components per-thread across calls to {@link |
| * #tokenStream(String, Reader)}. |
| */ |
| protected Analyzer() { |
| this(GLOBAL_REUSE_STRATEGY); |
| } |
| |
| /** |
| * Expert: create a new Analyzer with a custom {@link ReuseStrategy}. |
| * |
| * <p>NOTE: if you just want to reuse on a per-field basis, it's easier to use a subclass of |
| * {@link AnalyzerWrapper} such as <a |
| * href="{@docRoot}/../analysis/common/org/apache/lucene/analysis/miscellaneous/PerFieldAnalyzerWrapper.html"> |
| * PerFieldAnalyzerWrapper</a> instead. |
| */ |
| protected Analyzer(ReuseStrategy reuseStrategy) { |
| this.reuseStrategy = reuseStrategy; |
| } |
| |
| /** |
| * Creates a new {@link TokenStreamComponents} instance for this analyzer. |
| * |
| * @param fieldName the name of the fields content passed to the {@link TokenStreamComponents} |
| * sink as a reader |
| * @return the {@link TokenStreamComponents} for this analyzer. |
| */ |
| protected abstract TokenStreamComponents createComponents(String fieldName); |
| |
| /** |
| * Wrap the given {@link TokenStream} in order to apply normalization filters. The default |
| * implementation returns the {@link TokenStream} as-is. This is used by {@link #normalize(String, |
| * String)}. |
| */ |
| protected TokenStream normalize(String fieldName, TokenStream in) { |
| return in; |
| } |
| |
| /** |
| * Returns a TokenStream suitable for <code>fieldName</code>, tokenizing the contents of <code> |
| * reader</code>. |
| * |
| * <p>This method uses {@link #createComponents(String)} to obtain an instance of {@link |
| * TokenStreamComponents}. It returns the sink of the components and stores the components |
| * internally. Subsequent calls to this method will reuse the previously stored components after |
| * resetting them through {@link TokenStreamComponents#setReader(Reader)}. |
| * |
| * <p><b>NOTE:</b> After calling this method, the consumer must follow the workflow described in |
| * {@link TokenStream} to properly consume its contents. See the {@link org.apache.lucene.analysis |
| * Analysis package documentation} for some examples demonstrating this. |
| * |
| * <p><b>NOTE:</b> If your data is available as a {@code String}, use {@link #tokenStream(String, |
| * String)} which reuses a {@code StringReader}-like instance internally. |
| * |
| * @param fieldName the name of the field the created TokenStream is used for |
| * @param reader the reader the streams source reads from |
| * @return TokenStream for iterating the analyzed content of <code>reader</code> |
| * @throws AlreadyClosedException if the Analyzer is closed. |
| * @see #tokenStream(String, String) |
| */ |
| public final TokenStream tokenStream(final String fieldName, final Reader reader) { |
| TokenStreamComponents components = reuseStrategy.getReusableComponents(this, fieldName); |
| final Reader r = initReader(fieldName, reader); |
| if (components == null) { |
| components = createComponents(fieldName); |
| reuseStrategy.setReusableComponents(this, fieldName, components); |
| } |
| components.setReader(r); |
| return components.getTokenStream(); |
| } |
| |
| /** |
| * Returns a TokenStream suitable for <code>fieldName</code>, tokenizing the contents of <code> |
| * text</code>. |
| * |
| * <p>This method uses {@link #createComponents(String)} to obtain an instance of {@link |
| * TokenStreamComponents}. It returns the sink of the components and stores the components |
| * internally. Subsequent calls to this method will reuse the previously stored components after |
| * resetting them through {@link TokenStreamComponents#setReader(Reader)}. |
| * |
| * <p><b>NOTE:</b> After calling this method, the consumer must follow the workflow described in |
| * {@link TokenStream} to properly consume its contents. See the {@link org.apache.lucene.analysis |
| * Analysis package documentation} for some examples demonstrating this. |
| * |
| * @param fieldName the name of the field the created TokenStream is used for |
| * @param text the String the streams source reads from |
| * @return TokenStream for iterating the analyzed content of <code>reader</code> |
| * @throws AlreadyClosedException if the Analyzer is closed. |
| * @see #tokenStream(String, Reader) |
| */ |
| public final TokenStream tokenStream(final String fieldName, final String text) { |
| TokenStreamComponents components = reuseStrategy.getReusableComponents(this, fieldName); |
| @SuppressWarnings("resource") |
| final ReusableStringReader strReader = |
| (components == null || components.reusableStringReader == null) |
| ? new ReusableStringReader() |
| : components.reusableStringReader; |
| strReader.setValue(text); |
| final Reader r = initReader(fieldName, strReader); |
| if (components == null) { |
| components = createComponents(fieldName); |
| reuseStrategy.setReusableComponents(this, fieldName, components); |
| } |
| |
| components.setReader(r); |
| components.reusableStringReader = strReader; |
| return components.getTokenStream(); |
| } |
| |
| /** |
| * Normalize a string down to the representation that it would have in the index. |
| * |
| * <p>This is typically used by query parsers in order to generate a query on a given term, |
| * without tokenizing or stemming, which are undesirable if the string to analyze is a partial |
| * word (eg. in case of a wildcard or fuzzy query). |
| * |
| * <p>This method uses {@link #initReaderForNormalization(String, Reader)} in order to apply |
| * necessary character-level normalization and then {@link #normalize(String, TokenStream)} in |
| * order to apply the normalizing token filters. |
| */ |
| public final BytesRef normalize(final String fieldName, final String text) { |
| try { |
| // apply char filters |
| final String filteredText; |
| try (Reader reader = new StringReader(text)) { |
| Reader filterReader = initReaderForNormalization(fieldName, reader); |
| char[] buffer = new char[64]; |
| StringBuilder builder = new StringBuilder(); |
| for (; ; ) { |
| final int read = filterReader.read(buffer, 0, buffer.length); |
| if (read == -1) { |
| break; |
| } |
| builder.append(buffer, 0, read); |
| } |
| filteredText = builder.toString(); |
| } catch (IOException e) { |
| throw new IllegalStateException("Normalization threw an unexpected exception", e); |
| } |
| |
| final AttributeFactory attributeFactory = attributeFactory(fieldName); |
| try (TokenStream ts = |
| normalize( |
| fieldName, new StringTokenStream(attributeFactory, filteredText, text.length()))) { |
| final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class); |
| ts.reset(); |
| if (ts.incrementToken() == false) { |
| throw new IllegalStateException( |
| "The normalization token stream is " |
| + "expected to produce exactly 1 token, but got 0 for analyzer " |
| + this |
| + " and input \"" |
| + text |
| + "\""); |
| } |
| final BytesRef term = BytesRef.deepCopyOf(termAtt.getBytesRef()); |
| if (ts.incrementToken()) { |
| throw new IllegalStateException( |
| "The normalization token stream is " |
| + "expected to produce exactly 1 token, but got 2+ for analyzer " |
| + this |
| + " and input \"" |
| + text |
| + "\""); |
| } |
| ts.end(); |
| return term; |
| } |
| } catch (IOException e) { |
| throw new IllegalStateException("Normalization threw an unexpected exception", e); |
| } |
| } |
| |
| /** |
| * Override this if you want to add a CharFilter chain. |
| * |
| * <p>The default implementation returns <code>reader</code> unchanged. |
| * |
| * @param fieldName IndexableField name being indexed |
| * @param reader original Reader |
| * @return reader, optionally decorated with CharFilter(s) |
| */ |
| protected Reader initReader(String fieldName, Reader reader) { |
| return reader; |
| } |
| |
| /** |
| * Wrap the given {@link Reader} with {@link CharFilter}s that make sense for normalization. This |
| * is typically a subset of the {@link CharFilter}s that are applied in {@link #initReader(String, |
| * Reader)}. This is used by {@link #normalize(String, String)}. |
| */ |
| protected Reader initReaderForNormalization(String fieldName, Reader reader) { |
| return reader; |
| } |
| |
| /** |
| * Return the {@link AttributeFactory} to be used for {@link #tokenStream analysis} and {@link |
| * #normalize(String, String) normalization} on the given {@code FieldName}. The default |
| * implementation returns {@link TokenStream#DEFAULT_TOKEN_ATTRIBUTE_FACTORY}. |
| */ |
| protected AttributeFactory attributeFactory(String fieldName) { |
| return TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY; |
| } |
| |
| /** |
| * Invoked before indexing a IndexableField instance if terms have already been added to that |
| * field. This allows custom analyzers to place an automatic position increment gap between |
| * IndexbleField instances using the same field name. The default value position increment gap is |
| * 0. With a 0 position increment gap and the typical default token position increment of 1, all |
| * terms in a field, including across IndexableField instances, are in successive positions, |
| * allowing exact PhraseQuery matches, for instance, across IndexableField instance boundaries. |
| * |
| * @param fieldName IndexableField name being indexed. |
| * @return position increment gap, added to the next token emitted from {@link |
| * #tokenStream(String,Reader)}. This value must be {@code >= 0}. |
| */ |
| public int getPositionIncrementGap(String fieldName) { |
| return 0; |
| } |
| |
| /** |
| * Just like {@link #getPositionIncrementGap}, except for Token offsets instead. By default this |
| * returns 1. This method is only called if the field produced at least one token for indexing. |
| * |
| * @param fieldName the field just indexed |
| * @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)}. |
| * This value must be {@code >= 0}. |
| */ |
| public int getOffsetGap(String fieldName) { |
| return 1; |
| } |
| |
| /** Returns the used {@link ReuseStrategy}. */ |
| public final ReuseStrategy getReuseStrategy() { |
| return reuseStrategy; |
| } |
| |
| /** Set the version of Lucene this analyzer should mimic the behavior for for analysis. */ |
| public void setVersion(Version v) { |
| version = v; // TODO: make write once? |
| } |
| |
| /** Return the version of Lucene this analyzer will mimic the behavior of for analysis. */ |
| public Version getVersion() { |
| return version; |
| } |
| |
| /** Frees persistent resources used by this Analyzer */ |
| @Override |
| public void close() { |
| if (storedValue != null) { |
| storedValue.close(); |
| storedValue = null; |
| } |
| } |
| |
| /** |
| * This class encapsulates the outer components of a token stream. It provides access to the |
| * source (a {@link Reader} {@link Consumer} and the outer end (sink), an instance of {@link |
| * TokenFilter} which also serves as the {@link TokenStream} returned by {@link |
| * Analyzer#tokenStream(String, Reader)}. |
| */ |
| public static final class TokenStreamComponents { |
| /** Original source of the tokens. */ |
| protected final Consumer<Reader> source; |
| /** |
| * Sink tokenstream, such as the outer tokenfilter decorating the chain. This can be the source |
| * if there are no filters. |
| */ |
| protected final TokenStream sink; |
| |
| /** Internal cache only used by {@link Analyzer#tokenStream(String, String)}. */ |
| transient ReusableStringReader reusableStringReader; |
| |
| /** |
| * Creates a new {@link TokenStreamComponents} instance. |
| * |
| * @param source the source to set the reader on |
| * @param result the analyzer's resulting token stream |
| */ |
| public TokenStreamComponents(final Consumer<Reader> source, final TokenStream result) { |
| this.source = source; |
| this.sink = result; |
| } |
| |
| /** |
| * Creates a new {@link TokenStreamComponents} instance |
| * |
| * @param tokenizer the analyzer's Tokenizer |
| * @param result the analyzer's resulting token stream |
| */ |
| public TokenStreamComponents(final Tokenizer tokenizer, final TokenStream result) { |
| this(tokenizer::setReader, result); |
| } |
| |
| /** Creates a new {@link TokenStreamComponents} from a Tokenizer */ |
| public TokenStreamComponents(final Tokenizer tokenizer) { |
| this(tokenizer::setReader, tokenizer); |
| } |
| |
| /** |
| * Resets the encapsulated components with the given reader. If the components cannot be reset, |
| * an Exception should be thrown. |
| * |
| * @param reader a reader to reset the source component |
| */ |
| private void setReader(final Reader reader) { |
| source.accept(reader); |
| } |
| |
| /** |
| * Returns the sink {@link TokenStream} |
| * |
| * @return the sink {@link TokenStream} |
| */ |
| public TokenStream getTokenStream() { |
| return sink; |
| } |
| |
| /** Returns the component's source */ |
| public Consumer<Reader> getSource() { |
| return source; |
| } |
| } |
| |
| /** |
| * Strategy defining how TokenStreamComponents are reused per call to {@link |
| * Analyzer#tokenStream(String, java.io.Reader)}. |
| */ |
| public abstract static class ReuseStrategy { |
| /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */ |
| // Explicitly declared so that we have non-empty javadoc |
| protected ReuseStrategy() {} |
| |
| /** |
| * Gets the reusable TokenStreamComponents for the field with the given name. |
| * |
| * @param analyzer Analyzer from which to get the reused components. Use {@link |
| * #getStoredValue(Analyzer)} and {@link #setStoredValue(Analyzer, Object)} to access the |
| * data on the Analyzer. |
| * @param fieldName Name of the field whose reusable TokenStreamComponents are to be retrieved |
| * @return Reusable TokenStreamComponents for the field, or {@code null} if there was no |
| * previous components for the field |
| */ |
| public abstract TokenStreamComponents getReusableComponents( |
| Analyzer analyzer, String fieldName); |
| |
| /** |
| * Stores the given TokenStreamComponents as the reusable components for the field with the give |
| * name. |
| * |
| * @param fieldName Name of the field whose TokenStreamComponents are being set |
| * @param components TokenStreamComponents which are to be reused for the field |
| */ |
| public abstract void setReusableComponents( |
| Analyzer analyzer, String fieldName, TokenStreamComponents components); |
| |
| /** |
| * Returns the currently stored value. |
| * |
| * @return Currently stored value or {@code null} if no value is stored |
| * @throws AlreadyClosedException if the Analyzer is closed. |
| */ |
| protected final Object getStoredValue(Analyzer analyzer) { |
| if (analyzer.storedValue == null) { |
| throw new AlreadyClosedException("this Analyzer is closed"); |
| } |
| return analyzer.storedValue.get(); |
| } |
| |
| /** |
| * Sets the stored value. |
| * |
| * @param storedValue Value to store |
| * @throws AlreadyClosedException if the Analyzer is closed. |
| */ |
| protected final void setStoredValue(Analyzer analyzer, Object storedValue) { |
| if (analyzer.storedValue == null) { |
| throw new AlreadyClosedException("this Analyzer is closed"); |
| } |
| analyzer.storedValue.set(storedValue); |
| } |
| } |
| |
| /** A predefined {@link ReuseStrategy} that reuses the same components for every field. */ |
| public static final ReuseStrategy GLOBAL_REUSE_STRATEGY = |
| new ReuseStrategy() { |
| |
| @Override |
| public TokenStreamComponents getReusableComponents(Analyzer analyzer, String fieldName) { |
| return (TokenStreamComponents) getStoredValue(analyzer); |
| } |
| |
| @Override |
| public void setReusableComponents( |
| Analyzer analyzer, String fieldName, TokenStreamComponents components) { |
| setStoredValue(analyzer, components); |
| } |
| }; |
| |
| /** |
| * A predefined {@link ReuseStrategy} that reuses components per-field by maintaining a Map of |
| * TokenStreamComponent per field name. |
| */ |
| public static final ReuseStrategy PER_FIELD_REUSE_STRATEGY = |
| new ReuseStrategy() { |
| |
| @SuppressWarnings("unchecked") |
| @Override |
| public TokenStreamComponents getReusableComponents(Analyzer analyzer, String fieldName) { |
| Map<String, TokenStreamComponents> componentsPerField = |
| (Map<String, TokenStreamComponents>) getStoredValue(analyzer); |
| return componentsPerField != null ? componentsPerField.get(fieldName) : null; |
| } |
| |
| @SuppressWarnings("unchecked") |
| @Override |
| public void setReusableComponents( |
| Analyzer analyzer, String fieldName, TokenStreamComponents components) { |
| Map<String, TokenStreamComponents> componentsPerField = |
| (Map<String, TokenStreamComponents>) getStoredValue(analyzer); |
| if (componentsPerField == null) { |
| componentsPerField = new HashMap<>(); |
| setStoredValue(analyzer, componentsPerField); |
| } |
| componentsPerField.put(fieldName, components); |
| } |
| }; |
| |
| private static final class StringTokenStream extends TokenStream { |
| |
| private final String value; |
| private final int length; |
| private boolean used = true; |
| private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); |
| private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); |
| |
| StringTokenStream(AttributeFactory attributeFactory, String value, int length) { |
| super(attributeFactory); |
| this.value = value; |
| this.length = length; |
| } |
| |
| @Override |
| public void reset() { |
| used = false; |
| } |
| |
| @Override |
| public boolean incrementToken() { |
| if (used) { |
| return false; |
| } |
| clearAttributes(); |
| termAttribute.append(value); |
| offsetAttribute.setOffset(0, length); |
| used = true; |
| return true; |
| } |
| |
| @Override |
| public void end() throws IOException { |
| super.end(); |
| offsetAttribute.setOffset(length, length); |
| } |
| } |
| } |