| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.miscellaneous; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefBuilder; |
| import org.apache.lucene.util.BytesRefHash; |
| import org.apache.lucene.util.CharsRefBuilder; |
| import org.apache.lucene.util.IntsRefBuilder; |
| import org.apache.lucene.util.UnicodeUtil; |
| import org.apache.lucene.util.fst.ByteSequenceOutputs; |
| import org.apache.lucene.util.fst.FST; |
| import org.apache.lucene.util.fst.FST.Arc; |
| import org.apache.lucene.util.fst.FST.BytesReader; |
| import org.apache.lucene.util.fst.FSTCompiler; |
| |
| /** |
| * Provides the ability to override any {@link KeywordAttribute} aware stemmer with custom |
| * dictionary-based stemming. |
| */ |
| public final class StemmerOverrideFilter extends TokenFilter { |
| private final StemmerOverrideMap stemmerOverrideMap; |
| |
| private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); |
| private final BytesReader fstReader; |
| private final Arc<BytesRef> scratchArc = new FST.Arc<>(); |
| private char[] spare = new char[0]; |
| |
| /** |
| * Create a new StemmerOverrideFilter, performing dictionary-based stemming with the provided |
| * <code>dictionary</code>. |
| * |
| * <p>Any dictionary-stemmed terms will be marked with {@link KeywordAttribute} so that they will |
| * not be stemmed with stemmers down the chain. |
| */ |
| public StemmerOverrideFilter( |
| final TokenStream input, final StemmerOverrideMap stemmerOverrideMap) { |
| super(input); |
| this.stemmerOverrideMap = stemmerOverrideMap; |
| fstReader = stemmerOverrideMap.getBytesReader(); |
| } |
| |
| @Override |
| public boolean incrementToken() throws IOException { |
| if (input.incrementToken()) { |
| if (fstReader == null) { |
| // No overrides |
| return true; |
| } |
| if (!keywordAtt.isKeyword()) { // don't muck with already-keyworded terms |
| final BytesRef stem = |
| stemmerOverrideMap.get(termAtt.buffer(), termAtt.length(), scratchArc, fstReader); |
| if (stem != null) { |
| spare = ArrayUtil.grow(termAtt.buffer(), stem.length); |
| final int length = UnicodeUtil.UTF8toUTF16(stem, spare); |
| if (spare != termAtt.buffer()) { |
| termAtt.copyBuffer(spare, 0, length); |
| } else { |
| termAtt.setLength(length); |
| } |
| keywordAtt.setKeyword(true); |
| } |
| } |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * A read-only 4-byte FST backed map that allows fast case-insensitive key value lookups for |
| * {@link StemmerOverrideFilter} |
| */ |
| // TODO maybe we can generalize this and reuse this map somehow? |
| public static final class StemmerOverrideMap { |
| private final FST<BytesRef> fst; |
| private final boolean ignoreCase; |
| |
| /** |
| * Creates a new {@link StemmerOverrideMap} |
| * |
| * @param fst the fst to lookup the overrides |
| * @param ignoreCase if the keys case should be ingored |
| */ |
| public StemmerOverrideMap(FST<BytesRef> fst, boolean ignoreCase) { |
| this.fst = fst; |
| this.ignoreCase = ignoreCase; |
| } |
| |
| /** |
| * Returns a {@link BytesReader} to pass to the {@link #get(char[], int, FST.Arc, |
| * FST.BytesReader)} method. |
| */ |
| public BytesReader getBytesReader() { |
| if (fst == null) { |
| return null; |
| } else { |
| return fst.getBytesReader(); |
| } |
| } |
| |
| /** |
| * Returns the value mapped to the given key or <code>null</code> if the key is not in the FST |
| * dictionary. |
| */ |
| public BytesRef get( |
| char[] buffer, int bufferLen, Arc<BytesRef> scratchArc, BytesReader fstReader) |
| throws IOException { |
| BytesRef pendingOutput = fst.outputs.getNoOutput(); |
| BytesRef matchOutput = null; |
| int bufUpto = 0; |
| fst.getFirstArc(scratchArc); |
| while (bufUpto < bufferLen) { |
| final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); |
| if (fst.findTargetArc( |
| ignoreCase ? Character.toLowerCase(codePoint) : codePoint, |
| scratchArc, |
| scratchArc, |
| fstReader) |
| == null) { |
| return null; |
| } |
| pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output()); |
| bufUpto += Character.charCount(codePoint); |
| } |
| if (scratchArc.isFinal()) { |
| matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput()); |
| } |
| return matchOutput; |
| } |
| } |
| /** This builder builds an {@link FST} for the {@link StemmerOverrideFilter} */ |
| public static class Builder { |
| private final BytesRefHash hash = new BytesRefHash(); |
| private final BytesRefBuilder spare = new BytesRefBuilder(); |
| private final ArrayList<CharSequence> outputValues = new ArrayList<>(); |
| private final boolean ignoreCase; |
| private final CharsRefBuilder charsSpare = new CharsRefBuilder(); |
| |
| /** Creates a new {@link Builder} with ignoreCase set to <code>false</code> */ |
| public Builder() { |
| this(false); |
| } |
| |
| /** |
| * Creates a new {@link Builder} |
| * |
| * @param ignoreCase if the input case should be ignored. |
| */ |
| public Builder(boolean ignoreCase) { |
| this.ignoreCase = ignoreCase; |
| } |
| |
| /** |
| * Adds an input string and its stemmer override output to this builder. |
| * |
| * @param input the input char sequence |
| * @param output the stemmer override output char sequence |
| * @return <code>false</code> iff the input has already been added to this builder otherwise |
| * <code>true</code>. |
| */ |
| public boolean add(CharSequence input, CharSequence output) { |
| final int length = input.length(); |
| if (ignoreCase) { |
| // convert on the fly to lowercase |
| charsSpare.grow(length); |
| final char[] buffer = charsSpare.chars(); |
| for (int i = 0; i < length; ) { |
| i += Character.toChars(Character.toLowerCase(Character.codePointAt(input, i)), buffer, i); |
| } |
| spare.copyChars(buffer, 0, length); |
| } else { |
| spare.copyChars(input, 0, length); |
| } |
| if (hash.add(spare.get()) >= 0) { |
| outputValues.add(output); |
| return true; |
| } |
| return false; |
| } |
| |
| /** |
| * Returns an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter} |
| * |
| * @return an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter} |
| * @throws IOException if an {@link IOException} occurs; |
| */ |
| public StemmerOverrideMap build() throws IOException { |
| ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); |
| FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs); |
| final int[] sort = hash.sort(); |
| IntsRefBuilder intsSpare = new IntsRefBuilder(); |
| final int size = hash.size(); |
| BytesRef spare = new BytesRef(); |
| for (int i = 0; i < size; i++) { |
| int id = sort[i]; |
| BytesRef bytesRef = hash.get(id, spare); |
| intsSpare.copyUTF8Bytes(bytesRef); |
| fstCompiler.add(intsSpare.get(), new BytesRef(outputValues.get(id))); |
| } |
| return new StemmerOverrideMap(fstCompiler.compile(), ignoreCase); |
| } |
| } |
| } |