| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.shingle; |
| |
| import java.io.IOException; |
| import java.util.Iterator; |
| import java.util.LinkedList; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; |
| import org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
| import org.apache.lucene.util.AttributeSource; |
| |
| /** |
| * A ShingleFilter constructs shingles (token n-grams) from a token stream. In other words, it |
| * creates combinations of tokens as a single token. |
| * |
| * <p>For example, the sentence "please divide this sentence into shingles" might be tokenized into |
| * shingles "please divide", "divide this", "this sentence", "sentence into", and "into shingles". |
| * |
| * <p>This filter handles position increments > 1 by inserting filler tokens (tokens with |
| * termtext "_"). It does not handle a position increment of 0. |
| */ |
| public final class ShingleFilter extends TokenFilter { |
| |
| /** filler token for when positionIncrement is more than 1 */ |
| public static final String DEFAULT_FILLER_TOKEN = "_"; |
| |
| /** default maximum shingle size is 2. */ |
| public static final int DEFAULT_MAX_SHINGLE_SIZE = 2; |
| |
| /** default minimum shingle size is 2. */ |
| public static final int DEFAULT_MIN_SHINGLE_SIZE = 2; |
| |
| /** default token type attribute value is "shingle" */ |
| public static final String DEFAULT_TOKEN_TYPE = "shingle"; |
| |
| /** The default string to use when joining adjacent tokens to form a shingle */ |
| public static final String DEFAULT_TOKEN_SEPARATOR = " "; |
| |
| /** |
| * The sequence of input stream tokens (or filler tokens, if necessary) that will be composed to |
| * form output shingles. |
| */ |
| private LinkedList<InputWindowToken> inputWindow = new LinkedList<>(); |
| |
| /** The number of input tokens in the next output token. This is the "n" in "token n-grams". */ |
| private CircularSequence gramSize; |
| |
| /** Shingle and unigram text is composed here. */ |
| private StringBuilder gramBuilder = new StringBuilder(); |
| |
| /** The token type attribute value to use - default is "shingle" */ |
| private String tokenType = DEFAULT_TOKEN_TYPE; |
| |
| /** The string to use when joining adjacent tokens to form a shingle */ |
| private String tokenSeparator = DEFAULT_TOKEN_SEPARATOR; |
| |
| /** |
| * The string to insert for each position at which there is no token (i.e., when position |
| * increment is greater than one). |
| */ |
| private char[] fillerToken = DEFAULT_FILLER_TOKEN.toCharArray(); |
| |
| /** By default, we output unigrams (individual tokens) as well as shingles (token n-grams). */ |
| private boolean outputUnigrams = true; |
| |
| /** By default, we don't override behavior of outputUnigrams. */ |
| private boolean outputUnigramsIfNoShingles = false; |
| |
| /** maximum shingle size (number of tokens) */ |
| private int maxShingleSize; |
| |
| /** minimum shingle size (number of tokens) */ |
| private int minShingleSize; |
| |
| /** |
| * The remaining number of filler tokens to be inserted into the input stream from which shingles |
| * are composed, to handle position increments greater than one. |
| */ |
| private int numFillerTokensToInsert; |
| |
| /** |
| * When the next input stream token has a position increment greater than one, it is stored in |
| * this field until sufficient filler tokens have been inserted to account for the position |
| * increment. |
| */ |
| private AttributeSource nextInputStreamToken; |
| |
| /** Whether or not there is a next input stream token. */ |
| private boolean isNextInputStreamToken = false; |
| |
| /** Whether at least one unigram or shingle has been output at the current position. */ |
| private boolean isOutputHere = false; |
| |
| /** true if no shingles have been output yet (for outputUnigramsIfNoShingles). */ |
| boolean noShingleOutput = true; |
| |
| /** Holds the State after input.end() was called, so we can restore it in our end() impl. */ |
| private State endState; |
| |
| private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); |
| private final PositionIncrementAttribute posIncrAtt = |
| addAttribute(PositionIncrementAttribute.class); |
| private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); |
| private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); |
| |
| /** |
| * Constructs a ShingleFilter with the specified shingle size from the {@link TokenStream} <code> |
| * input</code> |
| * |
| * @param input input stream |
| * @param minShingleSize minimum shingle size produced by the filter. |
| * @param maxShingleSize maximum shingle size produced by the filter. |
| */ |
| public ShingleFilter(TokenStream input, int minShingleSize, int maxShingleSize) { |
| super(input); |
| setMaxShingleSize(maxShingleSize); |
| setMinShingleSize(minShingleSize); |
| } |
| |
| /** |
| * Constructs a ShingleFilter with the specified shingle size from the {@link TokenStream} <code> |
| * input</code> |
| * |
| * @param input input stream |
| * @param maxShingleSize maximum shingle size produced by the filter. |
| */ |
| public ShingleFilter(TokenStream input, int maxShingleSize) { |
| this(input, DEFAULT_MIN_SHINGLE_SIZE, maxShingleSize); |
| } |
| |
| /** |
| * Construct a ShingleFilter with default shingle size: 2. |
| * |
| * @param input input stream |
| */ |
| public ShingleFilter(TokenStream input) { |
| this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE); |
| } |
| |
| /** |
| * Construct a ShingleFilter with the specified token type for shingle tokens and the default |
| * shingle size: 2 |
| * |
| * @param input input stream |
| * @param tokenType token type for shingle tokens |
| */ |
| public ShingleFilter(TokenStream input, String tokenType) { |
| this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE); |
| setTokenType(tokenType); |
| } |
| |
| /** |
| * Set the type of the shingle tokens produced by this filter. (default: "shingle") |
| * |
| * @param tokenType token tokenType |
| */ |
| public void setTokenType(String tokenType) { |
| this.tokenType = tokenType; |
| } |
| |
| /** |
| * Shall the output stream contain the input tokens (unigrams) as well as shingles? (default: |
| * true.) |
| * |
| * @param outputUnigrams Whether or not the output stream shall contain the input tokens |
| * (unigrams) |
| */ |
| public void setOutputUnigrams(boolean outputUnigrams) { |
| this.outputUnigrams = outputUnigrams; |
| gramSize = new CircularSequence(); |
| } |
| |
| /** |
| * Shall we override the behavior of outputUnigrams==false for those times when no shingles are |
| * available (because there are fewer than minShingleSize tokens in the input stream)? (default: |
| * false.) |
| * |
| * <p>Note that if outputUnigrams==true, then unigrams are always output, regardless of whether |
| * any shingles are available. |
| * |
| * @param outputUnigramsIfNoShingles Whether or not to output a single unigram when no shingles |
| * are available. |
| */ |
| public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) { |
| this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles; |
| } |
| |
| /** |
| * Set the max shingle size (default: 2) |
| * |
| * @param maxShingleSize max size of output shingles |
| */ |
| public void setMaxShingleSize(int maxShingleSize) { |
| if (maxShingleSize < 2) { |
| throw new IllegalArgumentException("Max shingle size must be >= 2"); |
| } |
| this.maxShingleSize = maxShingleSize; |
| } |
| |
| /** |
| * Set the min shingle size (default: 2). |
| * |
| * <p>This method requires that the passed in minShingleSize is not greater than maxShingleSize, |
| * so make sure that maxShingleSize is set before calling this method. |
| * |
| * <p>The unigram output option is independent of the min shingle size. |
| * |
| * @param minShingleSize min size of output shingles |
| */ |
| public void setMinShingleSize(int minShingleSize) { |
| if (minShingleSize < 2) { |
| throw new IllegalArgumentException("Min shingle size must be >= 2"); |
| } |
| if (minShingleSize > maxShingleSize) { |
| throw new IllegalArgumentException("Min shingle size must be <= max shingle size"); |
| } |
| this.minShingleSize = minShingleSize; |
| gramSize = new CircularSequence(); |
| } |
| |
| /** |
| * Sets the string to use when joining adjacent tokens to form a shingle |
| * |
| * @param tokenSeparator used to separate input stream tokens in output shingles |
| */ |
| public void setTokenSeparator(String tokenSeparator) { |
| this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator; |
| } |
| |
| /** |
| * Sets the string to insert for each position at which there is no token (i.e., when position |
| * increment is greater than one). |
| * |
| * @param fillerToken string to insert at each position where there is no token |
| */ |
| public void setFillerToken(String fillerToken) { |
| this.fillerToken = null == fillerToken ? new char[0] : fillerToken.toCharArray(); |
| } |
| |
| @Override |
| public boolean incrementToken() throws IOException { |
| boolean tokenAvailable = false; |
| int builtGramSize = 0; |
| if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) { |
| shiftInputWindow(); |
| gramBuilder.setLength(0); |
| } else { |
| builtGramSize = gramSize.getPreviousValue(); |
| } |
| if (inputWindow.size() >= gramSize.getValue()) { |
| boolean isAllFiller = true; |
| InputWindowToken nextToken = null; |
| Iterator<InputWindowToken> iter = inputWindow.iterator(); |
| for (int gramNum = 1; iter.hasNext() && builtGramSize < gramSize.getValue(); ++gramNum) { |
| nextToken = iter.next(); |
| if (builtGramSize < gramNum) { |
| if (builtGramSize > 0) { |
| gramBuilder.append(tokenSeparator); |
| } |
| gramBuilder.append(nextToken.termAtt.buffer(), 0, nextToken.termAtt.length()); |
| ++builtGramSize; |
| } |
| if (isAllFiller && nextToken.isFiller) { |
| if (gramNum == gramSize.getValue()) { |
| gramSize.advance(); |
| } |
| } else { |
| isAllFiller = false; |
| } |
| } |
| if (!isAllFiller && builtGramSize == gramSize.getValue()) { |
| inputWindow.getFirst().attSource.copyTo(this); |
| posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1); |
| termAtt.setEmpty().append(gramBuilder); |
| if (gramSize.getValue() > 1) { |
| typeAtt.setType(tokenType); |
| noShingleOutput = false; |
| } |
| offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset()); |
| if (outputUnigrams) { |
| posLenAtt.setPositionLength(builtGramSize); |
| } else { |
| // position length for this token is the number of position created by shingles of smaller |
| // size. |
| posLenAtt.setPositionLength(Math.max(1, (builtGramSize - minShingleSize) + 1)); |
| } |
| isOutputHere = true; |
| gramSize.advance(); |
| tokenAvailable = true; |
| } |
| } |
| return tokenAvailable; |
| } |
| |
| private boolean exhausted; |
| |
| /** |
| * Get the next token from the input stream. |
| * |
| * <p>If the next token has <code>positionIncrement > 1</code>, <code>positionIncrement - 1 |
| * </code> {@link #fillerToken}s are inserted first. |
| * |
| * @param target Where to put the new token; if null, a new instance is created. |
| * @return On success, the populated token; null otherwise |
| * @throws IOException if the input stream has a problem |
| */ |
| private InputWindowToken getNextToken(InputWindowToken target) throws IOException { |
| InputWindowToken newTarget = target; |
| if (numFillerTokensToInsert > 0) { |
| if (null == target) { |
| newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes()); |
| } else { |
| nextInputStreamToken.copyTo(target.attSource); |
| } |
| // A filler token occupies no space |
| newTarget.offsetAtt.setOffset( |
| newTarget.offsetAtt.startOffset(), newTarget.offsetAtt.startOffset()); |
| newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length); |
| newTarget.isFiller = true; |
| --numFillerTokensToInsert; |
| } else if (isNextInputStreamToken) { |
| if (null == target) { |
| newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes()); |
| } else { |
| nextInputStreamToken.copyTo(target.attSource); |
| } |
| isNextInputStreamToken = false; |
| newTarget.isFiller = false; |
| } else if (!exhausted) { |
| if (input.incrementToken()) { |
| if (null == target) { |
| newTarget = new InputWindowToken(cloneAttributes()); |
| } else { |
| this.copyTo(target.attSource); |
| } |
| if (posIncrAtt.getPositionIncrement() > 1) { |
| // Each output shingle must contain at least one input token, |
| // so no more than (maxShingleSize - 1) filler tokens will be inserted. |
| numFillerTokensToInsert = |
| Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1); |
| // Save the current token as the next input stream token |
| if (null == nextInputStreamToken) { |
| nextInputStreamToken = cloneAttributes(); |
| } else { |
| this.copyTo(nextInputStreamToken); |
| } |
| isNextInputStreamToken = true; |
| // A filler token occupies no space |
| newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset()); |
| newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length); |
| newTarget.isFiller = true; |
| --numFillerTokensToInsert; |
| } else { |
| newTarget.isFiller = false; |
| } |
| } else { |
| exhausted = true; |
| input.end(); |
| endState = captureState(); |
| numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement(), maxShingleSize - 1); |
| if (numFillerTokensToInsert > 0) { |
| nextInputStreamToken = new AttributeSource(getAttributeFactory()); |
| nextInputStreamToken.addAttribute(CharTermAttribute.class); |
| OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(OffsetAttribute.class); |
| newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset()); |
| // Recurse/loop just once: |
| return getNextToken(target); |
| } else { |
| newTarget = null; |
| } |
| } |
| } else { |
| newTarget = null; |
| } |
| return newTarget; |
| } |
| |
| @Override |
| public void end() throws IOException { |
| if (!exhausted) { |
| super.end(); |
| } else { |
| restoreState(endState); |
| } |
| } |
| |
| /** |
| * Fills {@link #inputWindow} with input stream tokens, if available, shifting to the right if the |
| * window was previously full. |
| * |
| * <p>Resets {@link #gramSize} to its minimum value. |
| * |
| * @throws IOException if there's a problem getting the next token |
| */ |
| private void shiftInputWindow() throws IOException { |
| InputWindowToken firstToken = null; |
| if (inputWindow.size() > 0) { |
| firstToken = inputWindow.removeFirst(); |
| } |
| while (inputWindow.size() < maxShingleSize) { |
| if (null != firstToken) { // recycle the firstToken, if available |
| if (null != getNextToken(firstToken)) { |
| inputWindow.add(firstToken); // the firstToken becomes the last |
| firstToken = null; |
| } else { |
| break; // end of input stream |
| } |
| } else { |
| InputWindowToken nextToken = getNextToken(null); |
| if (null != nextToken) { |
| inputWindow.add(nextToken); |
| } else { |
| break; // end of input stream |
| } |
| } |
| } |
| if (outputUnigramsIfNoShingles |
| && noShingleOutput |
| && gramSize.minValue > 1 |
| && inputWindow.size() < minShingleSize) { |
| gramSize.minValue = 1; |
| } |
| gramSize.reset(); |
| isOutputHere = false; |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| super.reset(); |
| gramSize.reset(); |
| inputWindow.clear(); |
| nextInputStreamToken = null; |
| isNextInputStreamToken = false; |
| numFillerTokensToInsert = 0; |
| isOutputHere = false; |
| noShingleOutput = true; |
| exhausted = false; |
| endState = null; |
| if (outputUnigramsIfNoShingles && !outputUnigrams) { |
| // Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles |
| gramSize.minValue = minShingleSize; |
| } |
| } |
| |
| /** |
| * An instance of this class is used to maintain the number of input stream tokens that will be |
| * used to compose the next unigram or shingle: {@link #gramSize}. |
| * |
| * <p><code>gramSize</code> will take on values from the circular sequence <b>{ [ 1, ] {@link |
| * #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>. |
| * |
| * <p>1 is included in the circular sequence only if {@link #outputUnigrams} = true. |
| */ |
| private class CircularSequence { |
| private int value; |
| private int previousValue; |
| private int minValue; |
| |
| public CircularSequence() { |
| minValue = outputUnigrams ? 1 : minShingleSize; |
| reset(); |
| } |
| |
| /** |
| * @return the current value. |
| * @see #advance() |
| */ |
| public int getValue() { |
| return value; |
| } |
| |
| /** |
| * Increments this circular number's value to the next member in the circular sequence <code> |
| * gramSize</code> will take on values from the circular sequence <b>{ [ 1, ] {@link |
| * #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>. |
| * |
| * <p>1 is included in the circular sequence only if {@link #outputUnigrams} = true. |
| */ |
| public void advance() { |
| previousValue = value; |
| if (value == 1) { |
| value = minShingleSize; |
| } else if (value == maxShingleSize) { |
| reset(); |
| } else { |
| ++value; |
| } |
| } |
| |
| /** |
| * Sets this circular number's value to the first member of the circular sequence |
| * |
| * <p><code>gramSize</code> will take on values from the circular sequence <b>{ [ 1, ] {@link |
| * #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>. |
| * |
| * <p>1 is included in the circular sequence only if {@link #outputUnigrams} = true. |
| */ |
| public void reset() { |
| previousValue = value = minValue; |
| } |
| |
| /** |
| * Returns true if the current value is the first member of the circular sequence. |
| * |
| * <p>If {@link #outputUnigrams} = true, the first member of the circular sequence will be 1; |
| * otherwise, it will be {@link #minShingleSize}. |
| * |
| * @return true if the current value is the first member of the circular sequence; false |
| * otherwise |
| */ |
| public boolean atMinValue() { |
| return value == minValue; |
| } |
| |
| /** @return the value this instance had before the last advance() call */ |
| public int getPreviousValue() { |
| return previousValue; |
| } |
| } |
| |
| private static class InputWindowToken { |
| final AttributeSource attSource; |
| final CharTermAttribute termAtt; |
| final OffsetAttribute offsetAtt; |
| boolean isFiller = false; |
| |
| public InputWindowToken(AttributeSource attSource) { |
| this.attSource = attSource; |
| this.termAtt = attSource.getAttribute(CharTermAttribute.class); |
| this.offsetAtt = attSource.getAttribute(OffsetAttribute.class); |
| } |
| } |
| } |