blob: fb19974663b4f4114109407122f3320f4464fe2d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.shingle;
import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedList;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* A ShingleFilter constructs shingles (token n-grams) from a token stream. In other words, it
* creates combinations of tokens as a single token.
*
* <p>For example, the sentence "please divide this sentence into shingles" might be tokenized into
* shingles "please divide", "divide this", "this sentence", "sentence into", and "into shingles".
*
* <p>This filter handles position increments &gt; 1 by inserting filler tokens (tokens with
* termtext "_"). It does not handle a position increment of 0.
*/
public final class ShingleFilter extends TokenFilter {
/** filler token for when positionIncrement is more than 1 */
public static final String DEFAULT_FILLER_TOKEN = "_";
/** default maximum shingle size is 2. */
public static final int DEFAULT_MAX_SHINGLE_SIZE = 2;
/** default minimum shingle size is 2. */
public static final int DEFAULT_MIN_SHINGLE_SIZE = 2;
/** default token type attribute value is "shingle" */
public static final String DEFAULT_TOKEN_TYPE = "shingle";
/** The default string to use when joining adjacent tokens to form a shingle */
public static final String DEFAULT_TOKEN_SEPARATOR = " ";
/**
* The sequence of input stream tokens (or filler tokens, if necessary) that will be composed to
* form output shingles.
*/
private LinkedList<InputWindowToken> inputWindow = new LinkedList<>();
/** The number of input tokens in the next output token. This is the "n" in "token n-grams". */
private CircularSequence gramSize;
/** Shingle and unigram text is composed here. */
private StringBuilder gramBuilder = new StringBuilder();
/** The token type attribute value to use - default is "shingle" */
private String tokenType = DEFAULT_TOKEN_TYPE;
/** The string to use when joining adjacent tokens to form a shingle */
private String tokenSeparator = DEFAULT_TOKEN_SEPARATOR;
/**
* The string to insert for each position at which there is no token (i.e., when position
* increment is greater than one).
*/
private char[] fillerToken = DEFAULT_FILLER_TOKEN.toCharArray();
/** By default, we output unigrams (individual tokens) as well as shingles (token n-grams). */
private boolean outputUnigrams = true;
/** By default, we don't override behavior of outputUnigrams. */
private boolean outputUnigramsIfNoShingles = false;
/** maximum shingle size (number of tokens) */
private int maxShingleSize;
/** minimum shingle size (number of tokens) */
private int minShingleSize;
/**
* The remaining number of filler tokens to be inserted into the input stream from which shingles
* are composed, to handle position increments greater than one.
*/
private int numFillerTokensToInsert;
/**
* When the next input stream token has a position increment greater than one, it is stored in
* this field until sufficient filler tokens have been inserted to account for the position
* increment.
*/
private AttributeSource nextInputStreamToken;
/** Whether or not there is a next input stream token. */
private boolean isNextInputStreamToken = false;
/** Whether at least one unigram or shingle has been output at the current position. */
private boolean isOutputHere = false;
/** true if no shingles have been output yet (for outputUnigramsIfNoShingles). */
boolean noShingleOutput = true;
/** Holds the State after input.end() was called, so we can restore it in our end() impl. */
private State endState;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt =
addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
/**
* Constructs a ShingleFilter with the specified shingle size from the {@link TokenStream} <code>
* input</code>
*
* @param input input stream
* @param minShingleSize minimum shingle size produced by the filter.
* @param maxShingleSize maximum shingle size produced by the filter.
*/
public ShingleFilter(TokenStream input, int minShingleSize, int maxShingleSize) {
super(input);
setMaxShingleSize(maxShingleSize);
setMinShingleSize(minShingleSize);
}
/**
* Constructs a ShingleFilter with the specified shingle size from the {@link TokenStream} <code>
* input</code>
*
* @param input input stream
* @param maxShingleSize maximum shingle size produced by the filter.
*/
public ShingleFilter(TokenStream input, int maxShingleSize) {
this(input, DEFAULT_MIN_SHINGLE_SIZE, maxShingleSize);
}
/**
* Construct a ShingleFilter with default shingle size: 2.
*
* @param input input stream
*/
public ShingleFilter(TokenStream input) {
this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE);
}
/**
* Construct a ShingleFilter with the specified token type for shingle tokens and the default
* shingle size: 2
*
* @param input input stream
* @param tokenType token type for shingle tokens
*/
public ShingleFilter(TokenStream input, String tokenType) {
this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE);
setTokenType(tokenType);
}
/**
* Set the type of the shingle tokens produced by this filter. (default: "shingle")
*
* @param tokenType token tokenType
*/
public void setTokenType(String tokenType) {
this.tokenType = tokenType;
}
/**
* Shall the output stream contain the input tokens (unigrams) as well as shingles? (default:
* true.)
*
* @param outputUnigrams Whether or not the output stream shall contain the input tokens
* (unigrams)
*/
public void setOutputUnigrams(boolean outputUnigrams) {
this.outputUnigrams = outputUnigrams;
gramSize = new CircularSequence();
}
/**
* Shall we override the behavior of outputUnigrams==false for those times when no shingles are
* available (because there are fewer than minShingleSize tokens in the input stream)? (default:
* false.)
*
* <p>Note that if outputUnigrams==true, then unigrams are always output, regardless of whether
* any shingles are available.
*
* @param outputUnigramsIfNoShingles Whether or not to output a single unigram when no shingles
* are available.
*/
public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
}
/**
* Set the max shingle size (default: 2)
*
* @param maxShingleSize max size of output shingles
*/
public void setMaxShingleSize(int maxShingleSize) {
if (maxShingleSize < 2) {
throw new IllegalArgumentException("Max shingle size must be >= 2");
}
this.maxShingleSize = maxShingleSize;
}
/**
* Set the min shingle size (default: 2).
*
* <p>This method requires that the passed in minShingleSize is not greater than maxShingleSize,
* so make sure that maxShingleSize is set before calling this method.
*
* <p>The unigram output option is independent of the min shingle size.
*
* @param minShingleSize min size of output shingles
*/
public void setMinShingleSize(int minShingleSize) {
if (minShingleSize < 2) {
throw new IllegalArgumentException("Min shingle size must be >= 2");
}
if (minShingleSize > maxShingleSize) {
throw new IllegalArgumentException("Min shingle size must be <= max shingle size");
}
this.minShingleSize = minShingleSize;
gramSize = new CircularSequence();
}
/**
* Sets the string to use when joining adjacent tokens to form a shingle
*
* @param tokenSeparator used to separate input stream tokens in output shingles
*/
public void setTokenSeparator(String tokenSeparator) {
this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator;
}
/**
* Sets the string to insert for each position at which there is no token (i.e., when position
* increment is greater than one).
*
* @param fillerToken string to insert at each position where there is no token
*/
public void setFillerToken(String fillerToken) {
this.fillerToken = null == fillerToken ? new char[0] : fillerToken.toCharArray();
}
@Override
public boolean incrementToken() throws IOException {
boolean tokenAvailable = false;
int builtGramSize = 0;
if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) {
shiftInputWindow();
gramBuilder.setLength(0);
} else {
builtGramSize = gramSize.getPreviousValue();
}
if (inputWindow.size() >= gramSize.getValue()) {
boolean isAllFiller = true;
InputWindowToken nextToken = null;
Iterator<InputWindowToken> iter = inputWindow.iterator();
for (int gramNum = 1; iter.hasNext() && builtGramSize < gramSize.getValue(); ++gramNum) {
nextToken = iter.next();
if (builtGramSize < gramNum) {
if (builtGramSize > 0) {
gramBuilder.append(tokenSeparator);
}
gramBuilder.append(nextToken.termAtt.buffer(), 0, nextToken.termAtt.length());
++builtGramSize;
}
if (isAllFiller && nextToken.isFiller) {
if (gramNum == gramSize.getValue()) {
gramSize.advance();
}
} else {
isAllFiller = false;
}
}
if (!isAllFiller && builtGramSize == gramSize.getValue()) {
inputWindow.getFirst().attSource.copyTo(this);
posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1);
termAtt.setEmpty().append(gramBuilder);
if (gramSize.getValue() > 1) {
typeAtt.setType(tokenType);
noShingleOutput = false;
}
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
if (outputUnigrams) {
posLenAtt.setPositionLength(builtGramSize);
} else {
// position length for this token is the number of position created by shingles of smaller
// size.
posLenAtt.setPositionLength(Math.max(1, (builtGramSize - minShingleSize) + 1));
}
isOutputHere = true;
gramSize.advance();
tokenAvailable = true;
}
}
return tokenAvailable;
}
private boolean exhausted;
/**
* Get the next token from the input stream.
*
* <p>If the next token has <code>positionIncrement &gt; 1</code>, <code>positionIncrement - 1
* </code> {@link #fillerToken}s are inserted first.
*
* @param target Where to put the new token; if null, a new instance is created.
* @return On success, the populated token; null otherwise
* @throws IOException if the input stream has a problem
*/
private InputWindowToken getNextToken(InputWindowToken target) throws IOException {
InputWindowToken newTarget = target;
if (numFillerTokensToInsert > 0) {
if (null == target) {
newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
} else {
nextInputStreamToken.copyTo(target.attSource);
}
// A filler token occupies no space
newTarget.offsetAtt.setOffset(
newTarget.offsetAtt.startOffset(), newTarget.offsetAtt.startOffset());
newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else if (isNextInputStreamToken) {
if (null == target) {
newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
} else {
nextInputStreamToken.copyTo(target.attSource);
}
isNextInputStreamToken = false;
newTarget.isFiller = false;
} else if (!exhausted) {
if (input.incrementToken()) {
if (null == target) {
newTarget = new InputWindowToken(cloneAttributes());
} else {
this.copyTo(target.attSource);
}
if (posIncrAtt.getPositionIncrement() > 1) {
// Each output shingle must contain at least one input token,
// so no more than (maxShingleSize - 1) filler tokens will be inserted.
numFillerTokensToInsert =
Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
// Save the current token as the next input stream token
if (null == nextInputStreamToken) {
nextInputStreamToken = cloneAttributes();
} else {
this.copyTo(nextInputStreamToken);
}
isNextInputStreamToken = true;
// A filler token occupies no space
newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else {
newTarget.isFiller = false;
}
} else {
exhausted = true;
input.end();
endState = captureState();
numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement(), maxShingleSize - 1);
if (numFillerTokensToInsert > 0) {
nextInputStreamToken = new AttributeSource(getAttributeFactory());
nextInputStreamToken.addAttribute(CharTermAttribute.class);
OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(OffsetAttribute.class);
newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset());
// Recurse/loop just once:
return getNextToken(target);
} else {
newTarget = null;
}
}
} else {
newTarget = null;
}
return newTarget;
}
@Override
public void end() throws IOException {
if (!exhausted) {
super.end();
} else {
restoreState(endState);
}
}
/**
* Fills {@link #inputWindow} with input stream tokens, if available, shifting to the right if the
* window was previously full.
*
* <p>Resets {@link #gramSize} to its minimum value.
*
* @throws IOException if there's a problem getting the next token
*/
private void shiftInputWindow() throws IOException {
InputWindowToken firstToken = null;
if (inputWindow.size() > 0) {
firstToken = inputWindow.removeFirst();
}
while (inputWindow.size() < maxShingleSize) {
if (null != firstToken) { // recycle the firstToken, if available
if (null != getNextToken(firstToken)) {
inputWindow.add(firstToken); // the firstToken becomes the last
firstToken = null;
} else {
break; // end of input stream
}
} else {
InputWindowToken nextToken = getNextToken(null);
if (null != nextToken) {
inputWindow.add(nextToken);
} else {
break; // end of input stream
}
}
}
if (outputUnigramsIfNoShingles
&& noShingleOutput
&& gramSize.minValue > 1
&& inputWindow.size() < minShingleSize) {
gramSize.minValue = 1;
}
gramSize.reset();
isOutputHere = false;
}
@Override
public void reset() throws IOException {
super.reset();
gramSize.reset();
inputWindow.clear();
nextInputStreamToken = null;
isNextInputStreamToken = false;
numFillerTokensToInsert = 0;
isOutputHere = false;
noShingleOutput = true;
exhausted = false;
endState = null;
if (outputUnigramsIfNoShingles && !outputUnigrams) {
// Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles
gramSize.minValue = minShingleSize;
}
}
/**
* An instance of this class is used to maintain the number of input stream tokens that will be
* used to compose the next unigram or shingle: {@link #gramSize}.
*
* <p><code>gramSize</code> will take on values from the circular sequence <b>{ [ 1, ] {@link
* #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>.
*
* <p>1 is included in the circular sequence only if {@link #outputUnigrams} = true.
*/
private class CircularSequence {
private int value;
private int previousValue;
private int minValue;
public CircularSequence() {
minValue = outputUnigrams ? 1 : minShingleSize;
reset();
}
/**
* @return the current value.
* @see #advance()
*/
public int getValue() {
return value;
}
/**
* Increments this circular number's value to the next member in the circular sequence <code>
* gramSize</code> will take on values from the circular sequence <b>{ [ 1, ] {@link
* #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>.
*
* <p>1 is included in the circular sequence only if {@link #outputUnigrams} = true.
*/
public void advance() {
previousValue = value;
if (value == 1) {
value = minShingleSize;
} else if (value == maxShingleSize) {
reset();
} else {
++value;
}
}
/**
* Sets this circular number's value to the first member of the circular sequence
*
* <p><code>gramSize</code> will take on values from the circular sequence <b>{ [ 1, ] {@link
* #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>.
*
* <p>1 is included in the circular sequence only if {@link #outputUnigrams} = true.
*/
public void reset() {
previousValue = value = minValue;
}
/**
* Returns true if the current value is the first member of the circular sequence.
*
* <p>If {@link #outputUnigrams} = true, the first member of the circular sequence will be 1;
* otherwise, it will be {@link #minShingleSize}.
*
* @return true if the current value is the first member of the circular sequence; false
* otherwise
*/
public boolean atMinValue() {
return value == minValue;
}
/** @return the value this instance had before the last advance() call */
public int getPreviousValue() {
return previousValue;
}
}
private static class InputWindowToken {
final AttributeSource attSource;
final CharTermAttribute termAtt;
final OffsetAttribute offsetAtt;
boolean isFiller = false;
public InputWindowToken(AttributeSource attSource) {
this.attSource = attSource;
this.termAtt = attSource.getAttribute(CharTermAttribute.class);
this.offsetAtt = attSource.getAttribute(OffsetAttribute.class);
}
}
}