blob: 1d29a76779cdaf0847128c5101002bc920a97f97 [file] [log] [blame]
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Support;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace Lucene.Net.Analysis.Shingle
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// <para>A <see cref="ShingleFilter"/> constructs shingles (token n-grams) from a token stream.
/// In other words, it creates combinations of tokens as a single token.
///
/// </para>
/// <para>For example, the sentence "please divide this sentence into shingles"
/// might be tokenized into shingles "please divide", "divide this",
/// "this sentence", "sentence into", and "into shingles".
///
/// </para>
/// <para>This filter handles position increments > 1 by inserting filler tokens
/// (tokens with termtext "_"). It does not handle a position increment of 0.
/// </para>
/// </summary>
public sealed class ShingleFilter : TokenFilter
{
/// <summary>
/// filler token for when positionIncrement is more than 1
/// </summary>
public const string DEFAULT_FILLER_TOKEN = "_";
/// <summary>
/// default maximum shingle size is 2.
/// </summary>
public const int DEFAULT_MAX_SHINGLE_SIZE = 2;
/// <summary>
/// default minimum shingle size is 2.
/// </summary>
public const int DEFAULT_MIN_SHINGLE_SIZE = 2;
/// <summary>
/// default token type attribute value is "shingle"
/// </summary>
public const string DEFAULT_TOKEN_TYPE = "shingle";
/// <summary>
/// The default string to use when joining adjacent tokens to form a shingle
/// </summary>
public const string DEFAULT_TOKEN_SEPARATOR = " ";
/// <summary>
/// The sequence of input stream tokens (or filler tokens, if necessary)
/// that will be composed to form output shingles.
/// </summary>
private readonly Queue<InputWindowToken> inputWindow = new Queue<InputWindowToken>();
/// <summary>
/// The number of input tokens in the next output token. This is the "n" in
/// "token n-grams".
/// </summary>
private CircularSequence gramSize;
/// <summary>
/// Shingle and unigram text is composed here.
/// </summary>
private readonly StringBuilder gramBuilder = new StringBuilder();
/// <summary>
/// The token type attribute value to use - default is "shingle"
/// </summary>
private string tokenType = DEFAULT_TOKEN_TYPE;
/// <summary>
/// The string to use when joining adjacent tokens to form a shingle
/// </summary>
private string tokenSeparator = DEFAULT_TOKEN_SEPARATOR;
/// <summary>
/// The string to insert for each position at which there is no token
/// (i.e., when position increment is greater than one).
/// </summary>
private char[] fillerToken = DEFAULT_FILLER_TOKEN.ToCharArray();
/// <summary>
/// By default, we output unigrams (individual tokens) as well as shingles
/// (token n-grams).
/// </summary>
private bool outputUnigrams = true;
/// <summary>
/// By default, we don't override behavior of outputUnigrams.
/// </summary>
private bool outputUnigramsIfNoShingles = false;
/// <summary>
/// maximum shingle size (number of tokens)
/// </summary>
private int maxShingleSize;
/// <summary>
/// minimum shingle size (number of tokens)
/// </summary>
private int minShingleSize;
/// <summary>
/// The remaining number of filler tokens to be inserted into the input stream
/// from which shingles are composed, to handle position increments greater
/// than one.
/// </summary>
private int numFillerTokensToInsert;
/// <summary>
/// When the next input stream token has a position increment greater than
/// one, it is stored in this field until sufficient filler tokens have been
/// inserted to account for the position increment.
/// </summary>
private AttributeSource nextInputStreamToken;
/// <summary>
/// Whether or not there is a next input stream token.
/// </summary>
private bool isNextInputStreamToken = false;
/// <summary>
/// Whether at least one unigram or shingle has been output at the current
/// position.
/// </summary>
private bool isOutputHere = false;
/// <summary>
/// true if no shingles have been output yet (for outputUnigramsIfNoShingles).
/// </summary>
private bool noShingleOutput = true;
/// <summary>
/// Holds the State after input.end() was called, so we can
/// restore it in our end() impl.
/// </summary>
private State endState;
private readonly ICharTermAttribute termAtt;
private readonly IOffsetAttribute offsetAtt;
private readonly IPositionIncrementAttribute posIncrAtt;
private readonly IPositionLengthAttribute posLenAtt;
private readonly ITypeAttribute typeAtt;
/// <summary>
/// Constructs a <see cref="ShingleFilter"/> with the specified shingle size from the
/// <see cref="TokenStream"/> <paramref name="input"/>
/// </summary>
/// <param name="input"> input stream </param>
/// <param name="minShingleSize"> minimum shingle size produced by the filter. </param>
/// <param name="maxShingleSize"> maximum shingle size produced by the filter. </param>
public ShingleFilter(TokenStream input, int minShingleSize, int maxShingleSize)
: base(input)
{
SetMaxShingleSize(maxShingleSize);
SetMinShingleSize(minShingleSize);
termAtt = AddAttribute<ICharTermAttribute>();
offsetAtt = AddAttribute<IOffsetAttribute>();
posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
posLenAtt = AddAttribute<IPositionLengthAttribute>();
typeAtt = AddAttribute<ITypeAttribute>();
}
/// <summary>
/// Constructs a <see cref="ShingleFilter"/> with the specified shingle size from the
/// <see cref="TokenStream"/> <paramref name="input"/>
/// </summary>
/// <param name="input"> input stream </param>
/// <param name="maxShingleSize"> maximum shingle size produced by the filter. </param>
public ShingleFilter(TokenStream input, int maxShingleSize)
: this(input, DEFAULT_MIN_SHINGLE_SIZE, maxShingleSize)
{
}
/// <summary>
/// Construct a <see cref="ShingleFilter"/> with default shingle size: 2.
/// </summary>
/// <param name="input"> input stream </param>
public ShingleFilter(TokenStream input)
: this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE)
{
}
/// <summary>
/// Construct a <see cref="ShingleFilter"/> with the specified token type for shingle tokens
/// and the default shingle size: 2
/// </summary>
/// <param name="input"> input stream </param>
/// <param name="tokenType"> token type for shingle tokens </param>
public ShingleFilter(TokenStream input, string tokenType)
: this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE)
{
SetTokenType(tokenType);
}
/// <summary>
/// Set the type of the shingle tokens produced by this filter.
/// (default: "shingle")
/// </summary>
/// <param name="tokenType"> token tokenType </param>
public void SetTokenType(string tokenType)
{
this.tokenType = tokenType;
}
/// <summary>
/// Shall the output stream contain the input tokens (unigrams) as well as
/// shingles? (default: true.)
/// </summary>
/// <param name="outputUnigrams"> Whether or not the output stream shall contain
/// the input tokens (unigrams) </param>
public void SetOutputUnigrams(bool outputUnigrams)
{
this.outputUnigrams = outputUnigrams;
gramSize = new CircularSequence(this);
}
/// <summary>
/// <para>Shall we override the behavior of outputUnigrams==false for those
/// times when no shingles are available (because there are fewer than
/// minShingleSize tokens in the input stream)? (default: false.)
/// </para>
/// <para>Note that if outputUnigrams==true, then unigrams are always output,
/// regardless of whether any shingles are available.
///
/// </para>
/// </summary>
/// <param name="outputUnigramsIfNoShingles"> Whether or not to output a single
/// unigram when no shingles are available. </param>
public void SetOutputUnigramsIfNoShingles(bool outputUnigramsIfNoShingles)
{
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
}
/// <summary>
/// Set the max shingle size (default: 2)
/// </summary>
/// <param name="maxShingleSize"> max size of output shingles </param>
public void SetMaxShingleSize(int maxShingleSize)
{
if (maxShingleSize < 2)
{
throw new ArgumentException("Max shingle size must be >= 2");
}
this.maxShingleSize = maxShingleSize;
}
/// <summary>
/// <para>Set the min shingle size (default: 2).
/// </para>
/// <para>This method requires that the passed in minShingleSize is not greater
/// than maxShingleSize, so make sure that maxShingleSize is set before
/// calling this method.
/// </para>
/// <para>The unigram output option is independent of the min shingle size.
///
/// </para>
/// </summary>
/// <param name="minShingleSize"> min size of output shingles </param>
public void SetMinShingleSize(int minShingleSize)
{
if (minShingleSize < 2)
{
throw new ArgumentException("Min shingle size must be >= 2");
}
if (minShingleSize > maxShingleSize)
{
throw new ArgumentException("Min shingle size must be <= max shingle size");
}
this.minShingleSize = minShingleSize;
gramSize = new CircularSequence(this);
}
/// <summary>
/// Sets the string to use when joining adjacent tokens to form a shingle </summary>
/// <param name="tokenSeparator"> used to separate input stream tokens in output shingles </param>
public void SetTokenSeparator(string tokenSeparator)
{
this.tokenSeparator = tokenSeparator ?? string.Empty;
}
/// <summary>
/// Sets the string to insert for each position at which there is no token
/// (i.e., when position increment is greater than one).
/// </summary>
/// <param name="fillerToken"> string to insert at each position where there is no token </param>
public void SetFillerToken(string fillerToken)
{
this.fillerToken = null == fillerToken ? Arrays.Empty<char>() : fillerToken.ToCharArray();
}
public override bool IncrementToken()
{
bool tokenAvailable = false;
int builtGramSize = 0;
if (gramSize.AtMinValue() || inputWindow.Count < gramSize.Value)
{
ShiftInputWindow();
gramBuilder.Length = 0;
}
else
{
builtGramSize = gramSize.PreviousValue;
}
if (inputWindow.Count >= gramSize.Value)
{
bool isAllFiller = true;
InputWindowToken nextToken = null;
using (IEnumerator<InputWindowToken> iter = inputWindow.GetEnumerator())
{
for (int gramNum = 1; iter.MoveNext() && builtGramSize < gramSize.Value; ++gramNum)
{
nextToken = iter.Current;
if (builtGramSize < gramNum)
{
if (builtGramSize > 0)
{
gramBuilder.Append(tokenSeparator);
}
gramBuilder.Append(nextToken.termAtt.Buffer, 0, nextToken.termAtt.Length);
++builtGramSize;
}
if (isAllFiller && nextToken.isFiller)
{
if (gramNum == gramSize.Value)
{
gramSize.Advance();
}
}
else
{
isAllFiller = false;
}
}
}
if (!isAllFiller && builtGramSize == gramSize.Value)
{
inputWindow.Peek().attSource.CopyTo(this);
posIncrAtt.PositionIncrement = isOutputHere ? 0 : 1;
termAtt.SetEmpty().Append(gramBuilder);
if (gramSize.Value > 1)
{
typeAtt.Type = tokenType;
noShingleOutput = false;
}
offsetAtt.SetOffset(offsetAtt.StartOffset, nextToken.offsetAtt.EndOffset);
posLenAtt.PositionLength = builtGramSize;
isOutputHere = true;
gramSize.Advance();
tokenAvailable = true;
}
}
return tokenAvailable;
}
private bool exhausted;
/// <summary>
/// <para>Get the next token from the input stream.
/// </para>
/// <para>If the next token has <c>positionIncrement > 1</c>,
/// <c>positionIncrement - 1</c> <see cref="fillerToken"/>s are
/// inserted first.
/// </para>
/// </summary>
/// <param name="target"> Where to put the new token; if null, a new instance is created. </param>
/// <returns> On success, the populated token; null otherwise </returns>
/// <exception cref="IOException"> if the input stream has a problem </exception>
private InputWindowToken GetNextToken(InputWindowToken target)
{
InputWindowToken newTarget = target;
if (numFillerTokensToInsert > 0)
{
if (null == target)
{
newTarget = new InputWindowToken(nextInputStreamToken.CloneAttributes());
}
else
{
nextInputStreamToken.CopyTo(target.attSource);
}
// A filler token occupies no space
newTarget.offsetAtt.SetOffset(newTarget.offsetAtt.StartOffset, newTarget.offsetAtt.StartOffset);
newTarget.termAtt.CopyBuffer(fillerToken, 0, fillerToken.Length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
}
else if (isNextInputStreamToken)
{
if (null == target)
{
newTarget = new InputWindowToken(nextInputStreamToken.CloneAttributes());
}
else
{
nextInputStreamToken.CopyTo(target.attSource);
}
isNextInputStreamToken = false;
newTarget.isFiller = false;
}
else if (!exhausted)
{
if (m_input.IncrementToken())
{
if (null == target)
{
newTarget = new InputWindowToken(CloneAttributes());
}
else
{
this.CopyTo(target.attSource);
}
if (posIncrAtt.PositionIncrement > 1)
{
// Each output shingle must contain at least one input token,
// so no more than (maxShingleSize - 1) filler tokens will be inserted.
numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement - 1, maxShingleSize - 1);
// Save the current token as the next input stream token
if (null == nextInputStreamToken)
{
nextInputStreamToken = CloneAttributes();
}
else
{
this.CopyTo(nextInputStreamToken);
}
isNextInputStreamToken = true;
// A filler token occupies no space
newTarget.offsetAtt.SetOffset(offsetAtt.StartOffset, offsetAtt.StartOffset);
newTarget.termAtt.CopyBuffer(fillerToken, 0, fillerToken.Length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
}
else
{
newTarget.isFiller = false;
}
}
else
{
exhausted = true;
m_input.End();
endState = CaptureState();
numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement, maxShingleSize - 1);
if (numFillerTokensToInsert > 0)
{
nextInputStreamToken = new AttributeSource(this.GetAttributeFactory());
nextInputStreamToken.AddAttribute<ICharTermAttribute>();
IOffsetAttribute newOffsetAtt = nextInputStreamToken.AddAttribute<IOffsetAttribute>();
newOffsetAtt.SetOffset(offsetAtt.EndOffset, offsetAtt.EndOffset);
// Recurse/loop just once:
return GetNextToken(target);
}
else
{
newTarget = null;
}
}
}
else
{
newTarget = null;
}
return newTarget;
}
public override void End()
{
if (!exhausted)
{
base.End();
}
else
{
RestoreState(endState);
}
}
/// <summary>
/// <para>Fills <see cref="inputWindow"/> with input stream tokens, if available,
/// shifting to the right if the window was previously full.
/// </para>
/// <para>
/// Resets <see cref="gramSize"/> to its minimum value.
/// </para>
/// </summary>
/// <exception cref="IOException"> if there's a problem getting the next token </exception>
private void ShiftInputWindow()
{
InputWindowToken firstToken = null;
if (inputWindow.Count > 0)
{
firstToken = inputWindow.Dequeue();
}
while (inputWindow.Count < maxShingleSize)
{
if (null != firstToken) // recycle the firstToken, if available
{
if (null != GetNextToken(firstToken))
{
inputWindow.Enqueue(firstToken); // the firstToken becomes the last
firstToken = null;
}
else
{
break; // end of input stream
}
}
else
{
InputWindowToken nextToken = GetNextToken(null);
if (null != nextToken)
{
inputWindow.Enqueue(nextToken);
}
else
{
break; // end of input stream
}
}
}
if (outputUnigramsIfNoShingles && noShingleOutput && gramSize.MinValue > 1 && inputWindow.Count < minShingleSize)
{
gramSize.MinValue = 1;
}
gramSize.Reset();
isOutputHere = false;
}
public override void Reset()
{
base.Reset();
gramSize.Reset();
inputWindow.Clear();
nextInputStreamToken = null;
isNextInputStreamToken = false;
numFillerTokensToInsert = 0;
isOutputHere = false;
noShingleOutput = true;
exhausted = false;
endState = null;
if (outputUnigramsIfNoShingles && !outputUnigrams)
{
// Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles
gramSize.MinValue = minShingleSize;
}
}
/// <summary>
/// <para>An instance of this class is used to maintain the number of input
/// stream tokens that will be used to compose the next unigram or shingle:
/// <see cref="gramSize"/>.
/// </para>
/// <para><code>gramSize</code> will take on values from the circular sequence
/// <b>{ [ 1, ] <see cref="minShingleSize"/> [ , ... , <see cref="maxShingleSize"/> ] }</b>.
/// </para>
/// <para>1 is included in the circular sequence only if
/// <see cref="outputUnigrams"/> = true.
/// </para>
/// </summary>
private class CircularSequence
{
private readonly ShingleFilter outerInstance;
private int value;
private int previousValue;
private int minValue;
public CircularSequence(ShingleFilter shingleFilter)
{
this.outerInstance = shingleFilter;
minValue = shingleFilter.outputUnigrams ? 1 : shingleFilter.minShingleSize;
Reset();
}
/// <returns> the current value. </returns>
/// <seealso cref="Advance()"/>
public virtual int Value => value;
/// <summary>
/// <para>Increments this circular number's value to the next member in the
/// circular sequence
/// <code>gramSize</code> will take on values from the circular sequence
/// <b>{ [ 1, ] <see cref="minShingleSize"/> [ , ... , <see cref="maxShingleSize"/> ] }</b>.
/// </para>
/// <para>1 is included in the circular sequence only if
/// <see cref="outputUnigrams"/> = true.
/// </para>
/// </summary>
public virtual void Advance()
{
previousValue = value;
if (value == 1)
{
value = outerInstance.minShingleSize;
}
else if (value == outerInstance.maxShingleSize)
{
Reset();
}
else
{
++value;
}
}
/// <summary>
/// <para>Sets this circular number's value to the first member of the
/// circular sequence
/// </para>
/// <para><code>gramSize</code> will take on values from the circular sequence
/// <b>{ [ 1, ] <see cref="minShingleSize"/> [ , ... , <see cref="maxShingleSize"/> ] }</b>.
/// </para>
/// <para>1 is included in the circular sequence only if
/// <see cref="outputUnigrams"/> = true.
/// </para>
/// </summary>
public virtual void Reset()
{
previousValue = value = minValue;
}
/// <summary>
/// <para>Returns true if the current value is the first member of the circular
/// sequence.
/// </para>
/// <para>If <see cref="outputUnigrams"/> = true, the first member of the circular
/// sequence will be 1; otherwise, it will be <see cref="minShingleSize"/>.
///
/// </para>
/// </summary>
/// <returns> true if the current value is the first member of the circular
/// sequence; false otherwise </returns>
public virtual bool AtMinValue()
{
return value == minValue;
}
/// <returns> the value this instance had before the last <see cref="Advance()"/> call </returns>
public virtual int PreviousValue => previousValue;
internal virtual int MinValue // LUCENENET specific - added to encapsulate minValue field
{
get => minValue;
set => minValue = value;
}
}
private class InputWindowToken
{
internal readonly AttributeSource attSource;
internal readonly ICharTermAttribute termAtt;
internal readonly IOffsetAttribute offsetAtt;
internal bool isFiller = false;
public InputWindowToken(AttributeSource attSource)
{
this.attSource = attSource;
this.termAtt = attSource.GetAttribute<ICharTermAttribute>();
this.offsetAtt = attSource.GetAttribute<IOffsetAttribute>();
}
}
}
}