blob: 0c5deefc59b6a900b8b1d6d663ec27fde91f139a [file] [log] [blame]
using Lucene.Net.Analysis.Miscellaneous;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Util;
using System;
namespace Lucene.Net.Analysis.NGram
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Tokenizes the input into n-grams of the given size(s).
/// <para>You must specify the required <see cref="LuceneVersion"/> compatibility when
/// creating a <see cref="NGramTokenFilter"/>. As of Lucene 4.4, this token filters:
/// <list type="bullet">
/// <item><description>handles supplementary characters correctly,</description></item>
/// <item><description>emits all n-grams for the same token at the same position,</description></item>
/// <item><description>does not modify offsets,</description></item>
/// <item><description>sorts n-grams by their offset in the original token first, then
/// increasing length (meaning that "abc" will give "a", "ab", "abc", "b", "bc",
/// "c").</description></item>
/// </list>
/// </para>
/// <para>You can make this filter use the old behavior by providing a version &lt;
/// <see cref="LuceneVersion.LUCENE_44"/> in the constructor but this is not recommended as
/// it will lead to broken <see cref="TokenStream"/>s that will cause highlighting
/// bugs.
/// </para>
/// <para>If you were using this <see cref="TokenFilter"/> to perform partial highlighting,
/// this won't work anymore since this filter doesn't update offsets. You should
/// modify your analysis chain to use <see cref="NGramTokenizer"/>, and potentially
/// override <see cref="NGramTokenizer.IsTokenChar(int)"/> to perform pre-tokenization.
/// </para>
/// </summary>
public sealed class NGramTokenFilter : TokenFilter
{
public const int DEFAULT_MIN_NGRAM_SIZE = 1;
public const int DEFAULT_MAX_NGRAM_SIZE = 2;
private readonly int minGram, maxGram;
private char[] curTermBuffer;
private int curTermLength;
private int curCodePointCount;
private int curGramSize;
private int curPos;
private int curPosInc, curPosLen;
private int tokStart;
private int tokEnd;
private bool hasIllegalOffsets; // only if the length changed before this filter
private readonly LuceneVersion version;
private readonly CharacterUtils charUtils;
private readonly ICharTermAttribute termAtt;
private readonly IPositionIncrementAttribute posIncAtt;
private readonly IPositionLengthAttribute posLenAtt;
private readonly IOffsetAttribute offsetAtt;
/// <summary>
/// Creates <see cref="NGramTokenFilter"/> with given min and max n-grams. </summary>
/// <param name="version"> Lucene version to enable correct position increments.
/// See <see cref="NGramTokenFilter"/> for details. </param>
/// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
/// <param name="minGram"> the smallest n-gram to generate </param>
/// <param name="maxGram"> the largest n-gram to generate </param>
public NGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram)
: base(new CodepointCountFilter(version, input, minGram, int.MaxValue))
{
this.version = version;
this.charUtils = version.OnOrAfter(
#pragma warning disable 612, 618
LuceneVersion.LUCENE_44) ?
#pragma warning restore 612, 618
CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version);
if (minGram < 1)
{
throw new ArgumentException("minGram must be greater than zero");
}
if (minGram > maxGram)
{
throw new ArgumentException("minGram must not be greater than maxGram");
}
this.minGram = minGram;
this.maxGram = maxGram;
#pragma warning disable 612, 618
if (version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
{
posIncAtt = AddAttribute<IPositionIncrementAttribute>();
posLenAtt = AddAttribute<IPositionLengthAttribute>();
}
else
{
posIncAtt = new PositionIncrementAttributeAnonymousClass();
posLenAtt = new PositionLengthAttributeAnonymousClass();
}
termAtt = AddAttribute<ICharTermAttribute>();
offsetAtt = AddAttribute<IOffsetAttribute>();
}
private class PositionIncrementAttributeAnonymousClass : PositionIncrementAttribute
{
public override int PositionIncrement
{
get => 0;
set { }
}
}
private class PositionLengthAttributeAnonymousClass : PositionLengthAttribute
{
public override int PositionLength
{
get => 0;
set { }
}
}
/// <summary>
/// Creates <see cref="NGramTokenFilter"/> with default min and max n-grams. </summary>
/// <param name="version"> Lucene version to enable correct position increments.
/// See <see cref="NGramTokenFilter"/> for details. </param>
/// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
public NGramTokenFilter(LuceneVersion version, TokenStream input)
: this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
{
}
/// <summary>
/// Returns the next token in the stream, or null at EOS.
/// </summary>
public override sealed bool IncrementToken()
{
while (true)
{
if (curTermBuffer == null)
{
if (!m_input.IncrementToken())
{
return false;
}
else
{
curTermBuffer = (char[])termAtt.Buffer.Clone();
curTermLength = termAtt.Length;
curCodePointCount = charUtils.CodePointCount(termAtt);
curGramSize = minGram;
curPos = 0;
curPosInc = posIncAtt.PositionIncrement;
curPosLen = posLenAtt.PositionLength;
tokStart = offsetAtt.StartOffset;
tokEnd = offsetAtt.EndOffset;
// if length by start + end offsets doesn't match the term text then assume
// this is a synonym and don't adjust the offsets.
hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
}
}
#pragma warning disable 612, 618
if (version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
{
if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount)
{
++curPos;
curGramSize = minGram;
}
if ((curPos + curGramSize) <= curCodePointCount)
{
ClearAttributes();
int start = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
termAtt.CopyBuffer(curTermBuffer, start, end - start);
posIncAtt.PositionIncrement = curPosInc;
curPosInc = 0;
posLenAtt.PositionLength = curPosLen;
offsetAtt.SetOffset(tokStart, tokEnd);
curGramSize++;
return true;
}
}
else
{
while (curGramSize <= maxGram)
{
while (curPos + curGramSize <= curTermLength) // while there is input
{
ClearAttributes();
termAtt.CopyBuffer(curTermBuffer, curPos, curGramSize);
if (hasIllegalOffsets)
{
offsetAtt.SetOffset(tokStart, tokEnd);
}
else
{
offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize);
}
curPos++;
return true;
}
curGramSize++; // increase n-gram size
curPos = 0;
}
}
curTermBuffer = null;
}
}
public override void Reset()
{
base.Reset();
curTermBuffer = null;
}
}
}