using Lucene.Net.Analysis.Miscellaneous;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Util;
using System;
namespace Lucene.Net.Analysis.NGram
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
/// <summary>
/// Tokenizes the input into n-grams of the given size(s).
/// <para>You must specify the required <see cref="LuceneVersion"/> compatibility when
/// creating a <see cref="NGramTokenFilter"/>. As of Lucene 4.4, this token filters:
/// <list type="bullet">
/// <item><description>handles supplementary characters correctly,</description></item>
/// <item><description>emits all n-grams for the same token at the same position,</description></item>
/// <item><description>does not modify offsets,</description></item>
/// <item><description>sorts n-grams by their offset in the original token first, then
/// increasing length (meaning that "abc" will give "a", "ab", "abc", "b", "bc",
/// "c").</description></item>
/// </list>
/// </para>
/// <para>You can make this filter use the old behavior by providing a version &lt;
/// <see cref="LuceneVersion.LUCENE_44"/> in the constructor but this is not recommended as
/// it will lead to broken <see cref="TokenStream"/>s that will cause highlighting
/// bugs.
/// </para>
/// <para>If you were using this <see cref="TokenFilter"/> to perform partial highlighting,
/// this won't work anymore since this filter doesn't update offsets. You should
/// modify your analysis chain to use <see cref="NGramTokenizer"/>, and potentially
/// override <see cref="NGramTokenizer.IsTokenChar(int)"/> to perform pre-tokenization.
/// </para>
/// </summary>
public sealed class NGramTokenFilter : TokenFilter
public const int DEFAULT_MIN_NGRAM_SIZE = 1;
public const int DEFAULT_MAX_NGRAM_SIZE = 2;
private readonly int minGram, maxGram;
private char[] curTermBuffer;
private int curTermLength;
private int curCodePointCount;
private int curGramSize;
private int curPos;
private int curPosInc, curPosLen;
private int tokStart;
private int tokEnd;
private bool hasIllegalOffsets; // only if the length changed before this filter
private readonly LuceneVersion version;
private readonly CharacterUtils charUtils;
private readonly ICharTermAttribute termAtt;
private readonly IPositionIncrementAttribute posIncAtt;
private readonly IPositionLengthAttribute posLenAtt;
private readonly IOffsetAttribute offsetAtt;
/// <summary>
/// Creates <see cref="NGramTokenFilter"/> with given min and max n-grams. </summary>
/// <param name="version"> Lucene version to enable correct position increments.
/// See <see cref="NGramTokenFilter"/> for details. </param>
/// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
/// <param name="minGram"> the smallest n-gram to generate </param>
/// <param name="maxGram"> the largest n-gram to generate </param>
public NGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram)
: base(new CodepointCountFilter(version, input, minGram, int.MaxValue))
this.version = version;
this.charUtils = version.OnOrAfter(
#pragma warning disable 612, 618
LuceneVersion.LUCENE_44) ?
#pragma warning restore 612, 618
CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version);
if (minGram < 1)
throw new ArgumentException("minGram must be greater than zero");
if (minGram > maxGram)
throw new ArgumentException("minGram must not be greater than maxGram");
this.minGram = minGram;
this.maxGram = maxGram;
#pragma warning disable 612, 618
if (version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
posIncAtt = AddAttribute<IPositionIncrementAttribute>();
posLenAtt = AddAttribute<IPositionLengthAttribute>();
posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper();
posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper();
termAtt = AddAttribute<ICharTermAttribute>();
offsetAtt = AddAttribute<IOffsetAttribute>();
private class PositionIncrementAttributeAnonymousInnerClassHelper : PositionIncrementAttribute
public override int PositionIncrement
get => 0;
set { }
private class PositionLengthAttributeAnonymousInnerClassHelper : PositionLengthAttribute
public override int PositionLength
get => 0;
set { }
/// <summary>
/// Creates <see cref="NGramTokenFilter"/> with default min and max n-grams. </summary>
/// <param name="version"> Lucene version to enable correct position increments.
/// See <see cref="NGramTokenFilter"/> for details. </param>
/// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
public NGramTokenFilter(LuceneVersion version, TokenStream input)
/// <summary>
/// Returns the next token in the stream, or null at EOS.
/// </summary>
public override sealed bool IncrementToken()
while (true)
if (curTermBuffer == null)
if (!m_input.IncrementToken())
return false;
curTermBuffer = (char[])termAtt.Buffer.Clone();
curTermLength = termAtt.Length;
curCodePointCount = charUtils.CodePointCount(termAtt);
curGramSize = minGram;
curPos = 0;
curPosInc = posIncAtt.PositionIncrement;
curPosLen = posLenAtt.PositionLength;
tokStart = offsetAtt.StartOffset;
tokEnd = offsetAtt.EndOffset;
// if length by start + end offsets doesn't match the term text then assume
// this is a synonym and don't adjust the offsets.
hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
#pragma warning disable 612, 618
if (version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount)
curGramSize = minGram;
if ((curPos + curGramSize) <= curCodePointCount)
int start = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
termAtt.CopyBuffer(curTermBuffer, start, end - start);
posIncAtt.PositionIncrement = curPosInc;
curPosInc = 0;
posLenAtt.PositionLength = curPosLen;
offsetAtt.SetOffset(tokStart, tokEnd);
return true;
while (curGramSize <= maxGram)
while (curPos + curGramSize <= curTermLength) // while there is input
termAtt.CopyBuffer(curTermBuffer, curPos, curGramSize);
if (hasIllegalOffsets)
offsetAtt.SetOffset(tokStart, tokEnd);
offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize);
return true;
curGramSize++; // increase n-gram size
curPos = 0;
curTermBuffer = null;
public override void Reset()
curTermBuffer = null;