| using Lucene.Net.Analysis.TokenAttributes; |
| using Lucene.Net.Analysis.Util; |
| using Lucene.Net.Util; |
| using System; |
| |
| namespace Lucene.Net.Analysis.NGram |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Tokenizes the given token into n-grams of given size(s). |
| /// <para> |
| /// This <see cref="TokenFilter"/> create n-grams from the beginning edge or ending edge of a input token. |
| /// </para> |
| /// <para>As of Lucene 4.4, this filter does not support |
| /// <see cref="Side.BACK"/> (you can use <see cref="Reverse.ReverseStringFilter"/> up-front and |
| /// afterward to get the same behavior), handles supplementary characters |
| /// correctly and does not update offsets anymore. |
| /// </para> |
| /// </summary> |
| public sealed class EdgeNGramTokenFilter : TokenFilter |
| { |
| public const Side DEFAULT_SIDE = Side.FRONT; |
| public const int DEFAULT_MAX_GRAM_SIZE = 1; |
| public const int DEFAULT_MIN_GRAM_SIZE = 1; |
| |
| /// <summary> |
| /// Specifies which side of the input the n-gram should be generated from </summary> |
| public enum Side |
| { |
| /// <summary> |
| /// Get the n-gram from the front of the input </summary> |
| FRONT, |
| |
| /// <summary> |
| /// Get the n-gram from the end of the input </summary> |
| [System.Obsolete] |
| BACK, |
| } |
| |
| /// <summary> |
| /// Get the appropriate <see cref="Side"/> from a string |
| /// </summary> |
| public static Side GetSide(string sideName) |
| { |
| Side result; |
| if (!Enum.TryParse(sideName, true, out result)) |
| { |
| result = Side.FRONT; |
| } |
| return result; |
| } |
| |
| private readonly LuceneVersion version; |
| private readonly CharacterUtils charUtils; |
| private readonly int minGram; |
| private readonly int maxGram; |
| private readonly Side side; |
| private char[] curTermBuffer; |
| private int curTermLength; |
| private int curCodePointCount; |
| private int curGramSize; |
| private int tokStart; |
| private int tokEnd; // only used if the length changed before this filter |
| private bool updateOffsets; // never if the length changed before this filter |
| private int savePosIncr; |
| private int savePosLen; |
| |
| private readonly ICharTermAttribute termAtt; |
| private readonly IOffsetAttribute offsetAtt; |
| private readonly IPositionIncrementAttribute posIncrAtt; |
| private readonly IPositionLengthAttribute posLenAtt; |
| |
| /// <summary> |
| /// Creates <see cref="EdgeNGramTokenFilter"/> that can generate n-grams in the sizes of the given range |
| /// </summary> |
| /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param> |
| /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param> |
| /// <param name="side"> the <see cref="Side"/> from which to chop off an n-gram </param> |
| /// <param name="minGram"> the smallest n-gram to generate </param> |
| /// <param name="maxGram"> the largest n-gram to generate </param> |
| [Obsolete] |
| public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram) |
| : base(input) |
| { |
| |
| //if (version == null) |
| //{ |
| // throw new ArgumentException("version must not be null"); |
| //} |
| |
| if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK) |
| { |
| throw new ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward"); |
| } |
| |
| if (!Enum.IsDefined(typeof(Side), side)) |
| { |
| throw new ArgumentException("sideLabel must be either front or back"); |
| } |
| |
| if (minGram < 1) |
| { |
| throw new ArgumentException("minGram must be greater than zero"); |
| } |
| |
| if (minGram > maxGram) |
| { |
| throw new ArgumentException("minGram must not be greater than maxGram"); |
| } |
| |
| this.version = version; |
| this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version); |
| this.minGram = minGram; |
| this.maxGram = maxGram; |
| this.side = side; |
| |
| this.termAtt = AddAttribute<ICharTermAttribute>(); |
| this.offsetAtt = AddAttribute<IOffsetAttribute>(); |
| this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); |
| this.posLenAtt = AddAttribute<IPositionLengthAttribute>(); |
| } |
| |
| /// <summary> |
| /// Creates <see cref="EdgeNGramTokenFilter"/> that can generate n-grams in the sizes of the given range |
| /// </summary> |
| /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param> |
| /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param> |
| /// <param name="sideLabel"> the name of the <see cref="Side"/> from which to chop off an n-gram </param> |
| /// <param name="minGram"> the smallest n-gram to generate </param> |
| /// <param name="maxGram"> the largest n-gram to generate </param> |
| [Obsolete] |
| public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, string sideLabel, int minGram, int maxGram) |
| : this(version, input, GetSide(sideLabel), minGram, maxGram) |
| { |
| } |
| |
| /// <summary> |
| /// Creates <see cref="EdgeNGramTokenFilter"/> that can generate n-grams in the sizes of the given range |
| /// </summary> |
| /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param> |
| /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param> |
| /// <param name="minGram"> the smallest n-gram to generate </param> |
| /// <param name="maxGram"> the largest n-gram to generate </param> |
| public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram) |
| #pragma warning disable 612, 618 |
| : this(version, input, Side.FRONT, minGram, maxGram) |
| #pragma warning restore 612, 618 |
| { |
| } |
| |
| public override sealed bool IncrementToken() |
| { |
| while (true) |
| { |
| if (curTermBuffer == null) |
| { |
| if (!m_input.IncrementToken()) |
| { |
| return false; |
| } |
| else |
| { |
| curTermBuffer = (char[])termAtt.Buffer.Clone(); |
| curTermLength = termAtt.Length; |
| curCodePointCount = charUtils.CodePointCount(termAtt); |
| curGramSize = minGram; |
| tokStart = offsetAtt.StartOffset; |
| tokEnd = offsetAtt.EndOffset; |
| #pragma warning disable 612, 618 |
| if (version.OnOrAfter(LuceneVersion.LUCENE_44)) |
| #pragma warning restore 612, 618 |
| { |
| // Never update offsets |
| updateOffsets = false; |
| } |
| else |
| { |
| // if length by start + end offsets doesn't match the term text then assume |
| // this is a synonym and don't adjust the offsets. |
| updateOffsets = (tokStart + curTermLength) == tokEnd; |
| } |
| savePosIncr += posIncrAtt.PositionIncrement; |
| savePosLen = posLenAtt.PositionLength; |
| } |
| } |
| if (curGramSize <= maxGram) // if we have hit the end of our n-gram size range, quit |
| { |
| if (curGramSize <= curCodePointCount) // if the remaining input is too short, we can't generate any n-grams |
| { |
| // grab gramSize chars from front or back |
| int start = side == Side.FRONT ? 0 : charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize); |
| int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); |
| ClearAttributes(); |
| if (updateOffsets) |
| { |
| offsetAtt.SetOffset(tokStart + start, tokStart + end); |
| } |
| else |
| { |
| offsetAtt.SetOffset(tokStart, tokEnd); |
| } |
| // first ngram gets increment, others don't |
| if (curGramSize == minGram) |
| { |
| posIncrAtt.PositionIncrement = savePosIncr; |
| savePosIncr = 0; |
| } |
| else |
| { |
| posIncrAtt.PositionIncrement = 0; |
| } |
| posLenAtt.PositionLength = savePosLen; |
| termAtt.CopyBuffer(curTermBuffer, start, end - start); |
| curGramSize++; |
| return true; |
| } |
| } |
| curTermBuffer = null; |
| } |
| } |
| |
| public override void Reset() |
| { |
| base.Reset(); |
| curTermBuffer = null; |
| savePosIncr = 0; |
| } |
| } |
| } |