| using Lucene.Net.Analysis.TokenAttributes; |
| using System; |
| using System.IO; |
| |
| namespace Lucene.Net.Analysis.NGram |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Old broken version of <see cref="NGramTokenizer"/>. |
| /// </summary> |
| [Obsolete] |
| public sealed class Lucene43NGramTokenizer : Tokenizer |
| { |
| public const int DEFAULT_MIN_NGRAM_SIZE = 1; |
| public const int DEFAULT_MAX_NGRAM_SIZE = 2; |
| |
| private int minGram, maxGram; |
| private int gramSize; |
| private int pos; |
| private int inLen; // length of the input AFTER trim() |
| private int charsRead; // length of the input |
| private string inStr; |
| private bool started; |
| |
| private ICharTermAttribute termAtt; |
| private IOffsetAttribute offsetAtt; |
| |
| /// <summary> |
| /// Creates <see cref="Lucene43NGramTokenizer"/> with given min and max n-grams. </summary> |
| /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param> |
| /// <param name="minGram"> the smallest n-gram to generate </param> |
| /// <param name="maxGram"> the largest n-gram to generate </param> |
| public Lucene43NGramTokenizer(TextReader input, int minGram, int maxGram) |
| : base(input) |
| { |
| Init(minGram, maxGram); |
| } |
| |
| /// <summary> |
| /// Creates <see cref="Lucene43NGramTokenizer"/> with given min and max n-grams. </summary> |
| /// <param name="factory"> <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> to use </param> |
| /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param> |
| /// <param name="minGram"> the smallest n-gram to generate </param> |
| /// <param name="maxGram"> the largest n-gram to generate </param> |
| public Lucene43NGramTokenizer(AttributeFactory factory, TextReader input, int minGram, int maxGram) |
| : base(factory, input) |
| { |
| Init(minGram, maxGram); |
| } |
| |
| /// <summary> |
| /// Creates <see cref="Lucene43NGramTokenizer"/> with default min and max n-grams. </summary> |
| /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param> |
| public Lucene43NGramTokenizer(TextReader input) |
| : this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE) |
| { |
| } |
| |
| private void Init(int minGram, int maxGram) |
| { |
| if (minGram < 1) |
| { |
| throw new System.ArgumentException("minGram must be greater than zero"); |
| } |
| if (minGram > maxGram) |
| { |
| throw new System.ArgumentException("minGram must not be greater than maxGram"); |
| } |
| this.minGram = minGram; |
| this.maxGram = maxGram; |
| termAtt = AddAttribute<ICharTermAttribute>(); |
| offsetAtt = AddAttribute<IOffsetAttribute>(); |
| } |
| |
| /// <summary> |
| /// Returns the next token in the stream, or null at EOS. </summary> |
| public override bool IncrementToken() |
| { |
| ClearAttributes(); |
| if (!started) |
| { |
| started = true; |
| gramSize = minGram; |
| char[] chars = new char[1024]; |
| charsRead = 0; |
| // TODO: refactor to a shared readFully somewhere: |
| while (charsRead < chars.Length) |
| { |
| int inc = m_input.Read(chars, charsRead, chars.Length - charsRead); |
| if (inc == -1) |
| { |
| break; |
| } |
| charsRead += inc; |
| } |
| inStr = (new string(chars, 0, charsRead)).Trim(); // remove any trailing empty strings |
| |
| if (charsRead == chars.Length) |
| { |
| // Read extra throwaway chars so that on end() we |
| // report the correct offset: |
| var throwaway = new char[1024]; |
| while (true) |
| { |
| int inc = m_input.Read(throwaway, 0, throwaway.Length); |
| if (inc == -1) |
| { |
| break; |
| } |
| charsRead += inc; |
| } |
| } |
| |
| inLen = inStr.Length; |
| if (inLen == 0) |
| { |
| return false; |
| } |
| } |
| |
| if (pos + gramSize > inLen) // if we hit the end of the string |
| { |
| pos = 0; // reset to beginning of string |
| gramSize++; // increase n-gram size |
| if (gramSize > maxGram) // we are done |
| { |
| return false; |
| } |
| if (pos + gramSize > inLen) |
| { |
| return false; |
| } |
| } |
| |
| int oldPos = pos; |
| pos++; |
| termAtt.SetEmpty().Append(inStr, oldPos, gramSize); // LUCENENET: Corrected 3rd parameter |
| offsetAtt.SetOffset(CorrectOffset(oldPos), CorrectOffset(oldPos + gramSize)); |
| return true; |
| } |
| |
| public override void End() |
| { |
| base.End(); |
| // set final offset |
| int finalOffset = CorrectOffset(charsRead); |
| this.offsetAtt.SetOffset(finalOffset, finalOffset); |
| } |
| |
| public override void Reset() |
| { |
| base.Reset(); |
| started = false; |
| pos = 0; |
| } |
| } |
| } |