src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs - lucenenet - Git at Google

 using Lucene.Net.Analysis.Tokenattributes;
 using Lucene.Net.Util;
 using System;
 using System.IO;

 namespace Lucene.Net.Analysis.Ngram
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Old version of <seealso cref="EdgeNGramTokenizer"/> which doesn't handle correctly
     /// supplementary characters.
     /// </summary>
     [Obsolete]
     public sealed class Lucene43EdgeNGramTokenizer : Tokenizer
     {
         public const Side DEFAULT_SIDE = Side.FRONT;
         public const int DEFAULT_MAX_GRAM_SIZE = 1;
         public const int DEFAULT_MIN_GRAM_SIZE = 1;

         private ICharTermAttribute termAtt;
         private IOffsetAttribute offsetAtt;
         private IPositionIncrementAttribute posIncrAtt;

         /// <summary>
         /// Specifies which side of the input the n-gram should be generated from </summary>
         public enum Side
         {

             /// <summary>
             /// Get the n-gram from the front of the input </summary>
             FRONT,

             /// <summary>
             /// Get the n-gram from the end of the input </summary>
             BACK,
         }

         // Get the appropriate Side from a string
         internal static Side GetSide(string sideName)
         {
             Side result;
             if (!Enum.TryParse(sideName, true, out result))
             {
                 result = Side.FRONT;
             }
             return result;
         }

         private int minGram;
         private int maxGram;
         private int gramSize;
         private Side side;
         private bool started;
         private int inLen; // length of the input AFTER trim()
         private int charsRead; // length of the input
         private string inStr;


         /// <summary>
         /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
         /// </summary>
         /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
         /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
         /// <param name="side"> the <seealso cref="Side"/> from which to chop off an n-gram </param>
         /// <param name="minGram"> the smallest n-gram to generate </param>
         /// <param name="maxGram"> the largest n-gram to generate </param>
         [Obsolete]
         public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, Side side, int minGram, int maxGram)
             : base(input)
         {
             Init(version, side, minGram, maxGram);
         }

         /// <summary>
         /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
         /// </summary>
         /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
         /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
         /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
         /// <param name="side"> the <seealso cref="Side"/> from which to chop off an n-gram </param>
         /// <param name="minGram"> the smallest n-gram to generate </param>
         /// <param name="maxGram"> the largest n-gram to generate </param>
         [Obsolete]
         public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, Side side, int minGram, int maxGram)
             : base(factory, input)
         {
             Init(version, side, minGram, maxGram);
         }

         /// <summary>
         /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
         /// </summary>
         /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
         /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
         /// <param name="sideLabel"> the name of the <seealso cref="Side"/> from which to chop off an n-gram </param>
         /// <param name="minGram"> the smallest n-gram to generate </param>
         /// <param name="maxGram"> the largest n-gram to generate </param>
         [Obsolete]
         public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, string sideLabel, int minGram, int maxGram)
             : this(version, input, GetSide(sideLabel), minGram, maxGram)
         {
         }

         /// <summary>
         /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
         /// </summary>
         /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
         /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
         /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
         /// <param name="sideLabel"> the name of the <seealso cref="Side"/> from which to chop off an n-gram </param>
         /// <param name="minGram"> the smallest n-gram to generate </param>
         /// <param name="maxGram"> the largest n-gram to generate </param>
         [Obsolete]
         public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, string sideLabel, int minGram, int maxGram)
             : this(version, factory, input, GetSide(sideLabel), minGram, maxGram)
         {
         }

         /// <summary>
         /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
         /// </summary>
         /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
         /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
         /// <param name="minGram"> the smallest n-gram to generate </param>
         /// <param name="maxGram"> the largest n-gram to generate </param>
         public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram)
             : this(version, input, Side.FRONT, minGram, maxGram)
         {
         }

         /// <summary>
         /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
         /// </summary>
         /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
         /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
         /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
         /// <param name="minGram"> the smallest n-gram to generate </param>
         /// <param name="maxGram"> the largest n-gram to generate </param>
         public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram)
             : this(version, factory, input, Side.FRONT, minGram, maxGram)
         {
         }

         private void Init(LuceneVersion version, Side side, int minGram, int maxGram)
         {
             if (version == null)
             {
                 throw new System.ArgumentException("version must not be null");
             }

             if (side == null)
             {
                 throw new System.ArgumentException("sideLabel must be either front or back");
             }

             if (minGram < 1)
             {
                 throw new System.ArgumentException("minGram must be greater than zero");
             }

             if (minGram > maxGram)
             {
                 throw new System.ArgumentException("minGram must not be greater than maxGram");
             }

             if (version.OnOrAfter(LuceneVersion.LUCENE_44))
             {
                 if (side == Side.BACK)
                 {
                     throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4");
                 }
             }
             else
             {
                 maxGram = Math.Min(maxGram, 1024);
             }

             this.minGram = minGram;
             this.maxGram = maxGram;
             this.side = side;
             this.termAtt = AddAttribute<ICharTermAttribute>();
             this.offsetAtt = AddAttribute<IOffsetAttribute>();
             this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
         }

         /// <summary>
         /// Returns the next token in the stream, or null at EOS. </summary>
         public override bool IncrementToken()
         {
             ClearAttributes();
             // if we are just starting, read the whole input
             if (!started)
             {
                 started = true;
                 gramSize = minGram;
                 int limit = side == Side.FRONT ? maxGram : 1024;
                 char[] chars = new char[Math.Min(1024, limit)];
                 charsRead = 0;
                 // TODO: refactor to a shared readFully somewhere:
                 bool exhausted = false;
                 while (charsRead < limit)
                 {
                     int inc = input.Read(chars, charsRead, chars.Length - charsRead);
                     if (inc <= 0)
                     {
                         exhausted = true;
                         break;
                     }
                     charsRead += inc;
                     if (charsRead == chars.Length && charsRead < limit)
                     {
                         chars = ArrayUtil.Grow(chars);
                     }
                 }

                 inStr = new string(chars, 0, charsRead);
                 inStr = inStr.Trim();

                 if (!exhausted)
                 {
                     // Read extra throwaway chars so that on end() we
                     // report the correct offset:
                     var throwaway = new char[1024];
                     while (true)
                     {
                         int inc = input.Read(throwaway, 0, throwaway.Length);
                         if (inc <= 0)
                         {
                             break;
                         }
                         charsRead += inc;
                     }
                 }

                 inLen = inStr.Length;
                 if (inLen == 0)
                 {
                     return false;
                 }
                 posIncrAtt.PositionIncrement = 1;
             }
             else
             {
                 posIncrAtt.PositionIncrement = 0;
             }

             // if the remaining input is too short, we can't generate any n-grams
             if (gramSize > inLen)
             {
                 return false;
             }

             // if we have hit the end of our n-gram size range, quit
             if (gramSize > maxGram || gramSize > inLen)
             {
                 return false;
             }

             // grab gramSize chars from front or back
             int start = side == Side.FRONT ? 0 : inLen - gramSize;
             int end = start + gramSize;
             termAtt.SetEmpty().Append(inStr, start, end);
             offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end));
             gramSize++;
             return true;
         }

         public override void End()
         {
             base.End();
             // set final offset
             int finalOffset = CorrectOffset(charsRead);
             this.offsetAtt.SetOffset(finalOffset, finalOffset);
         }

         public override void Reset()
         {
             base.Reset();
             started = false;
         }
     }
 }
	using Lucene.Net.Analysis.Tokenattributes;
	using Lucene.Net.Util;
	using System;
	using System.IO;

	namespace Lucene.Net.Analysis.Ngram
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Old version of <seealso cref="EdgeNGramTokenizer"/> which doesn't handle correctly
	/// supplementary characters.
	/// </summary>
	[Obsolete]
	public sealed class Lucene43EdgeNGramTokenizer : Tokenizer
	{
	public const Side DEFAULT_SIDE = Side.FRONT;
	public const int DEFAULT_MAX_GRAM_SIZE = 1;
	public const int DEFAULT_MIN_GRAM_SIZE = 1;

	private ICharTermAttribute termAtt;
	private IOffsetAttribute offsetAtt;
	private IPositionIncrementAttribute posIncrAtt;

	/// <summary>
	/// Specifies which side of the input the n-gram should be generated from </summary>
	public enum Side
	{

	/// <summary>
	/// Get the n-gram from the front of the input </summary>
	FRONT,

	/// <summary>
	/// Get the n-gram from the end of the input </summary>
	BACK,
	}

	// Get the appropriate Side from a string
	internal static Side GetSide(string sideName)
	{
	Side result;
	if (!Enum.TryParse(sideName, true, out result))
	{
	result = Side.FRONT;
	}
	return result;
	}

	private int minGram;
	private int maxGram;
	private int gramSize;
	private Side side;
	private bool started;
	private int inLen; // length of the input AFTER trim()
	private int charsRead; // length of the input
	private string inStr;


	/// <summary>
	/// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
	/// </summary>
	/// <param name="version"> the <a href="#version">Lucene match version</a> </param>
	/// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
	/// <param name="side"> the <seealso cref="Side"/> from which to chop off an n-gram </param>
	/// <param name="minGram"> the smallest n-gram to generate </param>
	/// <param name="maxGram"> the largest n-gram to generate </param>
	[Obsolete]
	public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, Side side, int minGram, int maxGram)
	: base(input)
	{
	Init(version, side, minGram, maxGram);
	}

	/// <summary>
	/// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
	/// </summary>
	/// <param name="version"> the <a href="#version">Lucene match version</a> </param>
	/// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
	/// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
	/// <param name="side"> the <seealso cref="Side"/> from which to chop off an n-gram </param>
	/// <param name="minGram"> the smallest n-gram to generate </param>
	/// <param name="maxGram"> the largest n-gram to generate </param>
	[Obsolete]
	public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, Side side, int minGram, int maxGram)
	: base(factory, input)
	{
	Init(version, side, minGram, maxGram);
	}

	/// <summary>
	/// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
	/// </summary>
	/// <param name="version"> the <a href="#version">Lucene match version</a> </param>
	/// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
	/// <param name="sideLabel"> the name of the <seealso cref="Side"/> from which to chop off an n-gram </param>
	/// <param name="minGram"> the smallest n-gram to generate </param>
	/// <param name="maxGram"> the largest n-gram to generate </param>
	[Obsolete]
	public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, string sideLabel, int minGram, int maxGram)
	: this(version, input, GetSide(sideLabel), minGram, maxGram)
	{
	}

	/// <summary>
	/// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
	/// </summary>
	/// <param name="version"> the <a href="#version">Lucene match version</a> </param>
	/// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
	/// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
	/// <param name="sideLabel"> the name of the <seealso cref="Side"/> from which to chop off an n-gram </param>
	/// <param name="minGram"> the smallest n-gram to generate </param>
	/// <param name="maxGram"> the largest n-gram to generate </param>
	[Obsolete]
	public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, string sideLabel, int minGram, int maxGram)
	: this(version, factory, input, GetSide(sideLabel), minGram, maxGram)
	{
	}

	/// <summary>
	/// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
	/// </summary>
	/// <param name="version"> the <a href="#version">Lucene match version</a> </param>
	/// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
	/// <param name="minGram"> the smallest n-gram to generate </param>
	/// <param name="maxGram"> the largest n-gram to generate </param>
	public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram)
	: this(version, input, Side.FRONT, minGram, maxGram)
	{
	}

	/// <summary>
	/// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
	/// </summary>
	/// <param name="version"> the <a href="#version">Lucene match version</a> </param>
	/// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
	/// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
	/// <param name="minGram"> the smallest n-gram to generate </param>
	/// <param name="maxGram"> the largest n-gram to generate </param>
	public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram)
	: this(version, factory, input, Side.FRONT, minGram, maxGram)
	{
	}

	private void Init(LuceneVersion version, Side side, int minGram, int maxGram)
	{
	if (version == null)
	{
	throw new System.ArgumentException("version must not be null");
	}

	if (side == null)
	{
	throw new System.ArgumentException("sideLabel must be either front or back");
	}

	if (minGram < 1)
	{
	throw new System.ArgumentException("minGram must be greater than zero");
	}

	if (minGram > maxGram)
	{
	throw new System.ArgumentException("minGram must not be greater than maxGram");
	}

	if (version.OnOrAfter(LuceneVersion.LUCENE_44))
	{
	if (side == Side.BACK)
	{
	throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4");
	}
	}
	else
	{
	maxGram = Math.Min(maxGram, 1024);
	}

	this.minGram = minGram;
	this.maxGram = maxGram;
	this.side = side;
	this.termAtt = AddAttribute<ICharTermAttribute>();
	this.offsetAtt = AddAttribute<IOffsetAttribute>();
	this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
	}

	/// <summary>
	/// Returns the next token in the stream, or null at EOS. </summary>
	public override bool IncrementToken()
	{
	ClearAttributes();
	// if we are just starting, read the whole input
	if (!started)
	{
	started = true;
	gramSize = minGram;
	int limit = side == Side.FRONT ? maxGram : 1024;
	char[] chars = new char[Math.Min(1024, limit)];
	charsRead = 0;
	// TODO: refactor to a shared readFully somewhere:
	bool exhausted = false;
	while (charsRead < limit)
	{
	int inc = input.Read(chars, charsRead, chars.Length - charsRead);
	if (inc <= 0)
	{
	exhausted = true;
	break;
	}
	charsRead += inc;
	if (charsRead == chars.Length && charsRead < limit)
	{
	chars = ArrayUtil.Grow(chars);
	}
	}

	inStr = new string(chars, 0, charsRead);
	inStr = inStr.Trim();

	if (!exhausted)
	{
	// Read extra throwaway chars so that on end() we
	// report the correct offset:
	var throwaway = new char[1024];
	while (true)
	{
	int inc = input.Read(throwaway, 0, throwaway.Length);
	if (inc <= 0)
	{
	break;
	}
	charsRead += inc;
	}
	}

	inLen = inStr.Length;
	if (inLen == 0)
	{
	return false;
	}
	posIncrAtt.PositionIncrement = 1;
	}
	else
	{
	posIncrAtt.PositionIncrement = 0;
	}

	// if the remaining input is too short, we can't generate any n-grams
	if (gramSize > inLen)
	{
	return false;
	}

	// if we have hit the end of our n-gram size range, quit
	if (gramSize > maxGram \|\| gramSize > inLen)
	{
	return false;
	}

	// grab gramSize chars from front or back
	int start = side == Side.FRONT ? 0 : inLen - gramSize;
	int end = start + gramSize;
	termAtt.SetEmpty().Append(inStr, start, end);
	offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end));
	gramSize++;
	return true;
	}

	public override void End()
	{
	base.End();
	// set final offset
	int finalOffset = CorrectOffset(charsRead);
	this.offsetAtt.SetOffset(finalOffset, finalOffset);
	}

	public override void Reset()
	{
	base.Reset();
	started = false;
	}
	}
	}