src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUTransformFilter.cs - lucenenet - Git at Google

 // Lucene version compatibility level 7.1.0
 using ICU4N.Text;
 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Support;

 namespace Lucene.Net.Analysis.Icu
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// A <see cref="TokenFilter"/> that transforms text with ICU.
     /// </summary>
     /// <remarks>
     /// ICU provides text-transformation functionality via its Transliteration API.
     /// Although script conversion is its most common use, a Transliterator can
     /// actually perform a more general class of tasks. In fact, Transliterator
     /// defines a very general API which specifies only that a segment of the input
     /// text is replaced by new text. The particulars of this conversion are
     /// determined entirely by subclasses of Transliterator.
     /// <para/>
     /// Some useful transformations for search are built-in:
     /// <list type="bullet">
     ///     <item><description>Conversion from Traditional to Simplified Chinese characters</description></item>
     ///     <item><description>Conversion from Hiragana to Katakana</description></item>
     ///     <item><description>Conversion from Fullwidth to Halfwidth forms.</description></item>
     ///     <item><description>Script conversions, for example Serbian Cyrillic to Latin</description></item>
     /// </list>
     /// <para/>
     /// Example usage:
     /// <code>
     ///     stream = new ICUTransformFilter(stream, Transliterator.GetInstance("Traditional-Simplified"));
     /// </code>
     /// <para/>
     /// For more details, see the <a href="http://userguide.icu-project.org/transforms/general">ICU User Guide</a>.
     /// </remarks>
     [ExceptionToClassNameConvention]
     public sealed class ICUTransformFilter : TokenFilter
     {
         // Transliterator to transform the text
         private readonly Transliterator transform;

         // Reusable position object
         private readonly TransliterationPosition position = new TransliterationPosition();

         // term attribute, will be updated with transformed text.
         private readonly ICharTermAttribute termAtt;

         // Wraps a termAttribute around the replaceable interface.
         private readonly ReplaceableTermAttribute replaceableAttribute = new ReplaceableTermAttribute();

         /// <summary>
         /// Create a new <see cref="ICUTransformFilter"/> that transforms text on the given stream.
         /// </summary>
         /// <param name="input"><see cref="TokenStream"/> to filter.</param>
         /// <param name="transform">Transliterator to transform the text.</param>
         public ICUTransformFilter(TokenStream input, Transliterator transform)
             : base(input)
         {
             this.transform = transform;
             this.termAtt = AddAttribute<ICharTermAttribute>();

             /*
              * This is cheating, but speeds things up a lot.
              * If we wanted to use pkg-private APIs we could probably do better.
              */
 #pragma warning disable 612, 618
             if (transform.Filter == null && transform is RuleBasedTransliterator)
 #pragma warning restore 612, 618
             {
                 UnicodeSet sourceSet = transform.GetSourceSet();
                 if (sourceSet != null && sourceSet.Any())
                     transform.Filter=sourceSet;
             }
         }

         public override bool IncrementToken()
         {
             /*
              * Wrap around replaceable. clear the positions, and transliterate.
              */
             if (m_input.IncrementToken())
             {
                 replaceableAttribute.SetText(termAtt);

                 int length = termAtt.Length;
                 position.Start = 0;
                 position.Limit = length;
                 position.ContextStart = 0;
                 position.ContextLimit = length;

                 transform.FilteredTransliterate(replaceableAttribute, position, false);
                 return true;
             }
             else
             {
                 return false;
             }
         }

         /// <summary>
         /// Wrap a <see cref="ICharTermAttribute"/> with the <see cref="IReplaceable"/> API.
         /// </summary>
         private sealed class ReplaceableTermAttribute : IReplaceable
         {
             private char[] buffer;
             private int length;
             private ICharTermAttribute token;

             public void SetText(ICharTermAttribute token)
             {
                 this.token = token;
                 this.buffer = token.Buffer;
                 this.length = token.Length;
             }

             public int Char32At(int pos) => UTF16.CharAt(buffer, 0, length, pos);

             public char this[int pos] => buffer[pos];

             public void Copy(int startIndex, int length, int destinationIndex) // LUCENENET: Changed 2nd parameter from limit to length
             {
                 char[] text = new char[length]; // LUCENENET: Corrected length
                 CopyTo(startIndex, text, 0, length); // LUCENENET: Corrected length
                 Replace(destinationIndex, destinationIndex - destinationIndex, text, 0, length); // LUCENENET: Corrected length & charsLen
             }

             public void CopyTo(int sourceIndex, char[] destination, int destinationIndex, int count)
             {
                 System.Array.Copy(buffer, sourceIndex, destination, destinationIndex, count);
             }

             public bool HasMetaData => false;

             public int Length => length;

             public void Replace(int start, int length, string text) // LUCENENET: Changed 2nd parameter from limit to length
             {
                 int charsLen = text.Length;
                 int newLength = ShiftForReplace(start, length + start, charsLen); // LUCENENET: Changed 2nd parameter to calculate limit
                 // insert the replacement text
                 text.CopyTo(0, buffer, start, charsLen);
                 token.Length = (this.length = newLength);
             }

             public void Replace(int start, int length, char[] text, int charsStart,
                 int charsLen)
             {
                 // shift text if necessary for the replacement
                 int newLength = ShiftForReplace(start, length + start, charsLen); // LUCENENET: Changed 2nd parameter to calculate limit
                 // insert the replacement text
                 System.Array.Copy(text, charsStart, buffer, start, charsLen);
                 token.Length = (this.length = newLength);
             }

             /// <summary>shift text (if necessary) for a replacement operation</summary>
             private int ShiftForReplace(int start, int limit, int charsLen)
             {
                 int replacementLength = limit - start;
                 int newLength = length - replacementLength + charsLen;
                 // resize if necessary
                 if (newLength > length)
                     buffer = token.ResizeBuffer(newLength);
                 // if the substring being replaced is longer or shorter than the
                 // replacement, need to shift things around
                 if (replacementLength != charsLen && limit < length)
                     System.Array.Copy(buffer, limit, buffer, start + charsLen, length - limit);
                 return newLength;
             }
         }
     }
 }
	// Lucene version compatibility level 7.1.0
	using ICU4N.Text;
	using Lucene.Net.Analysis.TokenAttributes;
	using Lucene.Net.Support;

	namespace Lucene.Net.Analysis.Icu
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// A <see cref="TokenFilter"/> that transforms text with ICU.
	/// </summary>
	/// <remarks>
	/// ICU provides text-transformation functionality via its Transliteration API.
	/// Although script conversion is its most common use, a Transliterator can
	/// actually perform a more general class of tasks. In fact, Transliterator
	/// defines a very general API which specifies only that a segment of the input
	/// text is replaced by new text. The particulars of this conversion are
	/// determined entirely by subclasses of Transliterator.
	/// <para/>
	/// Some useful transformations for search are built-in:
	/// <list type="bullet">
	/// <item><description>Conversion from Traditional to Simplified Chinese characters</description></item>
	/// <item><description>Conversion from Hiragana to Katakana</description></item>
	/// <item><description>Conversion from Fullwidth to Halfwidth forms.</description></item>
	/// <item><description>Script conversions, for example Serbian Cyrillic to Latin</description></item>
	/// </list>
	/// <para/>
	/// Example usage:
	/// <code>
	/// stream = new ICUTransformFilter(stream, Transliterator.GetInstance("Traditional-Simplified"));
	/// </code>
	/// <para/>
	/// For more details, see the <a href="http://userguide.icu-project.org/transforms/general">ICU User Guide</a>.
	/// </remarks>
	[ExceptionToClassNameConvention]
	public sealed class ICUTransformFilter : TokenFilter
	{
	// Transliterator to transform the text
	private readonly Transliterator transform;

	// Reusable position object
	private readonly TransliterationPosition position = new TransliterationPosition();

	// term attribute, will be updated with transformed text.
	private readonly ICharTermAttribute termAtt;

	// Wraps a termAttribute around the replaceable interface.
	private readonly ReplaceableTermAttribute replaceableAttribute = new ReplaceableTermAttribute();

	/// <summary>
	/// Create a new <see cref="ICUTransformFilter"/> that transforms text on the given stream.
	/// </summary>
	/// <param name="input"><see cref="TokenStream"/> to filter.</param>
	/// <param name="transform">Transliterator to transform the text.</param>
	public ICUTransformFilter(TokenStream input, Transliterator transform)
	: base(input)
	{
	this.transform = transform;
	this.termAtt = AddAttribute<ICharTermAttribute>();

	/*
	* This is cheating, but speeds things up a lot.
	* If we wanted to use pkg-private APIs we could probably do better.
	*/
	#pragma warning disable 612, 618
	if (transform.Filter == null && transform is RuleBasedTransliterator)
	#pragma warning restore 612, 618
	{
	UnicodeSet sourceSet = transform.GetSourceSet();
	if (sourceSet != null && sourceSet.Any())
	transform.Filter=sourceSet;
	}
	}

	public override bool IncrementToken()
	{
	/*
	* Wrap around replaceable. clear the positions, and transliterate.
	*/
	if (m_input.IncrementToken())
	{
	replaceableAttribute.SetText(termAtt);

	int length = termAtt.Length;
	position.Start = 0;
	position.Limit = length;
	position.ContextStart = 0;
	position.ContextLimit = length;

	transform.FilteredTransliterate(replaceableAttribute, position, false);
	return true;
	}
	else
	{
	return false;
	}
	}

	/// <summary>
	/// Wrap a <see cref="ICharTermAttribute"/> with the <see cref="IReplaceable"/> API.
	/// </summary>
	private sealed class ReplaceableTermAttribute : IReplaceable
	{
	private char[] buffer;
	private int length;
	private ICharTermAttribute token;

	public void SetText(ICharTermAttribute token)
	{
	this.token = token;
	this.buffer = token.Buffer;
	this.length = token.Length;
	}

	public int Char32At(int pos) => UTF16.CharAt(buffer, 0, length, pos);

	public char this[int pos] => buffer[pos];

	public void Copy(int startIndex, int length, int destinationIndex) // LUCENENET: Changed 2nd parameter from limit to length
	{
	char[] text = new char[length]; // LUCENENET: Corrected length
	CopyTo(startIndex, text, 0, length); // LUCENENET: Corrected length
	Replace(destinationIndex, destinationIndex - destinationIndex, text, 0, length); // LUCENENET: Corrected length & charsLen
	}

	public void CopyTo(int sourceIndex, char[] destination, int destinationIndex, int count)
	{
	System.Array.Copy(buffer, sourceIndex, destination, destinationIndex, count);
	}

	public bool HasMetaData => false;

	public int Length => length;

	public void Replace(int start, int length, string text) // LUCENENET: Changed 2nd parameter from limit to length
	{
	int charsLen = text.Length;
	int newLength = ShiftForReplace(start, length + start, charsLen); // LUCENENET: Changed 2nd parameter to calculate limit
	// insert the replacement text
	text.CopyTo(0, buffer, start, charsLen);
	token.Length = (this.length = newLength);
	}

	public void Replace(int start, int length, char[] text, int charsStart,
	int charsLen)
	{
	// shift text if necessary for the replacement
	int newLength = ShiftForReplace(start, length + start, charsLen); // LUCENENET: Changed 2nd parameter to calculate limit
	// insert the replacement text
	System.Array.Copy(text, charsStart, buffer, start, charsLen);
	token.Length = (this.length = newLength);
	}

	/// <summary>shift text (if necessary) for a replacement operation</summary>
	private int ShiftForReplace(int start, int limit, int charsLen)
	{
	int replacementLength = limit - start;
	int newLength = length - replacementLength + charsLen;
	// resize if necessary
	if (newLength > length)
	buffer = token.ResizeBuffer(newLength);
	// if the substring being replaced is longer or shorter than the
	// replacement, need to shift things around
	if (replacementLength != charsLen && limit < length)
	System.Array.Copy(buffer, limit, buffer, start + charsLen, length - limit);
	return newLength;
	}
	}
	}
	}