src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUNormalizer2Filter.cs - lucenenet - Git at Google

 // Lucene version compatibility level 7.1.0
 using ICU4N.Text;
 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Support;
 using System.Text;

 namespace Lucene.Net.Analysis.Icu
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Normalize token text with ICU's <see cref="Normalizer2"/>.
     /// </summary>
     /// <remarks>
     /// With this filter, you can normalize text in the following ways:
     /// <list type="bullet">
     ///     <item><description>NFKC Normalization, Case Folding, and removing Ignorables (the default)</description></item>
     ///     <item><description>Using a standard Normalization mode (NFC, NFD, NFKC, NFKD)</description></item>
     ///     <item><description>Based on rules from a custom normalization mapping.</description></item>
     /// </list>
     /// <para/>
     /// If you use the defaults, this filter is a simple way to standardize Unicode text
     /// in a language-independent way for search:
     /// <list type="bullet">
     ///     <item><description>
     ///         The case folding that it does can be seen as a replacement for
     ///         LowerCaseFilter: For example, it handles cases such as the Greek sigma, so that
     ///         "Μάϊος" and "ΜΆΪΟΣ" will match correctly.
     ///     </description></item>
     ///     <item><description>
     ///         The normalization will standardizes different forms of the same
     ///         character in Unicode. For example, CJK full-width numbers will be standardized
     ///         to their ASCII forms.
     ///     </description></item>
     ///     <item><description>
     ///         Ignorables such as Zero-Width Joiner and Variation Selectors are removed.
     ///         These are typically modifier characters that affect display.
     ///     </description></item>
     /// </list>
     /// </remarks>
     /// <seealso cref="Normalizer2"/>
     /// <seealso cref="FilteredNormalizer2"/>
     [ExceptionToClassNameConvention]
     public class ICUNormalizer2Filter : TokenFilter
     {
         private readonly ICharTermAttribute termAtt;
         private readonly Normalizer2 normalizer;
         private readonly StringBuilder buffer = new StringBuilder();

         /// <summary>
         /// Create a new <see cref="ICUNormalizer2Filter"/> that combines NFKC normalization, Case
         /// Folding, and removes Default Ignorables (NFKC_Casefold)
         /// </summary>
         /// <param name="input"></param>
         public ICUNormalizer2Filter(TokenStream input)
             : this(input, Normalizer2.GetInstance(null, "nfkc_cf", Normalizer2Mode.Compose))
         {
         }

         /// <summary>
         /// Create a new <see cref="ICUNormalizer2Filter"/> with the specified <see cref="Normalizer2"/>
         /// </summary>
         /// <param name="input">stream</param>
         /// <param name="normalizer">normalizer to use</param>
         public ICUNormalizer2Filter(TokenStream input, Normalizer2 normalizer)
             : base(input)
         {
             this.normalizer = normalizer;
             this.termAtt = AddAttribute<ICharTermAttribute>();
         }

         public override sealed bool IncrementToken()
         {
             if (m_input.IncrementToken())
             {
                 if (normalizer.QuickCheck(termAtt) != QuickCheckResult.Yes)
                 {
                     buffer.Length = 0;
                     normalizer.Normalize(termAtt, buffer);
                     termAtt.SetEmpty().Append(buffer);
                 }
                 return true;
             }
             else
             {
                 return false;
             }
         }
     }
 }
	// Lucene version compatibility level 7.1.0
	using ICU4N.Text;
	using Lucene.Net.Analysis.TokenAttributes;
	using Lucene.Net.Support;
	using System.Text;

	namespace Lucene.Net.Analysis.Icu
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Normalize token text with ICU's <see cref="Normalizer2"/>.
	/// </summary>
	/// <remarks>
	/// With this filter, you can normalize text in the following ways:
	/// <list type="bullet">
	/// <item><description>NFKC Normalization, Case Folding, and removing Ignorables (the default)</description></item>
	/// <item><description>Using a standard Normalization mode (NFC, NFD, NFKC, NFKD)</description></item>
	/// <item><description>Based on rules from a custom normalization mapping.</description></item>
	/// </list>
	/// <para/>
	/// If you use the defaults, this filter is a simple way to standardize Unicode text
	/// in a language-independent way for search:
	/// <list type="bullet">
	/// <item><description>
	/// The case folding that it does can be seen as a replacement for
	/// LowerCaseFilter: For example, it handles cases such as the Greek sigma, so that
	/// "Μάϊος" and "ΜΆΪΟΣ" will match correctly.
	/// </description></item>
	/// <item><description>
	/// The normalization will standardizes different forms of the same
	/// character in Unicode. For example, CJK full-width numbers will be standardized
	/// to their ASCII forms.
	/// </description></item>
	/// <item><description>
	/// Ignorables such as Zero-Width Joiner and Variation Selectors are removed.
	/// These are typically modifier characters that affect display.
	/// </description></item>
	/// </list>
	/// </remarks>
	/// <seealso cref="Normalizer2"/>
	/// <seealso cref="FilteredNormalizer2"/>
	[ExceptionToClassNameConvention]
	public class ICUNormalizer2Filter : TokenFilter
	{
	private readonly ICharTermAttribute termAtt;
	private readonly Normalizer2 normalizer;
	private readonly StringBuilder buffer = new StringBuilder();

	/// <summary>
	/// Create a new <see cref="ICUNormalizer2Filter"/> that combines NFKC normalization, Case
	/// Folding, and removes Default Ignorables (NFKC_Casefold)
	/// </summary>
	/// <param name="input"></param>
	public ICUNormalizer2Filter(TokenStream input)
	: this(input, Normalizer2.GetInstance(null, "nfkc_cf", Normalizer2Mode.Compose))
	{
	}

	/// <summary>
	/// Create a new <see cref="ICUNormalizer2Filter"/> with the specified <see cref="Normalizer2"/>
	/// </summary>
	/// <param name="input">stream</param>
	/// <param name="normalizer">normalizer to use</param>
	public ICUNormalizer2Filter(TokenStream input, Normalizer2 normalizer)
	: base(input)
	{
	this.normalizer = normalizer;
	this.termAtt = AddAttribute<ICharTermAttribute>();
	}

	public override sealed bool IncrementToken()
	{
	if (m_input.IncrementToken())
	{
	if (normalizer.QuickCheck(termAtt) != QuickCheckResult.Yes)
	{
	buffer.Length = 0;
	normalizer.Normalize(termAtt, buffer);
	termAtt.SetEmpty().Append(buffer);
	}
	return true;
	}
	else
	{
	return false;
	}
	}
	}
	}