src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUNormalizer2CharFilter.cs - lucenenet - Git at Google

 // Lucene version compatibility level < 7.1.0
 using J2N;
 using ICU4N.Text;
 using Lucene.Net.Analysis.CharFilters;
 using Lucene.Net.Analysis.Util;
 using Lucene.Net.Util;
 using System;
 using System.Diagnostics;
 using System.IO;
 using System.Text;
 using ExceptionToClassNameConventionAttribute = Lucene.Net.Support.ExceptionToClassNameConventionAttribute;
 using Lucene.Net.Diagnostics;

 namespace Lucene.Net.Analysis.Icu
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Normalize token text with ICU's <see cref="Normalizer2"/>.
     /// </summary>
     [ExceptionToClassNameConvention]
     public sealed class ICUNormalizer2CharFilter : BaseCharFilter
     {
         private readonly Normalizer2 normalizer;
         private readonly StringBuilder inputBuffer = new StringBuilder();
         private readonly StringBuilder resultBuffer = new StringBuilder();

         private bool inputFinished;
         private bool afterQuickCheckYes;
         private int checkedInputBoundary;
         private int charCount;

         /// <summary>
         /// Create a new <see cref="ICUNormalizer2CharFilter"/> that combines NFKC normalization, Case
         /// Folding, and removes Default Ignorables (NFKC_Casefold).
         /// </summary>
         /// <param name="input"></param>
         public ICUNormalizer2CharFilter(TextReader input)
             : this(input, Normalizer2.GetInstance(null, "nfkc_cf", Normalizer2Mode.Compose))
         {
         }

         /// <summary>
         /// Create a new <see cref="ICUNormalizer2CharFilter"/> with the specified <see cref="Normalizer2"/>.
         /// </summary>
         /// <param name="input">Input text.</param>
         /// <param name="normalizer">Normalizer to use.</param>
         public ICUNormalizer2CharFilter(TextReader input, Normalizer2 normalizer)
             : this(input, normalizer, 128)
         {
             this.normalizer = normalizer ?? throw new ArgumentNullException(nameof(normalizer));
         }

         // for testing ONLY
         internal ICUNormalizer2CharFilter(TextReader input, Normalizer2 normalizer, int bufferSize)
             : base(input)
         {
             this.normalizer = normalizer ?? throw new ArgumentNullException(nameof(normalizer));
             this.tmpBuffer = CharacterUtils.NewCharacterBuffer(bufferSize);
         }

         public override int Read(char[] cbuf, int off, int len)
         {
             if (off < 0) throw new ArgumentException("off < 0");
             if (off >= cbuf.Length) throw new ArgumentException("off >= cbuf.length");
             if (len <= 0) throw new ArgumentException("len <= 0");

             while (!inputFinished || inputBuffer.Length > 0 || resultBuffer.Length > 0)
             {
                 int retLen;

                 if (resultBuffer.Length > 0)
                 {
                     retLen = OutputFromResultBuffer(cbuf, off, len);
                     if (retLen > 0)
                     {
                         return retLen;
                     }
                 }

                 int resLen = ReadAndNormalizeFromInput();
                 if (resLen > 0)
                 {
                     retLen = OutputFromResultBuffer(cbuf, off, len);
                     if (retLen > 0)
                     {
                         return retLen;
                     }
                 }

                 ReadInputToBuffer();
             }

             return 0; // .NET semantics - return 0, not -1
         }

         private readonly CharacterUtils.CharacterBuffer tmpBuffer;

         private void ReadInputToBuffer()
         {
             while (true)
             {
                 // CharacterUtils.fill is supplementary char aware
 #pragma warning disable 612, 618
                 bool hasRemainingChars = CharacterUtils.GetInstance(LuceneVersion.LUCENE_CURRENT).Fill(tmpBuffer, m_input);
 #pragma warning restore 612, 618

                 if (Debugging.AssertsEnabled) Debugging.Assert(tmpBuffer.Offset == 0);
                 inputBuffer.Append(tmpBuffer.Buffer, 0, tmpBuffer.Length);

                 if (hasRemainingChars == false)
                 {
                     inputFinished = true;
                     break;
                 }

                 int lastCodePoint = Character.CodePointBefore(tmpBuffer.Buffer, tmpBuffer.Length , 0);
                 if (normalizer.IsInert(lastCodePoint))
                 {
                     // we require an inert char so that we can normalize content before and
                     // after this character independently
                     break;
                 }
             }
         }

         private int ReadAndNormalizeFromInput()
         {
             if (inputBuffer.Length <= 0)
             {
                 afterQuickCheckYes = false;
                 return 0;
             }
             if (!afterQuickCheckYes)
             {
                 int resLen2 = ReadFromInputWhileSpanQuickCheckYes();
                 afterQuickCheckYes = true;
                 if (resLen2 > 0) return resLen2;
             }
             int resLen = ReadFromIoNormalizeUptoBoundary();
             if (resLen > 0)
             {
                 afterQuickCheckYes = false;
             }
             return resLen;
         }

         private int ReadFromInputWhileSpanQuickCheckYes()
         {
             int end = normalizer.SpanQuickCheckYes(inputBuffer);
             if (end > 0)
             {
                 resultBuffer.Append(inputBuffer.ToString(0, end));
                 inputBuffer.Remove(0, end);
                 checkedInputBoundary = Math.Max(checkedInputBoundary - end, 0);
                 charCount += end;
             }
             return end;
         }

         private int ReadFromIoNormalizeUptoBoundary()
         {
             // if there's no buffer to normalize, return 0
             if (inputBuffer.Length <= 0)
             {
                 return 0;
             }

             bool foundBoundary = false;
             int bufLen = inputBuffer.Length;

             while (checkedInputBoundary <= bufLen - 1)
             {
                 int charLen = Character.CharCount(inputBuffer.CodePointAt(checkedInputBoundary));
                 checkedInputBoundary += charLen;
                 if (checkedInputBoundary < bufLen && normalizer.HasBoundaryBefore(inputBuffer
                   .CodePointAt(checkedInputBoundary)))
                 {
                     foundBoundary = true;
                     break;
                 }
             }
             if (!foundBoundary && checkedInputBoundary >= bufLen && inputFinished)
             {
                 foundBoundary = true;
                 checkedInputBoundary = bufLen;
             }

             if (!foundBoundary)
             {
                 return 0;
             }

             return NormalizeInputUpto(checkedInputBoundary);
         }

         private int NormalizeInputUpto(int length)
         {
             int destOrigLen = resultBuffer.Length;
             normalizer.NormalizeSecondAndAppend(resultBuffer, inputBuffer.ToString(0, length));

             inputBuffer.Remove(0, length);
             checkedInputBoundary = Math.Max(checkedInputBoundary - length, 0);
             int resultLength = resultBuffer.Length - destOrigLen;
             RecordOffsetDiff(length, resultLength);
             return resultLength;
         }

         private void RecordOffsetDiff(int inputLength, int outputLength)
         {
             if (inputLength == outputLength)
             {
                 charCount += outputLength;
                 return;
             }
             int diff = inputLength - outputLength;
             int cumuDiff = LastCumulativeDiff;
             if (diff < 0)
             {
                 for (int i = 1; i <= -diff; ++i)
                 {
                     AddOffCorrectMap(charCount + i, cumuDiff - i);
                 }
             }
             else
             {
                 AddOffCorrectMap(charCount + outputLength, cumuDiff + diff);
             }
             charCount += outputLength;
         }

         private int OutputFromResultBuffer(char[] cbuf, int begin, int len)
         {
             len = Math.Min(resultBuffer.Length, len);
             resultBuffer.CopyTo(0, cbuf, begin, len);
             if (len > 0)
             {
                 resultBuffer.Remove(0, len);
             }
             return len;
         }
     }
 }
	// Lucene version compatibility level < 7.1.0
	using J2N;
	using ICU4N.Text;
	using Lucene.Net.Analysis.CharFilters;
	using Lucene.Net.Analysis.Util;
	using Lucene.Net.Util;
	using System;
	using System.Diagnostics;
	using System.IO;
	using System.Text;
	using ExceptionToClassNameConventionAttribute = Lucene.Net.Support.ExceptionToClassNameConventionAttribute;
	using Lucene.Net.Diagnostics;

	namespace Lucene.Net.Analysis.Icu
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Normalize token text with ICU's <see cref="Normalizer2"/>.
	/// </summary>
	[ExceptionToClassNameConvention]
	public sealed class ICUNormalizer2CharFilter : BaseCharFilter
	{
	private readonly Normalizer2 normalizer;
	private readonly StringBuilder inputBuffer = new StringBuilder();
	private readonly StringBuilder resultBuffer = new StringBuilder();

	private bool inputFinished;
	private bool afterQuickCheckYes;
	private int checkedInputBoundary;
	private int charCount;

	/// <summary>
	/// Create a new <see cref="ICUNormalizer2CharFilter"/> that combines NFKC normalization, Case
	/// Folding, and removes Default Ignorables (NFKC_Casefold).
	/// </summary>
	/// <param name="input"></param>
	public ICUNormalizer2CharFilter(TextReader input)
	: this(input, Normalizer2.GetInstance(null, "nfkc_cf", Normalizer2Mode.Compose))
	{
	}

	/// <summary>
	/// Create a new <see cref="ICUNormalizer2CharFilter"/> with the specified <see cref="Normalizer2"/>.
	/// </summary>
	/// <param name="input">Input text.</param>
	/// <param name="normalizer">Normalizer to use.</param>
	public ICUNormalizer2CharFilter(TextReader input, Normalizer2 normalizer)
	: this(input, normalizer, 128)
	{
	this.normalizer = normalizer ?? throw new ArgumentNullException(nameof(normalizer));
	}

	// for testing ONLY
	internal ICUNormalizer2CharFilter(TextReader input, Normalizer2 normalizer, int bufferSize)
	: base(input)
	{
	this.normalizer = normalizer ?? throw new ArgumentNullException(nameof(normalizer));
	this.tmpBuffer = CharacterUtils.NewCharacterBuffer(bufferSize);
	}

	public override int Read(char[] cbuf, int off, int len)
	{
	if (off < 0) throw new ArgumentException("off < 0");
	if (off >= cbuf.Length) throw new ArgumentException("off >= cbuf.length");
	if (len <= 0) throw new ArgumentException("len <= 0");

	while (!inputFinished \|\| inputBuffer.Length > 0 \|\| resultBuffer.Length > 0)
	{
	int retLen;

	if (resultBuffer.Length > 0)
	{
	retLen = OutputFromResultBuffer(cbuf, off, len);
	if (retLen > 0)
	{
	return retLen;
	}
	}

	int resLen = ReadAndNormalizeFromInput();
	if (resLen > 0)
	{
	retLen = OutputFromResultBuffer(cbuf, off, len);
	if (retLen > 0)
	{
	return retLen;
	}
	}

	ReadInputToBuffer();
	}

	return 0; // .NET semantics - return 0, not -1
	}

	private readonly CharacterUtils.CharacterBuffer tmpBuffer;

	private void ReadInputToBuffer()
	{
	while (true)
	{
	// CharacterUtils.fill is supplementary char aware
	#pragma warning disable 612, 618
	bool hasRemainingChars = CharacterUtils.GetInstance(LuceneVersion.LUCENE_CURRENT).Fill(tmpBuffer, m_input);
	#pragma warning restore 612, 618

	if (Debugging.AssertsEnabled) Debugging.Assert(tmpBuffer.Offset == 0);
	inputBuffer.Append(tmpBuffer.Buffer, 0, tmpBuffer.Length);

	if (hasRemainingChars == false)
	{
	inputFinished = true;
	break;
	}

	int lastCodePoint = Character.CodePointBefore(tmpBuffer.Buffer, tmpBuffer.Length , 0);
	if (normalizer.IsInert(lastCodePoint))
	{
	// we require an inert char so that we can normalize content before and
	// after this character independently
	break;
	}
	}
	}

	private int ReadAndNormalizeFromInput()
	{
	if (inputBuffer.Length <= 0)
	{
	afterQuickCheckYes = false;
	return 0;
	}
	if (!afterQuickCheckYes)
	{
	int resLen2 = ReadFromInputWhileSpanQuickCheckYes();
	afterQuickCheckYes = true;
	if (resLen2 > 0) return resLen2;
	}
	int resLen = ReadFromIoNormalizeUptoBoundary();
	if (resLen > 0)
	{
	afterQuickCheckYes = false;
	}
	return resLen;
	}

	private int ReadFromInputWhileSpanQuickCheckYes()
	{
	int end = normalizer.SpanQuickCheckYes(inputBuffer);
	if (end > 0)
	{
	resultBuffer.Append(inputBuffer.ToString(0, end));
	inputBuffer.Remove(0, end);
	checkedInputBoundary = Math.Max(checkedInputBoundary - end, 0);
	charCount += end;
	}
	return end;
	}

	private int ReadFromIoNormalizeUptoBoundary()
	{
	// if there's no buffer to normalize, return 0
	if (inputBuffer.Length <= 0)
	{
	return 0;
	}

	bool foundBoundary = false;
	int bufLen = inputBuffer.Length;

	while (checkedInputBoundary <= bufLen - 1)
	{
	int charLen = Character.CharCount(inputBuffer.CodePointAt(checkedInputBoundary));
	checkedInputBoundary += charLen;
	if (checkedInputBoundary < bufLen && normalizer.HasBoundaryBefore(inputBuffer
	.CodePointAt(checkedInputBoundary)))
	{
	foundBoundary = true;
	break;
	}
	}
	if (!foundBoundary && checkedInputBoundary >= bufLen && inputFinished)
	{
	foundBoundary = true;
	checkedInputBoundary = bufLen;
	}

	if (!foundBoundary)
	{
	return 0;
	}

	return NormalizeInputUpto(checkedInputBoundary);
	}

	private int NormalizeInputUpto(int length)
	{
	int destOrigLen = resultBuffer.Length;
	normalizer.NormalizeSecondAndAppend(resultBuffer, inputBuffer.ToString(0, length));

	inputBuffer.Remove(0, length);
	checkedInputBoundary = Math.Max(checkedInputBoundary - length, 0);
	int resultLength = resultBuffer.Length - destOrigLen;
	RecordOffsetDiff(length, resultLength);
	return resultLength;
	}

	private void RecordOffsetDiff(int inputLength, int outputLength)
	{
	if (inputLength == outputLength)
	{
	charCount += outputLength;
	return;
	}
	int diff = inputLength - outputLength;
	int cumuDiff = LastCumulativeDiff;
	if (diff < 0)
	{
	for (int i = 1; i <= -diff; ++i)
	{
	AddOffCorrectMap(charCount + i, cumuDiff - i);
	}
	}
	else
	{
	AddOffCorrectMap(charCount + outputLength, cumuDiff + diff);
	}
	charCount += outputLength;
	}

	private int OutputFromResultBuffer(char[] cbuf, int begin, int len)
	{
	len = Math.Min(resultBuffer.Length, len);
	resultBuffer.CopyTo(0, cbuf, begin, len);
	if (len > 0)
	{
	resultBuffer.Remove(0, len);
	}
	return len;
	}
	}
	}