src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ScriptIterator.cs - lucenenet - Git at Google

 // Lucene version compatibility level 8.6.1
 using ICU4N;
 using ICU4N.Globalization;
 using ICU4N.Text;

 namespace Lucene.Net.Analysis.Icu.Segmentation
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// An iterator that locates ISO 15924 script boundaries in text.
     /// </summary>
     /// <remarks>
     /// This is not the same as simply looking at the Unicode block, or even the
     /// Script property. Some characters are 'common' across multiple scripts, and
     /// some 'inherit' the script value of text surrounding them.
     /// <para/>
     /// This is similar to ICU (internal-only) UScriptRun, with the following
     /// differences:
     /// <list type="bullet">
     ///     <item><description>
     ///         Doesn't attempt to match paired punctuation. For tokenization purposes, this
     ///         is not necessary. Its also quite expensive.
     ///     </description></item>
     ///     <item><description>
     ///         Non-spacing marks inherit the script of their base character, following
     ///         recommendations from UTR #24.
     ///     </description></item>
     /// </list>
     /// <para/>
     /// @lucene.experimental
     /// </remarks>
     internal sealed class ScriptIterator
     {
         private char[] text;
         private int start;
         private int limit;
         private int index;

         private int scriptStart;
         private int scriptLimit;
         private int scriptCode;

         private readonly bool combineCJ;

         /// <param name="combineCJ">if true: Han,Hiragana,Katakana will all return as <see cref="UScript.Japanese"/>.</param>
         internal ScriptIterator(bool combineCJ)
         {
             this.combineCJ = combineCJ;
         }

         /// <summary>
         /// Gets the start of this script run.
         /// </summary>
         public int ScriptStart => scriptStart;

         /// <summary>
         /// Get the index of the first character after the end of this script run.
         /// </summary>
         public int ScriptLimit => scriptLimit;

         /// <summary>
         /// Get the UScript script code for this script run.
         /// </summary>
         public int ScriptCode => scriptCode;

         /// <summary>
         /// Iterates to the next script run, returning true if one exists.
         /// </summary>
         /// <returns>true if there is another script run, false otherwise.</returns>
         public bool Next()
         {
             if (scriptLimit >= limit)
                 return false;

             scriptCode = UScript.Common;
             scriptStart = scriptLimit;

             while (index < limit)
             {
                 int ch = UTF16.CharAt(text, start, limit, index - start);
                 int sc = GetScript(ch);

                 /*
                  * From UTR #24: Implementations that determine the boundaries between
                  * characters of given scripts should never break between a non-spacing
                  * mark and its base character. Thus for boundary determinations and
                  * similar sorts of processing, a non-spacing mark — whatever its script
                  * value — should inherit the script value of its base character.
                  */
                 if (IsSameScript(scriptCode, sc)
                     || UChar.GetUnicodeCategory(ch) == UUnicodeCategory.NonSpacingMark)
                 {
                     index += UTF16.GetCharCount(ch);

                     /*
                      * Inherited or Common becomes the script code of the surrounding text.
                      */
                     if (scriptCode <= UScript.Inherited && sc > UScript.Inherited)
                     {
                         scriptCode = sc;
                     }

                 }
                 else
                 {
                     break;
                 }
             }

             scriptLimit = index;
             return true;
         }

         /// <summary>Determine if two scripts are compatible.</summary>
         private static bool IsSameScript(int scriptOne, int scriptTwo)
         {
             return scriptOne <= UScript.Inherited || scriptTwo <= UScript.Inherited
                 || scriptOne == scriptTwo;
         }

         /// <summary>
         /// Set a new region of text to be examined by this iterator.
         /// </summary>
         /// <param name="text">Text buffer to examine.</param>
         /// <param name="start">Offset into buffer.</param>
         /// <param name="length">Maximum length to examine.</param>
         public void SetText(char[] text, int start, int length)
         {
             this.text = text;
             this.start = start;
             this.index = start;
             this.limit = start + length;
             this.scriptStart = start;
             this.scriptLimit = start;
             this.scriptCode = UScript.InvalidCode;
         }

         /// <summary>Linear fast-path for basic latin case.</summary>
         private static readonly int[] basicLatin = LoadBasicLatin();

         private static int[] LoadBasicLatin() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
         {
             var basicLatin = new int[128];
             for (int i = 0; i < basicLatin.Length; i++)
                 basicLatin[i] = UScript.GetScript(i);
             return basicLatin;
         }

         /// <summary>Fast version of <see cref="UScript.GetScript(int)"/>. Basic Latin is an array lookup.</summary>
         private int GetScript(int codepoint)
         {
             if (0 <= codepoint && codepoint < basicLatin.Length)
             {
                 return basicLatin[codepoint];
             }
             else
             {
                 int script = UScript.GetScript(codepoint);
                 if (combineCJ)
                 {
                     if (script == UScript.Han || script == UScript.Hiragana || script == UScript.Katakana)
                     {
                         return UScript.Japanese;
                     }
                     else if (codepoint >= 0xFF10 && codepoint <= 0xFF19)
                     {
                         // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
                         // they are treated as punctuation. we currently have no cleaner way to fix this!
                         return UScript.Latin;
                     }
                     else
                     {
                         return script;
                     }
                 }
                 else
                 {
                     return script;
                 }
             }
         }
     }
 }
	// Lucene version compatibility level 8.6.1
	using ICU4N;
	using ICU4N.Globalization;
	using ICU4N.Text;

	namespace Lucene.Net.Analysis.Icu.Segmentation
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// An iterator that locates ISO 15924 script boundaries in text.
	/// </summary>
	/// <remarks>
	/// This is not the same as simply looking at the Unicode block, or even the
	/// Script property. Some characters are 'common' across multiple scripts, and
	/// some 'inherit' the script value of text surrounding them.
	/// <para/>
	/// This is similar to ICU (internal-only) UScriptRun, with the following
	/// differences:
	/// <list type="bullet">
	/// <item><description>
	/// Doesn't attempt to match paired punctuation. For tokenization purposes, this
	/// is not necessary. Its also quite expensive.
	/// </description></item>
	/// <item><description>
	/// Non-spacing marks inherit the script of their base character, following
	/// recommendations from UTR #24.
	/// </description></item>
	/// </list>
	/// <para/>
	/// @lucene.experimental
	/// </remarks>
	internal sealed class ScriptIterator
	{
	private char[] text;
	private int start;
	private int limit;
	private int index;

	private int scriptStart;
	private int scriptLimit;
	private int scriptCode;

	private readonly bool combineCJ;

	/// <param name="combineCJ">if true: Han,Hiragana,Katakana will all return as <see cref="UScript.Japanese"/>.</param>
	internal ScriptIterator(bool combineCJ)
	{
	this.combineCJ = combineCJ;
	}

	/// <summary>
	/// Gets the start of this script run.
	/// </summary>
	public int ScriptStart => scriptStart;

	/// <summary>
	/// Get the index of the first character after the end of this script run.
	/// </summary>
	public int ScriptLimit => scriptLimit;

	/// <summary>
	/// Get the UScript script code for this script run.
	/// </summary>
	public int ScriptCode => scriptCode;

	/// <summary>
	/// Iterates to the next script run, returning true if one exists.
	/// </summary>
	/// <returns>true if there is another script run, false otherwise.</returns>
	public bool Next()
	{
	if (scriptLimit >= limit)
	return false;

	scriptCode = UScript.Common;
	scriptStart = scriptLimit;

	while (index < limit)
	{
	int ch = UTF16.CharAt(text, start, limit, index - start);
	int sc = GetScript(ch);

	/*
	* From UTR #24: Implementations that determine the boundaries between
	* characters of given scripts should never break between a non-spacing
	* mark and its base character. Thus for boundary determinations and
	* similar sorts of processing, a non-spacing mark — whatever its script
	* value — should inherit the script value of its base character.
	*/
	if (IsSameScript(scriptCode, sc)
	\|\| UChar.GetUnicodeCategory(ch) == UUnicodeCategory.NonSpacingMark)
	{
	index += UTF16.GetCharCount(ch);

	/*
	* Inherited or Common becomes the script code of the surrounding text.
	*/
	if (scriptCode <= UScript.Inherited && sc > UScript.Inherited)
	{
	scriptCode = sc;
	}

	}
	else
	{
	break;
	}
	}

	scriptLimit = index;
	return true;
	}

	/// <summary>Determine if two scripts are compatible.</summary>
	private static bool IsSameScript(int scriptOne, int scriptTwo)
	{
	return scriptOne <= UScript.Inherited \|\| scriptTwo <= UScript.Inherited
	\|\| scriptOne == scriptTwo;
	}

	/// <summary>
	/// Set a new region of text to be examined by this iterator.
	/// </summary>
	/// <param name="text">Text buffer to examine.</param>
	/// <param name="start">Offset into buffer.</param>
	/// <param name="length">Maximum length to examine.</param>
	public void SetText(char[] text, int start, int length)
	{
	this.text = text;
	this.start = start;
	this.index = start;
	this.limit = start + length;
	this.scriptStart = start;
	this.scriptLimit = start;
	this.scriptCode = UScript.InvalidCode;
	}

	/// <summary>Linear fast-path for basic latin case.</summary>
	private static readonly int[] basicLatin = LoadBasicLatin();

	private static int[] LoadBasicLatin() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
	{
	var basicLatin = new int[128];
	for (int i = 0; i < basicLatin.Length; i++)
	basicLatin[i] = UScript.GetScript(i);
	return basicLatin;
	}

	/// <summary>Fast version of <see cref="UScript.GetScript(int)"/>. Basic Latin is an array lookup.</summary>
	private int GetScript(int codepoint)
	{
	if (0 <= codepoint && codepoint < basicLatin.Length)
	{
	return basicLatin[codepoint];
	}
	else
	{
	int script = UScript.GetScript(codepoint);
	if (combineCJ)
	{
	if (script == UScript.Han \|\| script == UScript.Hiragana \|\| script == UScript.Katakana)
	{
	return UScript.Japanese;
	}
	else if (codepoint >= 0xFF10 && codepoint <= 0xFF19)
	{
	// when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
	// they are treated as punctuation. we currently have no cleaner way to fix this!
	return UScript.Latin;
	}
	else
	{
	return script;
	}
	}
	else
	{
	return script;
	}
	}
	}
	}
	}