src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/BreakIteratorWrapper.cs - lucenenet - Git at Google

 // Lucene version compatibility level 8.6.1
 using ICU4N.Text;
 using J2N;

 namespace Lucene.Net.Analysis.Icu.Segmentation
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Wraps <see cref="RuleBasedBreakIterator"/>, making object reuse convenient and
     /// emitting a rule status for emoji sequences.
     /// <para/>
     /// @lucene.experimental
     /// </summary>
     internal sealed class BreakIteratorWrapper
     {
         private readonly CharArrayIterator textIterator = new CharArrayIterator();
         private readonly RuleBasedBreakIterator rbbi;
         private char[] text;
         private int start;
         private int status;

         internal BreakIteratorWrapper(RuleBasedBreakIterator rbbi)
         {
             this.rbbi = rbbi;
         }

         public int Current => rbbi.Current;

         public int RuleStatus => status;

         public int Next()
         {
             int current = rbbi.Current;
             int next = rbbi.Next();
             status = CalcStatus(current, next);
             return next;
         }

         /// <summary>Returns current rule status for the text between breaks. (determines token type)</summary>
         private int CalcStatus(int current, int next)
         {
             // to support presentation selectors, we need to handle alphanum, num, and none at least, so currently not worth optimizing.
             // https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3AEmoji%3A%5D-%5B%3AEmoji_Presentation%3A%5D&g=Word_Break&i=
             if (next != BreakIterator.Done && IsEmoji(current, next))
             {
                 return ICUTokenizerConfig.EMOJI_SEQUENCE_STATUS;
             }
             else
             {
                 return rbbi.RuleStatus;
             }
         }

         // See unicode doc L2/16-315 for rationale.
         // basically for us the ambiguous cases (keycap/etc) as far as types go.
         internal static readonly UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").Freeze();
         // faster than doing hasBinaryProperty() checks, at the cost of 1KB ram
         //internal static readonly UnicodeSet EMOJI = new UnicodeSet("[[:Emoji:][:Extended_Pictographic:]]").Freeze(); // LUCENENET: Extended_Pictographic wasn't added until ICU 62
         internal static readonly UnicodeSet EMOJI = new UnicodeSet("[[:Emoji:]]").Freeze();

         /// <summary>Returns <c>true</c> if the current text represents emoji character or sequence.</summary>
         private bool IsEmoji(int current, int next)
         {
             int begin = start + current;
             int end = start + next;
             int codepoint = UTF16.CharAt(text, 0, end, begin);
             if (EMOJI.Contains(codepoint))
             {
                 if (EMOJI_RK.Contains(codepoint))
                 {
                     // if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
                     // an emoji presentation selector or keycap follows.
                     int trailer = begin + Character.CharCount(codepoint);
                     return trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3);
                 }
                 else
                 {
                     return true;
                 }
             }
             return false;
         }

         public void SetText(char[] text, int start, int length)
         {
             this.text = text;
             this.start = start;
             textIterator.SetText(text, start, length);
             rbbi.SetText(textIterator);
             status = RuleBasedBreakIterator.WordNone;
         }
     }
 }
	// Lucene version compatibility level 8.6.1
	using ICU4N.Text;
	using J2N;

	namespace Lucene.Net.Analysis.Icu.Segmentation
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Wraps <see cref="RuleBasedBreakIterator"/>, making object reuse convenient and
	/// emitting a rule status for emoji sequences.
	/// <para/>
	/// @lucene.experimental
	/// </summary>
	internal sealed class BreakIteratorWrapper
	{
	private readonly CharArrayIterator textIterator = new CharArrayIterator();
	private readonly RuleBasedBreakIterator rbbi;
	private char[] text;
	private int start;
	private int status;

	internal BreakIteratorWrapper(RuleBasedBreakIterator rbbi)
	{
	this.rbbi = rbbi;
	}

	public int Current => rbbi.Current;

	public int RuleStatus => status;

	public int Next()
	{
	int current = rbbi.Current;
	int next = rbbi.Next();
	status = CalcStatus(current, next);
	return next;
	}

	/// <summary>Returns current rule status for the text between breaks. (determines token type)</summary>
	private int CalcStatus(int current, int next)
	{
	// to support presentation selectors, we need to handle alphanum, num, and none at least, so currently not worth optimizing.
	// https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3AEmoji%3A%5D-%5B%3AEmoji_Presentation%3A%5D&g=Word_Break&i=
	if (next != BreakIterator.Done && IsEmoji(current, next))
	{
	return ICUTokenizerConfig.EMOJI_SEQUENCE_STATUS;
	}
	else
	{
	return rbbi.RuleStatus;
	}
	}

	// See unicode doc L2/16-315 for rationale.
	// basically for us the ambiguous cases (keycap/etc) as far as types go.
	internal static readonly UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").Freeze();
	// faster than doing hasBinaryProperty() checks, at the cost of 1KB ram
	//internal static readonly UnicodeSet EMOJI = new UnicodeSet("[[:Emoji:][:Extended_Pictographic:]]").Freeze(); // LUCENENET: Extended_Pictographic wasn't added until ICU 62
	internal static readonly UnicodeSet EMOJI = new UnicodeSet("[[:Emoji:]]").Freeze();

	/// <summary>Returns <c>true</c> if the current text represents emoji character or sequence.</summary>
	private bool IsEmoji(int current, int next)
	{
	int begin = start + current;
	int end = start + next;
	int codepoint = UTF16.CharAt(text, 0, end, begin);
	if (EMOJI.Contains(codepoint))
	{
	if (EMOJI_RK.Contains(codepoint))
	{
	// if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
	// an emoji presentation selector or keycap follows.
	int trailer = begin + Character.CharCount(codepoint);
	return trailer < end && (text[trailer] == 0xFE0F \|\| text[trailer] == 0x20E3);
	}
	else
	{
	return true;
	}
	}
	return false;
	}

	public void SetText(char[] text, int start, int length)
	{
	this.text = text;
	this.start = start;
	textIterator.SetText(text, start, length);
	rbbi.SetText(textIterator);
	status = RuleBasedBreakIterator.WordNone;
	}
	}
	}