src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CompositeBreakIterator.cs - lucenenet - Git at Google

 // Lucene version compatibility level 8.6.1
 using ICU4N;
 using ICU4N.Globalization;
 using ICU4N.Text;

 namespace Lucene.Net.Analysis.Icu.Segmentation
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// An internal <see cref="BreakIterator"/> for multilingual text, following recommendations
     /// from: UAX #29: Unicode Text Segmentation. (http://unicode.org/reports/tr29/)
     /// <para/>
     /// See http://unicode.org/reports/tr29/#Tailoring for the motivation of this
     /// design.
     /// <para/>
     /// Text is first divided into script boundaries. The processing is then
     /// delegated to the appropriate break iterator for that specific script.
     /// <para/>
     /// This break iterator also allows you to retrieve the ISO 15924 script code
     /// associated with a piece of text.
     /// <para/>
     /// See also UAX #29, UTR #24
     /// <para/>
     /// @lucene.experimental
     /// </summary>
     internal sealed class CompositeBreakIterator
     {
         private readonly ICUTokenizerConfig config;
         private readonly BreakIteratorWrapper[] wordBreakers = new BreakIteratorWrapper[1 + UChar.GetIntPropertyMaxValue(UProperty.Script)];

         private BreakIteratorWrapper rbbi;
         private readonly ScriptIterator scriptIterator;

         private char[] text;

         public CompositeBreakIterator(ICUTokenizerConfig config)
         {
             this.config = config;
             this.scriptIterator = new ScriptIterator(config.CombineCJ);
         }

         /// <summary>
         /// Retrieve the next break position. If the RBBI range is exhausted within the
         /// script boundary, examine the next script boundary.
         /// </summary>
         /// <returns>The next break position or <see cref="BreakIterator.Done"/>.</returns>
         public int Next()
         {
             int next = rbbi.Next();
             while (next == BreakIterator.Done && scriptIterator.Next())
             {
                 rbbi = GetBreakIterator(scriptIterator.ScriptCode);
                 rbbi.SetText(text, scriptIterator.ScriptStart,
                     scriptIterator.ScriptLimit - scriptIterator.ScriptStart);
                 next = rbbi.Next();
             }
             return (next == BreakIterator.Done) ? BreakIterator.Done : next
                 + scriptIterator.ScriptStart;
         }

         /// <summary>
         /// Gets the current break position. Returns the current break position or <see cref="BreakIterator.Done"/>.
         /// </summary>
         public int Current
         {
             get
             {
                 int current = rbbi.Current;
                 return (current == BreakIterator.Done) ? BreakIterator.Done : current
                     + scriptIterator.ScriptStart;
             }
         }

         /// <summary>
         /// Gets the rule status code (token type) from the underlying break
         /// iterator. See <see cref="RuleBasedBreakIterator"/> constants.
         /// </summary>
         public int RuleStatus => rbbi.RuleStatus;

         /// <summary>
         /// Gets the <see cref="UScript"/> script code for the current token. This code can be
         /// decoded with <see cref="UScript"/> into a name or ISO 15924 code.
         /// </summary>
         public int ScriptCode => scriptIterator.ScriptCode;

         /// <summary>
         /// Set a new region of text to be examined by this iterator.
         /// </summary>
         /// <param name="text">Buffer of text.</param>
         /// <param name="start">Offset into buffer.</param>
         /// <param name="length">Maximum length to examine.</param>
         public void SetText(char[] text, int start, int length)
         {
             this.text = text;
             scriptIterator.SetText(text, start, length);
             if (scriptIterator.Next())
             {
                 rbbi = GetBreakIterator(scriptIterator.ScriptCode);
                 rbbi.SetText(text, scriptIterator.ScriptStart,
                     scriptIterator.ScriptLimit - scriptIterator.ScriptStart);
             }
             else
             {
                 rbbi = GetBreakIterator(UScript.Common);
                 rbbi.SetText(text, 0, 0);
             }
         }

         private BreakIteratorWrapper GetBreakIterator(int scriptCode)
         {
             if (wordBreakers[scriptCode] is null)
                 wordBreakers[scriptCode] = new BreakIteratorWrapper(config.GetBreakIterator(scriptCode));
             return wordBreakers[scriptCode];
         }
     }
 }
	// Lucene version compatibility level 8.6.1
	using ICU4N;
	using ICU4N.Globalization;
	using ICU4N.Text;

	namespace Lucene.Net.Analysis.Icu.Segmentation
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// An internal <see cref="BreakIterator"/> for multilingual text, following recommendations
	/// from: UAX #29: Unicode Text Segmentation. (http://unicode.org/reports/tr29/)
	/// <para/>
	/// See http://unicode.org/reports/tr29/#Tailoring for the motivation of this
	/// design.
	/// <para/>
	/// Text is first divided into script boundaries. The processing is then
	/// delegated to the appropriate break iterator for that specific script.
	/// <para/>
	/// This break iterator also allows you to retrieve the ISO 15924 script code
	/// associated with a piece of text.
	/// <para/>
	/// See also UAX #29, UTR #24
	/// <para/>
	/// @lucene.experimental
	/// </summary>
	internal sealed class CompositeBreakIterator
	{
	private readonly ICUTokenizerConfig config;
	private readonly BreakIteratorWrapper[] wordBreakers = new BreakIteratorWrapper[1 + UChar.GetIntPropertyMaxValue(UProperty.Script)];

	private BreakIteratorWrapper rbbi;
	private readonly ScriptIterator scriptIterator;

	private char[] text;

	public CompositeBreakIterator(ICUTokenizerConfig config)
	{
	this.config = config;
	this.scriptIterator = new ScriptIterator(config.CombineCJ);
	}

	/// <summary>
	/// Retrieve the next break position. If the RBBI range is exhausted within the
	/// script boundary, examine the next script boundary.
	/// </summary>
	/// <returns>The next break position or <see cref="BreakIterator.Done"/>.</returns>
	public int Next()
	{
	int next = rbbi.Next();
	while (next == BreakIterator.Done && scriptIterator.Next())
	{
	rbbi = GetBreakIterator(scriptIterator.ScriptCode);
	rbbi.SetText(text, scriptIterator.ScriptStart,
	scriptIterator.ScriptLimit - scriptIterator.ScriptStart);
	next = rbbi.Next();
	}
	return (next == BreakIterator.Done) ? BreakIterator.Done : next
	+ scriptIterator.ScriptStart;
	}

	/// <summary>
	/// Gets the current break position. Returns the current break position or <see cref="BreakIterator.Done"/>.
	/// </summary>
	public int Current
	{
	get
	{
	int current = rbbi.Current;
	return (current == BreakIterator.Done) ? BreakIterator.Done : current
	+ scriptIterator.ScriptStart;
	}
	}

	/// <summary>
	/// Gets the rule status code (token type) from the underlying break
	/// iterator. See <see cref="RuleBasedBreakIterator"/> constants.
	/// </summary>
	public int RuleStatus => rbbi.RuleStatus;

	/// <summary>
	/// Gets the <see cref="UScript"/> script code for the current token. This code can be
	/// decoded with <see cref="UScript"/> into a name or ISO 15924 code.
	/// </summary>
	public int ScriptCode => scriptIterator.ScriptCode;

	/// <summary>
	/// Set a new region of text to be examined by this iterator.
	/// </summary>
	/// <param name="text">Buffer of text.</param>
	/// <param name="start">Offset into buffer.</param>
	/// <param name="length">Maximum length to examine.</param>
	public void SetText(char[] text, int start, int length)
	{
	this.text = text;
	scriptIterator.SetText(text, start, length);
	if (scriptIterator.Next())
	{
	rbbi = GetBreakIterator(scriptIterator.ScriptCode);
	rbbi.SetText(text, scriptIterator.ScriptStart,
	scriptIterator.ScriptLimit - scriptIterator.ScriptStart);
	}
	else
	{
	rbbi = GetBreakIterator(UScript.Common);
	rbbi.SetText(text, 0, 0);
	}
	}

	private BreakIteratorWrapper GetBreakIterator(int scriptCode)
	{
	if (wordBreakers[scriptCode] is null)
	wordBreakers[scriptCode] = new BreakIteratorWrapper(config.GetBreakIterator(scriptCode));
	return wordBreakers[scriptCode];
	}
	}
	}