blob: f628e81fa98d07927230b6ec27a02786df8343a9 [file] [log] [blame]
// Lucene version compatibility level 8.6.1
using ICU4N;
using ICU4N.Globalization;
using ICU4N.Text;
namespace Lucene.Net.Analysis.Icu.Segmentation
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// An internal <see cref="BreakIterator"/> for multilingual text, following recommendations
/// from: UAX #29: Unicode Text Segmentation. (http://unicode.org/reports/tr29/)
/// <para/>
/// See http://unicode.org/reports/tr29/#Tailoring for the motivation of this
/// design.
/// <para/>
/// Text is first divided into script boundaries. The processing is then
/// delegated to the appropriate break iterator for that specific script.
/// <para/>
/// This break iterator also allows you to retrieve the ISO 15924 script code
/// associated with a piece of text.
/// <para/>
/// See also UAX #29, UTR #24
/// <para/>
/// @lucene.experimental
/// </summary>
internal sealed class CompositeBreakIterator
{
private readonly ICUTokenizerConfig config;
private readonly BreakIteratorWrapper[] wordBreakers = new BreakIteratorWrapper[1 + UChar.GetIntPropertyMaxValue(UProperty.Script)];
private BreakIteratorWrapper rbbi;
private readonly ScriptIterator scriptIterator;
private char[] text;
public CompositeBreakIterator(ICUTokenizerConfig config)
{
this.config = config;
this.scriptIterator = new ScriptIterator(config.CombineCJ);
}
/// <summary>
/// Retrieve the next break position. If the RBBI range is exhausted within the
/// script boundary, examine the next script boundary.
/// </summary>
/// <returns>The next break position or <see cref="BreakIterator.Done"/>.</returns>
public int Next()
{
int next = rbbi.Next();
while (next == BreakIterator.Done && scriptIterator.Next())
{
rbbi = GetBreakIterator(scriptIterator.ScriptCode);
rbbi.SetText(text, scriptIterator.ScriptStart,
scriptIterator.ScriptLimit - scriptIterator.ScriptStart);
next = rbbi.Next();
}
return (next == BreakIterator.Done) ? BreakIterator.Done : next
+ scriptIterator.ScriptStart;
}
/// <summary>
/// Gets the current break position. Returns the current break position or <see cref="BreakIterator.Done"/>.
/// </summary>
public int Current
{
get
{
int current = rbbi.Current;
return (current == BreakIterator.Done) ? BreakIterator.Done : current
+ scriptIterator.ScriptStart;
}
}
/// <summary>
/// Gets the rule status code (token type) from the underlying break
/// iterator. See <see cref="RuleBasedBreakIterator"/> constants.
/// </summary>
public int RuleStatus => rbbi.RuleStatus;
/// <summary>
/// Gets the <see cref="UScript"/> script code for the current token. This code can be
/// decoded with <see cref="UScript"/> into a name or ISO 15924 code.
/// </summary>
public int ScriptCode => scriptIterator.ScriptCode;
/// <summary>
/// Set a new region of text to be examined by this iterator.
/// </summary>
/// <param name="text">Buffer of text.</param>
/// <param name="start">Offset into buffer.</param>
/// <param name="length">Maximum length to examine.</param>
public void SetText(char[] text, int start, int length)
{
this.text = text;
scriptIterator.SetText(text, start, length);
if (scriptIterator.Next())
{
rbbi = GetBreakIterator(scriptIterator.ScriptCode);
rbbi.SetText(text, scriptIterator.ScriptStart,
scriptIterator.ScriptLimit - scriptIterator.ScriptStart);
}
else
{
rbbi = GetBreakIterator(UScript.Common);
rbbi.SetText(text, 0, 0);
}
}
private BreakIteratorWrapper GetBreakIterator(int scriptCode)
{
if (wordBreakers[scriptCode] is null)
wordBreakers[scriptCode] = new BreakIteratorWrapper(config.GetBreakIterator(scriptCode));
return wordBreakers[scriptCode];
}
}
}