|  | // Lucene version compatibility level 8.6.1 | 
|  | using ICU4N; | 
|  | using ICU4N.Globalization; | 
|  | using ICU4N.Text; | 
|  |  | 
|  | namespace Lucene.Net.Analysis.Icu.Segmentation | 
|  | { | 
|  | /* | 
|  | * Licensed to the Apache Software Foundation (ASF) under one or more | 
|  | * contributor license agreements.  See the NOTICE file distributed with | 
|  | * this work for additional information regarding copyright ownership. | 
|  | * The ASF licenses this file to You under the Apache License, Version 2.0 | 
|  | * (the "License"); you may not use this file except in compliance with | 
|  | * the License.  You may obtain a copy of the License at | 
|  | * | 
|  | *     http://www.apache.org/licenses/LICENSE-2.0 | 
|  | * | 
|  | * Unless required by applicable law or agreed to in writing, software | 
|  | * distributed under the License is distributed on an "AS IS" BASIS, | 
|  | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | * See the License for the specific language governing permissions and | 
|  | * limitations under the License. | 
|  | */ | 
|  |  | 
|  | /// <summary> | 
|  | /// An internal <see cref="BreakIterator"/> for multilingual text, following recommendations | 
|  | /// from: UAX #29: Unicode Text Segmentation. (http://unicode.org/reports/tr29/) | 
|  | /// <para/> | 
|  | /// See http://unicode.org/reports/tr29/#Tailoring for the motivation of this | 
|  | /// design. | 
|  | /// <para/> | 
|  | /// Text is first divided into script boundaries. The processing is then | 
|  | /// delegated to the appropriate break iterator for that specific script. | 
|  | /// <para/> | 
|  | /// This break iterator also allows you to retrieve the ISO 15924 script code | 
|  | /// associated with a piece of text. | 
|  | /// <para/> | 
|  | /// See also UAX #29, UTR #24 | 
|  | /// <para/> | 
|  | /// @lucene.experimental | 
|  | /// </summary> | 
|  | internal sealed class CompositeBreakIterator | 
|  | { | 
|  | private readonly ICUTokenizerConfig config; | 
|  | private readonly BreakIteratorWrapper[] wordBreakers = new BreakIteratorWrapper[1 + UChar.GetIntPropertyMaxValue(UProperty.Script)]; | 
|  |  | 
|  | private BreakIteratorWrapper rbbi; | 
|  | private readonly ScriptIterator scriptIterator; | 
|  |  | 
|  | private char[] text; | 
|  |  | 
|  | public CompositeBreakIterator(ICUTokenizerConfig config) | 
|  | { | 
|  | this.config = config; | 
|  | this.scriptIterator = new ScriptIterator(config.CombineCJ); | 
|  | } | 
|  |  | 
|  | /// <summary> | 
|  | /// Retrieve the next break position. If the RBBI range is exhausted within the | 
|  | /// script boundary, examine the next script boundary. | 
|  | /// </summary> | 
|  | /// <returns>The next break position or <see cref="BreakIterator.Done"/>.</returns> | 
|  | public int Next() | 
|  | { | 
|  | int next = rbbi.Next(); | 
|  | while (next == BreakIterator.Done && scriptIterator.Next()) | 
|  | { | 
|  | rbbi = GetBreakIterator(scriptIterator.ScriptCode); | 
|  | rbbi.SetText(text, scriptIterator.ScriptStart, | 
|  | scriptIterator.ScriptLimit - scriptIterator.ScriptStart); | 
|  | next = rbbi.Next(); | 
|  | } | 
|  | return (next == BreakIterator.Done) ? BreakIterator.Done : next | 
|  | + scriptIterator.ScriptStart; | 
|  | } | 
|  |  | 
|  | /// <summary> | 
|  | /// Gets the current break position. Returns the current break position or <see cref="BreakIterator.Done"/>. | 
|  | /// </summary> | 
|  | public int Current | 
|  | { | 
|  | get | 
|  | { | 
|  | int current = rbbi.Current; | 
|  | return (current == BreakIterator.Done) ? BreakIterator.Done : current | 
|  | + scriptIterator.ScriptStart; | 
|  | } | 
|  | } | 
|  |  | 
|  | /// <summary> | 
|  | /// Gets the rule status code (token type) from the underlying break | 
|  | /// iterator. See <see cref="RuleBasedBreakIterator"/> constants. | 
|  | /// </summary> | 
|  | public int RuleStatus => rbbi.RuleStatus; | 
|  |  | 
|  | /// <summary> | 
|  | /// Gets the <see cref="UScript"/> script code for the current token. This code can be | 
|  | /// decoded with <see cref="UScript"/> into a name or ISO 15924 code. | 
|  | /// </summary> | 
|  | public int ScriptCode => scriptIterator.ScriptCode; | 
|  |  | 
|  | /// <summary> | 
|  | /// Set a new region of text to be examined by this iterator. | 
|  | /// </summary> | 
|  | /// <param name="text">Buffer of text.</param> | 
|  | /// <param name="start">Offset into buffer.</param> | 
|  | /// <param name="length">Maximum length to examine.</param> | 
|  | public void SetText(char[] text, int start, int length) | 
|  | { | 
|  | this.text = text; | 
|  | scriptIterator.SetText(text, start, length); | 
|  | if (scriptIterator.Next()) | 
|  | { | 
|  | rbbi = GetBreakIterator(scriptIterator.ScriptCode); | 
|  | rbbi.SetText(text, scriptIterator.ScriptStart, | 
|  | scriptIterator.ScriptLimit - scriptIterator.ScriptStart); | 
|  | } | 
|  | else | 
|  | { | 
|  | rbbi = GetBreakIterator(UScript.Common); | 
|  | rbbi.SetText(text, 0, 0); | 
|  | } | 
|  | } | 
|  |  | 
|  | private BreakIteratorWrapper GetBreakIterator(int scriptCode) | 
|  | { | 
|  | if (wordBreakers[scriptCode] is null) | 
|  | wordBreakers[scriptCode] = new BreakIteratorWrapper(config.GetBreakIterator(scriptCode)); | 
|  | return wordBreakers[scriptCode]; | 
|  | } | 
|  | } | 
|  | } |