src/Lucene.Net.Analysis.OpenNLP/OpenNLPSentenceBreakIterator.cs - lucenenet - Git at Google

 // Lucene version compatibility level 8.2.0
 using ICU4N.Support.Text;
 using ICU4N.Text;
 using Lucene.Net.Analysis.OpenNlp.Tools;
 using Lucene.Net.Analysis.Util;
 using opennlp.tools.util;
 using System;
 using System.Diagnostics;
 using System.Text;

 namespace Lucene.Net.Analysis.OpenNlp
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// A <see cref="BreakIterator"/> that splits sentences using an OpenNLP sentence chunking model.
     /// </summary>
     public sealed class OpenNLPSentenceBreakIterator : BreakIterator
     {
         private CharacterIterator text;
         private int currentSentence;
         private int[] sentenceStarts;
         private NLPSentenceDetectorOp sentenceOp;

         public OpenNLPSentenceBreakIterator(NLPSentenceDetectorOp sentenceOp)
         {
             this.sentenceOp = sentenceOp;
         }

         public override int Current => text.Index;

         public override int First()
         {
             currentSentence = 0;
             text.SetIndex(text.BeginIndex);
             return Current;
         }

         public override int Last()
         {
             if (sentenceStarts.Length > 0)
             {
                 currentSentence = sentenceStarts.Length - 1;
                 text.SetIndex(text.EndIndex);
             }
             else
             { // there are no sentences; both the first and last positions are the begin index
                 currentSentence = 0;
                 text.SetIndex(text.BeginIndex);
             }
             return Current;
         }

         public override int Next()
         {
             if (text.Index == text.EndIndex || 0 == sentenceStarts.Length)
             {
                 return Done;
             }
             else if (currentSentence < sentenceStarts.Length - 1)
             {
                 text.SetIndex(sentenceStarts[++currentSentence]);
                 return Current;
             }
             else
             {
                 return Last();
             }
         }

         public override int Following(int pos)
         {
             if (pos < text.BeginIndex || pos > text.EndIndex)
             {
                 throw new ArgumentException("offset out of bounds");
             }
             else if (0 == sentenceStarts.Length)
             {
                 text.SetIndex(text.BeginIndex);
                 return Done;
             }
             else if (pos >= sentenceStarts[sentenceStarts.Length - 1])
             {
                 // this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
                 // https://bugs.openjdk.java.net/browse/JDK-8015110
                 text.SetIndex(text.EndIndex);
                 currentSentence = sentenceStarts.Length - 1;
                 return Done;
             }
             else
             { // there are at least two sentences
                 currentSentence = (sentenceStarts.Length - 1) / 2; // start search from the middle
                 MoveToSentenceAt(pos, 0, sentenceStarts.Length - 2);
                 text.SetIndex(sentenceStarts[++currentSentence]);
                 return Current;
             }
         }

         /// <summary>Binary search over sentences</summary>
         private void MoveToSentenceAt(int pos, int minSentence, int maxSentence)
         {
             if (minSentence != maxSentence)
             {
                 if (pos < sentenceStarts[currentSentence])
                 {
                     int newMaxSentence = currentSentence - 1;
                     currentSentence = minSentence + (currentSentence - minSentence) / 2;
                     MoveToSentenceAt(pos, minSentence, newMaxSentence);
                 }
                 else if (pos >= sentenceStarts[currentSentence + 1])
                 {
                     int newMinSentence = currentSentence + 1;
                     currentSentence = maxSentence - (maxSentence - currentSentence) / 2;
                     MoveToSentenceAt(pos, newMinSentence, maxSentence);
                 }
             }
             else
             {
                 Debug.Assert(currentSentence == minSentence);
                 Debug.Assert(pos >= sentenceStarts[currentSentence]);
                 Debug.Assert((currentSentence == sentenceStarts.Length - 1 && pos <= text.EndIndex)
                     || pos < sentenceStarts[currentSentence + 1]);
             }
             // we have arrived - nothing to do
         }

         public override int Previous()
         {
             if (text.Index == text.BeginIndex)
             {
                 return Done;
             }
             else
             {
                 if (0 == sentenceStarts.Length)
                 {
                     text.SetIndex(text.BeginIndex);
                     return Done;
                 }
                 if (text.Index == text.EndIndex)
                 {
                     text.SetIndex(sentenceStarts[currentSentence]);
                 }
                 else
                 {
                     text.SetIndex(sentenceStarts[--currentSentence]);
                 }
                 return Current;
             }
         }

         public override int Preceding(int pos)
         {
             if (pos < text.BeginIndex || pos > text.EndIndex)
             {
                 throw new ArgumentException("offset out of bounds");
             }
             else if (0 == sentenceStarts.Length)
             {
                 text.SetIndex(text.BeginIndex);
                 currentSentence = 0;
                 return Done;
             }
             else if (pos < sentenceStarts[0])
             {
                 // this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
                 // https://bugs.openjdk.java.net/browse/JDK-8015110
                 text.SetIndex(text.BeginIndex);
                 currentSentence = 0;
                 return Done;
             }
             else
             {
                 currentSentence = sentenceStarts.Length / 2; // start search from the middle
                 MoveToSentenceAt(pos, 0, sentenceStarts.Length - 1);
                 if (0 == currentSentence)
                 {
                     text.SetIndex(text.BeginIndex);
                     return Done;
                 }
                 else
                 {
                     text.SetIndex(sentenceStarts[--currentSentence]);
                     return Current;
                 }
             }
         }

         public override int Next(int n)
         {
             currentSentence += n;
             if (n < 0)
             {
                 if (text.Index == text.EndIndex)
                 {
                     ++currentSentence;
                 }
                 if (currentSentence < 0)
                 {
                     currentSentence = 0;
                     text.SetIndex(text.BeginIndex);
                     return Done;
                 }
                 else
                 {
                     text.SetIndex(sentenceStarts[currentSentence]);
                 }
             }
             else if (n > 0)
             {
                 if (currentSentence >= sentenceStarts.Length)
                 {
                     currentSentence = sentenceStarts.Length - 1;
                     text.SetIndex(text.EndIndex);
                     return Done;
                 }
                 else
                 {
                     text.SetIndex(sentenceStarts[currentSentence]);
                 }
             }
             return Current;
         }

         public override CharacterIterator Text => text;

         public override void SetText(CharacterIterator newText)
         {
             text = newText;
             text.SetIndex(text.BeginIndex);
             currentSentence = 0;
             Span[] spans = sentenceOp.SplitSentences(CharacterIteratorToString());
             sentenceStarts = new int[spans.Length];
             for (int i = 0; i < spans.Length; ++i)
             {
                 // Adjust start positions to match those of the passed-in CharacterIterator
                 sentenceStarts[i] = spans[i].getStart() + text.BeginIndex;
             }
         }

         private string CharacterIteratorToString()
         {
             string fullText;
             if (text is CharArrayIterator)
             {
                 CharArrayIterator charArrayIterator = (CharArrayIterator)text;
                 fullText = new string(charArrayIterator.Text, charArrayIterator.Start, charArrayIterator.Length);
             }
             else
             {
                 // TODO: is there a better way to extract full text from arbitrary CharacterIterators?
                 StringBuilder builder = new StringBuilder();
                 for (char ch = text.First(); ch != CharacterIterator.Done; ch = text.Next())
                 {
                     builder.Append(ch);
                 }
                 fullText = builder.ToString();
                 text.SetIndex(text.BeginIndex);
             }
             return fullText;
         }
     }
 }
	// Lucene version compatibility level 8.2.0
	using ICU4N.Support.Text;
	using ICU4N.Text;
	using Lucene.Net.Analysis.OpenNlp.Tools;
	using Lucene.Net.Analysis.Util;
	using opennlp.tools.util;
	using System;
	using System.Diagnostics;
	using System.Text;

	namespace Lucene.Net.Analysis.OpenNlp
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// A <see cref="BreakIterator"/> that splits sentences using an OpenNLP sentence chunking model.
	/// </summary>
	public sealed class OpenNLPSentenceBreakIterator : BreakIterator
	{
	private CharacterIterator text;
	private int currentSentence;
	private int[] sentenceStarts;
	private NLPSentenceDetectorOp sentenceOp;

	public OpenNLPSentenceBreakIterator(NLPSentenceDetectorOp sentenceOp)
	{
	this.sentenceOp = sentenceOp;
	}

	public override int Current => text.Index;

	public override int First()
	{
	currentSentence = 0;
	text.SetIndex(text.BeginIndex);
	return Current;
	}

	public override int Last()
	{
	if (sentenceStarts.Length > 0)
	{
	currentSentence = sentenceStarts.Length - 1;
	text.SetIndex(text.EndIndex);
	}
	else
	{ // there are no sentences; both the first and last positions are the begin index
	currentSentence = 0;
	text.SetIndex(text.BeginIndex);
	}
	return Current;
	}

	public override int Next()
	{
	if (text.Index == text.EndIndex \|\| 0 == sentenceStarts.Length)
	{
	return Done;
	}
	else if (currentSentence < sentenceStarts.Length - 1)
	{
	text.SetIndex(sentenceStarts[++currentSentence]);
	return Current;
	}
	else
	{
	return Last();
	}
	}

	public override int Following(int pos)
	{
	if (pos < text.BeginIndex \|\| pos > text.EndIndex)
	{
	throw new ArgumentException("offset out of bounds");
	}
	else if (0 == sentenceStarts.Length)
	{
	text.SetIndex(text.BeginIndex);
	return Done;
	}
	else if (pos >= sentenceStarts[sentenceStarts.Length - 1])
	{
	// this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
	// https://bugs.openjdk.java.net/browse/JDK-8015110
	text.SetIndex(text.EndIndex);
	currentSentence = sentenceStarts.Length - 1;
	return Done;
	}
	else
	{ // there are at least two sentences
	currentSentence = (sentenceStarts.Length - 1) / 2; // start search from the middle
	MoveToSentenceAt(pos, 0, sentenceStarts.Length - 2);
	text.SetIndex(sentenceStarts[++currentSentence]);
	return Current;
	}
	}

	/// <summary>Binary search over sentences</summary>
	private void MoveToSentenceAt(int pos, int minSentence, int maxSentence)
	{
	if (minSentence != maxSentence)
	{
	if (pos < sentenceStarts[currentSentence])
	{
	int newMaxSentence = currentSentence - 1;
	currentSentence = minSentence + (currentSentence - minSentence) / 2;
	MoveToSentenceAt(pos, minSentence, newMaxSentence);
	}
	else if (pos >= sentenceStarts[currentSentence + 1])
	{
	int newMinSentence = currentSentence + 1;
	currentSentence = maxSentence - (maxSentence - currentSentence) / 2;
	MoveToSentenceAt(pos, newMinSentence, maxSentence);
	}
	}
	else
	{
	Debug.Assert(currentSentence == minSentence);
	Debug.Assert(pos >= sentenceStarts[currentSentence]);
	Debug.Assert((currentSentence == sentenceStarts.Length - 1 && pos <= text.EndIndex)
	\|\| pos < sentenceStarts[currentSentence + 1]);
	}
	// we have arrived - nothing to do
	}

	public override int Previous()
	{
	if (text.Index == text.BeginIndex)
	{
	return Done;
	}
	else
	{
	if (0 == sentenceStarts.Length)
	{
	text.SetIndex(text.BeginIndex);
	return Done;
	}
	if (text.Index == text.EndIndex)
	{
	text.SetIndex(sentenceStarts[currentSentence]);
	}
	else
	{
	text.SetIndex(sentenceStarts[--currentSentence]);
	}
	return Current;
	}
	}

	public override int Preceding(int pos)
	{
	if (pos < text.BeginIndex \|\| pos > text.EndIndex)
	{
	throw new ArgumentException("offset out of bounds");
	}
	else if (0 == sentenceStarts.Length)
	{
	text.SetIndex(text.BeginIndex);
	currentSentence = 0;
	return Done;
	}
	else if (pos < sentenceStarts[0])
	{
	// this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
	// https://bugs.openjdk.java.net/browse/JDK-8015110
	text.SetIndex(text.BeginIndex);
	currentSentence = 0;
	return Done;
	}
	else
	{
	currentSentence = sentenceStarts.Length / 2; // start search from the middle
	MoveToSentenceAt(pos, 0, sentenceStarts.Length - 1);
	if (0 == currentSentence)
	{
	text.SetIndex(text.BeginIndex);
	return Done;
	}
	else
	{
	text.SetIndex(sentenceStarts[--currentSentence]);
	return Current;
	}
	}
	}

	public override int Next(int n)
	{
	currentSentence += n;
	if (n < 0)
	{
	if (text.Index == text.EndIndex)
	{
	++currentSentence;
	}
	if (currentSentence < 0)
	{
	currentSentence = 0;
	text.SetIndex(text.BeginIndex);
	return Done;
	}
	else
	{
	text.SetIndex(sentenceStarts[currentSentence]);
	}
	}
	else if (n > 0)
	{
	if (currentSentence >= sentenceStarts.Length)
	{
	currentSentence = sentenceStarts.Length - 1;
	text.SetIndex(text.EndIndex);
	return Done;
	}
	else
	{
	text.SetIndex(sentenceStarts[currentSentence]);
	}
	}
	return Current;
	}

	public override CharacterIterator Text => text;

	public override void SetText(CharacterIterator newText)
	{
	text = newText;
	text.SetIndex(text.BeginIndex);
	currentSentence = 0;
	Span[] spans = sentenceOp.SplitSentences(CharacterIteratorToString());
	sentenceStarts = new int[spans.Length];
	for (int i = 0; i < spans.Length; ++i)
	{
	// Adjust start positions to match those of the passed-in CharacterIterator
	sentenceStarts[i] = spans[i].getStart() + text.BeginIndex;
	}
	}

	private string CharacterIteratorToString()
	{
	string fullText;
	if (text is CharArrayIterator)
	{
	CharArrayIterator charArrayIterator = (CharArrayIterator)text;
	fullText = new string(charArrayIterator.Text, charArrayIterator.Start, charArrayIterator.Length);
	}
	else
	{
	// TODO: is there a better way to extract full text from arbitrary CharacterIterators?
	StringBuilder builder = new StringBuilder();
	for (char ch = text.First(); ch != CharacterIterator.Done; ch = text.Next())
	{
	builder.Append(ch);
	}
	fullText = builder.ToString();
	text.SetIndex(text.BeginIndex);
	}
	return fullText;
	}
	}
	}