blob: 99c9b07e47eb47d6f4aca9934231b6089af2b5da [file] [log] [blame]
// Lucene version compatibility level 8.2.0
using ICU4N.Support.Text;
using ICU4N.Text;
using Lucene.Net.Analysis.OpenNlp.Tools;
using Lucene.Net.Analysis.Util;
using opennlp.tools.util;
using System;
using System.Diagnostics;
using System.Text;
namespace Lucene.Net.Analysis.OpenNlp
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// A <see cref="BreakIterator"/> that splits sentences using an OpenNLP sentence chunking model.
/// </summary>
public sealed class OpenNLPSentenceBreakIterator : BreakIterator
{
private CharacterIterator text;
private int currentSentence;
private int[] sentenceStarts;
private NLPSentenceDetectorOp sentenceOp;
public OpenNLPSentenceBreakIterator(NLPSentenceDetectorOp sentenceOp)
{
this.sentenceOp = sentenceOp;
}
public override int Current => text.Index;
public override int First()
{
currentSentence = 0;
text.SetIndex(text.BeginIndex);
return Current;
}
public override int Last()
{
if (sentenceStarts.Length > 0)
{
currentSentence = sentenceStarts.Length - 1;
text.SetIndex(text.EndIndex);
}
else
{ // there are no sentences; both the first and last positions are the begin index
currentSentence = 0;
text.SetIndex(text.BeginIndex);
}
return Current;
}
public override int Next()
{
if (text.Index == text.EndIndex || 0 == sentenceStarts.Length)
{
return Done;
}
else if (currentSentence < sentenceStarts.Length - 1)
{
text.SetIndex(sentenceStarts[++currentSentence]);
return Current;
}
else
{
return Last();
}
}
public override int Following(int pos)
{
if (pos < text.BeginIndex || pos > text.EndIndex)
{
throw new ArgumentException("offset out of bounds");
}
else if (0 == sentenceStarts.Length)
{
text.SetIndex(text.BeginIndex);
return Done;
}
else if (pos >= sentenceStarts[sentenceStarts.Length - 1])
{
// this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
// https://bugs.openjdk.java.net/browse/JDK-8015110
text.SetIndex(text.EndIndex);
currentSentence = sentenceStarts.Length - 1;
return Done;
}
else
{ // there are at least two sentences
currentSentence = (sentenceStarts.Length - 1) / 2; // start search from the middle
MoveToSentenceAt(pos, 0, sentenceStarts.Length - 2);
text.SetIndex(sentenceStarts[++currentSentence]);
return Current;
}
}
/// <summary>Binary search over sentences</summary>
private void MoveToSentenceAt(int pos, int minSentence, int maxSentence)
{
if (minSentence != maxSentence)
{
if (pos < sentenceStarts[currentSentence])
{
int newMaxSentence = currentSentence - 1;
currentSentence = minSentence + (currentSentence - minSentence) / 2;
MoveToSentenceAt(pos, minSentence, newMaxSentence);
}
else if (pos >= sentenceStarts[currentSentence + 1])
{
int newMinSentence = currentSentence + 1;
currentSentence = maxSentence - (maxSentence - currentSentence) / 2;
MoveToSentenceAt(pos, newMinSentence, maxSentence);
}
}
else
{
Debug.Assert(currentSentence == minSentence);
Debug.Assert(pos >= sentenceStarts[currentSentence]);
Debug.Assert((currentSentence == sentenceStarts.Length - 1 && pos <= text.EndIndex)
|| pos < sentenceStarts[currentSentence + 1]);
}
// we have arrived - nothing to do
}
public override int Previous()
{
if (text.Index == text.BeginIndex)
{
return Done;
}
else
{
if (0 == sentenceStarts.Length)
{
text.SetIndex(text.BeginIndex);
return Done;
}
if (text.Index == text.EndIndex)
{
text.SetIndex(sentenceStarts[currentSentence]);
}
else
{
text.SetIndex(sentenceStarts[--currentSentence]);
}
return Current;
}
}
public override int Preceding(int pos)
{
if (pos < text.BeginIndex || pos > text.EndIndex)
{
throw new ArgumentException("offset out of bounds");
}
else if (0 == sentenceStarts.Length)
{
text.SetIndex(text.BeginIndex);
currentSentence = 0;
return Done;
}
else if (pos < sentenceStarts[0])
{
// this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
// https://bugs.openjdk.java.net/browse/JDK-8015110
text.SetIndex(text.BeginIndex);
currentSentence = 0;
return Done;
}
else
{
currentSentence = sentenceStarts.Length / 2; // start search from the middle
MoveToSentenceAt(pos, 0, sentenceStarts.Length - 1);
if (0 == currentSentence)
{
text.SetIndex(text.BeginIndex);
return Done;
}
else
{
text.SetIndex(sentenceStarts[--currentSentence]);
return Current;
}
}
}
public override int Next(int n)
{
currentSentence += n;
if (n < 0)
{
if (text.Index == text.EndIndex)
{
++currentSentence;
}
if (currentSentence < 0)
{
currentSentence = 0;
text.SetIndex(text.BeginIndex);
return Done;
}
else
{
text.SetIndex(sentenceStarts[currentSentence]);
}
}
else if (n > 0)
{
if (currentSentence >= sentenceStarts.Length)
{
currentSentence = sentenceStarts.Length - 1;
text.SetIndex(text.EndIndex);
return Done;
}
else
{
text.SetIndex(sentenceStarts[currentSentence]);
}
}
return Current;
}
public override CharacterIterator Text => text;
public override void SetText(CharacterIterator newText)
{
text = newText;
text.SetIndex(text.BeginIndex);
currentSentence = 0;
Span[] spans = sentenceOp.SplitSentences(CharacterIteratorToString());
sentenceStarts = new int[spans.Length];
for (int i = 0; i < spans.Length; ++i)
{
// Adjust start positions to match those of the passed-in CharacterIterator
sentenceStarts[i] = spans[i].getStart() + text.BeginIndex;
}
}
private string CharacterIteratorToString()
{
string fullText;
if (text is CharArrayIterator)
{
CharArrayIterator charArrayIterator = (CharArrayIterator)text;
fullText = new string(charArrayIterator.Text, charArrayIterator.Start, charArrayIterator.Length);
}
else
{
// TODO: is there a better way to extract full text from arbitrary CharacterIterators?
StringBuilder builder = new StringBuilder();
for (char ch = text.First(); ch != CharacterIterator.Done; ch = text.Next())
{
builder.Append(ch);
}
fullText = builder.ToString();
text.SetIndex(text.BeginIndex);
}
return fullText;
}
}
}