blob: 68c1b84bf5550ced57f584071fcf416b9507bac6 [file] [log] [blame]
// Lucene version compatibility level 8.2.0
using Lucene.Net.Analysis.OpenNlp.Tools;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Analysis.Util;
using opennlp.tools.util;
using System;
using System.IO;
namespace Lucene.Net.Analysis.OpenNlp
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Run OpenNLP SentenceDetector and <see cref="Tokenizer"/>.
/// The last token in each sentence is marked by setting the <see cref="EOS_FLAG_BIT"/> in the <see cref="IFlagsAttribute"/>;
/// following filters can use this information to apply operations to tokens one sentence at a time.
/// </summary>
public sealed class OpenNLPTokenizer : SegmentingTokenizerBase
{
public static int EOS_FLAG_BIT = 1;
private readonly ICharTermAttribute termAtt;
private readonly IFlagsAttribute flagsAtt;
private readonly IOffsetAttribute offsetAtt;
private Span[] termSpans = null;
private int termNum = 0;
private int sentenceStart = 0;
private readonly NLPSentenceDetectorOp sentenceOp = null;
private readonly NLPTokenizerOp tokenizerOp = null;
/// <summary>
/// Creates a new <see cref="OpenNLPTokenizer"/> </summary>
public OpenNLPTokenizer(TextReader reader, NLPSentenceDetectorOp sentenceOp, NLPTokenizerOp tokenizerOp) // LUCENENET 4.8.0 specific overload to default AttributeFactory
: this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader, sentenceOp, tokenizerOp)
{
}
public OpenNLPTokenizer(AttributeFactory factory, TextReader reader, NLPSentenceDetectorOp sentenceOp, NLPTokenizerOp tokenizerOp) // LUCENENET: Added reader param for compatibility with 4.8 - remove when upgrading
: base(factory, reader, new OpenNLPSentenceBreakIterator(sentenceOp))
{
if (sentenceOp == null || tokenizerOp == null)
{
throw new ArgumentException("OpenNLPTokenizer: both a Sentence Detector and a Tokenizer are required");
}
this.sentenceOp = sentenceOp;
this.tokenizerOp = tokenizerOp;
this.termAtt = AddAttribute<ICharTermAttribute>();
this.flagsAtt = AddAttribute<IFlagsAttribute>();
this.offsetAtt = AddAttribute<IOffsetAttribute>();
}
protected override void Dispose(bool disposing)
{
base.Dispose(disposing);
if (disposing)
{
termSpans = null;
termNum = sentenceStart = 0;
}
}
protected override void SetNextSentence(int sentenceStart, int sentenceEnd)
{
this.sentenceStart = sentenceStart;
string sentenceText = new string(m_buffer, sentenceStart, sentenceEnd - sentenceStart);
termSpans = tokenizerOp.GetTerms(sentenceText);
termNum = 0;
}
protected override bool IncrementWord()
{
if (termSpans == null || termNum == termSpans.Length)
{
return false;
}
ClearAttributes();
Span term = termSpans[termNum];
termAtt.CopyBuffer(m_buffer, sentenceStart + term.getStart(), term.length());
offsetAtt.SetOffset(CorrectOffset(m_offset + sentenceStart + term.getStart()),
CorrectOffset(m_offset + sentenceStart + term.getEnd()));
if (termNum == termSpans.Length - 1)
{
flagsAtt.Flags = flagsAtt.Flags | EOS_FLAG_BIT; // mark the last token in the sentence with EOS_FLAG_BIT
}
++termNum;
return true;
}
public override void Reset()
{
base.Reset();
termSpans = null;
termNum = sentenceStart = 0;
}
}
}