| // Lucene version compatibility level 8.2.0 |
| using Lucene.Net.Analysis.OpenNlp.Tools; |
| using Lucene.Net.Analysis.TokenAttributes; |
| using Lucene.Net.Analysis.Util; |
| using opennlp.tools.util; |
| using System; |
| using System.IO; |
| |
| namespace Lucene.Net.Analysis.OpenNlp |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Run OpenNLP SentenceDetector and <see cref="Tokenizer"/>. |
| /// The last token in each sentence is marked by setting the <see cref="EOS_FLAG_BIT"/> in the <see cref="IFlagsAttribute"/>; |
| /// following filters can use this information to apply operations to tokens one sentence at a time. |
| /// </summary> |
| public sealed class OpenNLPTokenizer : SegmentingTokenizerBase |
| { |
| public static int EOS_FLAG_BIT = 1; |
| |
| private readonly ICharTermAttribute termAtt; |
| private readonly IFlagsAttribute flagsAtt; |
| private readonly IOffsetAttribute offsetAtt; |
| |
| private Span[] termSpans = null; |
| private int termNum = 0; |
| private int sentenceStart = 0; |
| |
| private readonly NLPSentenceDetectorOp sentenceOp = null; |
| private readonly NLPTokenizerOp tokenizerOp = null; |
| |
| /// <summary> |
| /// Creates a new <see cref="OpenNLPTokenizer"/> </summary> |
| public OpenNLPTokenizer(TextReader reader, NLPSentenceDetectorOp sentenceOp, NLPTokenizerOp tokenizerOp) // LUCENENET 4.8.0 specific overload to default AttributeFactory |
| : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader, sentenceOp, tokenizerOp) |
| { |
| } |
| |
| public OpenNLPTokenizer(AttributeFactory factory, TextReader reader, NLPSentenceDetectorOp sentenceOp, NLPTokenizerOp tokenizerOp) // LUCENENET: Added reader param for compatibility with 4.8 - remove when upgrading |
| : base(factory, reader, new OpenNLPSentenceBreakIterator(sentenceOp)) |
| { |
| if (sentenceOp == null || tokenizerOp == null) |
| { |
| throw new ArgumentException("OpenNLPTokenizer: both a Sentence Detector and a Tokenizer are required"); |
| } |
| this.sentenceOp = sentenceOp; |
| this.tokenizerOp = tokenizerOp; |
| this.termAtt = AddAttribute<ICharTermAttribute>(); |
| this.flagsAtt = AddAttribute<IFlagsAttribute>(); |
| this.offsetAtt = AddAttribute<IOffsetAttribute>(); |
| } |
| |
| protected override void Dispose(bool disposing) |
| { |
| base.Dispose(disposing); |
| if (disposing) |
| { |
| termSpans = null; |
| termNum = sentenceStart = 0; |
| } |
| } |
| |
| protected override void SetNextSentence(int sentenceStart, int sentenceEnd) |
| { |
| this.sentenceStart = sentenceStart; |
| string sentenceText = new string(m_buffer, sentenceStart, sentenceEnd - sentenceStart); |
| termSpans = tokenizerOp.GetTerms(sentenceText); |
| termNum = 0; |
| } |
| |
| protected override bool IncrementWord() |
| { |
| if (termSpans == null || termNum == termSpans.Length) |
| { |
| return false; |
| } |
| ClearAttributes(); |
| Span term = termSpans[termNum]; |
| termAtt.CopyBuffer(m_buffer, sentenceStart + term.getStart(), term.length()); |
| offsetAtt.SetOffset(CorrectOffset(m_offset + sentenceStart + term.getStart()), |
| CorrectOffset(m_offset + sentenceStart + term.getEnd())); |
| if (termNum == termSpans.Length - 1) |
| { |
| flagsAtt.Flags = flagsAtt.Flags | EOS_FLAG_BIT; // mark the last token in the sentence with EOS_FLAG_BIT |
| } |
| ++termNum; |
| return true; |
| } |
| |
| public override void Reset() |
| { |
| base.Reset(); |
| termSpans = null; |
| termNum = sentenceStart = 0; |
| } |
| } |
| } |