lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.lucene.analysis.opennlp;

 import java.io.IOException;
 import opennlp.tools.util.Span;
 import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
 import org.apache.lucene.analysis.opennlp.tools.NLPTokenizerOp;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
 import org.apache.lucene.util.AttributeFactory;

 /**
  * Run OpenNLP SentenceDetector and Tokenizer. The last token in each sentence is marked by setting
  * the {@link #EOS_FLAG_BIT} in the FlagsAttribute; following filters can use this information to
  * apply operations to tokens one sentence at a time.
  */
 public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
   public static int EOS_FLAG_BIT = 1;

   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

   private Span[] termSpans = null;
   private int termNum = 0;
   private int sentenceStart = 0;

   private NLPSentenceDetectorOp sentenceOp = null;
   private NLPTokenizerOp tokenizerOp = null;

   public OpenNLPTokenizer(
       AttributeFactory factory, NLPSentenceDetectorOp sentenceOp, NLPTokenizerOp tokenizerOp)
       throws IOException {
     super(factory, new OpenNLPSentenceBreakIterator(sentenceOp));
     if (sentenceOp == null || tokenizerOp == null) {
       throw new IllegalArgumentException(
           "OpenNLPTokenizer: both a Sentence Detector and a Tokenizer are required");
     }
     this.sentenceOp = sentenceOp;
     this.tokenizerOp = tokenizerOp;
   }

   @Override
   public void close() throws IOException {
     super.close();
     termSpans = null;
     termNum = sentenceStart = 0;
   }

   @Override
   protected void setNextSentence(int sentenceStart, int sentenceEnd) {
     this.sentenceStart = sentenceStart;
     String sentenceText = new String(buffer, sentenceStart, sentenceEnd - sentenceStart);
     termSpans = tokenizerOp.getTerms(sentenceText);
     termNum = 0;
   }

   @Override
   protected boolean incrementWord() {
     if (termSpans == null || termNum == termSpans.length) {
       return false;
     }
     clearAttributes();
     Span term = termSpans[termNum];
     termAtt.copyBuffer(buffer, sentenceStart + term.getStart(), term.length());
     offsetAtt.setOffset(
         correctOffset(offset + sentenceStart + term.getStart()),
         correctOffset(offset + sentenceStart + term.getEnd()));
     if (termNum == termSpans.length - 1) {
       flagsAtt.setFlags(
           flagsAtt.getFlags()
               | EOS_FLAG_BIT); // mark the last token in the sentence with EOS_FLAG_BIT
     }
     ++termNum;
     return true;
   }

   @Override
   public void reset() throws IOException {
     super.reset();
     termSpans = null;
     termNum = sentenceStart = 0;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.lucene.analysis.opennlp;

	import java.io.IOException;
	import opennlp.tools.util.Span;
	import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
	import org.apache.lucene.analysis.opennlp.tools.NLPTokenizerOp;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
	import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
	import org.apache.lucene.util.AttributeFactory;

	/**
	* Run OpenNLP SentenceDetector and Tokenizer. The last token in each sentence is marked by setting
	* the {@link #EOS_FLAG_BIT} in the FlagsAttribute; following filters can use this information to
	* apply operations to tokens one sentence at a time.
	*/
	public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
	public static int EOS_FLAG_BIT = 1;

	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
	private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
	private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

	private Span[] termSpans = null;
	private int termNum = 0;
	private int sentenceStart = 0;

	private NLPSentenceDetectorOp sentenceOp = null;
	private NLPTokenizerOp tokenizerOp = null;

	public OpenNLPTokenizer(
	AttributeFactory factory, NLPSentenceDetectorOp sentenceOp, NLPTokenizerOp tokenizerOp)
	throws IOException {
	super(factory, new OpenNLPSentenceBreakIterator(sentenceOp));
	if (sentenceOp == null \|\| tokenizerOp == null) {
	throw new IllegalArgumentException(
	"OpenNLPTokenizer: both a Sentence Detector and a Tokenizer are required");
	}
	this.sentenceOp = sentenceOp;
	this.tokenizerOp = tokenizerOp;
	}

	@Override
	public void close() throws IOException {
	super.close();
	termSpans = null;
	termNum = sentenceStart = 0;
	}

	@Override
	protected void setNextSentence(int sentenceStart, int sentenceEnd) {
	this.sentenceStart = sentenceStart;
	String sentenceText = new String(buffer, sentenceStart, sentenceEnd - sentenceStart);
	termSpans = tokenizerOp.getTerms(sentenceText);
	termNum = 0;
	}

	@Override
	protected boolean incrementWord() {
	if (termSpans == null \|\| termNum == termSpans.length) {
	return false;
	}
	clearAttributes();
	Span term = termSpans[termNum];
	termAtt.copyBuffer(buffer, sentenceStart + term.getStart(), term.length());
	offsetAtt.setOffset(
	correctOffset(offset + sentenceStart + term.getStart()),
	correctOffset(offset + sentenceStart + term.getEnd()));
	if (termNum == termSpans.length - 1) {
	flagsAtt.setFlags(
	flagsAtt.getFlags()
	\| EOS_FLAG_BIT); // mark the last token in the sentence with EOS_FLAG_BIT
	}
	++termNum;
	return true;
	}

	@Override
	public void reset() throws IOException {
	super.reset();
	termSpans = null;
	termNum = sentenceStart = 0;
	}
	}