enhancement-engines/opennlp/opennlp-sentence/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/sentence/impl/OpenNlpSentenceDetectionEngine.java - stanbol - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.stanbol.enhancer.engines.opennlp.sentence.impl;

 import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
 import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText;

 import java.util.Collections;
 import java.util.Dictionary;
 import java.util.HashMap;
 import java.util.Map;

 import opennlp.tools.sentdetect.SentenceDetector;
 import opennlp.tools.sentdetect.SentenceDetectorME;
 import opennlp.tools.sentdetect.SentenceModel;

 import org.apache.clerezza.commons.rdf.IRI;
 import org.apache.felix.scr.annotations.Activate;
 import org.apache.felix.scr.annotations.Component;
 import org.apache.felix.scr.annotations.ConfigurationPolicy;
 import org.apache.felix.scr.annotations.Deactivate;
 import org.apache.felix.scr.annotations.Properties;
 import org.apache.felix.scr.annotations.Property;
 import org.apache.felix.scr.annotations.Reference;
 import org.apache.felix.scr.annotations.Service;
 import org.apache.stanbol.commons.opennlp.OpenNLP;
 import org.apache.stanbol.enhancer.nlp.NlpProcessingRole;
 import org.apache.stanbol.enhancer.nlp.NlpServiceProperties;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
 import org.apache.stanbol.enhancer.nlp.model.Sentence;
 import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
 import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
 import org.apache.stanbol.enhancer.servicesapi.Blob;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
 import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
 import org.osgi.framework.Constants;
 import org.osgi.service.cm.ConfigurationException;
 import org.osgi.service.component.ComponentContext;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /**
  * EnhancementEngine that uses the OpenNLP {@link SentenceDetector} to
  * add {@link Sentence} annotations to the {@link AnalysedText}
  * content part of the parsed {@link ContentItem}.<p>
  * While the opennlp-pos engine does also support adding of {@link Sentence}
  * annotations this engine can be used in cases where no POS tagging is
  * needed. In addition this engine also allows to configure custom
  * {@link SentenceModel}s with by using the {@link #MODEL_NAME_PARAM}
  * with the language configuration
  * <code><pre>
  *     {lang};model={model-name}
  * </pre></code>
  */
 @Component(immediate = true, metatype = true,
     configurationFactory = true, //allow multiple instances
     policy = ConfigurationPolicy.OPTIONAL) //create a default instance with the default configuration
 @Service
 @Properties(value={
         @Property(name= EnhancementEngine.PROPERTY_NAME,value="opennlp-sentence"),
         @Property(name=OpenNlpSentenceDetectionEngine.CONFIG_LANGUAGES, value = {"*"},cardinality=Integer.MAX_VALUE),
         @Property(name=Constants.SERVICE_RANKING,intValue=-100) //give the default instance a ranking < 0
 })
 public class OpenNlpSentenceDetectionEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements ServiceProperties {


     private static final Map<String,Object> SERVICE_PROPERTIES;
     static {
         Map<String,Object> props = new HashMap<String,Object>();
         props.put(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING,
             ServiceProperties.ORDERING_NLP_SENTENCE_DETECTION);
         props.put(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE,
             NlpProcessingRole.SentenceDetection);
         SERVICE_PROPERTIES = Collections.unmodifiableMap(props);
     }

     /**
      * Language configuration. Takes a list of ISO language codes of supported languages. Currently supported
      * are the languages given as default value.
      */
     public static final String CONFIG_LANGUAGES = "org.apache.stanbol.enhancer.sentence.languages";

     /**
      * The parameter name used to configure the name of the OpenNLP model used for pos tagging
      */
     private static final String MODEL_NAME_PARAM = "model";


     private static Logger log = LoggerFactory.getLogger(OpenNlpSentenceDetectionEngine.class);

     //Langauge configuration
     private LanguageConfiguration languageConfig = new LanguageConfiguration(CONFIG_LANGUAGES,new String[]{"*"});

     @Reference
     private OpenNLP openNLP;

     @Reference
     private AnalysedTextFactory analysedTextFactory;

     /**
      * Indicate if this engine can enhance supplied ContentItem, and if it
      * suggests enhancing it synchronously or asynchronously. The
      * {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager} can force sync/async mode if desired, it is
      * just a suggestion from the engine.
      * <p/>
      * Returns ENHANCE_ASYNC in case there is a text/plain content part and a tagger for the language identified for
      * the content item, CANNOT_ENHANCE otherwise.
      *
      * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
      *          if the introspecting process of the content item
      *          fails
      */
     @Override
     public int canEnhance(ContentItem ci) throws EngineException {
         // check if content is present
         Map.Entry<IRI,Blob> entry = NlpEngineHelper.getPlainText(this, ci, false);
         if(entry == null || entry.getValue() == null) {
             return CANNOT_ENHANCE;
         }

         String language = getLanguage(this,ci,false);
         if(language == null) {
             return CANNOT_ENHANCE;
         }
         if(!languageConfig.isLanguage(language)){
             log.trace(" > can NOT enhance ContentItem {} because language {} is "
                 + "not enabled by this engines configuration",ci,language);
             return CANNOT_ENHANCE;
         }
         if(getSentenceDetector(language) == null){
             log.trace(" > can NOT enhance ContentItem {} because no sentence "
                     + "deteciton model for language {} is available.",ci,language);
                 return CANNOT_ENHANCE;
         }

         log.trace(" > can enhance ContentItem {} with language {}",ci,language);
         return ENHANCE_ASYNC;
     }

     /**
      * Compute enhancements for supplied ContentItem. The results of the process
      * are expected to be stored in the metadata of the content item.
      * <p/>
      * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
      * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
      * <p/>
      * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
      * stores it as a new part in the content item. The metadata is not changed.
      *
      * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
      *          if the underlying process failed to work as
      *          expected
      */
     @Override
     public void computeEnhancements(ContentItem ci) throws EngineException {
         AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
         String language = getLanguage(this, ci, true);
         SentenceDetector sentenceDetector = getSentenceDetector(language);
         if(sentenceDetector != null){
             for(opennlp.tools.util.Span sentSpan : sentenceDetector.sentPosDetect(at.getSpan())) {
                 //detect sentences and add it to the AnalyzedText.
                 Sentence sentence = at.addSentence(sentSpan.getStart(), sentSpan.getEnd());
                 log.trace(" > add {}",sentence);
             }
         } else {
             log.warn("SentenceDetector model for language {} is no longer available. "
                 + "This might happen if the model becomes unavailable during enhancement. "
                 + "If this happens more often it might also indicate an bug in the used "
                 + "EnhancementJobManager implementation as the availability is also checked "
                 + "in the canEnhance(..) method of this Enhancement Engine.");
         }
     }

     @Override
     public Map<String,Object> getServiceProperties() {
         return SERVICE_PROPERTIES;
     }

     /**
      * Activate and read the properties. Configures and initialises a POSTagger for each language configured in
      * CONFIG_LANGUAGES.
      *
      * @param ce the {@link org.osgi.service.component.ComponentContext}
      */
     @Activate
     protected void activate(ComponentContext ce) throws ConfigurationException {
         log.info("activating POS tagging engine");
         super.activate(ce);
         @SuppressWarnings("unchecked")
         Dictionary<String, Object> properties = ce.getProperties();

         languageConfig.setConfiguration(properties);
     }

     @Deactivate
     protected void deactivate(ComponentContext context) {
         languageConfig.setDefault();
         super.deactivate(context);
     }

     /**
      * Obtains the {@link SentenceDetectorME} model for the given
      * language form the {@link #openNLP} service. If a custom
      * model is configured for the parsed language than it is
      * loaded by using {@link OpenNLP#getModel(Class, String, Map)}
      * otherwise the default model {@link OpenNLP#getSentenceDetector(String)}
      * is retrieved
      * @param language the language
      * @return the model of <code>null</code> if non is available or
      * an exception was encountered while loading
      */
     private SentenceDetector getSentenceDetector(String language) {
         SentenceModel model;
         String modelName = languageConfig.getParameter(language, MODEL_NAME_PARAM);
         if(modelName == null){
             try {
                 model = openNLP.getSentenceModel(language);
             } catch (Exception e) {
                 log.warn("Unable to load default Sentence Detection model for language '"+language+"'!",e);
                 return null;
             }
         } else {
             try {
                 model = openNLP.getModel(SentenceModel.class, modelName, null);
             } catch (Exception e) {
                 log.warn("Unable to load Sentence Detection model for language '"
                         +language+"' from the configured model '"+modelName+"'!",e);
                 return null;
             }
         }
         if(model != null) {
             log.debug("Sentence Detection Model {} for lanugage '{}' version: {}",
                 new Object[]{model.getClass().getSimpleName(),
                              model.getLanguage(),
                              model.getVersion() != null ? model.getVersion() : "undefined"});
             return new SentenceDetectorME(model);
         }
         log.debug("Sentence Detection Model for Language '{}' not available.", language);
         return null;
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.stanbol.enhancer.engines.opennlp.sentence.impl;

	import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
	import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText;

	import java.util.Collections;
	import java.util.Dictionary;
	import java.util.HashMap;
	import java.util.Map;

	import opennlp.tools.sentdetect.SentenceDetector;
	import opennlp.tools.sentdetect.SentenceDetectorME;
	import opennlp.tools.sentdetect.SentenceModel;

	import org.apache.clerezza.commons.rdf.IRI;
	import org.apache.felix.scr.annotations.Activate;
	import org.apache.felix.scr.annotations.Component;
	import org.apache.felix.scr.annotations.ConfigurationPolicy;
	import org.apache.felix.scr.annotations.Deactivate;
	import org.apache.felix.scr.annotations.Properties;
	import org.apache.felix.scr.annotations.Property;
	import org.apache.felix.scr.annotations.Reference;
	import org.apache.felix.scr.annotations.Service;
	import org.apache.stanbol.commons.opennlp.OpenNLP;
	import org.apache.stanbol.enhancer.nlp.NlpProcessingRole;
	import org.apache.stanbol.enhancer.nlp.NlpServiceProperties;
	import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
	import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
	import org.apache.stanbol.enhancer.nlp.model.Sentence;
	import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
	import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
	import org.apache.stanbol.enhancer.servicesapi.Blob;
	import org.apache.stanbol.enhancer.servicesapi.ContentItem;
	import org.apache.stanbol.enhancer.servicesapi.EngineException;
	import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
	import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
	import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
	import org.osgi.framework.Constants;
	import org.osgi.service.cm.ConfigurationException;
	import org.osgi.service.component.ComponentContext;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	/**
	* EnhancementEngine that uses the OpenNLP {@link SentenceDetector} to
	* add {@link Sentence} annotations to the {@link AnalysedText}
	* content part of the parsed {@link ContentItem}.<p>
	* While the opennlp-pos engine does also support adding of {@link Sentence}
	* annotations this engine can be used in cases where no POS tagging is
	* needed. In addition this engine also allows to configure custom
	* {@link SentenceModel}s with by using the {@link #MODEL_NAME_PARAM}
	* with the language configuration
	* <code><pre>
	* {lang};model={model-name}
	* </pre></code>
	*/
	@Component(immediate = true, metatype = true,
	configurationFactory = true, //allow multiple instances
	policy = ConfigurationPolicy.OPTIONAL) //create a default instance with the default configuration
	@Service
	@Properties(value={
	@Property(name= EnhancementEngine.PROPERTY_NAME,value="opennlp-sentence"),
	@Property(name=OpenNlpSentenceDetectionEngine.CONFIG_LANGUAGES, value = {"*"},cardinality=Integer.MAX_VALUE),
	@Property(name=Constants.SERVICE_RANKING,intValue=-100) //give the default instance a ranking < 0
	})
	public class OpenNlpSentenceDetectionEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements ServiceProperties {


	private static final Map<String,Object> SERVICE_PROPERTIES;
	static {
	Map<String,Object> props = new HashMap<String,Object>();
	props.put(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING,
	ServiceProperties.ORDERING_NLP_SENTENCE_DETECTION);
	props.put(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE,
	NlpProcessingRole.SentenceDetection);
	SERVICE_PROPERTIES = Collections.unmodifiableMap(props);
	}

	/**
	* Language configuration. Takes a list of ISO language codes of supported languages. Currently supported
	* are the languages given as default value.
	*/
	public static final String CONFIG_LANGUAGES = "org.apache.stanbol.enhancer.sentence.languages";

	/**
	* The parameter name used to configure the name of the OpenNLP model used for pos tagging
	*/
	private static final String MODEL_NAME_PARAM = "model";


	private static Logger log = LoggerFactory.getLogger(OpenNlpSentenceDetectionEngine.class);

	//Langauge configuration
	private LanguageConfiguration languageConfig = new LanguageConfiguration(CONFIG_LANGUAGES,new String[]{"*"});

	@Reference
	private OpenNLP openNLP;

	@Reference
	private AnalysedTextFactory analysedTextFactory;

	/**
	* Indicate if this engine can enhance supplied ContentItem, and if it
	* suggests enhancing it synchronously or asynchronously. The
	* {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager} can force sync/async mode if desired, it is
	* just a suggestion from the engine.
	* <p/>
	* Returns ENHANCE_ASYNC in case there is a text/plain content part and a tagger for the language identified for
	* the content item, CANNOT_ENHANCE otherwise.
	*
	* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
	* if the introspecting process of the content item
	* fails
	*/
	@Override
	public int canEnhance(ContentItem ci) throws EngineException {
	// check if content is present
	Map.Entry<IRI,Blob> entry = NlpEngineHelper.getPlainText(this, ci, false);
	if(entry == null \|\| entry.getValue() == null) {
	return CANNOT_ENHANCE;
	}

	String language = getLanguage(this,ci,false);
	if(language == null) {
	return CANNOT_ENHANCE;
	}
	if(!languageConfig.isLanguage(language)){
	log.trace(" > can NOT enhance ContentItem {} because language {} is "
	+ "not enabled by this engines configuration",ci,language);
	return CANNOT_ENHANCE;
	}
	if(getSentenceDetector(language) == null){
	log.trace(" > can NOT enhance ContentItem {} because no sentence "
	+ "deteciton model for language {} is available.",ci,language);
	return CANNOT_ENHANCE;
	}

	log.trace(" > can enhance ContentItem {} with language {}",ci,language);
	return ENHANCE_ASYNC;
	}

	/**
	* Compute enhancements for supplied ContentItem. The results of the process
	* are expected to be stored in the metadata of the content item.
	* <p/>
	* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
	* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
	* <p/>
	* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
	* stores it as a new part in the content item. The metadata is not changed.
	*
	* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
	* if the underlying process failed to work as
	* expected
	*/
	@Override
	public void computeEnhancements(ContentItem ci) throws EngineException {
	AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
	String language = getLanguage(this, ci, true);
	SentenceDetector sentenceDetector = getSentenceDetector(language);
	if(sentenceDetector != null){
	for(opennlp.tools.util.Span sentSpan : sentenceDetector.sentPosDetect(at.getSpan())) {
	//detect sentences and add it to the AnalyzedText.
	Sentence sentence = at.addSentence(sentSpan.getStart(), sentSpan.getEnd());
	log.trace(" > add {}",sentence);
	}
	} else {
	log.warn("SentenceDetector model for language {} is no longer available. "
	+ "This might happen if the model becomes unavailable during enhancement. "
	+ "If this happens more often it might also indicate an bug in the used "
	+ "EnhancementJobManager implementation as the availability is also checked "
	+ "in the canEnhance(..) method of this Enhancement Engine.");
	}
	}

	@Override
	public Map<String,Object> getServiceProperties() {
	return SERVICE_PROPERTIES;
	}

	/**
	* Activate and read the properties. Configures and initialises a POSTagger for each language configured in
	* CONFIG_LANGUAGES.
	*
	* @param ce the {@link org.osgi.service.component.ComponentContext}
	*/
	@Activate
	protected void activate(ComponentContext ce) throws ConfigurationException {
	log.info("activating POS tagging engine");
	super.activate(ce);
	@SuppressWarnings("unchecked")
	Dictionary<String, Object> properties = ce.getProperties();

	languageConfig.setConfiguration(properties);
	}

	@Deactivate
	protected void deactivate(ComponentContext context) {
	languageConfig.setDefault();
	super.deactivate(context);
	}

	/**
	* Obtains the {@link SentenceDetectorME} model for the given
	* language form the {@link #openNLP} service. If a custom
	* model is configured for the parsed language than it is
	* loaded by using {@link OpenNLP#getModel(Class, String, Map)}
	* otherwise the default model {@link OpenNLP#getSentenceDetector(String)}
	* is retrieved
	* @param language the language
	* @return the model of <code>null</code> if non is available or
	* an exception was encountered while loading
	*/
	private SentenceDetector getSentenceDetector(String language) {
	SentenceModel model;
	String modelName = languageConfig.getParameter(language, MODEL_NAME_PARAM);
	if(modelName == null){
	try {
	model = openNLP.getSentenceModel(language);
	} catch (Exception e) {
	log.warn("Unable to load default Sentence Detection model for language '"+language+"'!",e);
	return null;
	}
	} else {
	try {
	model = openNLP.getModel(SentenceModel.class, modelName, null);
	} catch (Exception e) {
	log.warn("Unable to load Sentence Detection model for language '"
	+language+"' from the configured model '"+modelName+"'!",e);
	return null;
	}
	}
	if(model != null) {
	log.debug("Sentence Detection Model {} for lanugage '{}' version: {}",
	new Object[]{model.getClass().getSimpleName(),
	model.getLanguage(),
	model.getVersion() != null ? model.getVersion() : "undefined"});
	return new SentenceDetectorME(model);
	}
	log.debug("Sentence Detection Model for Language '{}' not available.", language);
	return null;
	}

	}