enhancement-engines/opennlp/opennlp-token/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/token/impl/OpenNlpTokenizerEngine.java - stanbol - Git at Google

 /*
  * Copyright (c) 2012 Sebastian Schaffert
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  *  Unless required by applicable law or agreed to in writing, software
  *  distributed under the License is distributed on an "AS IS" BASIS,
  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  *  See the License for the specific language governing permissions and
  *  limitations under the License.
  */

 package org.apache.stanbol.enhancer.engines.opennlp.token.impl;

 import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
 import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText;

 import java.util.Collections;
 import java.util.Dictionary;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;

 import opennlp.tools.tokenize.SimpleTokenizer;
 import opennlp.tools.tokenize.Tokenizer;
 import opennlp.tools.tokenize.TokenizerME;
 import opennlp.tools.tokenize.TokenizerModel;

 import org.apache.clerezza.commons.rdf.IRI;
 import org.apache.felix.scr.annotations.Activate;
 import org.apache.felix.scr.annotations.Component;
 import org.apache.felix.scr.annotations.ConfigurationPolicy;
 import org.apache.felix.scr.annotations.Deactivate;
 import org.apache.felix.scr.annotations.Properties;
 import org.apache.felix.scr.annotations.Property;
 import org.apache.felix.scr.annotations.Reference;
 import org.apache.felix.scr.annotations.Service;
 import org.apache.stanbol.commons.opennlp.OpenNLP;
 import org.apache.stanbol.enhancer.nlp.NlpProcessingRole;
 import org.apache.stanbol.enhancer.nlp.NlpServiceProperties;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
 import org.apache.stanbol.enhancer.nlp.model.Section;
 import org.apache.stanbol.enhancer.nlp.model.Token;
 import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
 import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
 import org.apache.stanbol.enhancer.servicesapi.Blob;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
 import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
 import org.osgi.framework.Constants;
 import org.osgi.service.cm.ConfigurationException;
 import org.osgi.service.component.ComponentContext;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /**
  * A german language POS tagger. Requires that the content item has a text/plain part and a
  * language id of "de". Adds a POSContentPart to the content item that can be used for further
  * processing by other modules.
  *
  * @author Sebastian Schaffert
  */

 @Component(immediate = true, metatype = true,
     configurationFactory = true, //allow multiple instances
     policy = ConfigurationPolicy.OPTIONAL) //create a default instance with the default configuration
 @Service
 @Properties(value={
         @Property(name=EnhancementEngine.PROPERTY_NAME,value="opennlp-token"),
         @Property(name=OpenNlpTokenizerEngine.CONFIG_LANGUAGES, value = {"*"},cardinality=Integer.MAX_VALUE),
         @Property(name=Constants.SERVICE_RANKING,intValue=-100) //give the default instance a ranking < 0
 })
 public class OpenNlpTokenizerEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements ServiceProperties {

     /**
      * Language configuration. Takes a list of ISO language codes of supported languages. Currently supported
      * are the languages given as default value.
      */
     public static final String CONFIG_LANGUAGES = "org.apache.stanbol.enhancer.token.languages";

     /**
      * The parameter name used to configure the name of the OpenNLP model used for pos tagging
      */
     private static final String MODEL_NAME_PARAM = "model";

     /**
      * Configuring {@value #SIMPLE_MODEL_NAME} as value for the {@link #MODEL_NAME_PARAM}
      * will cause the {@link SimpleTokenizer#INSTANCE} to be used for
      * Tokenizing the language.<p>
      * This might be useful to force the usage of this tokenizer even if a
      * language specific model is available via the {@link OpenNLP} service.
      */
     private static final String SIMPLE_MODEL_NAME = "SIMPLE";

     private static final Map<String,Object> SERVICE_PROPERTIES;
     static {
         Map<String,Object> props = new HashMap<String,Object>();
         props.put(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING,
             ServiceProperties.ORDERING_NLP_TOKENIZING);
         props.put(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE,
             NlpProcessingRole.Tokenizing);
         SERVICE_PROPERTIES = Collections.unmodifiableMap(props);
     }


     private static Logger log = LoggerFactory.getLogger(OpenNlpTokenizerEngine.class);

     //Langauge configuration
     private LanguageConfiguration languageConfig = new LanguageConfiguration(CONFIG_LANGUAGES,new String[]{"*"});

     @Reference
     private OpenNLP openNLP;

     @Reference
     private AnalysedTextFactory analysedTextFactory;

     /**
      * Indicate if this engine can enhance supplied ContentItem, and if it
      * suggests enhancing it synchronously or asynchronously. The
      * {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager} can force sync/async mode if desired, it is
      * just a suggestion from the engine.
      * <p/>
      * Returns ENHANCE_ASYNC in case there is a text/plain content part and a tagger for the language identified for
      * the content item, CANNOT_ENHANCE otherwise.
      *
      * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
      *          if the introspecting process of the content item
      *          fails
      */
     @Override
     public int canEnhance(ContentItem ci) throws EngineException {
         // check if content is present
         Map.Entry<IRI,Blob> entry = NlpEngineHelper.getPlainText(this, ci, false);
         if(entry == null || entry.getValue() == null) {
             return CANNOT_ENHANCE;
         }

         String language = getLanguage(this,ci,false);
         if(language == null) {
             return CANNOT_ENHANCE;
         }
         if(!languageConfig.isLanguage(language)){
             log.trace(" > can NOT enhance ContentItem {} because language {} is "
                 + "not enabled by this engines configuration",ci,language);
             return CANNOT_ENHANCE;
         }
         if(getTokenizer(language) == null){
             log.trace(" > can NOT tokenize plain text of {} because the tokenizer "
                 + "for language {} is not available.",ci,language);
                 return CANNOT_ENHANCE;
         }
         log.trace(" > can enhance ContentItem {} with language {}",ci,language);
         return ENHANCE_ASYNC;
     }

     /**
      * Compute enhancements for supplied ContentItem. The results of the process
      * are expected to be stored in the metadata of the content item.
      * <p/>
      * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
      * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
      * <p/>
      * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
      * stores it as a new part in the content item. The metadata is not changed.
      *
      * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
      *          if the underlying process failed to work as
      *          expected
      */
     @Override
     public void computeEnhancements(ContentItem ci) throws EngineException {
         AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
         String language = getLanguage(this, ci, true);

         Tokenizer tokenizer = getTokenizer(language);
         if(tokenizer == null){
             log.warn("Tokenizer for language {} is no longer available. "
                     + "This might happen if the model becomes unavailable during enhancement. "
                     + "If this happens more often it might also indicate an bug in the used "
                     + "EnhancementJobManager implementation as the availability is also checked "
                     + "in the canEnhance(..) method of this Enhancement Engine.");
             return;
         }
         //Try to use sentences for tokenizing
         Iterator<? extends Section> sections = at.getSentences();
         if(!sections.hasNext()){
             //if no sentences are annotated
             sections = Collections.singleton(at).iterator();
         }

         //for all sentences (or the whole Text - if no sentences available)
         while(sections.hasNext()){
             Section section = sections.next();
             //Tokenize section
             opennlp.tools.util.Span[] tokenSpans = tokenizer.tokenizePos(section.getSpan());
             for(int i=0;i<tokenSpans.length;i++){
                 Token token = section.addToken(tokenSpans[i].getStart(), tokenSpans[i].getEnd());
                 log.trace(" > add {}",token);
             }
         }
     }

     @Override
     public Map<String,Object> getServiceProperties() {
         return SERVICE_PROPERTIES;
     }
     /**
      * Activate and read the properties. Configures and initialises a POSTagger for each language configured in
      * CONFIG_LANGUAGES.
      *
      * @param ce the {@link org.osgi.service.component.ComponentContext}
      */
     @Activate
     protected void activate(ComponentContext ce) throws ConfigurationException {
         log.info("activating POS tagging engine");
         super.activate(ce);
         @SuppressWarnings("unchecked")
         Dictionary<String, Object> properties = ce.getProperties();
         languageConfig.setConfiguration(properties);
     }

     @Deactivate
     protected void deactivate(ComponentContext context) {
         languageConfig.setDefault();
         super.deactivate(context);
     }
     /**
      * Getter for the Tokenizer. This uses the {@link #languageConfig} to
      * check if a specific configuration for the given language is present
      * by checking for the {@link #MODEL_NAME_PARAM}.
      * @param language the language
      * @return the {@link Tokenizer} guaranteed to be not <code>null</code>.
      * @throws EngineException in case a custom configured model is not
      * available or an error occurred during loading.
      */
     private Tokenizer getTokenizer(String language) throws EngineException {
         String modelName = languageConfig.getParameter(language, MODEL_NAME_PARAM);
         if(modelName == null){
             return openNLP.getTokenizer(language);
         } else if(SIMPLE_MODEL_NAME.equals(modelName)){
             return SimpleTokenizer.INSTANCE;
         } else { //try to load the configured model
             TokenizerModel model;
             try {
                 model = openNLP.getModel(TokenizerModel.class, modelName, null);
             } catch (Exception e) {
                 throw new EngineException("Error while loading the configured OpenNLP "
                     + "TokenizerModel '"+modelName+"' ("+getClass().getSimpleName()+" | name="
                     + getName() + ")!",e);
             }
             if(model == null){
                 throw new EngineException("The configured OpenNLP TokenizerModel '"
                         + modelName +" is not available' ("+getClass().getSimpleName()
                         + " | name=" + getName() + ")!");
             }
             return new TokenizerME(model);
         }
     }

 }
	/*
	* Copyright (c) 2012 Sebastian Schaffert
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.stanbol.enhancer.engines.opennlp.token.impl;

	import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
	import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText;

	import java.util.Collections;
	import java.util.Dictionary;
	import java.util.HashMap;
	import java.util.Iterator;
	import java.util.Map;

	import opennlp.tools.tokenize.SimpleTokenizer;
	import opennlp.tools.tokenize.Tokenizer;
	import opennlp.tools.tokenize.TokenizerME;
	import opennlp.tools.tokenize.TokenizerModel;

	import org.apache.clerezza.commons.rdf.IRI;
	import org.apache.felix.scr.annotations.Activate;
	import org.apache.felix.scr.annotations.Component;
	import org.apache.felix.scr.annotations.ConfigurationPolicy;
	import org.apache.felix.scr.annotations.Deactivate;
	import org.apache.felix.scr.annotations.Properties;
	import org.apache.felix.scr.annotations.Property;
	import org.apache.felix.scr.annotations.Reference;
	import org.apache.felix.scr.annotations.Service;
	import org.apache.stanbol.commons.opennlp.OpenNLP;
	import org.apache.stanbol.enhancer.nlp.NlpProcessingRole;
	import org.apache.stanbol.enhancer.nlp.NlpServiceProperties;
	import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
	import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
	import org.apache.stanbol.enhancer.nlp.model.Section;
	import org.apache.stanbol.enhancer.nlp.model.Token;
	import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
	import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
	import org.apache.stanbol.enhancer.servicesapi.Blob;
	import org.apache.stanbol.enhancer.servicesapi.ContentItem;
	import org.apache.stanbol.enhancer.servicesapi.EngineException;
	import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
	import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
	import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
	import org.osgi.framework.Constants;
	import org.osgi.service.cm.ConfigurationException;
	import org.osgi.service.component.ComponentContext;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	/**
	* A german language POS tagger. Requires that the content item has a text/plain part and a
	* language id of "de". Adds a POSContentPart to the content item that can be used for further
	* processing by other modules.
	*
	* @author Sebastian Schaffert
	*/

	@Component(immediate = true, metatype = true,
	configurationFactory = true, //allow multiple instances
	policy = ConfigurationPolicy.OPTIONAL) //create a default instance with the default configuration
	@Service
	@Properties(value={
	@Property(name=EnhancementEngine.PROPERTY_NAME,value="opennlp-token"),
	@Property(name=OpenNlpTokenizerEngine.CONFIG_LANGUAGES, value = {"*"},cardinality=Integer.MAX_VALUE),
	@Property(name=Constants.SERVICE_RANKING,intValue=-100) //give the default instance a ranking < 0
	})
	public class OpenNlpTokenizerEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements ServiceProperties {

	/**
	* Language configuration. Takes a list of ISO language codes of supported languages. Currently supported
	* are the languages given as default value.
	*/
	public static final String CONFIG_LANGUAGES = "org.apache.stanbol.enhancer.token.languages";

	/**
	* The parameter name used to configure the name of the OpenNLP model used for pos tagging
	*/
	private static final String MODEL_NAME_PARAM = "model";

	/**
	* Configuring {@value #SIMPLE_MODEL_NAME} as value for the {@link #MODEL_NAME_PARAM}
	* will cause the {@link SimpleTokenizer#INSTANCE} to be used for
	* Tokenizing the language.<p>
	* This might be useful to force the usage of this tokenizer even if a
	* language specific model is available via the {@link OpenNLP} service.
	*/
	private static final String SIMPLE_MODEL_NAME = "SIMPLE";

	private static final Map<String,Object> SERVICE_PROPERTIES;
	static {
	Map<String,Object> props = new HashMap<String,Object>();
	props.put(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING,
	ServiceProperties.ORDERING_NLP_TOKENIZING);
	props.put(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE,
	NlpProcessingRole.Tokenizing);
	SERVICE_PROPERTIES = Collections.unmodifiableMap(props);
	}


	private static Logger log = LoggerFactory.getLogger(OpenNlpTokenizerEngine.class);

	//Langauge configuration
	private LanguageConfiguration languageConfig = new LanguageConfiguration(CONFIG_LANGUAGES,new String[]{"*"});

	@Reference
	private OpenNLP openNLP;

	@Reference
	private AnalysedTextFactory analysedTextFactory;

	/**
	* Indicate if this engine can enhance supplied ContentItem, and if it
	* suggests enhancing it synchronously or asynchronously. The
	* {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager} can force sync/async mode if desired, it is
	* just a suggestion from the engine.
	* <p/>
	* Returns ENHANCE_ASYNC in case there is a text/plain content part and a tagger for the language identified for
	* the content item, CANNOT_ENHANCE otherwise.
	*
	* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
	* if the introspecting process of the content item
	* fails
	*/
	@Override
	public int canEnhance(ContentItem ci) throws EngineException {
	// check if content is present
	Map.Entry<IRI,Blob> entry = NlpEngineHelper.getPlainText(this, ci, false);
	if(entry == null \|\| entry.getValue() == null) {
	return CANNOT_ENHANCE;
	}

	String language = getLanguage(this,ci,false);
	if(language == null) {
	return CANNOT_ENHANCE;
	}
	if(!languageConfig.isLanguage(language)){
	log.trace(" > can NOT enhance ContentItem {} because language {} is "
	+ "not enabled by this engines configuration",ci,language);
	return CANNOT_ENHANCE;
	}
	if(getTokenizer(language) == null){
	log.trace(" > can NOT tokenize plain text of {} because the tokenizer "
	+ "for language {} is not available.",ci,language);
	return CANNOT_ENHANCE;
	}
	log.trace(" > can enhance ContentItem {} with language {}",ci,language);
	return ENHANCE_ASYNC;
	}

	/**
	* Compute enhancements for supplied ContentItem. The results of the process
	* are expected to be stored in the metadata of the content item.
	* <p/>
	* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
	* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
	* <p/>
	* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
	* stores it as a new part in the content item. The metadata is not changed.
	*
	* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
	* if the underlying process failed to work as
	* expected
	*/
	@Override
	public void computeEnhancements(ContentItem ci) throws EngineException {
	AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
	String language = getLanguage(this, ci, true);

	Tokenizer tokenizer = getTokenizer(language);
	if(tokenizer == null){
	log.warn("Tokenizer for language {} is no longer available. "
	+ "This might happen if the model becomes unavailable during enhancement. "
	+ "If this happens more often it might also indicate an bug in the used "
	+ "EnhancementJobManager implementation as the availability is also checked "
	+ "in the canEnhance(..) method of this Enhancement Engine.");
	return;
	}
	//Try to use sentences for tokenizing
	Iterator<? extends Section> sections = at.getSentences();
	if(!sections.hasNext()){
	//if no sentences are annotated
	sections = Collections.singleton(at).iterator();
	}

	//for all sentences (or the whole Text - if no sentences available)
	while(sections.hasNext()){
	Section section = sections.next();
	//Tokenize section
	opennlp.tools.util.Span[] tokenSpans = tokenizer.tokenizePos(section.getSpan());
	for(int i=0;i<tokenSpans.length;i++){
	Token token = section.addToken(tokenSpans[i].getStart(), tokenSpans[i].getEnd());
	log.trace(" > add {}",token);
	}
	}
	}

	@Override
	public Map<String,Object> getServiceProperties() {
	return SERVICE_PROPERTIES;
	}
	/**
	* Activate and read the properties. Configures and initialises a POSTagger for each language configured in
	* CONFIG_LANGUAGES.
	*
	* @param ce the {@link org.osgi.service.component.ComponentContext}
	*/
	@Activate
	protected void activate(ComponentContext ce) throws ConfigurationException {
	log.info("activating POS tagging engine");
	super.activate(ce);
	@SuppressWarnings("unchecked")
	Dictionary<String, Object> properties = ce.getProperties();
	languageConfig.setConfiguration(properties);
	}

	@Deactivate
	protected void deactivate(ComponentContext context) {
	languageConfig.setDefault();
	super.deactivate(context);
	}
	/**
	* Getter for the Tokenizer. This uses the {@link #languageConfig} to
	* check if a specific configuration for the given language is present
	* by checking for the {@link #MODEL_NAME_PARAM}.
	* @param language the language
	* @return the {@link Tokenizer} guaranteed to be not <code>null</code>.
	* @throws EngineException in case a custom configured model is not
	* available or an error occurred during loading.
	*/
	private Tokenizer getTokenizer(String language) throws EngineException {
	String modelName = languageConfig.getParameter(language, MODEL_NAME_PARAM);
	if(modelName == null){
	return openNLP.getTokenizer(language);
	} else if(SIMPLE_MODEL_NAME.equals(modelName)){
	return SimpleTokenizer.INSTANCE;
	} else { //try to load the configured model
	TokenizerModel model;
	try {
	model = openNLP.getModel(TokenizerModel.class, modelName, null);
	} catch (Exception e) {
	throw new EngineException("Error while loading the configured OpenNLP "
	+ "TokenizerModel '"+modelName+"' ("+getClass().getSimpleName()+" \| name="
	+ getName() + ")!",e);
	}
	if(model == null){
	throw new EngineException("The configured OpenNLP TokenizerModel '"
	+ modelName +" is not available' ("+getClass().getSimpleName()
	+ " \| name=" + getName() + ")!");
	}
	return new TokenizerME(model);
	}
	}

	}