blob: a701bf5379663bf94c1b4a1df893ca3bac944acf [file] [log] [blame]
/*
* Copyright (c) 2012 Sebastian Schaffert
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.opennlp.token.impl;
import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText;
import java.util.Collections;
import java.util.Dictionary;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import org.apache.clerezza.commons.rdf.IRI;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.ConfigurationPolicy;
import org.apache.felix.scr.annotations.Deactivate;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.commons.opennlp.OpenNLP;
import org.apache.stanbol.enhancer.nlp.NlpProcessingRole;
import org.apache.stanbol.enhancer.nlp.NlpServiceProperties;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
import org.apache.stanbol.enhancer.nlp.model.Section;
import org.apache.stanbol.enhancer.nlp.model.Token;
import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.osgi.framework.Constants;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A german language POS tagger. Requires that the content item has a text/plain part and a
* language id of "de". Adds a POSContentPart to the content item that can be used for further
* processing by other modules.
*
* @author Sebastian Schaffert
*/
@Component(immediate = true, metatype = true,
configurationFactory = true, //allow multiple instances
policy = ConfigurationPolicy.OPTIONAL) //create a default instance with the default configuration
@Service
@Properties(value={
@Property(name=EnhancementEngine.PROPERTY_NAME,value="opennlp-token"),
@Property(name=OpenNlpTokenizerEngine.CONFIG_LANGUAGES, value = {"*"},cardinality=Integer.MAX_VALUE),
@Property(name=Constants.SERVICE_RANKING,intValue=-100) //give the default instance a ranking < 0
})
public class OpenNlpTokenizerEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements ServiceProperties {
/**
* Language configuration. Takes a list of ISO language codes of supported languages. Currently supported
* are the languages given as default value.
*/
public static final String CONFIG_LANGUAGES = "org.apache.stanbol.enhancer.token.languages";
/**
* The parameter name used to configure the name of the OpenNLP model used for pos tagging
*/
private static final String MODEL_NAME_PARAM = "model";
/**
* Configuring {@value #SIMPLE_MODEL_NAME} as value for the {@link #MODEL_NAME_PARAM}
* will cause the {@link SimpleTokenizer#INSTANCE} to be used for
* Tokenizing the language.<p>
* This might be useful to force the usage of this tokenizer even if a
* language specific model is available via the {@link OpenNLP} service.
*/
private static final String SIMPLE_MODEL_NAME = "SIMPLE";
private static final Map<String,Object> SERVICE_PROPERTIES;
static {
Map<String,Object> props = new HashMap<String,Object>();
props.put(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING,
ServiceProperties.ORDERING_NLP_TOKENIZING);
props.put(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE,
NlpProcessingRole.Tokenizing);
SERVICE_PROPERTIES = Collections.unmodifiableMap(props);
}
private static Logger log = LoggerFactory.getLogger(OpenNlpTokenizerEngine.class);
//Langauge configuration
private LanguageConfiguration languageConfig = new LanguageConfiguration(CONFIG_LANGUAGES,new String[]{"*"});
@Reference
private OpenNLP openNLP;
@Reference
private AnalysedTextFactory analysedTextFactory;
/**
* Indicate if this engine can enhance supplied ContentItem, and if it
* suggests enhancing it synchronously or asynchronously. The
* {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager} can force sync/async mode if desired, it is
* just a suggestion from the engine.
* <p/>
* Returns ENHANCE_ASYNC in case there is a text/plain content part and a tagger for the language identified for
* the content item, CANNOT_ENHANCE otherwise.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the introspecting process of the content item
* fails
*/
@Override
public int canEnhance(ContentItem ci) throws EngineException {
// check if content is present
Map.Entry<IRI,Blob> entry = NlpEngineHelper.getPlainText(this, ci, false);
if(entry == null || entry.getValue() == null) {
return CANNOT_ENHANCE;
}
String language = getLanguage(this,ci,false);
if(language == null) {
return CANNOT_ENHANCE;
}
if(!languageConfig.isLanguage(language)){
log.trace(" > can NOT enhance ContentItem {} because language {} is "
+ "not enabled by this engines configuration",ci,language);
return CANNOT_ENHANCE;
}
if(getTokenizer(language) == null){
log.trace(" > can NOT tokenize plain text of {} because the tokenizer "
+ "for language {} is not available.",ci,language);
return CANNOT_ENHANCE;
}
log.trace(" > can enhance ContentItem {} with language {}",ci,language);
return ENHANCE_ASYNC;
}
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
String language = getLanguage(this, ci, true);
Tokenizer tokenizer = getTokenizer(language);
if(tokenizer == null){
log.warn("Tokenizer for language {} is no longer available. "
+ "This might happen if the model becomes unavailable during enhancement. "
+ "If this happens more often it might also indicate an bug in the used "
+ "EnhancementJobManager implementation as the availability is also checked "
+ "in the canEnhance(..) method of this Enhancement Engine.");
return;
}
//Try to use sentences for tokenizing
Iterator<? extends Section> sections = at.getSentences();
if(!sections.hasNext()){
//if no sentences are annotated
sections = Collections.singleton(at).iterator();
}
//for all sentences (or the whole Text - if no sentences available)
while(sections.hasNext()){
Section section = sections.next();
//Tokenize section
opennlp.tools.util.Span[] tokenSpans = tokenizer.tokenizePos(section.getSpan());
for(int i=0;i<tokenSpans.length;i++){
Token token = section.addToken(tokenSpans[i].getStart(), tokenSpans[i].getEnd());
log.trace(" > add {}",token);
}
}
}
@Override
public Map<String,Object> getServiceProperties() {
return SERVICE_PROPERTIES;
}
/**
* Activate and read the properties. Configures and initialises a POSTagger for each language configured in
* CONFIG_LANGUAGES.
*
* @param ce the {@link org.osgi.service.component.ComponentContext}
*/
@Activate
protected void activate(ComponentContext ce) throws ConfigurationException {
log.info("activating POS tagging engine");
super.activate(ce);
@SuppressWarnings("unchecked")
Dictionary<String, Object> properties = ce.getProperties();
languageConfig.setConfiguration(properties);
}
@Deactivate
protected void deactivate(ComponentContext context) {
languageConfig.setDefault();
super.deactivate(context);
}
/**
* Getter for the Tokenizer. This uses the {@link #languageConfig} to
* check if a specific configuration for the given language is present
* by checking for the {@link #MODEL_NAME_PARAM}.
* @param language the language
* @return the {@link Tokenizer} guaranteed to be not <code>null</code>.
* @throws EngineException in case a custom configured model is not
* available or an error occurred during loading.
*/
private Tokenizer getTokenizer(String language) throws EngineException {
String modelName = languageConfig.getParameter(language, MODEL_NAME_PARAM);
if(modelName == null){
return openNLP.getTokenizer(language);
} else if(SIMPLE_MODEL_NAME.equals(modelName)){
return SimpleTokenizer.INSTANCE;
} else { //try to load the configured model
TokenizerModel model;
try {
model = openNLP.getModel(TokenizerModel.class, modelName, null);
} catch (Exception e) {
throw new EngineException("Error while loading the configured OpenNLP "
+ "TokenizerModel '"+modelName+"' ("+getClass().getSimpleName()+" | name="
+ getName() + ")!",e);
}
if(model == null){
throw new EngineException("The configured OpenNLP TokenizerModel '"
+ modelName +" is not available' ("+getClass().getSimpleName()
+ " | name=" + getName() + ")!");
}
return new TokenizerME(model);
}
}
}