blob: 027c8d7a46dffa8351941623870c497c48994586 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.opennlp.impl;
import static org.apache.stanbol.enhancer.nlp.NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import opennlp.tools.namefind.TokenNameFinderModel;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.ConfigurationPolicy;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.ReferenceCardinality;
import org.apache.felix.scr.annotations.ReferencePolicy;
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.commons.opennlp.OpenNLP;
import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileListener;
import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileTracker;
import org.apache.stanbol.enhancer.nlp.NlpProcessingRole;
import org.apache.stanbol.enhancer.nlp.NlpServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.osgi.framework.Constants;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Apache Stanbol Enhancer Named Entity Recognition enhancement engine based on opennlp's Maximum Entropy
* models. In contrast to the {@link NamedEntityExtractionEnhancementEngine} this
* engine is intended to be used for custom build models.
*/
@Component(
metatype = true,
immediate = true,
inherit = true,
configurationFactory = true,
policy = ConfigurationPolicy.REQUIRE,
specVersion = "1.1",
label = "%stanbol.CustomNERModelEnhancementEngine.name",
description = "%stanbol.CustomNERModelEnhancementEngine.description")
@Service
@org.apache.felix.scr.annotations.Properties(value={
@Property(name=EnhancementEngine.PROPERTY_NAME,value="changeme"),
@Property(name=CustomNERModelEnhancementEngine.NAME_FINDER_MODELS, cardinality=Integer.MAX_VALUE,
value={"openNlp-namefinder-model-name.bin"}),
@Property(name=CustomNERModelEnhancementEngine.NAMED_ENTITY_TYPE_MAPPINGS, cardinality=Integer.MAX_VALUE,
value={"person > http://dbpedia.org/ontology/Person",
"organization > http://dbpedia.org/ontology/Organisation",
"location > http://dbpedia.org/ontology/Place"}),
//set the ranking of the default config to a negative value (ConfigurationPolicy.OPTIONAL)
@Property(name=Constants.SERVICE_RANKING,intValue=-100)
})
@Reference(name="openNLP",referenceInterface=OpenNLP.class,
cardinality=ReferenceCardinality.MANDATORY_UNARY,
policy=ReferencePolicy.STATIC)
public class CustomNERModelEnhancementEngine
extends NEREngineCore
implements EnhancementEngine, ServiceProperties {
protected final Logger log = LoggerFactory.getLogger(CustomNERModelEnhancementEngine.class);
/**
* Do hold the named entity type to dc:type value mappings as used for
* created fise:TextAnnotations. If a named entity type is not mapped than
* created fise:TextAnnotations will not have any dc:type values
*/
public static final String NAMED_ENTITY_TYPE_MAPPINGS = "stanbol.engines.opennlp-ner.typeMappings";
/**
* Allows to define the list of custom NER models
*/
public static final String NAME_FINDER_MODELS = "stanbol.engines.opennlp-ner.nameFinderModels";
/**
* The default value for the Execution of this Engine. Currently set to
* {@link ServiceProperties#ORDERING_CONTENT_EXTRACTION}
*/
public static final Integer defaultOrder = ORDERING_CONTENT_EXTRACTION;
private static final Map<String,Object> SERVICE_PROPERTIES;
static {
Map<String,Object> sp = new HashMap<String,Object>();
sp.put(ENHANCEMENT_ENGINE_ORDERING,defaultOrder);
sp.put(ENHANCEMENT_ENGINE_NLP_ROLE, NlpProcessingRole.NamedEntityRecognition);
SERVICE_PROPERTIES = Collections.unmodifiableMap(sp);
}
/**
* Bind method of {@link NEREngineCore#openNLP}
* @param openNlp
*/
protected void bindOpenNLP(OpenNLP openNlp){
this.openNLP = openNlp;
}
/**
* Unbind method of {@link NEREngineCore#openNLP}
* @param openNLP
*/
protected void unbindOpenNLP(OpenNLP openNLP){
this.openNLP = null;
}
@Reference
private DataFileTracker dataFileTracker;
private DataFileListener modelFileListener;
protected void activate(ComponentContext ctx) throws IOException, ConfigurationException {
super.activate(ctx);
config = new NEREngineConfig();
config.getDefaultModelTypes().clear(); //this engine does not use default models
Object value = ctx.getProperties().get(NAMED_ENTITY_TYPE_MAPPINGS);
if(value instanceof String[]){ //support array
value = Arrays.asList((String[])value);
} else if(value instanceof String) { //single value
value = Collections.singleton(value);
}
if(value instanceof Collection<?>){ //and collection
log.info("Init Named Entity Type Mappings");
configs :
for(Object o : (Iterable<?>)value){
if(o != null){
StringBuilder usage = new StringBuilder("useage: ");
usage.append("'{namedEntityType} > {dc-type-uri}'");
String[] config = o.toString().split(">");
String namedEntityType = config[0].trim();
if(namedEntityType.isEmpty()){
log.warn("Invalid Type Mapping Config '{}': Missing namedEntityType ({}) -> ignore this config",
o,usage);
continue configs;
}
if(config.length < 2 || config[1].isEmpty()){
log.warn("Invalid Type Mapping Config '{}': Missing dc:type URI '{}' ({}) -> ignore this config",
o,usage);
continue configs;
}
String dcTypeUri = config[1].trim();
if(config.length > 2){
log.warn("Configuration after 2nd '>' gets ignored. Will use mapping '{} > {}' from config {}",
new Object[]{namedEntityType,dcTypeUri,o});
}
//TODO support short names (ns:localName)
try { //validate
new URI(dcTypeUri);
} catch (URISyntaxException e) {
log.warn("Invalid URI '{}' in Type Mapping Config '{}' -> ignore this config",
dcTypeUri,o);
continue configs;
}
this.config.setMappedType(namedEntityType,new UriRef(dcTypeUri));
log.info(" add mapping {} > {}",namedEntityType,dcTypeUri);
}
}
} else {
log.debug("No Type mappings configured");
}
value = ctx.getProperties().get(NAME_FINDER_MODELS);
Set<String> nameFinderModelNames = new HashSet<String>();
if(value instanceof String[]){
nameFinderModelNames.addAll(Arrays.asList((String[]) value));
nameFinderModelNames.remove(null); //remove null
nameFinderModelNames.remove(""); //remove empty
} else if (value instanceof Collection<?>){
for(Object o : ((Collection<?>)value)){
if(o != null){
nameFinderModelNames.add(o.toString());
}
}
nameFinderModelNames.remove(""); //remove empty
} else if(value != null && !value.toString().isEmpty()){
//if a single String is parsed we support ',' as seperator
String[] languageArray = value.toString().split(",");
nameFinderModelNames.addAll(Arrays.asList(languageArray));
nameFinderModelNames.remove(null); //remove null
nameFinderModelNames.remove(""); //remove empty
} else {//no configuration
throw new ConfigurationException(NAME_FINDER_MODELS, "Configurations for the "
+ getClass().getSimpleName() +" MUST HAVE at least a single custom "
+ "OpenNLP NameFinder model configured! Supported are comma separated "
+ "Strings, Arrays and Collections. Values are the file names of the "
+ "Modles. Models are Loaded via the Apache Stanbol DataFileProvider "
+ "Infrastructure (usually user wants to copy modles in the 'datafile' "
+ "directory under the {stanbol.home} directory - {working.dir}/stanbol"
+ "/datafiles).");
}
//register the configured models with the DataFileTracker
modelFileListener = new NamedModelFileListener();
Map<String,String> modelProperties = new HashMap<String,String>();
modelProperties.put("Description",
String.format("Statistical NameFinder (NER) model for OpenNLP as configured "
+"for the %s (name: %s)",getClass().getSimpleName(),getName()));
modelProperties.put("Model Type", TokenNameFinderModel.class.getSimpleName());
for(String modelName : nameFinderModelNames){
dataFileTracker.add(modelFileListener, modelName, modelProperties);
}
}
protected void deactivate(ComponentContext ctx) {
dataFileTracker.removeAll(modelFileListener); //remove all tracked files
config = null;
super.deactivate(ctx);
}
@Override
public Map<String,Object> getServiceProperties() {
return SERVICE_PROPERTIES;
}
private class NamedModelFileListener implements DataFileListener {
private Map<String,String> registeredModels = Collections.synchronizedMap(
new HashMap<String,String>());
@Override
public boolean available(String resourceName, InputStream is) {
TokenNameFinderModel model;
try {
model = openNLP.getModel(TokenNameFinderModel.class, resourceName, null);
//register the new model to the configuration
String modelLang = model.getLanguage().toLowerCase();
log.info("register custom NameFinderModel from resource: {} for language: {} to {} (name:{})",
new Object[]{resourceName,model.getLanguage(),getClass().getSimpleName(),getName()});
String currentLang = registeredModels.remove(resourceName);
if(currentLang != null && !modelLang.equals(currentLang)){
config.removeCustomNameFinderModel(currentLang, resourceName);
}
config.addCustomNameFinderModel(modelLang, resourceName);
registeredModels.put(resourceName, modelLang);
} catch (IOException e) {
log.warn("Error while loading custom TokenNameFinderModel model from resource " +
" resourceName. This model will NOT be available for the "+
getClass().getSimpleName()+" (name:"+getName()+")",e);
} catch (RuntimeException e){
log.warn("Error while loading custom TokenNameFinderModel model from resource " +
" resourceName. This model will NOT be available for the "+
getClass().getSimpleName()+" (name:"+getName()+")",e);
}
return false; //keep tracking
}
@Override
public boolean unavailable(String resource) {
String language = registeredModels.remove(resource);
if(language != null){
log.info("unregister custom NameFinderModel for resource: {} for language: {} to {} (name:{})" +
"because the resource is no longer available via the DataFileProvider infrastructure.",
new Object[]{resource,language,getClass().getSimpleName(),getName()});
config.removeCustomNameFinderModel(language, resource);
}
return false; //keep tracking
}
}
}