blob: c11a7106ca1514223441260abf78b15c783f80cf [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.lucenefstlinking;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.CASE_SENSITIVE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_CASE_SENSITIVE_MATCHING_STATE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_INCLUDE_SIMILAR_SCORE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_MATCHING_LANGUAGE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_SUGGESTIONS;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.ENTITY_TYPES;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.INCLUDE_SIMILAR_SCORE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.SUGGESTIONS;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.TYPE_MAPPINGS;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.PROCESSED_LANGUAGES;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.PROCESS_ONLY_PROPER_NOUNS_STATE;
import static org.apache.stanbol.enhancer.servicesapi.EnhancementEngine.PROPERTY_NAME;
import static org.osgi.framework.Constants.SERVICE_RANKING;
import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Dictionary;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import org.apache.clerezza.rdf.core.Literal;
import org.apache.clerezza.rdf.core.Resource;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.text.StrLookup;
import org.apache.commons.lang.text.StrSubstitutor;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.ConfigurationPolicy;
import org.apache.felix.scr.annotations.Deactivate;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.PropertyOption;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.ReferenceCardinality;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.core.SolrCore;
import org.apache.stanbol.commons.namespaceprefix.NamespaceMappingUtils;
import org.apache.stanbol.commons.namespaceprefix.NamespacePrefixService;
import org.apache.stanbol.commons.solr.IndexReference;
import org.apache.stanbol.commons.solr.RegisteredSolrServerTracker;
import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
import org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
import org.apache.stanbol.enhancer.engines.lucenefstlinking.cache.EntityCacheManager;
import org.apache.stanbol.enhancer.engines.lucenefstlinking.cache.FastLRUCacheManager;
import org.apache.stanbol.enhancer.nlp.ner.NerTag;
import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.opensextant.solrtexttagger.TaggerFstCorpus;
import org.osgi.framework.BundleContext;
import org.osgi.framework.Constants;
import org.osgi.framework.InvalidSyntaxException;
import org.osgi.framework.ServiceReference;
import org.osgi.framework.ServiceRegistration;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.osgi.util.tracker.ServiceTracker;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
/**
* This is the OSGI component for the {@link FstLinkingEngine}. It is used to
* manage the service configuration, tracks dependencies and handles the
* OSGI life cycle.
* @author Rupert Westenthaler
*
*/
@Component(
configurationFactory = true,
policy = ConfigurationPolicy.REQUIRE, // the baseUri is required!
specVersion = "1.1",
metatype = true,
immediate = true)
@org.apache.felix.scr.annotations.Properties(value={
@Property(name=PROPERTY_NAME), //the name of the engine
@Property(name=FstLinkingEngineComponent.SOLR_CORE),
@Property(name=IndexConfiguration.FIELD_ENCODING, options={
@PropertyOption(
value='%'+IndexConfiguration.FIELD_ENCODING+".option.none",
name="None"),
@PropertyOption(
value='%'+IndexConfiguration.FIELD_ENCODING+".option.solrYard",
name="SolrYard"),
@PropertyOption(
value='%'+IndexConfiguration.FIELD_ENCODING+".option.minusPrefix",
name="MinusPrefix"),
@PropertyOption(
value='%'+IndexConfiguration.FIELD_ENCODING+".option.underscorePrefix",
name="UnderscorePrefix"),
@PropertyOption(
value='%'+IndexConfiguration.FIELD_ENCODING+".option.minusSuffix",
name="MinusSuffix"),
@PropertyOption(
value='%'+IndexConfiguration.FIELD_ENCODING+".option.underscoreSuffix",
name="UnderscoreSuffix"),
@PropertyOption(
value='%'+IndexConfiguration.FIELD_ENCODING+".option.atPrefix",
name="AtPrefix"),
@PropertyOption(
value='%'+IndexConfiguration.FIELD_ENCODING+".option.atSuffix",
name="AtSuffix")
},value="SolrYard"),
@Property(name=IndexConfiguration.FST_CONFIG, cardinality=Integer.MAX_VALUE),
@Property(name=IndexConfiguration.FST_FOLDER,
value=IndexConfiguration.DEFAULT_FST_FOLDER),
@Property(name=IndexConfiguration.SOLR_TYPE_FIELD, value="rdf:type"),
@Property(name=IndexConfiguration.SOLR_RANKING_FIELD, value="entityhub:entityRank"),
@Property(name=FstLinkingEngineComponent.FST_THREAD_POOL_SIZE,
intValue=FstLinkingEngineComponent.DEFAULT_FST_THREAD_POOL_SIZE),
@Property(name=FstLinkingEngineComponent.ENTITY_CACHE_SIZE,
intValue=FstLinkingEngineComponent.DEFAULT_ENTITY_CACHE_SIZE),
@Property(name=SUGGESTIONS, intValue=DEFAULT_SUGGESTIONS),
@Property(name=INCLUDE_SIMILAR_SCORE, boolValue=DEFAULT_INCLUDE_SIMILAR_SCORE),
@Property(name=CASE_SENSITIVE,boolValue=DEFAULT_CASE_SENSITIVE_MATCHING_STATE),
@Property(name=PROCESS_ONLY_PROPER_NOUNS_STATE, boolValue=DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE),
@Property(name=PROCESSED_LANGUAGES, cardinality=Integer.MAX_VALUE,
value={"*;lmmtip;uc=LINK;prob=0.75;pprob=0.75", // link multiple matchable tokens in chunks; link upper case words
"de;uc=MATCH", //in German all Nouns are upper case
"es;lc=Noun", //the OpenNLP POS tagger for Spanish does not support ProperNouns
"nl;lc=Noun"}), //same for Dutch
@Property(name=DEFAULT_MATCHING_LANGUAGE,value=""),
@Property(name=ENTITY_TYPES,cardinality=Integer.MAX_VALUE),
@Property(name=TYPE_MAPPINGS,cardinality=Integer.MAX_VALUE, value={
"dbp-ont:Organisation; dbp-ont:Newspaper; schema:Organization > dbp-ont:Organisation",
"dbp-ont:Person; foaf:Person; schema:Person > dbp-ont:Person",
"dbp-ont:Place; schema:Place; geonames:Feature > dbp-ont:Place",
"dbp-ont:Work; schema:CreativeWork > dbp-ont:Work",
"dbp-ont:Event; schema:Event > dbp-ont:Event",
"schema:Product > schema:Product",
"skos:Concept > skos:Concept"}),
@Property(name=SERVICE_RANKING,intValue=0)
})
public class FstLinkingEngineComponent {
/**
* The {@link SolrCore} is required to access the document ids for the Entities
* as well as the analyzer chains of the fields used for the linking
*/
public static final String SOLR_CORE = "enhancer.engines.linking.lucenefst.solrcore";
/**
* The origin information for all Entities provided by the configured SolrCore and
* FST. Origin information are added to all <code>fise:EntityAnnotation</code>
* by using the <code>fise:origin</code> property. Configured values can be both
* {@link UriRef URI}s or {@link Literal}s. Configured Strings are checked if
* they are valid {@link URI}s and {@link URI#isAbsolute() absolute}. If not
* a {@link Literal} is parsed.
*/
public static final String ORIGIN = "enhancer.engines.linking.lucenefst.origin";
/**
* Property used to configure the {@link LinkingModeEnum}.
*/
public static final String LINKING_MODE = "enhancer.engines.linking.lucenefst.mode";
/**
* Allows to configure mappings of NamedEntity Types to types of Entities in the
* vocabulary. Configured keys are matched against the {@link NerTag#getTag()} AND
* {@link NerTag#getType()} values of NamedEntities. Configured Values are mapped
* against the values of the configured {@link IndexConfiguration#SOLR_TYPE_FIELD}.
*/
public static final String NAMED_ENTITY_TYPE_MAPPINGS = "enhancer.engines.linking.lucenefst.neTypeMapping";
/**
* The size of the thread pool used to create FST models (default=1). Creating
* such models does need a lot of memory. Expect values up to 10times of the
* build model. So while this task can easily performed concurrently users need
* to be aware that the process will occupy a lot of heap space (typically several
* GBytes). If heap space is not an issue it is best to configure the value
* based on the CPU cores available on the local host.<p>
* This configuration has only an effect if runtime generation of FST modles
* is enabled (either by default or for some FST by explicitly setting the
* '<code>{@link IndexConfiguration#PARAM_RUNTIME_GENERATION generate}=true</code>' parameter
* for some languages in the {@link IndexConfiguration#FST_CONFIG}.
*/
public static final String FST_THREAD_POOL_SIZE = "enhancer.engines.linking.lucenefst.fstThreadPoolSize";
/**
* The default number of threads used to create FST models (default=1)
*/
public static final int DEFAULT_FST_THREAD_POOL_SIZE = 1;
/**
* Parameter used to configure the size of the Cache used to for Entity information.
* While the FST linking is fully performed in memory this engine needs still to
* load tagging relevant fields (labels, types, redirectes and entity ranking)
* for matched entities from the disc. The EntityCache is a LRU cache for such
* information.
*/
public static final String ENTITY_CACHE_SIZE = "enhancer.engines.linking.lucenefst.entityCacheSize";
/**
* The default size of the Entity Cache is set to 65k entities.
*/
public static final int DEFAULT_ENTITY_CACHE_SIZE = 65536;
/**
* Changed default for the {@link EntityLinkerConfig#MIN_FOUND_TOKENS} property.
* This Engine uses <code>2</code> as default. While the {@link EntityLinkerConfig}
* currently sets the default to <code>1</code>
*/
private static final Integer FST_DEFAULT_MIN_FOUND_TOKENS = 2;
protected final Logger log = LoggerFactory.getLogger(FstLinkingEngineComponent.class);
/**
* the name for the EnhancementEngine registered by this component
*/
private String engineName;
/**
* The origin information of Entities.
*/
private Resource origin;
/**
* used to resolve '{prefix}:{local-name}' used within the engines configuration
*/
@Reference(cardinality=ReferenceCardinality.OPTIONAL_UNARY)
private NamespacePrefixService prefixService;
/**
* Holds the FST configuration parsed to the engine
*/
private LanguageConfiguration fstConfig;
/**
* The configured fstFolder. NOTE that the actual folder is determined in the
* {@link #updateEngineRegistration(ServiceReference, SolrServer)} based on
* the SolrCore.
*/
private String fstFolder;
/**
* The {@link LinkingModeEnum linking mode}
*/
private LinkingModeEnum linkingMode;
/**
* Holds the {@link TextProcessingConfig} parsed from the configuration of
* this engine. <p>
* NOTE: that by far not all configurations are supported. See documentation
* for details
*/
private TextProcessingConfig textProcessingConfig;
/**
* Holds the {@link EntityLinkerConfig} parsed from the configuration of
* this engine. <p>
* NOTE: that by far not all configurations are supported. See documentation
* for details
*/
private EntityLinkerConfig entityLinkerConfig;
//SolrCore related fields
/**
* The reference to the configured SolrIndex parsed from the {@link #SOLR_CORE}
* configuration
* @see #SOLR_CORE
*/
private IndexReference indexReference;
/**
* The OSGI {@link ServiceTracker} used to track the configured Solr core
*/
private RegisteredSolrServerTracker solrServerTracker;
/**
* The ServiceReference of the {@link SolrCore} registered as OSGI service
* and tracked by the {@link #solrServerTracker}. This is used to
* check if the SolrCore has changed on OSGI service events
*/
private ServiceReference solrServerReference;
/**
* The {@link SolrCore} used for FST linking. This is set based on OSGI
* events provided by the {@link #solrServerTracker}
*/
private SolrCore solrCore;
/**
* Holds the OSGI service registration for the {@link FstLinkingEngine}
*/
private ServiceRegistration engineRegistration;
/**
* Holds the metadata registered with the {@link #engineRegistration}
*/
Dictionary<String,Object> engineMetadata;
/**
* The bundle context for this component. Also used to track dependencies
* and register the {@link #engineRegistration}
*/
protected BundleContext bundleContext;
/**
* Thread pool used for the runtime creation of FST modles.
* @see #FST_THREAD_POOL_SIZE
* @see #DEFAULT_FST_THREAD_POOL_SIZE
*/
private ExecutorService fstCreatorService;
/**
* The field name in the configured Solr index holding type information for
* Entities.
*/
private String solrTypeField;
/**
* The field name in the configured Solr index holding ranking information for
* Entities. values are expected to be floating point numbers.
*/
private String solrRankingField;
/**
* The fieldEncoding used by the configured SolrIndex for field names.
* @see FieldEncodingEnum
*/
private FieldEncodingEnum fieldEncoding;
private IndexConfiguration indexConfig;
private Boolean skipAltTokensConfig;
/**
* The size of the EntityCache ( <code>0</code> ... means deactivated)
*/
private int entityCacheSize;
private Map<String,Set<String>> nerTypeMappings;
/**
* Default constructor as used by OSGI. This expects that
* {@link #activate(ComponentContext)} is called before usage
*/
public FstLinkingEngineComponent() {
}
@Activate
@SuppressWarnings("unchecked")
protected void activate(ComponentContext ctx) throws ConfigurationException {
log.info("activate {}", getClass().getSimpleName());
log.debug(" - instance: {}", this);
log.debug(" - config: {}", ctx.getProperties());
this.bundleContext = ctx.getBundleContext();
//(0) parse the linking mode
applyConfig(parseLinkingMode(ctx), ctx.getProperties(), prefixService);
}
/**
* Parses the LinkingMode from the {@link #LINKING_MODE} property. This
* allows to use this component to configure FST linking engines for any
* supported LinkingMode. If the {@link #LINKING_MODE} is not present the
* default {@link LinkingModeEnum#LINKABLE_TOKEN} is returned. <p>
* <b>NOTE:</b>Typically
* users will want to use the <ul>
* <li>{@link PlainFstLinkingComponnet} to configure FST engines for the
* {@link LinkingModeEnum#PLAIN}
* <li> {@link NamedEntityFstLinkingComponnet} to configure FST engines for
* the {@link LinkingModeEnum#NER}
* </ul>
* but is is also fine to explicitly specify a {@link #LINKING_MODE} linking
* mode when using this component to configure the FST linking engine.
* @param ctx the parsed component context
* @return the parsed {@link LinkingModeEnum}
* @throws ConfigurationException
*/
private LinkingModeEnum parseLinkingMode(ComponentContext ctx) throws ConfigurationException {
Object value = ctx.getProperties().get(LINKING_MODE);
LinkingModeEnum linkingMode;
if(value == null || StringUtils.isBlank(value.toString())){
linkingMode = LinkingModeEnum.LINKABLE_TOKEN;
} else {
try {
linkingMode = LinkingModeEnum.valueOf(value.toString());
} catch(IllegalArgumentException e){
throw new ConfigurationException(LINKING_MODE, "The parsed value '"
+value+"' (type: "+value.getClass().getName()+") is not a member "
+ "of the enum (members: "+ Arrays.toString(LinkingModeEnum.values())
+ ")!",e);
}
}
return linkingMode;
}
/**
* Called by {@link #activate(ComponentContext)},
* {@link PlainFstLinkingComponnet#activate(ComponentContext)} and
* {@link NamedEntityFstLinkingComponnet#activate(ComponentContext)} to
* apply the parsed {@link ComponentContext#getProperties()}. The
* {@link LinkingModeEnum linking mode} is parsed separately as OSGI does not
* allow to modify the parsed config and sup-classes do need to override
* the linking mode.
* @param linkingMode the linking mode
* @param properties
* @throws ConfigurationException
*/
protected void applyConfig(LinkingModeEnum linkingMode, Dictionary<String,Object> properties, NamespacePrefixService prefixService)
throws ConfigurationException {
//(0) The name for the Enhancement Engine and the basic metadata
Object value = properties.get(PROPERTY_NAME);
if(value == null || value.toString().isEmpty()){
throw new ConfigurationException(PROPERTY_NAME, "The EnhancementEngine name MUST BE configured!");
} else {
this.engineName = value.toString();
}
log.info(" - engine name: {}", engineName);
engineMetadata = new Hashtable<String,Object>();
engineMetadata.put(PROPERTY_NAME, this.engineName);
value = properties.get(Constants.SERVICE_RANKING);
engineMetadata.put(Constants.SERVICE_RANKING, value == null ? Integer.valueOf(0) : value);
//(0) set the linking mode
this.linkingMode = linkingMode;
log.info(" - linking mode: {}", linkingMode);
//(1) parse the TextProcessing configuration
//TODO: decide if we should use the TextProcessingConfig for this engine
textProcessingConfig = TextProcessingConfig.createInstance(properties);
//change default for EntityLinkerConfig.MIN_FOUND_TOKENS
value = properties.get(EntityLinkerConfig.MIN_FOUND_TOKENS);
entityLinkerConfig = EntityLinkerConfig.createInstance(properties, prefixService);
if(value == null){ //no MIN_FOUND_TOKENS config present
//manually set the default to the value used by this engine
entityLinkerConfig.setMinFoundTokens(FST_DEFAULT_MIN_FOUND_TOKENS);
}
//(2) parse the configured IndexReference
value = properties.get(SOLR_CORE);
if(value == null){
throw new ConfigurationException(SOLR_CORE, "Missing required configuration of the SolrCore");
} else {
indexReference = IndexReference.parse(value.toString());
}
value = properties.get(IndexConfiguration.FIELD_ENCODING);
if(value == null){
throw new ConfigurationException(IndexConfiguration.FIELD_ENCODING, "Missing required configuration of the Solr Field Encoding");
} else {
try {
fieldEncoding = FieldEncodingEnum.valueOf(value.toString().trim());
} catch (IllegalArgumentException e) {
throw new ConfigurationException(IndexConfiguration.FIELD_ENCODING, "The configured "
+ "FieldEncoding MUST BE a member of "
+ Arrays.toString(FieldEncodingEnum.values()), e);
}
}
value = properties.get(IndexConfiguration.SKIP_ALT_TOKENS);
if(value instanceof Boolean){
skipAltTokensConfig = ((Boolean)value);
} else if(value != null){
skipAltTokensConfig = Boolean.valueOf(value.toString());
} // else no config -> will use the default
//(4) parse Origin information
value = properties.get(ORIGIN);
if(value instanceof Resource){
origin = (Resource)origin;
} else if (value instanceof String){
try {
URI originUri = new URI((String)value);
if(originUri.isAbsolute()){
origin = new UriRef((String)value);
} else {
origin = new PlainLiteralImpl((String)value);
}
} catch(URISyntaxException e){
origin = new PlainLiteralImpl((String)value);
}
log.info(" - origin: {}", origin);
} else if(value != null){
log.warn("Values of the {} property MUST BE of type Resource or String "
+ "(parsed: {} (type:{}))", new Object[]{ORIGIN,value,value.getClass()});
} //else no ORIGIN information provided
//(5) init the FST configuration
//We can create the default configuration only here, as it depends on the
//name of the solrIndex
String defaultConfig = "*;"
+ IndexConfiguration.PARAM_FST + "=" + indexReference.getIndex() + ";"
+ IndexConfiguration.PARAM_FIELD + "=" + IndexConfiguration.DEFAULT_FIELD;
fstConfig = new LanguageConfiguration(IndexConfiguration.FST_CONFIG, new String[]{defaultConfig});
//now set the actual configuration parsed to the engine
value = properties.get(IndexConfiguration.FST_CONFIG);
if(value != null && !StringUtils.isBlank(value.toString())){
fstConfig.setConfiguration(properties);
} //else keep the default
value = properties.get(IndexConfiguration.FST_FOLDER);
if(value instanceof String){
this.fstFolder = ((String)value).trim();
if(this.fstFolder.isEmpty()){
this.fstFolder = null;
}
} else if(value == null){
this.fstFolder = null;
} else {
throw new ConfigurationException(IndexConfiguration.FST_FOLDER, "Values MUST BE of type String"
+ "(found: "+value.getClass().getName()+")!");
}
//(6) Create the ThreadPool used for the runtime creation of FST models
value = properties.get(FST_THREAD_POOL_SIZE);
int tpSize;
if(value instanceof Number){
tpSize = ((Number)value).intValue();
} else if(value != null){
try {
tpSize = Integer.parseInt(value.toString());
} catch (NumberFormatException e) {
throw new ConfigurationException(FST_THREAD_POOL_SIZE,
"Unable to parse the integer FST thread pool size from the "
+ "configured "+value.getClass().getSimpleName()+" '"
+ value+"'!",e);
}
} else {
tpSize = -1;
}
if(tpSize <= 0){ //if configured value <= 0 we use the default
tpSize = DEFAULT_FST_THREAD_POOL_SIZE;
}
//build a ThreadFactoryBuilder for low priority daemon threads that
//do use a meaningful name
ThreadFactoryBuilder tfBuilder = new ThreadFactoryBuilder();
tfBuilder.setDaemon(true);//should be stopped if the VM closes
tfBuilder.setPriority(Thread.MIN_PRIORITY); //low priority
tfBuilder.setNameFormat(engineName+"-FstRuntimeCreation-thread-%d");
if(fstCreatorService != null && !fstCreatorService.isTerminated()){
//NOTE: We can not call terminateNow, because to interrupt threads
// here would also close FileChannels used by the SolrCore
// and produce java.nio.channels.ClosedByInterruptException
// exceptions followed by java.nio.channels.ClosedChannelException
// on following calls to affected files of the SolrIndex.
//Because of that we just log a warning and let uncompleted tasks
//complete!
log.warn("some items in a previouse FST Runtime Creation Threadpool have "
+ "still not finished!");
}
fstCreatorService = Executors.newFixedThreadPool(tpSize,tfBuilder.build());
//(7) Parse the EntityCache config
int entityCacheSize;
value = properties.get(ENTITY_CACHE_SIZE);
if(value instanceof Number){
entityCacheSize = ((Number)value).intValue();
} else if (value != null){
try {
entityCacheSize = Integer.parseInt(value.toString());
} catch (NumberFormatException e) {
throw new ConfigurationException(ENTITY_CACHE_SIZE,
"Unable to parse the integer EntityCacheSize from the "
+ "configured "+value.getClass().getSimpleName()+" '"
+ value+"'!",e);
}
} else {
entityCacheSize = -1;
}
if(entityCacheSize == 0){
log.info(" ... EntityCache deactivated");
this.entityCacheSize = entityCacheSize;
} else {
this.entityCacheSize = entityCacheSize < 0 ? DEFAULT_ENTITY_CACHE_SIZE : entityCacheSize;
log.info(" ... EntityCache enabled (size: {})",this.entityCacheSize);
}
//(8) parse the Entity type field
value = properties.get(IndexConfiguration.SOLR_TYPE_FIELD);
if(value == null || StringUtils.isBlank(value.toString())){
solrTypeField = null;
} else {
solrTypeField = value.toString().trim();
}
//(9) parse the Entity Ranking field
value = properties.get(IndexConfiguration.SOLR_RANKING_FIELD);
if(value == null){
solrRankingField = null;
} else {
solrRankingField = value.toString().trim();
}
//(10) parse the NamedEntity type mappings (if linkingMode = NER)
if(linkingMode == LinkingModeEnum.NER){
nerTypeMappings = new HashMap<String,Set<String>>();
value = properties.get(NAMED_ENTITY_TYPE_MAPPINGS);
if(value instanceof String[]){ //support array
value = Arrays.asList((String[])value);
} else if(value instanceof String) { //single value
value = Collections.singleton(value);
}
if(value instanceof Collection<?>){ //and collection
log.info(" - process Named Entity Type Mappings (used by LinkingMode: {})",linkingMode);
configs : for(Object o : (Iterable<?>)value){
if(o != null){
StringBuilder usage = new StringBuilder("useage: ");
usage.append("'{namedEntity-tag-or-uri} > {entityType-1}[,{entityType-n}]'");
String[] config = o.toString().split(">");
String namedEntityType = config[0].trim();
if(namedEntityType.isEmpty()){
log.warn("Invalid Type Mapping Config '{}': Missing namedEntityType ({}) -> ignore this config",
o,usage);
continue configs;
}
if(NamespaceMappingUtils.getPrefix(namedEntityType) != null){
namedEntityType = NamespaceMappingUtils.getConfiguredUri(
prefixService, NAMED_ENTITY_TYPE_MAPPINGS,namedEntityType);
}
if(config.length < 2 || config[1].isEmpty()){
log.warn("Invalid Type Mapping Config '{}': Missing dc:type URI '{}' ({}) -> ignore this config",
o,usage);
continue configs;
}
String entityTypes = config[1].trim();
if(config.length > 2){
log.warn("Configuration after 2nd '>' gets ignored. Will use mapping '{} > {}' from config {}",
new Object[]{namedEntityType,entityTypes,o});
}
Set<String> types = nerTypeMappings.get(namedEntityType);
if(types == null){ //add new element to the mapping
types = new HashSet<String>();
nerTypeMappings.put(namedEntityType, types);
}
for(String entityType : entityTypes.split(";")){
entityType = entityType.trim();
if(!entityType.isEmpty()){
String typeUri;
if("*".equals(entityType)){
typeUri = null; //null is used as wildcard
} else {
typeUri = NamespaceMappingUtils.getConfiguredUri(
prefixService, NAMED_ENTITY_TYPE_MAPPINGS, entityType);
}
log.info(" - add {} > {}", namedEntityType, typeUri);
types.add(typeUri);
} //else ignore empty mapping
}
}
}
} else { //no mappings defined ... set wildcard mapping
log.info(" - No Named Entity type mappings configured. Will use wildcard mappings");
nerTypeMappings = Collections.singletonMap(null, Collections.<String>singleton(null));
}
}
//(11) start tracking the SolrCore
try {
solrServerTracker = new RegisteredSolrServerTracker(
bundleContext, indexReference, null){
@Override
public void removedService(ServiceReference reference, Object service) {
log.info(" ... SolrCore for {} was removed!", reference);
//try to get an other serviceReference from the tracker
if(reference.equals(FstLinkingEngineComponent.this.solrServerReference)){
updateEngineRegistration(solrServerTracker.getServiceReference(), null);
} else {
log.info(" - removed SolrCore was not used for FST linking");
}
super.removedService(reference, service);
}
@Override
public void modifiedService(ServiceReference reference, Object service) {
log.info(" ... SolrCore for {} was updated!", indexReference);
updateEngineRegistration(solrServerTracker.getServiceReference(), null);
super.modifiedService(reference, service);
}
@Override
public SolrServer addingService(ServiceReference reference) {
SolrServer server = super.addingService(reference);
if(solrCore != null){
log.info("Multiple SolrCores for name {}! Will update engine "
+ "with the newly added {}!", new Object[]{solrCore.getName(),
indexReference, reference});
}
updateEngineRegistration(reference, server);
return server;
}
};
} catch (InvalidSyntaxException e) {
throw new ConfigurationException(SOLR_CORE, "parsed SolrCore name '"
+ value.toString()+"' is invalid (expected: '[{server-name}:]{indexname}'");
}
try {
solrServerTracker.open();
} catch(RuntimeException e){
//FIX for STANBOL-1416 (see https://issues.apache.org/jira/browse/STANBOL-1416)
//If an available SolrCore can not be correctly initialized we will
//get the exception here. In this case we want this component to be
//activated and waiting for further service events. Because of that
//we catch here the exception.
log.debug("Error while processing existing SolrCore Service during "
+ "opening SolrServiceTracker ... waiting for further service"
+ "Events", e);
}
}
/**
* This will be called on each <ul>
* <li>update to the Component configuration (activate, deactivate)
* <li>updates on the SolrCore
* </ul>
* on any detected change it will update the registered EnhancementEngine.<p>
* This also initialises the FST configuration.
* @param reference the ServiceRefernece for the SolrServer or <code>null</code>
* in case the service is no longer available.
* @param server the SolrServer (or <code>null</code>
*/
protected void updateEngineRegistration(ServiceReference reference, SolrServer server) {
log.info(" ... updateEngineRegistration for {}: {}",getClass().getSimpleName(), engineName);
if(reference != null && server == null){
server = solrServerTracker.getService(reference);
}
if(reference == null && this.indexReference == null){
//unregisterEngine(); //unregister existing
return; //and return
}
BundleContext bundleContext = this.bundleContext;
//We need to keep the old configuration vars for unregistering the
//current engine (see #unregisterEngine(..) method)
final ServiceRegistration<?> oldEngineRegistration = this.engineRegistration;
final SolrCore oldSolrCore = this.solrCore;
final IndexConfiguration oldIndexConfig = this.indexConfig;
SolrCore core;
IndexConfiguration indexConfig; // the indexConfig build by this call
synchronized (this) { //init one after the other in case of multiple calls
try { //try to init - finally unregisterEngine
//reset the old field values
this.engineRegistration = null;
this.indexConfig = null;
this.solrCore = null;
//now we can update the engines configuration
if(bundleContext == null){ //already deactivated
return; //NOTE: unregistering is done in finally block
}
core = getSolrCore(server);
if(core == null){ //no SolrCore
log.info(" - SolrCore {} present", oldSolrCore == null ?
"not yet" : "no longer");
return; //NOTE: unregistering is done in finally block
} else { //- we do have a SolrCore
log.info(" - solrCore (name: {} | indexDir: {}",
core.getName(), core.getIndexDir());
}
//File fstDir = new File(dataDir,"fst");
//now collect the FST configuration
indexConfig = new IndexConfiguration(fstConfig, core, fieldEncoding, entityLinkerConfig.getDefaultLanguage());
indexConfig.setTypeField(solrTypeField);
indexConfig.setRankingField(solrRankingField);
//set fields parsed in the activate method
indexConfig.setExecutorService(fstCreatorService);
indexConfig.setRedirectField(null);//TODO add support
indexConfig.setOrigin(origin);
//NOTE: the FST cofnig is processed even if the SolrCore has not changed
// because their might be config changes and/or new FST files in the
// FST directory of the SolrCore.
indexConfig.setFstDirectory(getFstDirectory(core, fstFolder));
//set the DocumentCacheFactory
if(entityCacheSize > 0){
indexConfig.setEntityCacheManager(new FastLRUCacheManager(entityCacheSize));
} //else no entityCache is used
if(skipAltTokensConfig != null){
indexConfig.setSkipAltTokens(skipAltTokensConfig);
}
//activate the index configuration
try {
//this will init the FST directory if necessary so we might run
//into IOExceptions
indexConfig.activate();
} catch (IOException e) {
throw new RuntimeException("Unable to activate Index for FST Linking Engine '"
+ engineName +"' (solrCore: "+ core.getName() + ", instanceDir: "
+ core.getCoreDescriptor().getInstanceDir() +")!", e);
}
if(log.isInfoEnabled()){ //log the initialised languages
Set<String> langSet = new HashSet<String>(indexConfig.getCorpusLanguages());
if(langSet.remove(null)){ //replace the null for the default language
langSet.add(""); //with an empty string
}
String[] langArray = langSet.toArray(new String[langSet.size()]);
Arrays.sort(langArray,String.CASE_INSENSITIVE_ORDER);
log.info(" ... initialised FST corpora for languages {}",
Arrays.toString(langArray));
}
//check if we need to create some FST files
for(CorpusInfo fstInfo : indexConfig.getCorpora()){
//check if the fst does not exist and the fstInfo allows creation
if(!fstInfo.fst.exists() && fstInfo.allowCreation){
//create a task on the FST corpus creation service
fstInfo.corpusLock.writeLock().lock();
try {
Future<TaggerFstCorpus> enqueued = fstCreatorService.submit(new CorpusCreationTask(indexConfig, fstInfo));
fstInfo.enqueued(enqueued);
} finally {
fstInfo.corpusLock.writeLock().unlock();
}
}
}
//set the newly configured instances to the fields
this.indexConfig = indexConfig;
this.solrServerReference = reference;
this.solrCore = core;
//create the new FST linking engine instance
FstLinkingEngine engine = new FstLinkingEngine(engineName,
linkingMode, indexConfig,
textProcessingConfig, entityLinkerConfig, nerTypeMappings);
//register it as a service
String[] services = new String [] {
EnhancementEngine.class.getName(),
ServiceProperties.class.getName()};
log.info(" ... register {}: {}", engine.getClass().getSimpleName(),engineName);
this.engineRegistration = bundleContext.registerService(services,engine, engineMetadata);
} finally {
//in any case (even an Exception) ensure that the current
//engine registration is unregistered and the currently used
//SolrCore is unregistered!
unregisterEngine(oldEngineRegistration, oldIndexConfig, oldSolrCore);
}
}
}
/**
* Resolves the directory to store the FST models based on the configured
* {@link IndexConfiguration#FST_FOLDER}. Also considering the name of the SolrServer and
* SolrCore
* @param core
* @param fstFolderConfig
* @return
*/
private File getFstDirectory(SolrCore core, String fstFolderConfig) {
StrSubstitutor substitutor = new StrSubstitutor(new SolrCoreStrLookup(
indexReference, core, bundleContext));
substitutor.setEnableSubstitutionInVariables(true);
String folderStr = substitutor.replace(fstFolderConfig);
if(folderStr.indexOf("${") > 0){
folderStr = substitutor.replace(folderStr);
}
//convert separators to the current OS
folderStr = FilenameUtils.separatorsToSystem(folderStr);
File fstDir = new File(folderStr);
if(!fstDir.isDirectory()){ //create the FST directory
try {
FileUtils.forceMkdir(fstDir);
} catch (IOException e) {
throw new IllegalStateException("Unable to create Directory for"
+ "storing the FST files at location '"+fstDir+"'.");
}
}
return fstDir;
}
/**
* unregisters the Engines service registration, closes the SolrCore and
* rests the fields. If no engine is registered this does nothing!
*/
private void unregisterEngine(ServiceRegistration<?> engineRegistration,
IndexConfiguration indexConfig, SolrCore solrCore) {
log.debug("> clean up (old) FSTLinkingEngine instance ...");
//use local copies for method calls to avoid concurrency issues
if(engineRegistration != null){
log.info(" ... unregister Lucene FSTLinkingEngine {}",engineName);
try {
engineRegistration.unregister();
} catch(IllegalStateException e) {
//already unregistered ... can be ignored
log.warn("Unexpected State: Service for FSTLinkingEngine "
+ engineName+" was already deactivated.", e);
}
} else {
log.debug(" ... no (old) engine registration present");
}
if(solrCore != null){
log.debug(" ... unregister SolrCore {}", solrCore.getName());
solrCore.close(); //decrease the reference count!!
} else {
log.debug(" ... no (old) SolrCore present");
}
//deactivate the index configuration if present
if(indexConfig != null){
log.debug(" ... deactivate (old) IndexingConfiguration");
indexConfig.deactivate();
//close the EntityCacheManager (if present
EntityCacheManager cacheManager = indexConfig.getEntityCacheManager();
if(cacheManager != null){
log.debug(" ... deactivate {}", cacheManager.getClass().getSimpleName());
cacheManager.close();
}
} else {
log.debug(" ... no (old) index config present");
}
}
/**
* Internal helper to get th SolrCore from the tracked SolrServer. This
* assumes that tracked SolrServers are of type {@link EmbeddedSolrServer}.
* @param server the SolrServer
* @return the SolrCore or <code>null</code> if <code>null</code> is parsed
* as server.
* @throws IllegalStateException if the parsed {@link SolrServer} is not an
* {@link EmbeddedSolrServer} or it does not contain the configured SolrCore
*/
private SolrCore getSolrCore(SolrServer server) {
SolrCore core;
if(server != null){
if(server instanceof EmbeddedSolrServer){
core = ((EmbeddedSolrServer)server).getCoreContainer().getCore(
indexReference.getIndex());
if(core == null){
throw new IllegalStateException("Solr CoreContainer for IndexRef '"
+ indexReference + "'is missing the expected SolrCore '"
+ indexReference.getIndex() + "' (present: "
+ ((EmbeddedSolrServer)server).getCoreContainer().getCoreNames()
+ ")!");
}
} else {
core = null;
throw new IllegalStateException("Unable to use '"
+ server.getClass().getSimpleName() + "' (indexRef: "
+indexReference+") because it is not an EmbeddedSolrServer!");
}
} else {
core = null;
}
return core;
}
/**
* Deactivates this components.
*/
@Deactivate
protected void deactivate(ComponentContext ctx) {
log.info(" ... deactivate {}: {} (CompInst: {})",new Object[] {
getClass().getSimpleName(),
engineName, ctx.getComponentInstance()});
log.debug(" - instance: {}", this);
log.debug(" - config: {}", ctx.getProperties());
if(solrServerTracker != null){
//closing the tracker will also cause registered engines to be
//unregistered as service (see #updateEngineRegistration())
solrServerTracker.close();
solrServerTracker = null;
}
if(fstCreatorService != null){
//we MUST NOT call shutdownNow(), because this would close
//low level Solr FileChannels.
fstCreatorService.shutdown();
//do not set NULL, as we want to warn users an re-activation if old
//threads are still running.
}
indexReference = null;
engineMetadata = null;
textProcessingConfig = null;
entityLinkerConfig = null;
entityCacheSize = -1;
bundleContext = null;
skipAltTokensConfig = null;
//NOTE: just to be sure that all the engine is unregistered and to
// 100% make sure that there are no refs to unregistered SolrCores!
// this also include IndexConfiguration instances that does refer
// FST models for the closed SolrCore. Using old models will cause
// FST and SolrCore to be out of sync if a new SolrCore with updated
// data will become available
boolean unregisterFailier = false;
if(engineRegistration != null){
log.warn("Engine is still registered after deactivating Engine! Will "
+ "explicitly perform required clean-up, but please report "
+ "this as a Bug for the Lucene FST Linking Engine!");
unregisterFailier = true;
}
if(solrCore != null){
log.warn("SolrCore used for linking was not closed! Will "
+ "explicitly perform required clean-up, but please report "
+ "this as a Bug for the Lucene FST Linking Engine!");
unregisterFailier = true;
}
if(indexConfig != null){
log.warn("FST Configuration was not not reset! Will "
+ "explicitly perform required clean-up, but please report "
+ "this as a Bug for the Lucene FST Linking Engine!");
unregisterFailier = true;
}
if(unregisterFailier){
unregisterEngine(this.engineRegistration,this.indexConfig,this.solrCore);
}
this.engineRegistration = null;
this.indexConfig = null;
this.solrCore = null;
}
/**
* {@link StrSubstitutor} {@link StrLookup} implementation used for
* determining the directory for storing FST files based on the configured
* {@link IndexConfiguration#FST_FOLDER} configuration.
* @author Rupert Westenthaler
*
*/
private static class SolrCoreStrLookup extends StrLookup {
private final BundleContext bc;
private final SolrCore core;
private final IndexReference indexRef;
public SolrCoreStrLookup(IndexReference indexRef, SolrCore core, BundleContext bc) {
this.indexRef = indexRef;
this.core = core;
this.bc = bc;
}
@Override
public String lookup(String key) {
if("solr-data-dir".equals(key)){
return core.getDataDir();
} else if("solr-index-dir".equals(key)){
return core.getIndexDir();
} else if("solr-server-name".equals(key)){
return indexRef.getServer();
} else if("solr-core-name".equals(key)){
return core.getName();
} else {
return bc.getProperty(key);
}
}
}
}