blob: 87e27a70b6b50b4f969fcfffab56b2cd6b51a0ed [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.entitycoreference;
import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Dictionary;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Deactivate;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase;
import org.apache.stanbol.enhancer.engines.entitycoreference.impl.CoreferenceFinder;
import org.apache.stanbol.enhancer.engines.entitycoreference.impl.NounPhraseFilterer;
import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.nlp.model.Section;
import org.apache.stanbol.enhancer.nlp.model.Span;
import org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum;
import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
import org.apache.stanbol.enhancer.nlp.ner.NerTag;
import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.apache.stanbol.entityhub.servicesapi.Entityhub;
import org.apache.stanbol.entityhub.servicesapi.site.SiteManager;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This engine extracts references in the given text of noun phrases which point to NERs. The coreference is
* performed based on matching several of the named entity's dbpedia/yago properties to the noun phrase
* tokens.
*
* TODO - Be able to detect possessive coreferences such as Germany's prime minister TODO - be able to detect
* products and their developer such as Iphone 7 and Apple's new device. TODO - provide the ability via config
* for the user to also allow coreferencing of 1 word noun phrases based soley on comparison with entity class
* type?
*
* @author Cristian Petroaca
*
*/
@Component(immediate = true, metatype = true)
@Service(value = EnhancementEngine.class)
@Properties(value = {
@Property(name = EnhancementEngine.PROPERTY_NAME, value = "entity-coreference"),
@Property(name = EntityCoReferenceEngine.CONFIG_LANGUAGES, value = "en"),
@Property(name = EntityCoReferenceEngine.REFERENCED_SITE_ID, value = "dbpedia"),
@Property(name = EntityCoReferenceEngine.MAX_DISTANCE, intValue = EntityCoReferenceEngine.MAX_DISTANCE_DEFAULT_VALUE)})
public class EntityCoReferenceEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException>
implements EnhancementEngine, ServiceProperties {
private static final Integer ENGINE_ORDERING = ServiceProperties.ORDERING_POST_PROCESSING + 91;
/**
* Language configuration. Takes a list of ISO language codes of supported languages. Currently supported
* are the languages given as default value.
*/
protected static final String CONFIG_LANGUAGES = "enhancer.engine.entitycoreference.languages";
/**
* Referenced site configuration. Defaults to dbpedia.
*/
protected static final String REFERENCED_SITE_ID = "enhancer.engine.entitycoreference.referencedSiteId";
/**
* Maximum sentence distance between the ner and the noun phrase which mentions it. -1 means no distance
* constraint.
*/
protected static final String MAX_DISTANCE = "enhancer.engine.entitycoreference.maxDistance";
protected static final int MAX_DISTANCE_DEFAULT_VALUE = 1;
public static final int MAX_DISTANCE_NO_CONSTRAINT = -1;
private final Logger log = LoggerFactory.getLogger(EntityCoReferenceEngine.class);
/**
* Service of the Entityhub that manages all the active referenced Site. This Service is used to lookup
* the configured Referenced Site when we need to enhance a content item.
*/
@Reference
protected SiteManager siteManager;
/**
* Used to lookup Entities if the {@link #REFERENCED_SITE_ID} property is set to "entityhub" or "local"
*/
@Reference
protected Entityhub entityhub;
/**
* Specialized class which filters out bad noun phrases based on the language.
*/
private NounPhraseFilterer nounPhraseFilterer;
/**
* Performs the logic needed to find corefs based on the NERs and noun phrases in the text.
*/
private CoreferenceFinder corefFinder;
@SuppressWarnings("unchecked")
@Activate
protected void activate(ComponentContext ctx) throws ConfigurationException {
super.activate(ctx);
Dictionary<String,Object> config = ctx.getProperties();
/* Step 1 - initialize the {@link NounPhraseFilterer} with the language config */
String languages = (String) config.get(CONFIG_LANGUAGES);
if (languages == null || languages.isEmpty()) {
throw new ConfigurationException(CONFIG_LANGUAGES,
"The Languages Config is a required Parameter and MUST NOT be NULL or an empty String!");
}
nounPhraseFilterer = new NounPhraseFilterer(languages.split(","));
/* Step 2 - initialize the {@link CoreferenceFinder} */
String referencedSiteID = null;
Object referencedSiteIDfromConfig = config.get(REFERENCED_SITE_ID);
if (referencedSiteIDfromConfig == null) {
throw new ConfigurationException(REFERENCED_SITE_ID,
"The ID of the Referenced Site is a required Parameter and MUST NOT be NULL!");
}
referencedSiteID = referencedSiteIDfromConfig.toString();
if (referencedSiteID.isEmpty()) {
throw new ConfigurationException(REFERENCED_SITE_ID,
"The ID of the Referenced Site is a required Parameter and MUST NOT be an empty String!");
}
if (Entityhub.ENTITYHUB_IDS.contains(referencedSiteID.toLowerCase())) {
log.debug("Init NamedEntityTaggingEngine instance for the Entityhub");
referencedSiteID = null;
}
int maxDistance;
Object maxDistanceFromConfig = config.get(MAX_DISTANCE);
if (maxDistanceFromConfig == null) {
maxDistance = MAX_DISTANCE_DEFAULT_VALUE;
} else if (maxDistanceFromConfig instanceof Number) {
maxDistance = ((Number) maxDistanceFromConfig).intValue();
} else {
try {
maxDistance = Integer.parseInt(maxDistanceFromConfig.toString());
} catch (NumberFormatException nfe) {
throw new ConfigurationException(MAX_DISTANCE, "The Max Distance parameter must be a number");
}
}
if (maxDistance < -1) {
throw new ConfigurationException(MAX_DISTANCE,
"The Max Distance parameter must not be smaller than -1");
}
corefFinder = new CoreferenceFinder(languages.split(","), siteManager, entityhub, referencedSiteID,
maxDistance);
log.info("activate {}[name:{}]", getClass().getSimpleName(), getName());
}
@Override
public Map<String,Object> getServiceProperties() {
return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING,
(Object) ENGINE_ORDERING));
}
@Override
public int canEnhance(ContentItem ci) throws EngineException {
String language = getLanguage(this, ci, false);
if (language == null) {
log.debug("Engine {} ignores ContentItem {} becuase language {} is not detected.",
new Object[] {getName(), ci.getUri(), language});
return CANNOT_ENHANCE;
}
if (!nounPhraseFilterer.supportsLanguage(language)) {
log.debug("Engine {} does not support language {}.", new Object[] {getName(), language});
return CANNOT_ENHANCE;
}
return ENHANCE_SYNCHRONOUS;
}
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
/*
* Step 1 - Build the NER list and the noun phrase list.
*
* TODO - the noun phrases need to be lemmatized.
*/
Map<Integer,List<Span>> ners = new HashMap<Integer,List<Span>>();
List<NounPhrase> nounPhrases = new ArrayList<NounPhrase>();
extractNersAndNounPhrases(ci, ners, nounPhrases);
/*
* If there are no NERs to reference there's nothing to do but exit.
*/
if (ners.size() == 0) {
log.info("Did not find any NERs for which to do the coreferencing");
return;
}
/*
* Step 2 - Filter out bad noun phrases.
*/
String language = getLanguage(this, ci, false);
if (language == null) {
log.info("Could not detect the language of the text");
return;
}
nounPhraseFilterer.filter(nounPhrases, language);
/*
* If there are no good noun phrases there's nothing to do but exit.
*/
if (nounPhrases.size() == 0) {
log.info("Did not find any noun phrases with which to do the coreferencing");
return;
}
/*
* Step 3 - Extract corefs and write them as {@link NlpAnnotations.COREF_ANNOTATION}s in the {@link
* Span}s
*/
corefFinder.extractCorefs(ners, nounPhrases, language);
}
@Deactivate
protected void deactivate(ComponentContext ctx) {
log.info("deactivate {}[name:{}]", getClass().getSimpleName(), getName());
nounPhraseFilterer = null;
corefFinder = null;
super.deactivate(ctx);
}
/**
* Extracts the NERs and the noun phrases from the given text and puts them in the given lists.
*
* @param ci
* @param ners
* @param nounPhrases
*/
private void extractNersAndNounPhrases(ContentItem ci,
Map<Integer,List<Span>> ners,
List<NounPhrase> nounPhrases) {
AnalysedText at = NlpEngineHelper.getAnalysedText(this, ci, true);
Iterator<? extends Section> sections = at.getSentences();
if (!sections.hasNext()) { // process as single sentence
sections = Collections.singleton(at).iterator();
}
int sentenceCnt = 0;
while (sections.hasNext()) {
sentenceCnt++;
Section section = sections.next();
List<NounPhrase> sectionNounPhrases = new ArrayList<NounPhrase>();
List<Span> sectionNers = new ArrayList<Span>();
Iterator<Span> chunks = section.getEnclosed(EnumSet.of(SpanTypeEnum.Chunk));
while (chunks.hasNext()) {
Span chunk = chunks.next();
Value<NerTag> ner = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION);
if (ner != null) {
sectionNers.add(chunk);
}
Value<PhraseTag> phrase = chunk.getAnnotation(NlpAnnotations.PHRASE_ANNOTATION);
if (phrase != null && phrase.value().getCategory() == LexicalCategory.Noun) {
sectionNounPhrases.add(new NounPhrase(chunk, sentenceCnt));
}
}
for (NounPhrase nounPhrase : sectionNounPhrases) {
Iterator<Span> tokens = section.getEnclosed(EnumSet.of(SpanTypeEnum.Token));
while (tokens.hasNext()) {
Span token = tokens.next();
if (nounPhrase.containsSpan(token)) {
nounPhrase.addToken(token);
}
}
for (Span sectionNer : sectionNers) {
if (nounPhrase.containsSpan(sectionNer)) {
nounPhrase.addNerChunk(sectionNer);
}
}
}
nounPhrases.addAll(sectionNounPhrases);
if (!sectionNers.isEmpty()) {
ners.put(sentenceCnt, sectionNers);
}
}
}
}