blob: e8c447eabf2d829c40d281a9e114bd7a195fb000 [file] [log] [blame]
/*
* Copyright 2012, FORMCEPT [http://www.formcept.com]
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engine.disambiguation.mlt;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDFS_LABEL;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Dictionary;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.NavigableMap;
import java.util.Set;
import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.commons.lang.StringUtils;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Deactivate;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
import org.apache.stanbol.entityhub.servicesapi.defaults.SpecialFieldEnum;
import org.apache.stanbol.entityhub.servicesapi.model.Entity;
import org.apache.stanbol.entityhub.servicesapi.model.Representation;
import org.apache.stanbol.entityhub.servicesapi.model.Text;
import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
import org.apache.stanbol.entityhub.servicesapi.query.Constraint;
import org.apache.stanbol.entityhub.servicesapi.query.FieldQuery;
import org.apache.stanbol.entityhub.servicesapi.query.QueryResultList;
import org.apache.stanbol.entityhub.servicesapi.query.SimilarityConstraint;
import org.apache.stanbol.entityhub.servicesapi.query.TextConstraint;
import org.apache.stanbol.entityhub.servicesapi.site.Site;
import org.apache.stanbol.entityhub.servicesapi.site.SiteException;
import org.apache.stanbol.entityhub.servicesapi.site.SiteManager;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Disambiguation Engine using Entityhub {@link SimilarityConstraint}s to
* disambiguate between existing fise:EntityAnnotations for fise:TextAnnotations.
* <p>
* <b>TODOs</b>:<ul>
* <li> Configurations: currently all configurations is set to the defaults
* <li> Context: test and improve different ways to determine the context
* used for disambiguation.
* <li> URI based similarity: currently only full text similarity is used.
* However it would also be possible to use the
* {@link SpecialFieldEnum#references} field to disambiguate based on
* URIs of already suggested Entities.
* </ul>
* @author Kritarth Anand
* @author Rupert Westenthaler
*/
@Component(immediate = true, metatype = true)
@Service
@Properties(value = {
@Property(name = EnhancementEngine.PROPERTY_NAME, value = "disambiguation-mlt")
})
public class DisambiguatorEngine extends AbstractEnhancementEngine<IOException,RuntimeException> implements
EnhancementEngine, ServiceProperties {
private static Logger log = LoggerFactory.getLogger(DisambiguatorEngine.class);
/**
* Service URL
*/
private String serviceURL;
/**
* The default value for the execution of this Engine. Currently set to
* {@link ServiceProperties#ORDERING_POST_PROCESSING} + 90.
* <p>
* This should ensure that this engines runs as one of the first engines of the post-processing phase
*/
public static final Integer defaultOrder = ServiceProperties.ORDERING_POST_PROCESSING - 90;
/**
* The plain text might be required for determining the extraction context
*/
public static final String PLAIN_TEXT_MIMETYPE = "text/plain";
/**
* Contains the only supported mime type {@link #PLAIN_TEXT_MIMETYPE}
*/
public static final Set<String> SUPPORTED_MIMETYPES = Collections.singleton(PLAIN_TEXT_MIMETYPE);
/**
* Used to lookup the Entityhub {@link Site} used to perform the disambiguation.
*/
@Reference
protected SiteManager siteManager;
/*
* The following parameters describe the ratio of the
* original fise:confidence values and the disambiguation scores contributing
* to the final disambiguated fise:confidence
*
* TODO: make configurable
*/
/**
* Default ratio for Disambiguation (2.0)
*/
public static final double DEFAULT_DISAMBIGUATION_RATIO = 2.0;
/**
* Default ratio for the original fise:confidence of suggested entities
*/
public static final double DEFAULT_CONFIDNECE_RATIO = 1.0;
/**
* The weight for disambiguation scores
* <code>:= disRatio/(disRatio+confRatio)</code>
*/
private double disambiguationWeight = DEFAULT_DISAMBIGUATION_RATIO /
(DEFAULT_DISAMBIGUATION_RATIO + DEFAULT_CONFIDNECE_RATIO);
/**
* The weight for the original confidence scores
* <code>:= confRatio/(disRatio+confRatio)</code>
*/
private double confidenceWeight = DEFAULT_CONFIDNECE_RATIO /
(DEFAULT_DISAMBIGUATION_RATIO + DEFAULT_CONFIDNECE_RATIO);
/**
* The {@link LiteralFactory} used to create typed RDF literals
*/
private final LiteralFactory literalFactory = LiteralFactory.getInstance();
/**
* Returns the properties containing the {@link ServiceProperties#ENHANCEMENT_ENGINE_ORDERING}
*/
@Override
public Map<String,Object> getServiceProperties() {
return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING,
(Object) defaultOrder));
}
@Override
public int canEnhance(ContentItem ci) throws EngineException {
// check if content is present
try {
if ((ContentItemHelper.getText(ci.getBlob()) == null)
|| (ContentItemHelper.getText(ci.getBlob()).trim().isEmpty())) {
return CANNOT_ENHANCE;
}
} catch (IOException e) {
log.error("Failed to get the text for " + "enhancement of content: " + ci.getUri(), e);
throw new InvalidContentException(this, ci, e);
}
// default enhancement is synchronous enhancement
return ENHANCE_SYNCHRONOUS;
}
/*
* This function first evaluates all the possible ambiguations of each text annotation detected. the text
* of all entities detected is used for making a Dbpedia query with all string for MLT that contain all
* the other entities. The results obtained are used to calcualte new confidence values which are updated
* in the metadata.
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
String textContent;
Entry<UriRef,Blob> textBlob = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
if(textBlob != null){
try {
textContent = ContentItemHelper.getText(textBlob.getValue());
} catch (IOException e) {
log.warn("Unable to retieve plain text content for ContentItem "+ci.getUri(),e);
textContent = null;
}
} else {
textContent = null;
}
MGraph graph = ci.getMetadata();
// (1) read the data from the content item
String contentLangauge;
DisambiguationData disData;
ci.getLock().readLock().lock();
try {
contentLangauge = EnhancementEngineHelper.getLanguage(ci);
//NOTE (rwesten): moved the parsing of the information from the
//contentItem to static method of the Class holding those information
//(similar as it already was for SavedEntity)
//readEntities(loseConfidence, allEntities, textAnnotations, graph);
disData = DisambiguationData.createFromContentItem(ci);
} finally {
ci.getLock().readLock().unlock();
}
// (2) Disambiguate the SavedEntities
for (SavedEntity savedEntity : disData.textAnnotations.values()) {
if (savedEntity.getSuggestions().size() <= 1) {
//we need not to disambiguate if only one suggestion is present
continue;
}
//NOTE: the site is determined from the
// fise:TextAnnotation <-- dc:relation --
// fise:EntityAnnotation -- entityhub:ste --> "{siteName}"^^xsd:string
// data.
//TODO: add configuration to include/exclude Sites by name
Site site = siteManager.getSite(savedEntity.getSite());
Collection<String> types = null; // potential types of entities
boolean casesensitive = false; //TODO: make configurable
String savedEntityLabel = casesensitive ?
savedEntity.getName() : savedEntity.getName().toLowerCase();
//Determine the context used for disambiguation
//TODO: make this configurable options
String disambiguationContext;
//(0.a) The easiest way is to just use the selection context
//disambiguationContext = savedEntity.getContext();
//(0.b) Calculate a context based on a moving window
String window = getDisambiguationContext(textContent,savedEntity.getName(),
savedEntity.getStart(),100);
log.info("Use Window: '{}' for '{}'",window,savedEntity.getName());
//(1) The contextSelections:
// All other selected text within the selection context
List<String> contextSelections = getSelectionsInContext(
savedEntity.getName(),
disData.allSelectedTexts,
window);
//savedEntity.getContext());
disambiguationContext = unionString(false, contextSelections);
//(2) I do not understand this variant (see comment for the
// EntitiesInRange(..) method
// List<String> L = EntitiesInRange(disData.directoryTextAnotation,
// (savedEntity.getStart() + savedEntity.getEnd()) / 2);
// disambiguationContext = unionString(false,contextSelections);
//(3) one can build a combination of the above
// disambiguationContext = unionString(true, //unique adds
// Collections.singleton(savedEntity.getName()), //the selected text
// Collections.singleton(context), //the context
// contextSelections); //other selected parsed in the context
//or just the name of the entity AND the context
// disambiguationContext = unionString(false,
// Collections.singleton(savedEntity.getName()),
// contextSelections);
//(4) TODO: I would also like to have the possibility to disambiguate
// using URIs of Entities suggested for other TextAnnotations
// within the context.
//make the similarity query on the Entityhub using the collected
//information
QueryResultList<Entity> results;
log.info(" - Query '{}' for {}@{} with context '{}'",
new Object[]{site.getId(),savedEntityLabel,contentLangauge,disambiguationContext});
if(!StringUtils.isBlank(disambiguationContext)){
try {
results = query(site, savedEntityLabel, contentLangauge,
disambiguationContext);
} catch (SiteException e) {
//TODO we could also try to catch those errors ...
throw new EngineException("Unable to disambiguate Mention of '"
+ savedEntity.getName()+"' on Entityhub Site '"+
site.getId()+"!",e);
}
log.debug(" - {} results returned by query {}", results.size(), results.getQuery());
//match the results with the suggestions
disambiguateSuggestions(results, savedEntity);
} else {
log.debug(" - not disambiguated because of empty context!");
}
}
//(3) Write back the Results of the Disambiguation process
// NOTE (rwesten): In the original version of Kritarth this was done as
// part of (2) - disambiguation. This is now changed as in (2) the
// disambiguation results are stored in the Suggestions and only
// applied to the EnhancementStructure in (3). This allows to reduce the
// coverage of the wirte lock needed to be applied to the ContentItem.
ci.getLock().writeLock().lock();
try {
applyDisambiguationResults(graph, disData);
} finally {
ci.getLock().writeLock().unlock();
}
}
/*
* Is used to query the Dbpedia with a entity as main constraint and then add string of all other entities
* detected as similarity constraints
*/
protected QueryResultList<Entity> query(Site dbpediaSite,
String savedEntityLabel,
String language,
String extractionContext) throws SiteException {
FieldQuery query = dbpediaSite.getQueryFactory().createFieldQuery();
if(savedEntityLabel != null && !savedEntityLabel.isEmpty()){
Constraint labelConstraint;
if (language != null) {
labelConstraint = new TextConstraint(savedEntityLabel, false, language, null);
} else {
labelConstraint = new TextConstraint(savedEntityLabel, false);
}
//TODO: what happens if a recommendation was not based on rdfs:label?
query.setConstraint(RDFS_LABEL.getUnicodeString(), labelConstraint);
} else {
log.warn("parsed label {} was empty or NULL. Will use Similarity constraint only!",savedEntityLabel);
}
query.setConstraint(SpecialFieldEnum.fullText.getUri(),
new SimilarityConstraint(extractionContext));
query.setLimit(25);
return dbpediaSite.findEntities(query);
}
/*
* If for an entity the Dbpedia query results in suggestion none of which match the already present
* ambiguations, we go with the ambiguations found earlier that is the ones we have with.
*/
// NOTE (rwesten): The disambiguateSuggestions now reduces confidence
// values of Suggestions that are not within the disambiguation result
// by the #confidenceWeight. So if not a single suggestion do match with
// the disambiguation result the ambiguation is kept but the overall
// fise:confidence values are reduced by #confidenceWeight (ensured to be
// less than 1)
// protected List<Triple> unchangedConfidences(List<UriRef> subsumed,
// MGraph graph,
// List<Triple> loseConfidence) {
// for (int i = 0; i < subsumed.size(); i++) {
// UriRef uri = subsumed.get(i);
// Iterator<Triple> confidenceTriple = graph.filter(uri, ENHANCER_CONFIDENCE, null);
// while (confidenceTriple.hasNext()) {
// loseConfidence.remove(confidenceTriple.next());
// }
// }
// return loseConfidence;
// }
/**
* Applies the disambiguation results to the suggestions of the
* {@link SavedEntity}.<p>
* This method modifies the state of the {@link SavedEntity#getSuggestions()}
* @param results the results of the disambiguation request
* @param savedEntity the saved entity to be disambiguated
**/
protected void disambiguateSuggestions(QueryResultList<Entity> results,
SavedEntity savedEntity) {
//NOTE (rwesten) We should not score disambiguation results based on
// how well the labels match.
// Either use directly the scores of the disambiguation results OR
// do combine the confidence of the original suggestion with the
// scores of the disambiguation
/*
* Algorithm: Combine original confidence with Disambiguation results
*
* Parameter(s):
*
* * ratio configured as '{dr}:{cr}' where 'dr' stands for the
* ratio for the disambiguation score and 'cr' stand for the
* ratio for the original fise:confidence of a suggestion
* (default 1:1)
* * disambiguation weight (dw) := dr/(dr+cr) ... already calculated
* based on the configured ratio in #disambiguationWeight
* * confidence weight (cw) := cw/(dr+cr) ... already calculated
* based on the configured ratio in #confidenceWeight
*
* Input(s):
*
* * confidence (c): the original confidence of a suggestion
* (range [0..1])
* * score (s): the score of the disambiguation
* * maximum score (ms): the maximum disambiguation score
*
* Output
*
* * disambiguated confidence (dc): the confidence after disambiguation
*
* Algorithm:
*
* * normalized score (ns) := s/ms ... ensures range [0..1] for
* disambiguation scores
* * disambiguated confidence = c*cw+ns*dw ... guaranteed to be [0..1]
*
*/
List<Suggestion> matches = new ArrayList<Suggestion>(results.size());
Float maxScore = null;
Float maxSuggestedScore = null;
Iterator<Entity> guesses = results.iterator();
log.info("disambiguate {}: ",savedEntity.getName());
while ( guesses.hasNext()) {
Entity guess = guesses.next();
Float score = guess.getRepresentation().getFirst(
RdfResourceEnum.resultScore.getUri(),Float.class);
if(score == null){
log.warn("Missing Score for Entityhub Query Result {}!",
guess.getId());
continue;
}
if (maxScore == null) {
maxScore = score;
}
UriRef uri = new UriRef(guess.getId());
Suggestion suggestion = savedEntity.getSuggestion(uri);
if(suggestion == null){
log.info(" - not found {}",guess.getId());
continue;
}
if(maxSuggestedScore == null){
maxSuggestedScore = score;
}
double c = suggestion.getOriginalConfidnece() == null ? 0 :
suggestion.getOriginalConfidnece();
//TODO (rwesten) we need to find out if we should normalize based on the
// maximum score or the maximum score of an suggested one
double ns = score/maxSuggestedScore;
suggestion.setNormalizedDisambiguationScore(ns);
double dc = c*confidenceWeight + ns*disambiguationWeight;
suggestion.setDisambiguatedConfidence(dc);
log.info(" - found {}, origConf:{}, disScore:{}, disConf:{}",
new Object[]{suggestion.getEntityUri(),c,ns,dc});
}
//if at least one suggestion was also in the disambiguation result
if(maxSuggestedScore != null){
//adapt the confidence of suggestions that where not part of the
//disambiguation result
for(Suggestion suggestion : savedEntity.getSuggestions()){
if(suggestion.getDisambiguatedConfidence() == null){
double c = suggestion.getOriginalConfidnece() == null ? 0 :
suggestion.getOriginalConfidnece();
suggestion.setDisambiguatedConfidence(c*confidenceWeight);
}
}
} else { //else keep the original results
log.info(" - none found");
}
}
/*
* Checks if there is any common elements amongst the ambiguations amongst latest dbpedia query and intial
* ambiguations
*/
// NOTE (rwesten): now done as part of the disambiguateSuggestions(..)
// method.
// protected boolean intersectionCheck(List<Suggestion> matches,
// List<UriRef> subsumed,
// MGraph graph,
// String contentLangauge) {
// for (int i = 0; i < subsumed.size(); i++) {
// UriRef uri = subsumed.get(i);
//
// UriRef uri1 = EnhancementEngineHelper.getReference(graph, uri, new UriRef(NamespaceEnum.fise
// + "entity-reference"));
//
// String selectedText = EnhancementEngineHelper.getString(graph, uri, ENHANCER_ENTITY_LABEL);
//
// if (selectedText == null) {
// continue;
// }
//
// for (int j = 0; j < matches.size(); j++) {
// Suggestion suggestion = matches.get(j);
// String suggestName = suggestion.getURI();
// if (suggestName.compareToIgnoreCase(uri1.getUnicodeString()) == 0) return true;
// }
// }
// return false;
// }
// NOTE (rwesten): one MUST NOT store information of processed ContentItems
// as member variables, as one EnhancementEngine instance is
// concurrently used to process multiple ContentItems. Because
// of that member variables will have data of different
// ContentItems!
// All those data need to be hold in information that are local
// to the processing of a single ContentItem (similar to
// SavedEntity).
// NOTE moved the DisambiguationData#directoryTextAnotation
// public Map<Integer,String> directoryTextAnotation = new HashMap<Integer,String>();
//TODO: make configureable
int radii = 23;
// Value to be configured
public boolean toInclude(int k, int s) {
if (Math.abs(k - s) < radii && Math.abs(k - s) > 0) {
return true;
}
return false;
}
/*
* TODO: rwesten I do not understand what is the intension of this
* Adding the fise:selection-context of all entities within a range of
* #radii characters seams not to be a great way to build a context (or
* do i miss something?
*/
@Deprecated //for now until someone can answer the anove question
public List<String> EntitiesInRange(NavigableMap<Integer,SavedEntity> map, int radius) {
List<String> temp = new ArrayList<String>();
//TODO: reimplement using subMap of the parsed NavigableMap map
for (Entry<Integer,SavedEntity> entry : map.entrySet()) {
Integer s = entry.getKey();
String subs = entry.getValue().getContext();
if (toInclude(s, radius)) {
temp.add(subs);
}
}
return temp; // if(Cal(f,k))
}
/**
* Returns a list of all fise:selected-text values occurring in the
* parsed context (excluding the parsed label if not null
* @param label The label of the current Entity. parse <code>null</code> if
* the current label should not be ignored (and included in the context)
* @param allEntities The collections with all the fise:selection-text values
* of all fise:TextAnnotations
* @param context
* @return
*/
protected List<String> getSelectionsInContext(String label, Collection<String> allEntities, String context) {
List<String> allEntityString = new ArrayList<String>();
for (String selectedText : allEntities) {
if (context.contains(selectedText) && selectedText.compareToIgnoreCase(label) != 0) {
allEntityString.add(selectedText);
}
}
return allEntityString;
}
public String unionString(boolean unique,Collection<?>...lists) {
StringBuilder union = new StringBuilder();
HashSet<String> added = new HashSet<String>();
for(Collection<?> list : lists){
for(Object entry : list){
if(!unique || added.add(entry.toString())){
union.append(entry);
union.append(' ');
}
}
}
return union.toString();
}
/*
* Finds values the lie in intersection of both the set of disambiguations( the one intially suggested and
* the one from dpedia). Update the confidence values of those and make the confidence values of others as
* 0 in gainconfidence list
*/
// NOTE (rwesten): intersection is calculated as part of the disambiguateSuggestions(..)
// method. Results are stored in the Suggestions (member of SavedEntiy) and
// than written back to the EnhancementStructure in a separate step
// protected List<Triple> intersection(List<Suggestion> matches,
// List<UriRef> subsumed,
// MGraph graph,
// List<Triple> gainConfidence,
// String contentLangauge) {
//
// for (int i = 0; i < subsumed.size(); i++) {
// boolean matchFound = false;
// UriRef uri = subsumed.get(i);
//
// UriRef uri1 = EnhancementEngineHelper.getReference(graph, uri, new UriRef(NamespaceEnum.fise
// + "entity-reference"));
//
// for (int j = 0; j < matches.size(); j++) {
// Suggestion suggestion = matches.get(j);
// String suggestName = suggestion.getURI();
//
// if (suggestName != null && uri1 != null
// && suggestName.compareToIgnoreCase(uri1.getUnicodeString()) == 0) {
// Triple confidenceTriple = new TripleImpl(uri, ENHANCER_CONFIDENCE, LiteralFactory
// .getInstance().createTypedLiteral(suggestion.getScore()));
// Triple contributorTriple = new TripleImpl((UriRef) confidenceTriple.getSubject(),
// new UriRef(NamespaceEnum.dc + "contributor"), LiteralFactory.getInstance()
// .createTypedLiteral(this.getClass().getName()));
// gainConfidence.add(confidenceTriple);
// gainConfidence.add(contributorTriple);
// matchFound = true;
// }
// }
//
// if (!matchFound) {
// Triple confidenceTriple = new TripleImpl(uri, ENHANCER_CONFIDENCE, LiteralFactory
// .getInstance().createTypedLiteral(0.0));
// Triple contributorTriple = new TripleImpl((UriRef) confidenceTriple.getSubject(), new UriRef(
// NamespaceEnum.dc + "contributor"), LiteralFactory.getInstance().createTypedLiteral(
// this.getClass().getName()));
// gainConfidence.add(confidenceTriple);
// gainConfidence.add(contributorTriple);
// }
// }
//
// return gainConfidence;
// }
/* Removes the value in lose confidence from the graph */
protected void removeOldConfidenceFromGraph(MGraph graph, List<Triple> loseConfidence) {
for (int i = 0; i < loseConfidence.size(); i++) {
Triple elementToRemove = loseConfidence.get(i);
graph.remove(elementToRemove);
}
}
/**
* Adds the disambiguation results to the enhancement structure
* @param graph the metadata of the {@link ContentItem}
* @param disData the disambiguation data
*/
protected void applyDisambiguationResults(MGraph graph, DisambiguationData disData) {
for(SavedEntity savedEntity : disData.textAnnotations.values()){
for(Suggestion s : savedEntity.getSuggestions()){
if(s.getDisambiguatedConfidence() != null){
if(disData.suggestionMap.get(s.getEntityAnnotation()).size() > 1){
//already encountered AND disambiguated -> we need to clone!!
log.info("clone {} suggesting {} for {}[{},{}]({})",
new Object[]{s.getEntityAnnotation(),s.getEntityUri(),
savedEntity.getName(),savedEntity.getStart(),
savedEntity.getEnd(),savedEntity.getUri()});
s.setEntityAnnotation(cloneTextAnnotation(
graph,s.getEntityAnnotation(),savedEntity.getUri()));
log.info(" - cloned {}",s.getEntityAnnotation());
}
//change the confidence
EnhancementEngineHelper.set(graph,
s.getEntityAnnotation(), ENHANCER_CONFIDENCE,
s.getDisambiguatedConfidence(),literalFactory);
EnhancementEngineHelper.addContributingEngine(graph,
s.getEntityAnnotation(),this);
}
}
}
}
/**
* This creates a 'clone' of the fise:EntityAnnotation where the original
* does no longer have a dc:relation to the parsed fise:TextAnnotation and
* the created clone does only have a dc:relation to the parsed
* fise:TextAnnotation.<p>
* This is required by disambiguation because other engines typically only
* create a single fise:EntityAnnotation instance if several
* fise:TextAnnotation do have the same fise:selected-text values. So
* for a text that multiple times mentions the same Entity (e.g. "Paris")
* there will be multiple fise:TextAnnotations selecting the different
* mentions of that Entity, but there will be only a single set of
* suggestions - fise:EntityAnnotations (e.g. "Paris, France" and
* "Paris, Texas"). Now lets assume a text like
* <pre>
* Paris is the capital of France and it is worth a visit for sure. But
* one can also visit Paris without leaving the United States as there
* is also a city with the same name in Texas.
* </pre>
*
* Entity Disambiguation need to be able to have different fise:confidence
* values for the first and second mention of Paris and this is only
* possible of the fise:TextAnnotations of those mentions do NOT refer to
* the same set of fise:EntityAnnotations.<p>
* This methods accomplished exactly that as it <ul>
* <li> creates a clone of a fise:EntityAnnotation
* <li> removes the dc:relation link to the 2nd mention of Paris from the original
* <li> only adds the dc:relation of the end mention to the clone
* </ul>
* So in the end you will have two fise:EntityAnnotation<ul>
* <li> the original fise:EntityAnnotation with dc:relation to all
* fise:TextAnnotations other than the 2nd mention (the one this method was
* called for)
* <li> the cloned fise:EntityAnnnotation with a dc:relation to the 2nd
* mention.
* </ul>
* @param graph
* @param entityAnnotation
* @param textAnnotation
* @return
*/
public static UriRef cloneTextAnnotation(MGraph graph, UriRef entityAnnotation, UriRef textAnnotation) {
UriRef copy = new UriRef("urn:enhancement-"
+ EnhancementEngineHelper.randomUUID());
Iterator<Triple> it = graph.filter(entityAnnotation,null,null);
//we can not add triples to the graph while iterating. So store them
//in a list and add later
List<Triple> added = new ArrayList<Triple>(32);
while(it.hasNext()){
Triple triple = it.next();
if(DC_RELATION.equals(triple.getPredicate())){
if(triple.getObject().equals(textAnnotation)){
//remove the dc relation to the currently processed
//textAnnotation from the original
it.remove();
//and add it to the copy
added.add(new TripleImpl(
copy, //use the copy as subject!
triple.getPredicate(), triple.getObject()));
} //else it is not the currently processed TextAnnotation
// so we need to keep in in the original and NOT add
// it to the copy
} else { //we can copy all other information 1:1
added.add(new TripleImpl(
copy, //use the copy as subject!
triple.getPredicate(), triple.getObject()));
}
}
graph.addAll(added);
return copy;
}
/* Returns a string on appended text annotations seperated by spaces */
protected String getEntitiesfromContext(String label, List<String> allEntities, String context) {
String allEntityString = "";
for (int i = 0; i < allEntities.size(); i++) {
if (label.compareToIgnoreCase(allEntities.get(i)) != 0 && (context != null)
&& (context.contains(allEntities.get(i)))) {
allEntityString = allEntityString + " " + allEntities.get(i);
}
}
return allEntityString;
}
protected String deriveSentence(String Context, int a, int b) {
String allEntityString = "";
String start = Context.substring(0, a);
String end = Context.substring(b);
int s = start.lastIndexOf('.');
int e = end.indexOf('.');
if (s < 0) {
if (e < 0) return Context;
else return Context.substring(0, b + e);
} else {
if (e < 0) return Context.substring(s);
else return Context.substring(s + 1, b + e);
}
}
/**
* Extracts the selection context based on the content, selection and
* the start char offset of the selection
* @param content the content
* @param selection the selected text
* @param selectionStartPos the start char position of the selection
* @param contextSize the size of the context in characters
* @return the context
*/
public static String getDisambiguationContext(String content, String selection,int selectionStartPos, int contextSize){
//extract the selection context
int beginPos;
if(selectionStartPos <= contextSize){
beginPos = 0;
} else {
int start = selectionStartPos-contextSize;
beginPos = start;
int c;
do {
c = content.codePointAt(beginPos);
beginPos++;
} while(beginPos <= selectionStartPos ||
Character.isWhitespace(c) ||
Character.getType(c) == Character.SPACE_SEPARATOR);
if(beginPos < 0 || beginPos >= selectionStartPos){ //no words
beginPos = start; //begin within a word
}
}
int endPos;
if(selectionStartPos+selection.length()+contextSize >= content.length()){
endPos = content.length();
} else {
int selectionEndPos = selectionStartPos+selection.length();
int end = selectionEndPos+contextSize;
endPos = end;
int c;
do {
c = content.codePointAt(endPos);
endPos--;
} while(endPos > selectionEndPos ||
Character.isWhitespace(c) ||
Character.getType(c) == Character.SPACE_SEPARATOR);
if(endPos <= selectionStartPos+selection.length()){
endPos = end; //end within a word;
}
}
return content.substring(beginPos, endPos);
}
/**
* Activate and read the properties
*
* @param ce
* the {@link ComponentContext}
*/
@Activate
protected void activate(ComponentContext ce) throws ConfigurationException {
try {
super.activate(ce);
} catch (IOException e) {
// log
log.error("Failed to update the configuration", e);
}
@SuppressWarnings("unchecked")
Dictionary<String,Object> properties = ce.getProperties();
// update the service URL if it is defined
// if (properties.get(FORMCEPT_SERVICE_URL) != null) {
// this.serviceURL = (String) properties.get(FORMCEPT_SERVICE_URL);
// }
}
/**
* Deactivate
*
* @param ce
* the {@link ComponentContext}
*/
@Deactivate
protected void deactivate(ComponentContext ce) {
super.deactivate(ce);
}
/**
* Gets the Service URL
*
* @return
*/
public String getServiceURL() {
return serviceURL;
}
// private static double levenshtein(String s1, String s2) {
// if (s1 == null || s2 == null) {
// throw new IllegalArgumentException("NONE of the parsed String MUST BE NULL!");
// }
// s1 = StringUtils.trim(s1);
// s2 = StringUtils.trim(s2);
// return s1.isEmpty() || s2.isEmpty() ? 0
// : 1.0 - (((double) getLevenshteinDistance(s1, s2)) / ((double) (Math.max(s1.length(),
// s2.length()))));
// }
}