blob: 3d8eade756a283103b3be64341e2d92d1bb3b81b [file] [log] [blame]
package org.apache.stanbol.enhancer.engine.disambiguation.freebase.graph;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Dictionary;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Deactivate;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.enhancer.engine.disambiguation.freebase.graph.constants.FreebaseDisambiguatorEngineConstants;
import org.apache.stanbol.enhancer.engine.disambiguation.freebase.graph.helper.FreebaseDisambiguatorEngineHelper;
import org.apache.stanbol.enhancer.engine.disambiguation.mlt.DisambiguationData;
import org.apache.stanbol.enhancer.engine.disambiguation.mlt.SavedEntity;
import org.apache.stanbol.enhancer.engine.disambiguation.mlt.Suggestion;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Multimap;
import com.tinkerpop.blueprints.Edge;
import com.tinkerpop.blueprints.Graph;
import com.tinkerpop.blueprints.Vertex;
import com.tinkerpop.blueprints.impls.neo4j.Neo4jGraph;
@Component(immediate = true, metatype = true)
@Service
@Properties(value = {@Property(name = EnhancementEngine.PROPERTY_NAME, value = "freebase-disambiguation"),
@Property(name = FreebaseDisambiguatorEngine.GRAPH_LOCATION)})
public class FreebaseDisambiguatorEngine extends AbstractEnhancementEngine<IOException,RuntimeException>
implements EnhancementEngine, ServiceProperties {
public static final String GRAPH_LOCATION = "graph.location";
private static Logger log = LoggerFactory.getLogger(FreebaseDisambiguatorEngine.class);
/**
* The default value for the execution of this Engine. Currently set to
* {@link ServiceProperties#ORDERING_POST_PROCESSING} - 600.
* <p>
* This should ensure that this engines runs as one of the first engines of the post-processing phase
*/
public static final Integer defaultOrder = ServiceProperties.ORDERING_POST_PROCESSING - 600;
/**
* The plain text might be required for determining the extraction context
*/
public static final String PLAIN_TEXT_MIMETYPE = "text/plain";
/**
* Contains the only supported mime type {@link #PLAIN_TEXT_MIMETYPE}
*/
public static final Set<String> SUPPORTED_MIMETYPES = Collections.singleton(PLAIN_TEXT_MIMETYPE);
/**
* <p>
* Graph (Neo4jGraph) containing the whole graph
* </p>
*/
private Graph graph;
/*
* The following parameters describe the ratio of the original fise:confidence values and the
* disambiguation scores contributing to the final disambiguated fise:confidence
*
* TODO: make configurable
*/
/**
* Default ratio for Disambiguation (2.0)
*/
public static final double DEFAULT_DISAMBIGUATION_RATIO = 2.0;
/**
* Default ratio for the original fise:confidence of suggested entities
*/
public static final double DEFAULT_CONFIDENCE_RATIO = 1.0;
/**
* The weight for disambiguation scores <code>:= disRatio/(disRatio+confRatio)</code>
*/
private double disambiguationWeight = DEFAULT_DISAMBIGUATION_RATIO
/ (DEFAULT_DISAMBIGUATION_RATIO + DEFAULT_CONFIDENCE_RATIO);
/**
* The weight for the original confidence scores <code>:= confRatio/(disRatio+confRatio)</code>
*/
private double confidenceWeight = DEFAULT_CONFIDENCE_RATIO
/ (DEFAULT_DISAMBIGUATION_RATIO + DEFAULT_CONFIDENCE_RATIO);
/**
* The {@link LiteralFactory} used to create typed RDF literals
*/
private final LiteralFactory literalFactory = LiteralFactory.getInstance();
/**
* Returns the properties containing the {@link ServiceProperties#ENHANCEMENT_ENGINE_ORDERING}
*/
public Map<String,Object> getServiceProperties() {
return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING,
(Object) defaultOrder));
}
/**
* /**
* <p>
* Checks whether the current {@code ContentItem} can be enhanced or not
* </p>
*
* @param ci
* The ContentItem to be enhanced
*
* @return An integer indicating whether the ContentItem can be enhanced or not. There are a few constants
* that can be returned: CANNOT_ENHANCE, ENHANCE_SYNCHRONOUS, ENHANCE_ASYNC
*/
public int canEnhance(ContentItem ci) throws EngineException {
// check if content is present
try {
if ((ContentItemHelper.getText(ci.getBlob()) == null)
|| (ContentItemHelper.getText(ci.getBlob()).trim().isEmpty())) {
return CANNOT_ENHANCE;
}
} catch (IOException e) {
log.error("Failed to get the text for " + "enhancement of content: " + ci.getUri(), e);
throw new InvalidContentException(this, ci, e);
}
// default enhancement is synchronous enhancement
return ENHANCE_SYNCHRONOUS;
}
/**
* <p>
* This function first evaluates all the possible ambiguations of each text annotation detected. The ids
* of the referenced entities are used to check relations in the Freebase graph in order to obtain a new
* disambiguation score based on the relations between entities in the graph
* </p>
*
* <p>
* The results obtained are used to calculate new confidence values which are updated in the metadata.
* </p>
*/
public void computeEnhancements(ContentItem ci) throws EngineException {
MGraph metadata = ci.getMetadata();
// Read the data from the content item
DisambiguationData disData;
ci.getLock().readLock().lock();
try {
disData = DisambiguationData.createFromContentItem(ci);
} finally {
ci.getLock().readLock().unlock();
}
/*
* Disambiguation Steps
*
* 1. Generate the sets of entities for each text annotation
*
* 2. Generate all the possible solutions for the text. That means, generate the combination of
* entities of each set (cartesian product)
*
* 3. Construct the subgraph from the whole graph using only the extracted entities
*
* 4. Filter the possible solutions, removing those entities from each possible solution which are not
* valid, i.e isolated entities in the graph
*
* 5. For each possible solution, calculate the shortest path between every pair of entities in it and
* store the disambiguation score for the possible solution
*
* 6. For each entity annotation, modify the confidence value using the disambiguation score for the
* suggestion of the entity annotation
*/
/*
* (1) Generating the Map TextAnnotation->List<UriRef EntityUri> to be used to generate the possible
* solutions using cartesian product (one element for each entity of every text annotation)
*
* Generating the allEntities list to be used to generate the subgraph
*/
List<UriRef> allEntities = new ArrayList<UriRef>();
Multimap<UriRef,Suggestion> taSuggestions = ArrayListMultimap.create();
for (UriRef textAnnotation : disData.textAnnotations.keySet()) {
SavedEntity savedEntity = disData.textAnnotations.get(textAnnotation);
for (Suggestion suggestion : savedEntity.getSuggestions()) {
taSuggestions.put(textAnnotation, suggestion);
allEntities.add(suggestion.getEntityUri());
}
}
/*
* (2) Generate possible solutions
*/
Set<List<Suggestion>> possibleSolutions = FreebaseDisambiguatorEngineHelper
.generatePossibleSolutions(taSuggestions);
/*
* (3) Generating the subgraph
*/
log.info("Generating Subgraph for current entities");
Graph subgraph = FreebaseDisambiguatorEngineHelper.generateSubgraph(this.graph, allEntities);
log.info("Subgraph generated");
/*
* Stops the disambiguation if the subgraph has no edges
*/
if (!subgraph.getEdges().iterator().hasNext()) {
// Graph with all its vertices isolated, don't modify the confidence
log.info("Graph with all its vertices isolated, don't modify the confidence of EntityAnnotation's");
return;
}
this.dumpGraph(subgraph);
/*
* (4) Filter possible solutions;
*/
possibleSolutions = FreebaseDisambiguatorEngineHelper.filterPossibleSolutions(possibleSolutions,
subgraph);
/*
* (5) Calculate the shortest-path between entities in the possible solution and modify the
* disambiguation score of each Suggestion
*/
FreebaseDisambiguatorEngineHelper.calculateDisambiguationScores(possibleSolutions, subgraph, disData);
/*
* (6) Modify confidence values of the entity annotations
*/
ci.getLock().writeLock().lock();
try {
this.disambiguateAndApplyResults(metadata, disData);
} finally {
ci.getLock().writeLock().unlock();
}
}
/**
* <p>
* Calculates the new confidence value for each Suggestion using the normalized disambiguation score of
* the Suggestion
* <p>
* Applies the disambiguation results to the enhancements, modifying the confidence value of the entity
* annotations
* </p>
*
* @param metadata
* the {@code MGraph} instance containing the enhancement triples
* @param disData
* the {@code DisambiguationData} instance containing the Suggestion objects and their
* normalized disambiguation scores
*/
private void disambiguateAndApplyResults(MGraph metadata, DisambiguationData disData) {
for (SavedEntity savedEntity : disData.textAnnotations.values()) {
for (Suggestion suggestion : savedEntity.getSuggestions()) {
Double ns = suggestion.getNormalizedDisambiguationScore();
/*
* Suggestions with null in disambiguated confidence means that they don't form part of any
* possible solution so their confidence value won't change
*/
if (ns == null) {
suggestion.setDisambiguatedConfidence(suggestion.getOriginalConfidnece());
continue;
}
Double c = suggestion.getOriginalConfidnece() == null ? 0 : suggestion
.getOriginalConfidnece();
Double dc = c * confidenceWeight + ns * disambiguationWeight;
suggestion.setDisambiguatedConfidence(dc);
if (suggestion.getDisambiguatedConfidence() != null) {
// change the confidence
log.info("Modifying confidence for "
+ suggestion.getEntityAnnotation().getUnicodeString() + " (Entity "
+ suggestion.getEntityUri().getUnicodeString() + ") -> Last confidence: "
+ suggestion.getOriginalConfidnece() + " - New confidence: "
+ suggestion.getDisambiguatedConfidence());
EnhancementEngineHelper.set(metadata, suggestion.getEntityAnnotation(),
ENHANCER_CONFIDENCE, suggestion.getDisambiguatedConfidence(), literalFactory);
EnhancementEngineHelper.addContributingEngine(metadata, suggestion.getEntityAnnotation(),
this);
}
}
}
}
/**
* <p>
* Dump the graph to the log debug
* </p>
*
* @param graph
* the {@code Graph> to dump
*/
private void dumpGraph(Graph graph) {
if (log.isDebugEnabled() || true) {
log.debug("Dumping graph");
for (Vertex vertex : graph.getVertices()) {
log.debug("Vertex "
+ vertex
+ " : "
+ vertex.getProperty(FreebaseDisambiguatorEngineConstants.VERTEX_ENTITY_URI_PROPERTY));
}
for (Edge edge : graph.getEdges()) {
log.debug("Edge: " + edge);
}
}
}
/**
* Activate and read the properties
*
* @param ce
* the {@link ComponentContext}
*/
@Activate
protected void activate(ComponentContext ce) throws ConfigurationException {
try {
super.activate(ce);
@SuppressWarnings("unchecked")
Dictionary<String,Object> properties = ce.getProperties();
// update the service URL if it is defined
// if (properties.get(FORMCEPT_SERVICE_URL) != null) {
// this.serviceURL = (String) properties.get(FORMCEPT_SERVICE_URL);
// }
if (properties.get(GRAPH_LOCATION) != null
&& !((String) properties.get(GRAPH_LOCATION)).isEmpty()) {
String location = (String) properties.get(GRAPH_LOCATION);
log.info("Loading graph. Location: " + location);
this.graph = new Neo4jGraph(location);
log.info("Graph loaded");
} else throw new IOException(GRAPH_LOCATION
+ " property is null or empty. Failed to initialize the engine");
} catch (IOException e) { // log
log.error("Failed to update the configuration");
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Deactivate
*
* @param ce
* the {@link ComponentContext}
*/
@Deactivate
protected void deactivate(ComponentContext ce) {
try {
// Closing the graph
super.deactivate(ce);
if (this.graph != null) this.graph.shutdown();
} catch (Exception e) {
e.printStackTrace();
}
}
}