blob: cbee2584926333b2beabde7c2261b2ec7f179ba4 [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.stanbol.enhancer.engine.disambiguation.mlt;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Set;
import java.util.TreeMap;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
* Collects all data needed for Disambiguation
* @author Rupert Westenthaler
* @author Kritarth
public class DisambiguationData {
* used by #c
private DisambiguationData() {}
* Stores the URIs of fise:EntityAnnnotation as key and the fise:TextAnnotation they link to as value.
* <p>
* This is needed during writing the disambiguation results to the EnhancementStructure to know if one
* needs to clone an fise:EntityAnnotation or not.
public Map<UriRef,Set<UriRef>> suggestionMap = new HashMap<UriRef,Set<UriRef>>();
* Holds the center position of the fise:TextAnnotation fise:selected-text as key and the SavedEntity
* (representing the extracted data for the fise:TextAnnotation) as value.
* <p>
* Intended to do fast index based lookup for other TextAnnotations when building contexts for
* disambiguations.
public NavigableMap<Integer,SavedEntity> directoryTextAnotation = new TreeMap<Integer,SavedEntity>();
* Collection with the 'fise:selected-text' of all 'fise:TextAnnotations' Also those that are NOT included
* in {@link #textAnnotations} (e.g. because they are missing some required data)
public Collection<String> allSelectedTexts = new HashSet<String>();
* List of all fise:textAnnotations that can be used for disambiguation. the key is the URI and the value
* is the {@link SavedEntity} with the extracted information.
public Map<UriRef,SavedEntity> textAnnotations = new HashMap<UriRef,SavedEntity>();
// List to contain old confidence values that are to removed
// List<Triple> loseConfidence = new ArrayList<Triple>();
// List to contain new confidence values to be added to metadata
// List<Triple> gainConfidence = new ArrayList<Triple>();
* We create a data structure that stores the mapping of text annotation to List of Uri of all possible
* amiguations of the Text. Also it fills the list loseconfidence with confidence values of all the
* ambiguations for all entities (which will be removed eventually)
public static DisambiguationData createFromContentItem(ContentItem ci) {
MGraph graph = ci.getMetadata();
DisambiguationData data = new DisambiguationData();
Iterator<Triple> it = graph.filter(null, RDF_TYPE, TechnicalClasses.ENHANCER_TEXTANNOTATION);
while (it.hasNext()) {
UriRef uri = (UriRef);
// TODO: rwesten: do we really want to ignore fise:TextAnnotations that link to
// to an other one (typically two TextAnnotations that select the exact same text)
// if (graph.filter(uri, new UriRef(NamespaceEnum.dc + "relation"), null).hasNext()) {
// continue;
// }
SavedEntity savedEntity = SavedEntity.createFromTextAnnotation(graph, uri);
if (savedEntity != null) {
// data.allEntities.add(savedEntity.getContext());
Integer.valueOf((savedEntity.getStart() + savedEntity.getEnd()) / 2), savedEntity);
// add information to the #suggestionMap
for (Suggestion s : savedEntity.getSuggestions()) {
Set<UriRef> textAnnotations = data.suggestionMap.get(s.getEntityAnnotation());
if (textAnnotations == null) {
textAnnotations = new HashSet<UriRef>();
data.suggestionMap.put(s.getEntityAnnotation(), textAnnotations);
// NOTE (rwesten):
// changed the layout here. Now savedEntity contains the list
// of suggestions
data.textAnnotations.put(uri, savedEntity);
} else { // some information are also needed for other TextAnnotations
// like the selectedText of TextAnnotations (regardless if they
// have suggestions or not
String selectedText = EnhancementEngineHelper.getString(graph, uri, ENHANCER_SELECTED_TEXT);
if (selectedText != null) {
return data;