blob: 5c9e34e7f098b3a9bbac9a73b027725a7c2d84c9 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engine.disambiguation.mlt;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.NonLiteral;
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.TripleCollection;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
import org.apache.stanbol.entityhub.servicesapi.site.Site;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public final class SavedEntity {
private static final Logger log = LoggerFactory.getLogger(SavedEntity.class);
/**
* The {@link LiteralFactory} used to create typed RDF literals
*/
private final static LiteralFactory literalFactory = LiteralFactory.getInstance();
private String name;
private UriRef type;
private UriRef uri;
private String context;
private Integer start;
private Integer end;
/**
* Map with the suggestion. The key is the URI of the fise:EntityAnnotation
* and the value is the Triple with the confidence value
*/
private Map<UriRef,Suggestion> suggestions = new LinkedHashMap<UriRef,Suggestion>();
/**
* The name of the Entityhub {@link Site} managing the suggestions of
* this fise:TextAnnotation
*/
private String site;
/**
* private constructor only used by {@link #createFromTextAnnotation(TripleCollection, NonLiteral)}
*/
private SavedEntity() {}
/**
* creates a SavedEntity instance for the parsed fise:TextAnnotation
*
* @param graph
* the graph with the information
* @param textAnnotation
* the fise:TextAnnotation
* @return the {@link SavedEntity} or <code>null</code> if the parsed text
* annotation is missing required information.
*/
public static SavedEntity createFromTextAnnotation(TripleCollection graph, UriRef textAnnotation) {
SavedEntity entity = new SavedEntity();
entity.uri = textAnnotation;
entity.name = EnhancementEngineHelper.getString(graph, textAnnotation, ENHANCER_SELECTED_TEXT);
if (entity.name == null) {
log.debug("Unable to create SavedEntity for TextAnnotation {} "
+ "because property {} is not present", textAnnotation, ENHANCER_SELECTED_TEXT);
return null;
}
//NOTE rwesten: I think one should not change the selected text
// remove punctuation form the search string
//entity.name = cleanupKeywords(name);
if (entity.name.isEmpty()) {
log.debug("Unable to process TextAnnotation {} because its selects " + "an empty Stirng !",
textAnnotation);
return null;
}
entity.type = EnhancementEngineHelper.getReference(graph, textAnnotation, DC_TYPE);
// NOTE rwesten: TextAnnotations without dc:type should be still OK
// if (type == null) {
// log.warn("Unable to process TextAnnotation {} because property {}"
// + " is not present!",textAnnotation, DC_TYPE);
// return null;
// }
entity.context = EnhancementEngineHelper.getString(graph, textAnnotation, ENHANCER_SELECTION_CONTEXT);
Integer start = EnhancementEngineHelper.get(graph, textAnnotation, ENHANCER_START, Integer.class,
literalFactory);
Integer end = EnhancementEngineHelper.get(graph, textAnnotation, ENHANCER_END, Integer.class,
literalFactory);
if (start == null || end == null) {
log.debug("Unable to process TextAnnotation {} because the start and/or the end "
+ "position is not defined (selectedText: {}, start: {}, end: {})",
new Object[] {textAnnotation, entity.name, start, end});
}
entity.start = start;
entity.end = end;
//parse the suggestions
//all the entityhubSites that manage a suggested Entity
//(hopefully only a single one)
Set<String> entityhubSites = new HashSet<String>();
List<Suggestion> suggestionList = new ArrayList<Suggestion>();
Iterator<Triple> suggestions = graph.filter(null, Properties.DC_RELATION, textAnnotation);
//NOTE: this iterator will also include dc:relation between fise:TextAnnotation's
// but in those cases NULL will be returned as suggestion
while (suggestions.hasNext()) {
UriRef entityAnnotation = (UriRef) suggestions.next().getSubject();
Suggestion suggestion = Suggestion.createFromEntityAnnotation(graph, entityAnnotation);
if(suggestion != null){
suggestionList.add(suggestion);
if(suggestion.getSite() != null){
entityhubSites.add(suggestion.getSite());
}
}
}
if(suggestionList.isEmpty()){
log.warn("TextAnnotation {} (selectedText: {}, start: {}) has no"
+ "suggestions.",
new Object[]{entity.uri,entity.name,entity.start});
return null; //nothing to disambiguate
} else {
Collections.sort(suggestionList); //sort them based on confidence
//the LinkedHashMap will keep the order (based on the original
//confidence)
for(Suggestion suggestion : suggestionList){
entity.suggestions.put(suggestion.getEntityUri(), suggestion);
}
}
if(entityhubSites.isEmpty()) {
log.debug("TextAnnotation {} (selectedText: {}, start: {}) has "
+ "suggestions do not have 'entityhub:site' information. "
+ "Can not disambiguate because origin is unknown.",
new Object[]{entity.uri,entity.name,entity.start});
return null; //Ignore TextAnnotatiosn with suggestions of unknown origin.
} else if(entityhubSites.size() > 1){
log.warn("TextAnnotation {} (selectedText: {}, start: {}) has "
+ "suggestions originating from multiple Entityhub Sites {}",
new Object[]{entity.uri,entity.name,entity.start,entityhubSites});
return null; //TODO: Ignore those for now
} else {
entity.site = entityhubSites.iterator().next();
}
return entity;
}
/**
* Removes punctuation form a parsed string
*/
private static String cleanupKeywords(String keywords) {
return keywords.replaceAll("\\p{P}", " ").trim();
}
/**
* Getter for the name
*
* @return the name
*/
public final String getName() {
return name;
}
/**
* Getter for the type
*
* @return the type
*/
public final UriRef getType() {
return type;
}
@Override
public int hashCode() {
return uri.hashCode();
}
@Override
public boolean equals(Object o) {
return o instanceof SavedEntity && uri.equals(((SavedEntity) o).uri);
}
@Override
public String toString() {
return String.format("SavedEntity %s (name=%s | type=%s)", uri, name, type);
}
public UriRef getUri() {
return this.uri;
}
public String getContext() {
return this.context;
}
public int getStart() {
return this.start;
}
public int getEnd() {
return this.end;
}
public Collection<Suggestion> getSuggestions(){
return suggestions.values();
}
public Suggestion getSuggestion(UriRef uri){
return suggestions.get(uri);
}
/**
* The name of the Entityhub {@link Site} managing the suggestions
* @return
*/
public String getSite() {
return site;
}
}