| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.stanbol.enhancer.engine.disambiguation.mlt; |
| |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START; |
| |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.LinkedHashMap; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.apache.clerezza.rdf.core.LiteralFactory; |
| import org.apache.clerezza.rdf.core.NonLiteral; |
| import org.apache.clerezza.rdf.core.Triple; |
| import org.apache.clerezza.rdf.core.TripleCollection; |
| import org.apache.clerezza.rdf.core.UriRef; |
| import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper; |
| import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum; |
| import org.apache.stanbol.enhancer.servicesapi.rdf.Properties; |
| import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses; |
| import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum; |
| import org.apache.stanbol.entityhub.servicesapi.site.Site; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| public final class SavedEntity { |
| private static final Logger log = LoggerFactory.getLogger(SavedEntity.class); |
| |
| /** |
| * The {@link LiteralFactory} used to create typed RDF literals |
| */ |
| private final static LiteralFactory literalFactory = LiteralFactory.getInstance(); |
| private String name; |
| private UriRef type; |
| private UriRef uri; |
| private String context; |
| private Integer start; |
| private Integer end; |
| |
| /** |
| * Map with the suggestion. The key is the URI of the fise:EntityAnnotation and the value is the Triple |
| * with the confidence value |
| */ |
| private Map<UriRef,Suggestion> suggestions = new LinkedHashMap<UriRef,Suggestion>(); |
| |
| /** |
| * The name of the Entityhub {@link Site} managing the suggestions of this fise:TextAnnotation |
| */ |
| private String site; |
| |
| /** |
| * private constructor only used by {@link #createFromTextAnnotation(TripleCollection, NonLiteral)} |
| */ |
| private SavedEntity() {} |
| |
| /** |
| * creates a SavedEntity instance for the parsed fise:TextAnnotation |
| * |
| * @param graph |
| * the graph with the information |
| * @param textAnnotation |
| * the fise:TextAnnotation |
| * @return the {@link SavedEntity} or <code>null</code> if the parsed text annotation is missing required |
| * information. |
| */ |
| public static SavedEntity createFromTextAnnotation(TripleCollection graph, UriRef textAnnotation) { |
| SavedEntity entity = new SavedEntity(); |
| entity.uri = textAnnotation; |
| entity.name = EnhancementEngineHelper.getString(graph, textAnnotation, ENHANCER_SELECTED_TEXT); |
| if (entity.name == null) { |
| log.debug("Unable to create SavedEntity for TextAnnotation {} " |
| + "because property {} is not present", textAnnotation, ENHANCER_SELECTED_TEXT); |
| return null; |
| } |
| // NOTE rwesten: I think one should not change the selected text |
| // remove punctuation form the search string |
| // entity.name = cleanupKeywords(name); |
| if (entity.name.isEmpty()) { |
| log.debug("Unable to process TextAnnotation {} because its selects " + "an empty Stirng !", |
| textAnnotation); |
| return null; |
| } |
| entity.type = EnhancementEngineHelper.getReference(graph, textAnnotation, DC_TYPE); |
| // NOTE rwesten: TextAnnotations without dc:type should be still OK |
| // if (type == null) { |
| // log.warn("Unable to process TextAnnotation {} because property {}" |
| // + " is not present!",textAnnotation, DC_TYPE); |
| // return null; |
| // } |
| entity.context = EnhancementEngineHelper.getString(graph, textAnnotation, ENHANCER_SELECTION_CONTEXT); |
| Integer start = |
| EnhancementEngineHelper.get(graph, textAnnotation, ENHANCER_START, Integer.class, |
| literalFactory); |
| Integer end = |
| EnhancementEngineHelper.get(graph, textAnnotation, ENHANCER_END, Integer.class, |
| literalFactory); |
| if (start == null || end == null) { |
| log.debug("Unable to process TextAnnotation {} because the start and/or the end " |
| + "position is not defined (selectedText: {}, start: {}, end: {})", new Object[] { |
| textAnnotation, entity.name, start, end}); |
| |
| } |
| entity.start = start; |
| entity.end = end; |
| |
| // parse the suggestions |
| |
| // all the entityhubSites that manage a suggested Entity |
| // (hopefully only a single one) |
| Set<String> entityhubSites = new HashSet<String>(); |
| List<Suggestion> suggestionList = new ArrayList<Suggestion>(); |
| Iterator<Triple> suggestions = graph.filter(null, Properties.DC_RELATION, textAnnotation); |
| // NOTE: this iterator will also include dc:relation between fise:TextAnnotation's |
| // but in those cases NULL will be returned as suggestion |
| while (suggestions.hasNext()) { |
| UriRef entityAnnotation = (UriRef) suggestions.next().getSubject(); |
| Suggestion suggestion = Suggestion.createFromEntityAnnotation(graph, entityAnnotation); |
| if (suggestion != null) { |
| suggestionList.add(suggestion); |
| if (suggestion.getSite() != null) { |
| entityhubSites.add(suggestion.getSite()); |
| } |
| } |
| } |
| if (suggestionList.isEmpty()) { |
| log.warn("TextAnnotation {} (selectedText: {}, start: {}) has no" + "suggestions.", new Object[] { |
| entity.uri, entity.name, entity.start}); |
| return null; // nothing to disambiguate |
| } else { |
| Collections.sort(suggestionList); // sort them based on confidence |
| // the LinkedHashMap will keep the order (based on the original |
| // confidence) |
| for (Suggestion suggestion : suggestionList) { |
| entity.suggestions.put(suggestion.getEntityUri(), suggestion); |
| } |
| } |
| if (entityhubSites.isEmpty()) { |
| log.debug("TextAnnotation {} (selectedText: {}, start: {}) has " |
| + "suggestions do not have 'entityhub:site' information. " |
| + "Can not disambiguate because origin is unknown.", new Object[] {entity.uri, |
| entity.name, entity.start}); |
| return null; // Ignore TextAnnotatiosn with suggestions of unknown origin. |
| } else if (entityhubSites.size() > 1) { |
| log.warn("TextAnnotation {} (selectedText: {}, start: {}) has " |
| + "suggestions originating from multiple Entityhub Sites {}", new Object[] {entity.uri, |
| entity.name, entity.start, entityhubSites}); |
| return null; // TODO: Ignore those for now |
| } else { |
| entity.site = entityhubSites.iterator().next(); |
| } |
| return entity; |
| } |
| |
| /** |
| * Removes punctuation form a parsed string |
| */ |
| private static String cleanupKeywords(String keywords) { |
| return keywords.replaceAll("\\p{P}", " ").trim(); |
| } |
| |
| /** |
| * Getter for the name |
| * |
| * @return the name |
| */ |
| public final String getName() { |
| return name; |
| } |
| |
| /** |
| * Getter for the type |
| * |
| * @return the type |
| */ |
| public final UriRef getType() { |
| return type; |
| } |
| |
| @Override |
| public int hashCode() { |
| return uri.hashCode(); |
| } |
| |
| @Override |
| public boolean equals(Object o) { |
| return o instanceof SavedEntity && uri.equals(((SavedEntity) o).uri); |
| } |
| |
| @Override |
| public String toString() { |
| return String.format("SavedEntity %s (name=%s | type=%s)", uri, name, type); |
| } |
| |
| public UriRef getUri() { |
| return this.uri; |
| } |
| |
| public String getContext() { |
| return this.context; |
| } |
| |
| public int getStart() { |
| return this.start; |
| } |
| |
| public int getEnd() { |
| return this.end; |
| } |
| |
| public Collection<Suggestion> getSuggestions() { |
| return suggestions.values(); |
| } |
| |
| public Suggestion getSuggestion(UriRef uri) { |
| return suggestions.get(uri); |
| } |
| |
| /** |
| * The name of the Entityhub {@link Site} managing the suggestions |
| * |
| * @return |
| */ |
| public String getSite() { |
| return site; |
| } |
| } |