| /* |
| * Copyright 2012, FORMCEPT [http://www.formcept.com] |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.stanbol.enhancer.engine.disambiguation.mlt; |
| |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDFS_LABEL; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.Dictionary; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Map.Entry; |
| import java.util.NavigableMap; |
| import java.util.Set; |
| |
| import org.apache.clerezza.rdf.core.LiteralFactory; |
| import org.apache.clerezza.rdf.core.MGraph; |
| import org.apache.clerezza.rdf.core.Triple; |
| import org.apache.clerezza.rdf.core.UriRef; |
| import org.apache.clerezza.rdf.core.impl.TripleImpl; |
| import org.apache.commons.lang.StringUtils; |
| import org.apache.felix.scr.annotations.Activate; |
| import org.apache.felix.scr.annotations.Component; |
| import org.apache.felix.scr.annotations.Deactivate; |
| import org.apache.felix.scr.annotations.Properties; |
| import org.apache.felix.scr.annotations.Property; |
| import org.apache.felix.scr.annotations.Reference; |
| import org.apache.felix.scr.annotations.Service; |
| import org.apache.stanbol.enhancer.servicesapi.Blob; |
| import org.apache.stanbol.enhancer.servicesapi.ContentItem; |
| import org.apache.stanbol.enhancer.servicesapi.EngineException; |
| import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine; |
| import org.apache.stanbol.enhancer.servicesapi.InvalidContentException; |
| import org.apache.stanbol.enhancer.servicesapi.ServiceProperties; |
| import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper; |
| import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper; |
| import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine; |
| import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum; |
| import org.apache.stanbol.entityhub.servicesapi.defaults.SpecialFieldEnum; |
| import org.apache.stanbol.entityhub.servicesapi.model.Entity; |
| import org.apache.stanbol.entityhub.servicesapi.model.Representation; |
| import org.apache.stanbol.entityhub.servicesapi.model.Text; |
| import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum; |
| import org.apache.stanbol.entityhub.servicesapi.query.Constraint; |
| import org.apache.stanbol.entityhub.servicesapi.query.FieldQuery; |
| import org.apache.stanbol.entityhub.servicesapi.query.QueryResultList; |
| import org.apache.stanbol.entityhub.servicesapi.query.SimilarityConstraint; |
| import org.apache.stanbol.entityhub.servicesapi.query.TextConstraint; |
| import org.apache.stanbol.entityhub.servicesapi.site.Site; |
| import org.apache.stanbol.entityhub.servicesapi.site.SiteException; |
| import org.apache.stanbol.entityhub.servicesapi.site.SiteManager; |
| import org.osgi.service.cm.ConfigurationException; |
| import org.osgi.service.component.ComponentContext; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * Disambiguation Engine using Entityhub {@link SimilarityConstraint}s to disambiguate between existing |
| * fise:EntityAnnotations for fise:TextAnnotations. |
| * <p> |
| * <b>TODOs</b>: |
| * <ul> |
| * <li>Configurations: currently all configurations is set to the defaults |
| * <li>Context: test and improve different ways to determine the context used for disambiguation. |
| * <li>URI based similarity: currently only full text similarity is used. However it would also be possible to |
| * use the {@link SpecialFieldEnum#references} field to disambiguate based on URIs of already suggested |
| * Entities. |
| * </ul> |
| * |
| * @author Kritarth Anand |
| * @author Rupert Westenthaler |
| */ |
| @Component(immediate = true, metatype = true) |
| @Service |
| @Properties(value = {@Property(name = EnhancementEngine.PROPERTY_NAME, value = "disambiguation-mlt")}) |
| public class DisambiguatorEngine extends AbstractEnhancementEngine<IOException,RuntimeException> implements |
| EnhancementEngine, ServiceProperties { |
| |
| private static Logger log = LoggerFactory.getLogger(DisambiguatorEngine.class); |
| |
| /** |
| * Service URL |
| */ |
| private String serviceURL; |
| |
| /** |
| * The default value for the execution of this Engine. Currently set to |
| * {@link ServiceProperties#ORDERING_POST_PROCESSING} + 90. |
| * <p> |
| * This should ensure that this engines runs as one of the first engines of the post-processing phase |
| */ |
| public static final Integer defaultOrder = ServiceProperties.ORDERING_POST_PROCESSING - 90; |
| /** |
| * The plain text might be required for determining the extraction context |
| */ |
| public static final String PLAIN_TEXT_MIMETYPE = "text/plain"; |
| /** |
| * Contains the only supported mime type {@link #PLAIN_TEXT_MIMETYPE} |
| */ |
| public static final Set<String> SUPPORTED_MIMETYPES = Collections.singleton(PLAIN_TEXT_MIMETYPE); |
| |
| /** |
| * Used to lookup the Entityhub {@link Site} used to perform the disambiguation. |
| */ |
| @Reference |
| protected SiteManager siteManager; |
| |
| /* |
| * The following parameters describe the ratio of the original fise:confidence values and the |
| * disambiguation scores contributing to the final disambiguated fise:confidence |
| * |
| * TODO: make configurable |
| */ |
| /** |
| * Default ratio for Disambiguation (2.0) |
| */ |
| public static final double DEFAULT_DISAMBIGUATION_RATIO = 2.0; |
| /** |
| * Default ratio for the original fise:confidence of suggested entities |
| */ |
| public static final double DEFAULT_CONFIDNECE_RATIO = 1.0; |
| |
| /** |
| * The weight for disambiguation scores <code>:= disRatio/(disRatio+confRatio)</code> |
| */ |
| private double disambiguationWeight = DEFAULT_DISAMBIGUATION_RATIO |
| / (DEFAULT_DISAMBIGUATION_RATIO + DEFAULT_CONFIDNECE_RATIO); |
| /** |
| * The weight for the original confidence scores <code>:= confRatio/(disRatio+confRatio)</code> |
| */ |
| private double confidenceWeight = DEFAULT_CONFIDNECE_RATIO |
| / (DEFAULT_DISAMBIGUATION_RATIO + DEFAULT_CONFIDNECE_RATIO); |
| |
| /** |
| * The {@link LiteralFactory} used to create typed RDF literals |
| */ |
| private final LiteralFactory literalFactory = LiteralFactory.getInstance(); |
| |
| /** |
| * Returns the properties containing the {@link ServiceProperties#ENHANCEMENT_ENGINE_ORDERING} |
| */ |
| @Override |
| public Map<String,Object> getServiceProperties() { |
| return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, |
| (Object) defaultOrder)); |
| } |
| |
| @Override |
| public int canEnhance(ContentItem ci) throws EngineException { |
| // check if content is present |
| try { |
| if ((ContentItemHelper.getText(ci.getBlob()) == null) |
| || (ContentItemHelper.getText(ci.getBlob()).trim().isEmpty())) { |
| return CANNOT_ENHANCE; |
| } |
| } catch (IOException e) { |
| log.error("Failed to get the text for " + "enhancement of content: " + ci.getUri(), e); |
| throw new InvalidContentException(this, ci, e); |
| } |
| // default enhancement is synchronous enhancement |
| return ENHANCE_SYNCHRONOUS; |
| } |
| |
| /* |
| * This function first evaluates all the possible ambiguations of each text annotation detected. the text |
| * of all entities detected is used for making a Dbpedia query with all string for MLT that contain all |
| * the other entities. The results obtained are used to calcualte new confidence values which are updated |
| * in the metadata. |
| */ |
| @Override |
| public void computeEnhancements(ContentItem ci) throws EngineException { |
| |
| String textContent; |
| Entry<UriRef,Blob> textBlob = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES); |
| if (textBlob != null) { |
| try { |
| textContent = ContentItemHelper.getText(textBlob.getValue()); |
| } catch (IOException e) { |
| log.warn("Unable to retieve plain text content for ContentItem " + ci.getUri(), e); |
| textContent = null; |
| } |
| } else { |
| textContent = null; |
| } |
| |
| MGraph graph = ci.getMetadata(); |
| |
| // (1) read the data from the content item |
| String contentLangauge; |
| DisambiguationData disData; |
| ci.getLock().readLock().lock(); |
| try { |
| contentLangauge = EnhancementEngineHelper.getLanguage(ci); |
| // NOTE (rwesten): moved the parsing of the information from the |
| // contentItem to static method of the Class holding those information |
| // (similar as it already was for SavedEntity) |
| // readEntities(loseConfidence, allEntities, textAnnotations, graph); |
| disData = DisambiguationData.createFromContentItem(ci); |
| } finally { |
| ci.getLock().readLock().unlock(); |
| } |
| |
| // (2) Disambiguate the SavedEntities |
| for (SavedEntity savedEntity : disData.textAnnotations.values()) { |
| if (savedEntity.getSuggestions().size() <= 1) { |
| // we need not to disambiguate if only one suggestion is present |
| continue; |
| } |
| // NOTE: the site is determined from the |
| // fise:TextAnnotation <-- dc:relation -- |
| // fise:EntityAnnotation -- entityhub:ste --> "{siteName}"^^xsd:string |
| // data. |
| // TODO: add configuration to include/exclude Sites by name |
| Site site = siteManager.getSite(savedEntity.getSite()); |
| Collection<String> types = null; // potential types of entities |
| boolean casesensitive = false; // TODO: make configurable |
| String savedEntityLabel = |
| casesensitive ? savedEntity.getName() : savedEntity.getName().toLowerCase(); |
| |
| // Determine the context used for disambiguation |
| // TODO: make this configurable options |
| |
| String disambiguationContext; |
| // (0.a) The easiest way is to just use the selection context |
| // disambiguationContext = savedEntity.getContext(); |
| // (0.b) Calculate a context based on a moving window |
| String window = |
| getDisambiguationContext(textContent, savedEntity.getName(), savedEntity.getStart(), 100); |
| log.info("Use Window: '{}' for '{}'", window, savedEntity.getName()); |
| |
| // (1) The contextSelections: |
| // All other selected text within the selection context |
| List<String> contextSelections = |
| getSelectionsInContext(savedEntity.getName(), disData.allSelectedTexts, window); |
| // savedEntity.getContext()); |
| disambiguationContext = unionString(false, contextSelections); |
| |
| // (2) I do not understand this variant (see comment for the |
| // EntitiesInRange(..) method |
| // List<String> L = EntitiesInRange(disData.directoryTextAnotation, |
| // (savedEntity.getStart() + savedEntity.getEnd()) / 2); |
| // disambiguationContext = unionString(false,contextSelections); |
| |
| // (3) one can build a combination of the above |
| // disambiguationContext = unionString(true, //unique adds |
| // Collections.singleton(savedEntity.getName()), //the selected text |
| // Collections.singleton(context), //the context |
| // contextSelections); //other selected parsed in the context |
| |
| // or just the name of the entity AND the context |
| // disambiguationContext = unionString(false, |
| // Collections.singleton(savedEntity.getName()), |
| // contextSelections); |
| |
| // (4) TODO: I would also like to have the possibility to disambiguate |
| // using URIs of Entities suggested for other TextAnnotations |
| // within the context. |
| |
| // make the similarity query on the Entityhub using the collected |
| // information |
| QueryResultList<Entity> results; |
| log.info(" - Query '{}' for {}@{} with context '{}'", new Object[] {site.getId(), |
| savedEntityLabel, contentLangauge, disambiguationContext}); |
| if (!StringUtils.isBlank(disambiguationContext)) { |
| try { |
| results = query(site, savedEntityLabel, contentLangauge, disambiguationContext); |
| } catch (SiteException e) { |
| // TODO we could also try to catch those errors ... |
| throw new EngineException("Unable to disambiguate Mention of '" + savedEntity.getName() |
| + "' on Entityhub Site '" + site.getId() + "!", e); |
| } |
| log.debug(" - {} results returned by query {}", results.size(), results.getQuery()); |
| // match the results with the suggestions |
| disambiguateSuggestions(results, savedEntity); |
| } else { |
| log.debug(" - not disambiguated because of empty context!"); |
| } |
| } |
| // (3) Write back the Results of the Disambiguation process |
| // NOTE (rwesten): In the original version of Kritarth this was done as |
| // part of (2) - disambiguation. This is now changed as in (2) the |
| // disambiguation results are stored in the Suggestions and only |
| // applied to the EnhancementStructure in (3). This allows to reduce the |
| // coverage of the wirte lock needed to be applied to the ContentItem. |
| ci.getLock().writeLock().lock(); |
| try { |
| applyDisambiguationResults(graph, disData); |
| } finally { |
| ci.getLock().writeLock().unlock(); |
| } |
| } |
| |
| /* |
| * Is used to query the Dbpedia with a entity as main constraint and then add string of all other entities |
| * detected as similarity constraints |
| */ |
| |
| protected QueryResultList<Entity> query(Site dbpediaSite, String savedEntityLabel, String language, |
| String extractionContext) throws SiteException { |
| FieldQuery query = dbpediaSite.getQueryFactory().createFieldQuery(); |
| if (savedEntityLabel != null && !savedEntityLabel.isEmpty()) { |
| Constraint labelConstraint; |
| if (language != null) { |
| labelConstraint = new TextConstraint(savedEntityLabel, false, language, null); |
| } else { |
| labelConstraint = new TextConstraint(savedEntityLabel, false); |
| } |
| // TODO: what happens if a recommendation was not based on rdfs:label? |
| query.setConstraint(RDFS_LABEL.getUnicodeString(), labelConstraint); |
| } else { |
| log.warn("parsed label {} was empty or NULL. Will use Similarity constraint only!", |
| savedEntityLabel); |
| } |
| query.setConstraint(SpecialFieldEnum.fullText.getUri(), new SimilarityConstraint(extractionContext)); |
| query.setLimit(25); |
| |
| return dbpediaSite.findEntities(query); |
| } |
| |
| /* |
| * If for an entity the Dbpedia query results in suggestion none of which match the already present |
| * ambiguations, we go with the ambiguations found earlier that is the ones we have with. |
| */ |
| // NOTE (rwesten): The disambiguateSuggestions now reduces confidence |
| // values of Suggestions that are not within the disambiguation result |
| // by the #confidenceWeight. So if not a single suggestion do match with |
| // the disambiguation result the ambiguation is kept but the overall |
| // fise:confidence values are reduced by #confidenceWeight (ensured to be |
| // less than 1) |
| // protected List<Triple> unchangedConfidences(List<UriRef> subsumed, |
| // MGraph graph, |
| // List<Triple> loseConfidence) { |
| // for (int i = 0; i < subsumed.size(); i++) { |
| // UriRef uri = subsumed.get(i); |
| // Iterator<Triple> confidenceTriple = graph.filter(uri, ENHANCER_CONFIDENCE, null); |
| // while (confidenceTriple.hasNext()) { |
| // loseConfidence.remove(confidenceTriple.next()); |
| // } |
| // } |
| // return loseConfidence; |
| // } |
| |
| /** |
| * Applies the disambiguation results to the suggestions of the {@link SavedEntity}. |
| * <p> |
| * This method modifies the state of the {@link SavedEntity#getSuggestions()} |
| * |
| * @param results |
| * the results of the disambiguation request |
| * @param savedEntity |
| * the saved entity to be disambiguated |
| **/ |
| protected void disambiguateSuggestions(QueryResultList<Entity> results, SavedEntity savedEntity) { |
| // NOTE (rwesten) We should not score disambiguation results based on |
| // how well the labels match. |
| // Either use directly the scores of the disambiguation results OR |
| // do combine the confidence of the original suggestion with the |
| // scores of the disambiguation |
| |
| /* |
| * Algorithm: Combine original confidence with Disambiguation results |
| * |
| * Parameter(s): |
| * |
| * * ratio configured as '{dr}:{cr}' where 'dr' stands for the ratio for the disambiguation score and |
| * 'cr' stand for the ratio for the original fise:confidence of a suggestion (default 1:1) * |
| * disambiguation weight (dw) := dr/(dr+cr) ... already calculated based on the configured ratio in |
| * #disambiguationWeight * confidence weight (cw) := cw/(dr+cr) ... already calculated based on the |
| * configured ratio in #confidenceWeight |
| * |
| * Input(s): |
| * |
| * * confidence (c): the original confidence of a suggestion (range [0..1]) * score (s): the score of |
| * the disambiguation * maximum score (ms): the maximum disambiguation score |
| * |
| * Output |
| * |
| * * disambiguated confidence (dc): the confidence after disambiguation |
| * |
| * Algorithm: |
| * |
| * * normalized score (ns) := s/ms ... ensures range [0..1] for disambiguation scores * disambiguated |
| * confidence = c*cw+ns*dw ... guaranteed to be [0..1] |
| */ |
| List<Suggestion> matches = new ArrayList<Suggestion>(results.size()); |
| Float maxScore = null; |
| Float maxSuggestedScore = null; |
| Iterator<Entity> guesses = results.iterator(); |
| log.info("disambiguate {}: ", savedEntity.getName()); |
| while (guesses.hasNext()) { |
| Entity guess = guesses.next(); |
| Float score = |
| guess.getRepresentation().getFirst(RdfResourceEnum.resultScore.getUri(), Float.class); |
| if (score == null) { |
| log.warn("Missing Score for Entityhub Query Result {}!", guess.getId()); |
| continue; |
| } |
| if (maxScore == null) { |
| maxScore = score; |
| } |
| UriRef uri = new UriRef(guess.getId()); |
| Suggestion suggestion = savedEntity.getSuggestion(uri); |
| if (suggestion == null) { |
| log.info(" - not found {}", guess.getId()); |
| continue; |
| } |
| if (maxSuggestedScore == null) { |
| maxSuggestedScore = score; |
| } |
| double c = suggestion.getOriginalConfidnece() == null ? 0 : suggestion.getOriginalConfidnece(); |
| // TODO (rwesten) we need to find out if we should normalize based on the |
| // maximum score or the maximum score of an suggested one |
| double ns = score / maxSuggestedScore; |
| suggestion.setNormalizedDisambiguationScore(ns); |
| double dc = c * confidenceWeight + ns * disambiguationWeight; |
| suggestion.setDisambiguatedConfidence(dc); |
| log.info(" - found {}, origConf:{}, disScore:{}, disConf:{}", |
| new Object[] {suggestion.getEntityUri(), c, ns, dc}); |
| } |
| // if at least one suggestion was also in the disambiguation result |
| if (maxSuggestedScore != null) { |
| // adapt the confidence of suggestions that where not part of the |
| // disambiguation result |
| for (Suggestion suggestion : savedEntity.getSuggestions()) { |
| if (suggestion.getDisambiguatedConfidence() == null) { |
| double c = |
| suggestion.getOriginalConfidnece() == null ? 0 : suggestion |
| .getOriginalConfidnece(); |
| suggestion.setDisambiguatedConfidence(c * confidenceWeight); |
| } |
| } |
| } else { // else keep the original results |
| log.info(" - none found"); |
| } |
| } |
| |
| /* |
| * Checks if there is any common elements amongst the ambiguations amongst latest dbpedia query and intial |
| * ambiguations |
| */ |
| // NOTE (rwesten): now done as part of the disambiguateSuggestions(..) |
| // method. |
| // protected boolean intersectionCheck(List<Suggestion> matches, |
| // List<UriRef> subsumed, |
| // MGraph graph, |
| // String contentLangauge) { |
| // for (int i = 0; i < subsumed.size(); i++) { |
| // UriRef uri = subsumed.get(i); |
| // |
| // UriRef uri1 = EnhancementEngineHelper.getReference(graph, uri, new UriRef(NamespaceEnum.fise |
| // + "entity-reference")); |
| // |
| // String selectedText = EnhancementEngineHelper.getString(graph, uri, ENHANCER_ENTITY_LABEL); |
| // |
| // if (selectedText == null) { |
| // continue; |
| // } |
| // |
| // for (int j = 0; j < matches.size(); j++) { |
| // Suggestion suggestion = matches.get(j); |
| // String suggestName = suggestion.getURI(); |
| // if (suggestName.compareToIgnoreCase(uri1.getUnicodeString()) == 0) return true; |
| // } |
| // } |
| // return false; |
| // } |
| |
| // NOTE (rwesten): one MUST NOT store information of processed ContentItems |
| // as member variables, as one EnhancementEngine instance is |
| // concurrently used to process multiple ContentItems. Because |
| // of that member variables will have data of different |
| // ContentItems! |
| // All those data need to be hold in information that are local |
| // to the processing of a single ContentItem (similar to |
| // SavedEntity). |
| // NOTE moved the DisambiguationData#directoryTextAnotation |
| // public Map<Integer,String> directoryTextAnotation = new HashMap<Integer,String>(); |
| |
| // TODO: make configureable |
| int radii = 23; |
| |
| // Value to be configured |
| |
| public boolean toInclude(int k, int s) { |
| if (Math.abs(k - s) < radii && Math.abs(k - s) > 0) { |
| return true; |
| } |
| return false; |
| } |
| |
| /* |
| * TODO: rwesten I do not understand what is the intension of this Adding the fise:selection-context of |
| * all entities within a range of #radii characters seams not to be a great way to build a context (or do |
| * i miss something? |
| */ |
| @Deprecated |
| // for now until someone can answer the anove question |
| public List<String> EntitiesInRange(NavigableMap<Integer,SavedEntity> map, int radius) { |
| List<String> temp = new ArrayList<String>(); |
| // TODO: reimplement using subMap of the parsed NavigableMap map |
| for (Entry<Integer,SavedEntity> entry : map.entrySet()) { |
| Integer s = entry.getKey(); |
| String subs = entry.getValue().getContext(); |
| if (toInclude(s, radius)) { |
| temp.add(subs); |
| } |
| } |
| |
| return temp; // if(Cal(f,k)) |
| } |
| |
| /** |
| * Returns a list of all fise:selected-text values occurring in the parsed context (excluding the parsed |
| * label if not null |
| * |
| * @param label |
| * The label of the current Entity. parse <code>null</code> if the current label should not be |
| * ignored (and included in the context) |
| * @param allEntities |
| * The collections with all the fise:selection-text values of all fise:TextAnnotations |
| * @param context |
| * @return |
| */ |
| protected List<String> getSelectionsInContext(String label, Collection<String> allEntities, String context) { |
| List<String> allEntityString = new ArrayList<String>(); |
| |
| for (String selectedText : allEntities) { |
| if (context.contains(selectedText) && selectedText.compareToIgnoreCase(label) != 0) { |
| allEntityString.add(selectedText); |
| } |
| |
| } |
| |
| return allEntityString; |
| } |
| |
| public String unionString(boolean unique, Collection<?>... lists) { |
| StringBuilder union = new StringBuilder(); |
| HashSet<String> added = new HashSet<String>(); |
| for (Collection<?> list : lists) { |
| for (Object entry : list) { |
| if (!unique || added.add(entry.toString())) { |
| union.append(entry); |
| union.append(' '); |
| } |
| } |
| } |
| return union.toString(); |
| } |
| |
| /* |
| * Finds values the lie in intersection of both the set of disambiguations( the one intially suggested and |
| * the one from dpedia). Update the confidence values of those and make the confidence values of others as |
| * 0 in gainconfidence list |
| */ |
| // NOTE (rwesten): intersection is calculated as part of the disambiguateSuggestions(..) |
| // method. Results are stored in the Suggestions (member of SavedEntiy) and |
| // than written back to the EnhancementStructure in a separate step |
| // protected List<Triple> intersection(List<Suggestion> matches, |
| // List<UriRef> subsumed, |
| // MGraph graph, |
| // List<Triple> gainConfidence, |
| // String contentLangauge) { |
| // |
| // for (int i = 0; i < subsumed.size(); i++) { |
| // boolean matchFound = false; |
| // UriRef uri = subsumed.get(i); |
| // |
| // UriRef uri1 = EnhancementEngineHelper.getReference(graph, uri, new UriRef(NamespaceEnum.fise |
| // + "entity-reference")); |
| // |
| // for (int j = 0; j < matches.size(); j++) { |
| // Suggestion suggestion = matches.get(j); |
| // String suggestName = suggestion.getURI(); |
| // |
| // if (suggestName != null && uri1 != null |
| // && suggestName.compareToIgnoreCase(uri1.getUnicodeString()) == 0) { |
| // Triple confidenceTriple = new TripleImpl(uri, ENHANCER_CONFIDENCE, LiteralFactory |
| // .getInstance().createTypedLiteral(suggestion.getScore())); |
| // Triple contributorTriple = new TripleImpl((UriRef) confidenceTriple.getSubject(), |
| // new UriRef(NamespaceEnum.dc + "contributor"), LiteralFactory.getInstance() |
| // .createTypedLiteral(this.getClass().getName())); |
| // gainConfidence.add(confidenceTriple); |
| // gainConfidence.add(contributorTriple); |
| // matchFound = true; |
| // } |
| // } |
| // |
| // if (!matchFound) { |
| // Triple confidenceTriple = new TripleImpl(uri, ENHANCER_CONFIDENCE, LiteralFactory |
| // .getInstance().createTypedLiteral(0.0)); |
| // Triple contributorTriple = new TripleImpl((UriRef) confidenceTriple.getSubject(), new UriRef( |
| // NamespaceEnum.dc + "contributor"), LiteralFactory.getInstance().createTypedLiteral( |
| // this.getClass().getName())); |
| // gainConfidence.add(confidenceTriple); |
| // gainConfidence.add(contributorTriple); |
| // } |
| // } |
| // |
| // return gainConfidence; |
| // } |
| |
| /* Removes the value in lose confidence from the graph */ |
| protected void removeOldConfidenceFromGraph(MGraph graph, List<Triple> loseConfidence) { |
| for (int i = 0; i < loseConfidence.size(); i++) { |
| Triple elementToRemove = loseConfidence.get(i); |
| graph.remove(elementToRemove); |
| } |
| } |
| |
| /** |
| * Adds the disambiguation results to the enhancement structure |
| * |
| * @param graph |
| * the metadata of the {@link ContentItem} |
| * @param disData |
| * the disambiguation data |
| */ |
| protected void applyDisambiguationResults(MGraph graph, DisambiguationData disData) { |
| for (SavedEntity savedEntity : disData.textAnnotations.values()) { |
| for (Suggestion s : savedEntity.getSuggestions()) { |
| if (s.getDisambiguatedConfidence() != null) { |
| if (disData.suggestionMap.get(s.getEntityAnnotation()).size() > 1) { |
| // already encountered AND disambiguated -> we need to clone!! |
| log.info("clone {} suggesting {} for {}[{},{}]({})", |
| new Object[] {s.getEntityAnnotation(), s.getEntityUri(), savedEntity.getName(), |
| savedEntity.getStart(), savedEntity.getEnd(), savedEntity.getUri()}); |
| s.setEntityAnnotation(cloneTextAnnotation(graph, s.getEntityAnnotation(), |
| savedEntity.getUri())); |
| log.info(" - cloned {}", s.getEntityAnnotation()); |
| } |
| // change the confidence |
| EnhancementEngineHelper.set(graph, s.getEntityAnnotation(), ENHANCER_CONFIDENCE, |
| s.getDisambiguatedConfidence(), literalFactory); |
| EnhancementEngineHelper.addContributingEngine(graph, s.getEntityAnnotation(), this); |
| } |
| } |
| } |
| } |
| |
| /** |
| * This creates a 'clone' of the fise:EntityAnnotation where the original does no longer have a |
| * dc:relation to the parsed fise:TextAnnotation and the created clone does only have a dc:relation to the |
| * parsed fise:TextAnnotation. |
| * <p> |
| * This is required by disambiguation because other engines typically only create a single |
| * fise:EntityAnnotation instance if several fise:TextAnnotation do have the same fise:selected-text |
| * values. So for a text that multiple times mentions the same Entity (e.g. "Paris") there will be |
| * multiple fise:TextAnnotations selecting the different mentions of that Entity, but there will be only a |
| * single set of suggestions - fise:EntityAnnotations (e.g. "Paris, France" and "Paris, Texas"). Now lets |
| * assume a text like |
| * |
| * <pre> |
| * Paris is the capital of France and it is worth a visit for sure. But |
| * one can also visit Paris without leaving the United States as there |
| * is also a city with the same name in Texas. |
| * </pre> |
| * |
| * Entity Disambiguation need to be able to have different fise:confidence values for the first and second |
| * mention of Paris and this is only possible of the fise:TextAnnotations of those mentions do NOT refer |
| * to the same set of fise:EntityAnnotations. |
| * <p> |
| * This methods accomplished exactly that as it |
| * <ul> |
| * <li>creates a clone of a fise:EntityAnnotation |
| * <li>removes the dc:relation link to the 2nd mention of Paris from the original |
| * <li>only adds the dc:relation of the end mention to the clone |
| * </ul> |
| * So in the end you will have two fise:EntityAnnotation |
| * <ul> |
| * <li>the original fise:EntityAnnotation with dc:relation to all fise:TextAnnotations other than the 2nd |
| * mention (the one this method was called for) |
| * <li>the cloned fise:EntityAnnnotation with a dc:relation to the 2nd mention. |
| * </ul> |
| * |
| * @param graph |
| * @param entityAnnotation |
| * @param textAnnotation |
| * @return |
| */ |
| public static UriRef cloneTextAnnotation(MGraph graph, UriRef entityAnnotation, UriRef textAnnotation) { |
| UriRef copy = new UriRef("urn:enhancement-" + EnhancementEngineHelper.randomUUID()); |
| Iterator<Triple> it = graph.filter(entityAnnotation, null, null); |
| // we can not add triples to the graph while iterating. So store them |
| // in a list and add later |
| List<Triple> added = new ArrayList<Triple>(32); |
| while (it.hasNext()) { |
| Triple triple = it.next(); |
| if (DC_RELATION.equals(triple.getPredicate())) { |
| if (triple.getObject().equals(textAnnotation)) { |
| // remove the dc relation to the currently processed |
| // textAnnotation from the original |
| it.remove(); |
| // and add it to the copy |
| added.add(new TripleImpl(copy, // use the copy as subject! |
| triple.getPredicate(), triple.getObject())); |
| } // else it is not the currently processed TextAnnotation |
| // so we need to keep in in the original and NOT add |
| // it to the copy |
| } else { // we can copy all other information 1:1 |
| added.add(new TripleImpl(copy, // use the copy as subject! |
| triple.getPredicate(), triple.getObject())); |
| } |
| } |
| graph.addAll(added); |
| return copy; |
| } |
| |
| /* Returns a string on appended text annotations seperated by spaces */ |
| protected String getEntitiesfromContext(String label, List<String> allEntities, String context) { |
| String allEntityString = ""; |
| |
| for (int i = 0; i < allEntities.size(); i++) { |
| |
| if (label.compareToIgnoreCase(allEntities.get(i)) != 0 && (context != null) |
| && (context.contains(allEntities.get(i)))) { |
| allEntityString = allEntityString + " " + allEntities.get(i); |
| } |
| |
| } |
| |
| return allEntityString; |
| } |
| |
| protected String deriveSentence(String Context, int a, int b) { |
| String allEntityString = ""; |
| String start = Context.substring(0, a); |
| String end = Context.substring(b); |
| int s = start.lastIndexOf('.'); |
| int e = end.indexOf('.'); |
| if (s < 0) { |
| if (e < 0) return Context; |
| else return Context.substring(0, b + e); |
| } else { |
| if (e < 0) return Context.substring(s); |
| else return Context.substring(s + 1, b + e); |
| } |
| |
| } |
| |
| /** |
| * Extracts the selection context based on the content, selection and the start char offset of the |
| * selection |
| * |
| * @param content |
| * the content |
| * @param selection |
| * the selected text |
| * @param selectionStartPos |
| * the start char position of the selection |
| * @param contextSize |
| * the size of the context in characters |
| * @return the context |
| */ |
| public static String getDisambiguationContext(String content, String selection, int selectionStartPos, |
| int contextSize) { |
| // extract the selection context |
| int beginPos; |
| if (selectionStartPos <= contextSize) { |
| beginPos = 0; |
| } else { |
| int start = selectionStartPos - contextSize; |
| beginPos = start; |
| int c; |
| do { |
| c = content.codePointAt(beginPos); |
| beginPos++; |
| } while (beginPos <= selectionStartPos || Character.isWhitespace(c) |
| || Character.getType(c) == Character.SPACE_SEPARATOR); |
| if (beginPos < 0 || beginPos >= selectionStartPos) { // no words |
| beginPos = start; // begin within a word |
| } |
| } |
| int endPos; |
| if (selectionStartPos + selection.length() + contextSize >= content.length()) { |
| endPos = content.length(); |
| } else { |
| int selectionEndPos = selectionStartPos + selection.length(); |
| int end = selectionEndPos + contextSize; |
| endPos = end; |
| int c; |
| do { |
| c = content.codePointAt(endPos); |
| endPos--; |
| } while (endPos > selectionEndPos || Character.isWhitespace(c) |
| || Character.getType(c) == Character.SPACE_SEPARATOR); |
| if (endPos <= selectionStartPos + selection.length()) { |
| endPos = end; // end within a word; |
| } |
| } |
| return content.substring(beginPos, endPos); |
| } |
| |
| /** |
| * Activate and read the properties |
| * |
| * @param ce |
| * the {@link ComponentContext} |
| */ |
| @Activate |
| protected void activate(ComponentContext ce) throws ConfigurationException { |
| try { |
| super.activate(ce); |
| } catch (IOException e) { |
| // log |
| log.error("Failed to update the configuration", e); |
| } |
| @SuppressWarnings("unchecked") |
| Dictionary<String,Object> properties = ce.getProperties(); |
| // update the service URL if it is defined |
| // if (properties.get(FORMCEPT_SERVICE_URL) != null) { |
| // this.serviceURL = (String) properties.get(FORMCEPT_SERVICE_URL); |
| // } |
| } |
| |
| /** |
| * Deactivate |
| * |
| * @param ce |
| * the {@link ComponentContext} |
| */ |
| @Deactivate |
| protected void deactivate(ComponentContext ce) { |
| super.deactivate(ce); |
| } |
| |
| /** |
| * Gets the Service URL |
| * |
| * @return |
| */ |
| public String getServiceURL() { |
| return serviceURL; |
| } |
| |
| // private static double levenshtein(String s1, String s2) { |
| // if (s1 == null || s2 == null) { |
| // throw new IllegalArgumentException("NONE of the parsed String MUST BE NULL!"); |
| // } |
| // s1 = StringUtils.trim(s1); |
| // s2 = StringUtils.trim(s2); |
| // return s1.isEmpty() || s2.isEmpty() ? 0 |
| // : 1.0 - (((double) getLevenshteinDistance(s1, s2)) / ((double) (Math.max(s1.length(), |
| // s2.length())))); |
| // } |
| |
| } |