enhancement-engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/DisambiguatorEngine.java - stanbol - Git at Google

 /*
  * Copyright 2012, FORMCEPT [http://www.formcept.com]
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  *  Unless required by applicable law or agreed to in writing, software
  *  distributed under the License is distributed on an "AS IS" BASIS,
  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  *  See the License for the specific language governing permissions and
  *  limitations under the License.
  */
 package org.apache.stanbol.enhancer.engine.disambiguation.mlt;

 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDFS_LABEL;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Dictionary;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.NavigableMap;
 import java.util.Set;

 import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.commons.lang.StringUtils;
 import org.apache.felix.scr.annotations.Activate;
 import org.apache.felix.scr.annotations.Component;
 import org.apache.felix.scr.annotations.Deactivate;
 import org.apache.felix.scr.annotations.Properties;
 import org.apache.felix.scr.annotations.Property;
 import org.apache.felix.scr.annotations.Reference;
 import org.apache.felix.scr.annotations.Service;
 import org.apache.stanbol.enhancer.servicesapi.Blob;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
 import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
 import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
 import org.apache.stanbol.entityhub.servicesapi.defaults.SpecialFieldEnum;
 import org.apache.stanbol.entityhub.servicesapi.model.Entity;
 import org.apache.stanbol.entityhub.servicesapi.model.Representation;
 import org.apache.stanbol.entityhub.servicesapi.model.Text;
 import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
 import org.apache.stanbol.entityhub.servicesapi.query.Constraint;
 import org.apache.stanbol.entityhub.servicesapi.query.FieldQuery;
 import org.apache.stanbol.entityhub.servicesapi.query.QueryResultList;
 import org.apache.stanbol.entityhub.servicesapi.query.SimilarityConstraint;
 import org.apache.stanbol.entityhub.servicesapi.query.TextConstraint;
 import org.apache.stanbol.entityhub.servicesapi.site.Site;
 import org.apache.stanbol.entityhub.servicesapi.site.SiteException;
 import org.apache.stanbol.entityhub.servicesapi.site.SiteManager;
 import org.osgi.service.cm.ConfigurationException;
 import org.osgi.service.component.ComponentContext;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /**
  * Disambiguation Engine using Entityhub {@link SimilarityConstraint}s to disambiguate between existing
  * fise:EntityAnnotations for fise:TextAnnotations.
  * <p>
  * <b>TODOs</b>:
  * <ul>
  * <li>Configurations: currently all configurations is set to the defaults
  * <li>Context: test and improve different ways to determine the context used for disambiguation.
  * <li>URI based similarity: currently only full text similarity is used. However it would also be possible to
  * use the {@link SpecialFieldEnum#references} field to disambiguate based on URIs of already suggested
  * Entities.
  * </ul>
  *
  * @author Kritarth Anand
  * @author Rupert Westenthaler
  */
 @Component(immediate = true, metatype = true)
 @Service
 @Properties(value = {@Property(name = EnhancementEngine.PROPERTY_NAME, value = "disambiguation-mlt")})
 public class DisambiguatorEngine extends AbstractEnhancementEngine<IOException,RuntimeException> implements
         EnhancementEngine, ServiceProperties {

     private static Logger log = LoggerFactory.getLogger(DisambiguatorEngine.class);

     /**
      * Service URL
      */
     private String serviceURL;

     /**
      * The default value for the execution of this Engine. Currently set to
      * {@link ServiceProperties#ORDERING_POST_PROCESSING} + 90.
      * <p>
      * This should ensure that this engines runs as one of the first engines of the post-processing phase
      */
     public static final Integer defaultOrder = ServiceProperties.ORDERING_POST_PROCESSING - 90;
     /**
      * The plain text might be required for determining the extraction context
      */
     public static final String PLAIN_TEXT_MIMETYPE = "text/plain";
     /**
      * Contains the only supported mime type {@link #PLAIN_TEXT_MIMETYPE}
      */
     public static final Set<String> SUPPORTED_MIMETYPES = Collections.singleton(PLAIN_TEXT_MIMETYPE);

     /**
      * Used to lookup the Entityhub {@link Site} used to perform the disambiguation.
      */
     @Reference
     protected SiteManager siteManager;

     /*
      * The following parameters describe the ratio of the original fise:confidence values and the
      * disambiguation scores contributing to the final disambiguated fise:confidence
      *
      * TODO: make configurable
      */
     /**
      * Default ratio for Disambiguation (2.0)
      */
     public static final double DEFAULT_DISAMBIGUATION_RATIO = 2.0;
     /**
      * Default ratio for the original fise:confidence of suggested entities
      */
     public static final double DEFAULT_CONFIDNECE_RATIO = 1.0;

     /**
      * The weight for disambiguation scores <code>:= disRatio/(disRatio+confRatio)</code>
      */
     private double disambiguationWeight = DEFAULT_DISAMBIGUATION_RATIO
             / (DEFAULT_DISAMBIGUATION_RATIO + DEFAULT_CONFIDNECE_RATIO);
     /**
      * The weight for the original confidence scores <code>:= confRatio/(disRatio+confRatio)</code>
      */
     private double confidenceWeight = DEFAULT_CONFIDNECE_RATIO
             / (DEFAULT_DISAMBIGUATION_RATIO + DEFAULT_CONFIDNECE_RATIO);

     /**
      * The {@link LiteralFactory} used to create typed RDF literals
      */
     private final LiteralFactory literalFactory = LiteralFactory.getInstance();

     /**
      * Returns the properties containing the {@link ServiceProperties#ENHANCEMENT_ENGINE_ORDERING}
      */
     @Override
     public Map<String,Object> getServiceProperties() {
         return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING,
             (Object) defaultOrder));
     }

     @Override
     public int canEnhance(ContentItem ci) throws EngineException {
         // check if content is present
         try {
             if ((ContentItemHelper.getText(ci.getBlob()) == null)
                     || (ContentItemHelper.getText(ci.getBlob()).trim().isEmpty())) {
                 return CANNOT_ENHANCE;
             }
         } catch (IOException e) {
             log.error("Failed to get the text for " + "enhancement of content: " + ci.getUri(), e);
             throw new InvalidContentException(this, ci, e);
         }
         // default enhancement is synchronous enhancement
         return ENHANCE_SYNCHRONOUS;
     }

     /*
      * This function first evaluates all the possible ambiguations of each text annotation detected. the text
      * of all entities detected is used for making a Dbpedia query with all string for MLT that contain all
      * the other entities. The results obtained are used to calcualte new confidence values which are updated
      * in the metadata.
      */
     @Override
     public void computeEnhancements(ContentItem ci) throws EngineException {

         String textContent;
         Entry<UriRef,Blob> textBlob = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
         if (textBlob != null) {
             try {
                 textContent = ContentItemHelper.getText(textBlob.getValue());
             } catch (IOException e) {
                 log.warn("Unable to retieve plain text content for ContentItem " + ci.getUri(), e);
                 textContent = null;
             }
         } else {
             textContent = null;
         }

         MGraph graph = ci.getMetadata();

         // (1) read the data from the content item
         String contentLangauge;
         DisambiguationData disData;
         ci.getLock().readLock().lock();
         try {
             contentLangauge = EnhancementEngineHelper.getLanguage(ci);
             // NOTE (rwesten): moved the parsing of the information from the
             // contentItem to static method of the Class holding those information
             // (similar as it already was for SavedEntity)
             // readEntities(loseConfidence, allEntities, textAnnotations, graph);
             disData = DisambiguationData.createFromContentItem(ci);
         } finally {
             ci.getLock().readLock().unlock();
         }

         // (2) Disambiguate the SavedEntities
         for (SavedEntity savedEntity : disData.textAnnotations.values()) {
             if (savedEntity.getSuggestions().size() <= 1) {
                 // we need not to disambiguate if only one suggestion is present
                 continue;
             }
             // NOTE: the site is determined from the
             // fise:TextAnnotation <-- dc:relation --
             // fise:EntityAnnotation -- entityhub:ste --> "{siteName}"^^xsd:string
             // data.
             // TODO: add configuration to include/exclude Sites by name
             Site site = siteManager.getSite(savedEntity.getSite());
             Collection<String> types = null; // potential types of entities
             boolean casesensitive = false; // TODO: make configurable
             String savedEntityLabel =
                     casesensitive ? savedEntity.getName() : savedEntity.getName().toLowerCase();

             // Determine the context used for disambiguation
             // TODO: make this configurable options

             String disambiguationContext;
             // (0.a) The easiest way is to just use the selection context
             // disambiguationContext = savedEntity.getContext();
             // (0.b) Calculate a context based on a moving window
             String window =
                     getDisambiguationContext(textContent, savedEntity.getName(), savedEntity.getStart(), 100);
             log.info("Use Window: '{}' for '{}'", window, savedEntity.getName());

             // (1) The contextSelections:
             // All other selected text within the selection context
             List<String> contextSelections =
                     getSelectionsInContext(savedEntity.getName(), disData.allSelectedTexts, window);
             // savedEntity.getContext());
             disambiguationContext = unionString(false, contextSelections);

             // (2) I do not understand this variant (see comment for the
             // EntitiesInRange(..) method
             // List<String> L = EntitiesInRange(disData.directoryTextAnotation,
             // (savedEntity.getStart() + savedEntity.getEnd()) / 2);
             // disambiguationContext = unionString(false,contextSelections);

             // (3) one can build a combination of the above
             // disambiguationContext = unionString(true, //unique adds
             // Collections.singleton(savedEntity.getName()), //the selected text
             // Collections.singleton(context), //the context
             // contextSelections); //other selected parsed in the context

             // or just the name of the entity AND the context
             // disambiguationContext = unionString(false,
             // Collections.singleton(savedEntity.getName()),
             // contextSelections);

             // (4) TODO: I would also like to have the possibility to disambiguate
             // using URIs of Entities suggested for other TextAnnotations
             // within the context.

             // make the similarity query on the Entityhub using the collected
             // information
             QueryResultList<Entity> results;
             log.info(" - Query '{}' for {}@{} with context '{}'", new Object[] {site.getId(),
                     savedEntityLabel, contentLangauge, disambiguationContext});
             if (!StringUtils.isBlank(disambiguationContext)) {
                 try {
                     results = query(site, savedEntityLabel, contentLangauge, disambiguationContext);
                 } catch (SiteException e) {
                     // TODO we could also try to catch those errors ...
                     throw new EngineException("Unable to disambiguate Mention of '" + savedEntity.getName()
                             + "' on Entityhub Site '" + site.getId() + "!", e);
                 }
                 log.debug(" - {} results returned by query {}", results.size(), results.getQuery());
                 // match the results with the suggestions
                 disambiguateSuggestions(results, savedEntity);
             } else {
                 log.debug(" - not disambiguated because of empty context!");
             }
         }
         // (3) Write back the Results of the Disambiguation process
         // NOTE (rwesten): In the original version of Kritarth this was done as
         // part of (2) - disambiguation. This is now changed as in (2) the
         // disambiguation results are stored in the Suggestions and only
         // applied to the EnhancementStructure in (3). This allows to reduce the
         // coverage of the wirte lock needed to be applied to the ContentItem.
         ci.getLock().writeLock().lock();
         try {
             applyDisambiguationResults(graph, disData);
         } finally {
             ci.getLock().writeLock().unlock();
         }
     }

     /*
      * Is used to query the Dbpedia with a entity as main constraint and then add string of all other entities
      * detected as similarity constraints
      */

     protected QueryResultList<Entity> query(Site dbpediaSite, String savedEntityLabel, String language,
             String extractionContext) throws SiteException {
         FieldQuery query = dbpediaSite.getQueryFactory().createFieldQuery();
         if (savedEntityLabel != null && !savedEntityLabel.isEmpty()) {
             Constraint labelConstraint;
             if (language != null) {
                 labelConstraint = new TextConstraint(savedEntityLabel, false, language, null);
             } else {
                 labelConstraint = new TextConstraint(savedEntityLabel, false);
             }
             // TODO: what happens if a recommendation was not based on rdfs:label?
             query.setConstraint(RDFS_LABEL.getUnicodeString(), labelConstraint);
         } else {
             log.warn("parsed label {} was empty or NULL. Will use Similarity constraint only!",
                 savedEntityLabel);
         }
         query.setConstraint(SpecialFieldEnum.fullText.getUri(), new SimilarityConstraint(extractionContext));
         query.setLimit(25);

         return dbpediaSite.findEntities(query);
     }

     /*
      * If for an entity the Dbpedia query results in suggestion none of which match the already present
      * ambiguations, we go with the ambiguations found earlier that is the ones we have with.
      */
     // NOTE (rwesten): The disambiguateSuggestions now reduces confidence
     // values of Suggestions that are not within the disambiguation result
     // by the #confidenceWeight. So if not a single suggestion do match with
     // the disambiguation result the ambiguation is kept but the overall
     // fise:confidence values are reduced by #confidenceWeight (ensured to be
     // less than 1)
     // protected List<Triple> unchangedConfidences(List<UriRef> subsumed,
     // MGraph graph,
     // List<Triple> loseConfidence) {
     // for (int i = 0; i < subsumed.size(); i++) {
     // UriRef uri = subsumed.get(i);
     // Iterator<Triple> confidenceTriple = graph.filter(uri, ENHANCER_CONFIDENCE, null);
     // while (confidenceTriple.hasNext()) {
     // loseConfidence.remove(confidenceTriple.next());
     // }
     // }
     // return loseConfidence;
     // }

     /**
      * Applies the disambiguation results to the suggestions of the {@link SavedEntity}.
      * <p>
      * This method modifies the state of the {@link SavedEntity#getSuggestions()}
      *
      * @param results
      *            the results of the disambiguation request
      * @param savedEntity
      *            the saved entity to be disambiguated
      **/
     protected void disambiguateSuggestions(QueryResultList<Entity> results, SavedEntity savedEntity) {
         // NOTE (rwesten) We should not score disambiguation results based on
         // how well the labels match.
         // Either use directly the scores of the disambiguation results OR
         // do combine the confidence of the original suggestion with the
         // scores of the disambiguation

         /*
          * Algorithm: Combine original confidence with Disambiguation results
          *
          * Parameter(s):
          *
          * * ratio configured as '{dr}:{cr}' where 'dr' stands for the ratio for the disambiguation score and
          * 'cr' stand for the ratio for the original fise:confidence of a suggestion (default 1:1) *
          * disambiguation weight (dw) := dr/(dr+cr) ... already calculated based on the configured ratio in
          * #disambiguationWeight * confidence weight (cw) := cw/(dr+cr) ... already calculated based on the
          * configured ratio in #confidenceWeight
          *
          * Input(s):
          *
          * * confidence (c): the original confidence of a suggestion (range [0..1]) * score (s): the score of
          * the disambiguation * maximum score (ms): the maximum disambiguation score
          *
          * Output
          *
          * * disambiguated confidence (dc): the confidence after disambiguation
          *
          * Algorithm:
          *
          * * normalized score (ns) := s/ms ... ensures range [0..1] for disambiguation scores * disambiguated
          * confidence = c*cw+ns*dw ... guaranteed to be [0..1]
          */
         List<Suggestion> matches = new ArrayList<Suggestion>(results.size());
         Float maxScore = null;
         Float maxSuggestedScore = null;
         Iterator<Entity> guesses = results.iterator();
         log.info("disambiguate {}: ", savedEntity.getName());
         while (guesses.hasNext()) {
             Entity guess = guesses.next();
             Float score =
                     guess.getRepresentation().getFirst(RdfResourceEnum.resultScore.getUri(), Float.class);
             if (score == null) {
                 log.warn("Missing Score for Entityhub Query Result {}!", guess.getId());
                 continue;
             }
             if (maxScore == null) {
                 maxScore = score;
             }
             UriRef uri = new UriRef(guess.getId());
             Suggestion suggestion = savedEntity.getSuggestion(uri);
             if (suggestion == null) {
                 log.info(" - not found {}", guess.getId());
                 continue;
             }
             if (maxSuggestedScore == null) {
                 maxSuggestedScore = score;
             }
             double c = suggestion.getOriginalConfidnece() == null ? 0 : suggestion.getOriginalConfidnece();
             // TODO (rwesten) we need to find out if we should normalize based on the
             // maximum score or the maximum score of an suggested one
             double ns = score / maxSuggestedScore;
             suggestion.setNormalizedDisambiguationScore(ns);
             double dc = c * confidenceWeight + ns * disambiguationWeight;
             suggestion.setDisambiguatedConfidence(dc);
             log.info("  - found {}, origConf:{}, disScore:{}, disConf:{}",
                 new Object[] {suggestion.getEntityUri(), c, ns, dc});
         }
         // if at least one suggestion was also in the disambiguation result
         if (maxSuggestedScore != null) {
             // adapt the confidence of suggestions that where not part of the
             // disambiguation result
             for (Suggestion suggestion : savedEntity.getSuggestions()) {
                 if (suggestion.getDisambiguatedConfidence() == null) {
                     double c =
                             suggestion.getOriginalConfidnece() == null ? 0 : suggestion
                                     .getOriginalConfidnece();
                     suggestion.setDisambiguatedConfidence(c * confidenceWeight);
                 }
             }
         } else { // else keep the original results
             log.info("  - none found");
         }
     }

     /*
      * Checks if there is any common elements amongst the ambiguations amongst latest dbpedia query and intial
      * ambiguations
      */
     // NOTE (rwesten): now done as part of the disambiguateSuggestions(..)
     // method.
     // protected boolean intersectionCheck(List<Suggestion> matches,
     // List<UriRef> subsumed,
     // MGraph graph,
     // String contentLangauge) {
     // for (int i = 0; i < subsumed.size(); i++) {
     // UriRef uri = subsumed.get(i);
     //
     // UriRef uri1 = EnhancementEngineHelper.getReference(graph, uri, new UriRef(NamespaceEnum.fise
     // + "entity-reference"));
     //
     // String selectedText = EnhancementEngineHelper.getString(graph, uri, ENHANCER_ENTITY_LABEL);
     //
     // if (selectedText == null) {
     // continue;
     // }
     //
     // for (int j = 0; j < matches.size(); j++) {
     // Suggestion suggestion = matches.get(j);
     // String suggestName = suggestion.getURI();
     // if (suggestName.compareToIgnoreCase(uri1.getUnicodeString()) == 0) return true;
     // }
     // }
     // return false;
     // }

     // NOTE (rwesten): one MUST NOT store information of processed ContentItems
     // as member variables, as one EnhancementEngine instance is
     // concurrently used to process multiple ContentItems. Because
     // of that member variables will have data of different
     // ContentItems!
     // All those data need to be hold in information that are local
     // to the processing of a single ContentItem (similar to
     // SavedEntity).
     // NOTE moved the DisambiguationData#directoryTextAnotation
     // public Map<Integer,String> directoryTextAnotation = new HashMap<Integer,String>();

     // TODO: make configureable
     int radii = 23;

     // Value to be configured

     public boolean toInclude(int k, int s) {
         if (Math.abs(k - s) < radii && Math.abs(k - s) > 0) {
             return true;
         }
         return false;
     }

     /*
      * TODO: rwesten I do not understand what is the intension of this Adding the fise:selection-context of
      * all entities within a range of #radii characters seams not to be a great way to build a context (or do
      * i miss something?
      */
     @Deprecated
     // for now until someone can answer the anove question
     public List<String> EntitiesInRange(NavigableMap<Integer,SavedEntity> map, int radius) {
         List<String> temp = new ArrayList<String>();
         // TODO: reimplement using subMap of the parsed NavigableMap map
         for (Entry<Integer,SavedEntity> entry : map.entrySet()) {
             Integer s = entry.getKey();
             String subs = entry.getValue().getContext();
             if (toInclude(s, radius)) {
                 temp.add(subs);
             }
         }

         return temp; // if(Cal(f,k))
     }

     /**
      * Returns a list of all fise:selected-text values occurring in the parsed context (excluding the parsed
      * label if not null
      *
      * @param label
      *            The label of the current Entity. parse <code>null</code> if the current label should not be
      *            ignored (and included in the context)
      * @param allEntities
      *            The collections with all the fise:selection-text values of all fise:TextAnnotations
      * @param context
      * @return
      */
     protected List<String> getSelectionsInContext(String label, Collection<String> allEntities, String context) {
         List<String> allEntityString = new ArrayList<String>();

         for (String selectedText : allEntities) {
             if (context.contains(selectedText) && selectedText.compareToIgnoreCase(label) != 0) {
                 allEntityString.add(selectedText);
             }

         }

         return allEntityString;
     }

     public String unionString(boolean unique, Collection<?>... lists) {
         StringBuilder union = new StringBuilder();
         HashSet<String> added = new HashSet<String>();
         for (Collection<?> list : lists) {
             for (Object entry : list) {
                 if (!unique || added.add(entry.toString())) {
                     union.append(entry);
                     union.append(' ');
                 }
             }
         }
         return union.toString();
     }

     /*
      * Finds values the lie in intersection of both the set of disambiguations( the one intially suggested and
      * the one from dpedia). Update the confidence values of those and make the confidence values of others as
      * 0 in gainconfidence list
      */
     // NOTE (rwesten): intersection is calculated as part of the disambiguateSuggestions(..)
     // method. Results are stored in the Suggestions (member of SavedEntiy) and
     // than written back to the EnhancementStructure in a separate step
     // protected List<Triple> intersection(List<Suggestion> matches,
     // List<UriRef> subsumed,
     // MGraph graph,
     // List<Triple> gainConfidence,
     // String contentLangauge) {
     //
     // for (int i = 0; i < subsumed.size(); i++) {
     // boolean matchFound = false;
     // UriRef uri = subsumed.get(i);
     //
     // UriRef uri1 = EnhancementEngineHelper.getReference(graph, uri, new UriRef(NamespaceEnum.fise
     // + "entity-reference"));
     //
     // for (int j = 0; j < matches.size(); j++) {
     // Suggestion suggestion = matches.get(j);
     // String suggestName = suggestion.getURI();
     //
     // if (suggestName != null && uri1 != null
     // && suggestName.compareToIgnoreCase(uri1.getUnicodeString()) == 0) {
     // Triple confidenceTriple = new TripleImpl(uri, ENHANCER_CONFIDENCE, LiteralFactory
     // .getInstance().createTypedLiteral(suggestion.getScore()));
     // Triple contributorTriple = new TripleImpl((UriRef) confidenceTriple.getSubject(),
     // new UriRef(NamespaceEnum.dc + "contributor"), LiteralFactory.getInstance()
     // .createTypedLiteral(this.getClass().getName()));
     // gainConfidence.add(confidenceTriple);
     // gainConfidence.add(contributorTriple);
     // matchFound = true;
     // }
     // }
     //
     // if (!matchFound) {
     // Triple confidenceTriple = new TripleImpl(uri, ENHANCER_CONFIDENCE, LiteralFactory
     // .getInstance().createTypedLiteral(0.0));
     // Triple contributorTriple = new TripleImpl((UriRef) confidenceTriple.getSubject(), new UriRef(
     // NamespaceEnum.dc + "contributor"), LiteralFactory.getInstance().createTypedLiteral(
     // this.getClass().getName()));
     // gainConfidence.add(confidenceTriple);
     // gainConfidence.add(contributorTriple);
     // }
     // }
     //
     // return gainConfidence;
     // }

     /* Removes the value in lose confidence from the graph */
     protected void removeOldConfidenceFromGraph(MGraph graph, List<Triple> loseConfidence) {
         for (int i = 0; i < loseConfidence.size(); i++) {
             Triple elementToRemove = loseConfidence.get(i);
             graph.remove(elementToRemove);
         }
     }

     /**
      * Adds the disambiguation results to the enhancement structure
      *
      * @param graph
      *            the metadata of the {@link ContentItem}
      * @param disData
      *            the disambiguation data
      */
     protected void applyDisambiguationResults(MGraph graph, DisambiguationData disData) {
         for (SavedEntity savedEntity : disData.textAnnotations.values()) {
             for (Suggestion s : savedEntity.getSuggestions()) {
                 if (s.getDisambiguatedConfidence() != null) {
                     if (disData.suggestionMap.get(s.getEntityAnnotation()).size() > 1) {
                         // already encountered AND disambiguated -> we need to clone!!
                         log.info("clone {} suggesting {} for {}[{},{}]({})",
                             new Object[] {s.getEntityAnnotation(), s.getEntityUri(), savedEntity.getName(),
                                     savedEntity.getStart(), savedEntity.getEnd(), savedEntity.getUri()});
                         s.setEntityAnnotation(cloneTextAnnotation(graph, s.getEntityAnnotation(),
                             savedEntity.getUri()));
                         log.info("  - cloned {}", s.getEntityAnnotation());
                     }
                     // change the confidence
                     EnhancementEngineHelper.set(graph, s.getEntityAnnotation(), ENHANCER_CONFIDENCE,
                         s.getDisambiguatedConfidence(), literalFactory);
                     EnhancementEngineHelper.addContributingEngine(graph, s.getEntityAnnotation(), this);
                 }
             }
         }
     }

     /**
      * This creates a 'clone' of the fise:EntityAnnotation where the original does no longer have a
      * dc:relation to the parsed fise:TextAnnotation and the created clone does only have a dc:relation to the
      * parsed fise:TextAnnotation.
      * <p>
      * This is required by disambiguation because other engines typically only create a single
      * fise:EntityAnnotation instance if several fise:TextAnnotation do have the same fise:selected-text
      * values. So for a text that multiple times mentions the same Entity (e.g. "Paris") there will be
      * multiple fise:TextAnnotations selecting the different mentions of that Entity, but there will be only a
      * single set of suggestions - fise:EntityAnnotations (e.g. "Paris, France" and "Paris, Texas"). Now lets
      * assume a text like
      *
      * <pre>
      *     Paris is the capital of France and it is worth a visit for sure. But
      *     one can also visit Paris without leaving the United States as there
      *     is also a city with the same name in Texas.
      * </pre>
      *
      * Entity Disambiguation need to be able to have different fise:confidence values for the first and second
      * mention of Paris and this is only possible of the fise:TextAnnotations of those mentions do NOT refer
      * to the same set of fise:EntityAnnotations.
      * <p>
      * This methods accomplished exactly that as it
      * <ul>
      * <li>creates a clone of a fise:EntityAnnotation
      * <li>removes the dc:relation link to the 2nd mention of Paris from the original
      * <li>only adds the dc:relation of the end mention to the clone
      * </ul>
      * So in the end you will have two fise:EntityAnnotation
      * <ul>
      * <li>the original fise:EntityAnnotation with dc:relation to all fise:TextAnnotations other than the 2nd
      * mention (the one this method was called for)
      * <li>the cloned fise:EntityAnnnotation with a dc:relation to the 2nd mention.
      * </ul>
      *
      * @param graph
      * @param entityAnnotation
      * @param textAnnotation
      * @return
      */
     public static UriRef cloneTextAnnotation(MGraph graph, UriRef entityAnnotation, UriRef textAnnotation) {
         UriRef copy = new UriRef("urn:enhancement-" + EnhancementEngineHelper.randomUUID());
         Iterator<Triple> it = graph.filter(entityAnnotation, null, null);
         // we can not add triples to the graph while iterating. So store them
         // in a list and add later
         List<Triple> added = new ArrayList<Triple>(32);
         while (it.hasNext()) {
             Triple triple = it.next();
             if (DC_RELATION.equals(triple.getPredicate())) {
                 if (triple.getObject().equals(textAnnotation)) {
                     // remove the dc relation to the currently processed
                     // textAnnotation from the original
                     it.remove();
                     // and add it to the copy
                     added.add(new TripleImpl(copy, // use the copy as subject!
                             triple.getPredicate(), triple.getObject()));
                 } // else it is not the currently processed TextAnnotation
                   // so we need to keep in in the original and NOT add
                   // it to the copy
             } else { // we can copy all other information 1:1
                 added.add(new TripleImpl(copy, // use the copy as subject!
                         triple.getPredicate(), triple.getObject()));
             }
         }
         graph.addAll(added);
         return copy;
     }

     /* Returns a string on appended text annotations seperated by spaces */
     protected String getEntitiesfromContext(String label, List<String> allEntities, String context) {
         String allEntityString = "";

         for (int i = 0; i < allEntities.size(); i++) {

             if (label.compareToIgnoreCase(allEntities.get(i)) != 0 && (context != null)
                     && (context.contains(allEntities.get(i)))) {
                 allEntityString = allEntityString + "  " + allEntities.get(i);
             }

         }

         return allEntityString;
     }

     protected String deriveSentence(String Context, int a, int b) {
         String allEntityString = "";
         String start = Context.substring(0, a);
         String end = Context.substring(b);
         int s = start.lastIndexOf('.');
         int e = end.indexOf('.');
         if (s < 0) {
             if (e < 0) return Context;
             else return Context.substring(0, b + e);
         } else {
             if (e < 0) return Context.substring(s);
             else return Context.substring(s + 1, b + e);
         }

     }

     /**
      * Extracts the selection context based on the content, selection and the start char offset of the
      * selection
      *
      * @param content
      *            the content
      * @param selection
      *            the selected text
      * @param selectionStartPos
      *            the start char position of the selection
      * @param contextSize
      *            the size of the context in characters
      * @return the context
      */
     public static String getDisambiguationContext(String content, String selection, int selectionStartPos,
             int contextSize) {
         // extract the selection context
         int beginPos;
         if (selectionStartPos <= contextSize) {
             beginPos = 0;
         } else {
             int start = selectionStartPos - contextSize;
             beginPos = start;
             int c;
             do {
                 c = content.codePointAt(beginPos);
                 beginPos++;
             } while (beginPos <= selectionStartPos || Character.isWhitespace(c)
                     || Character.getType(c) == Character.SPACE_SEPARATOR);
             if (beginPos < 0 || beginPos >= selectionStartPos) { // no words
                 beginPos = start; // begin within a word
             }
         }
         int endPos;
         if (selectionStartPos + selection.length() + contextSize >= content.length()) {
             endPos = content.length();
         } else {
             int selectionEndPos = selectionStartPos + selection.length();
             int end = selectionEndPos + contextSize;
             endPos = end;
             int c;
             do {
                 c = content.codePointAt(endPos);
                 endPos--;
             } while (endPos > selectionEndPos || Character.isWhitespace(c)
                     || Character.getType(c) == Character.SPACE_SEPARATOR);
             if (endPos <= selectionStartPos + selection.length()) {
                 endPos = end; // end within a word;
             }
         }
         return content.substring(beginPos, endPos);
     }

     /**
      * Activate and read the properties
      *
      * @param ce
      *            the {@link ComponentContext}
      */
     @Activate
     protected void activate(ComponentContext ce) throws ConfigurationException {
         try {
             super.activate(ce);
         } catch (IOException e) {
             // log
             log.error("Failed to update the configuration", e);
         }
         @SuppressWarnings("unchecked")
         Dictionary<String,Object> properties = ce.getProperties();
         // update the service URL if it is defined
         // if (properties.get(FORMCEPT_SERVICE_URL) != null) {
         // this.serviceURL = (String) properties.get(FORMCEPT_SERVICE_URL);
         // }
     }

     /**
      * Deactivate
      *
      * @param ce
      *            the {@link ComponentContext}
      */
     @Deactivate
     protected void deactivate(ComponentContext ce) {
         super.deactivate(ce);
     }

     /**
      * Gets the Service URL
      *
      * @return
      */
     public String getServiceURL() {
         return serviceURL;
     }

     // private static double levenshtein(String s1, String s2) {
     // if (s1 == null || s2 == null) {
     // throw new IllegalArgumentException("NONE of the parsed String MUST BE NULL!");
     // }
     // s1 = StringUtils.trim(s1);
     // s2 = StringUtils.trim(s2);
     // return s1.isEmpty() || s2.isEmpty() ? 0
     // : 1.0 - (((double) getLevenshteinDistance(s1, s2)) / ((double) (Math.max(s1.length(),
     // s2.length()))));
     // }

 }