enhancement-engines/opennlp/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java - stanbol - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.stanbol.enhancer.engines.opennlp.impl;

 import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.NER_ANNOTATION;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;

 import java.io.IOException;
 import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Set;

 import opennlp.tools.namefind.NameFinderME;
 import opennlp.tools.namefind.TokenNameFinderModel;
 import opennlp.tools.sentdetect.SentenceDetectorME;
 import opennlp.tools.sentdetect.SentenceModel;
 import opennlp.tools.tokenize.Tokenizer;
 import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.Span;

 import org.apache.clerezza.commons.rdf.Language;
 import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.commons.rdf.Graph;
 import org.apache.clerezza.commons.rdf.IRI;
 import org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl;
 import org.apache.clerezza.commons.rdf.impl.utils.TripleImpl;
 import org.apache.commons.lang.StringUtils;
 import org.apache.stanbol.commons.opennlp.OpenNLP;
 import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider;
 import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedTextUtils;
 import org.apache.stanbol.enhancer.nlp.model.Chunk;
 import org.apache.stanbol.enhancer.nlp.model.Section;
 import org.apache.stanbol.enhancer.nlp.model.Sentence;
 import org.apache.stanbol.enhancer.nlp.model.Token;
 import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
 import org.apache.stanbol.enhancer.nlp.ner.NerTag;
 import org.apache.stanbol.enhancer.servicesapi.Blob;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
 import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
 import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /**
  * Core of the NER EnhancementEngine(s), separated from the OSGi service to make
  * it easier to test this.
  */
 public abstract class NEREngineCore
         extends AbstractEnhancementEngine<IOException,RuntimeException>
         implements EnhancementEngine {
     protected static final String TEXT_PLAIN_MIMETYPE = "text/plain";
     /**
      * Contains the only supported mimetype {@link #TEXT_PLAIN_MIMETYPE}
      */
     protected static final Set<String> SUPPORTED_MIMETYPES =
             Collections.singleton(TEXT_PLAIN_MIMETYPE);

     private final Logger log = LoggerFactory.getLogger(getClass());

     protected OpenNLP openNLP;

     protected NEREngineConfig config;


     /** Comments about our models */
     public static final Map<String, String> DATA_FILE_COMMENTS;
     static {
         DATA_FILE_COMMENTS = new HashMap<String, String>();
         DATA_FILE_COMMENTS.put("Default data files", "provided by the org.apache.stanbol.defaultdata bundle");
     }
     /**
      * If used sub classes MUST ensure that {@link #openNLP} and {@link #config}
      * are set before calling {@link #canEnhance(ContentItem)} or
      * {@link #computeEnhancements(ContentItem)}
      */
     protected NEREngineCore(){}

     NEREngineCore(OpenNLP openNLP, NEREngineConfig config) throws InvalidFormatException, IOException{
         if(openNLP == null){
             throw new IllegalArgumentException("The parsed OpenNLP instance MUST NOT be NULL!");
         }
         if(config == null){
             throw new IllegalArgumentException("The parsed NER engine configuration MUST NOT be NULL!");
         }
         this.openNLP = openNLP;
         this.config = config;
     }

     NEREngineCore(DataFileProvider dfp,NEREngineConfig config) throws InvalidFormatException, IOException {
         this(new OpenNLP(dfp),config);
     }


     public void computeEnhancements(ContentItem ci) throws EngineException {
         //first check the langauge before processing the content (text)
         String language = extractLanguage(ci);
         if(language == null){
             throw new IllegalStateException("Unable to extract Language for "
                 + "ContentItem "+ci.getUri()+": This is also checked in the canEnhance "
                 + "method! -> This indicated an Bug in the implementation of the "
                 + "EnhancementJobManager!");
         }
         if(!isNerModel(language)){
             throw new IllegalStateException("For the language '"+language+"' of ContentItem "+ci.getUri()
                 + " no NER model is configured: This is also checked in the canEnhance "
                 + "method! -> This indicated an Bug in the implementation of the "
                 + "EnhancementJobManager!");
         }
         final AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
         //validate data in the AnalysedText
         final String text;
         if(at != null && at.getTokens().hasNext()){ //if the AnalysedText is present and tokens are present
             if(log.isDebugEnabled()){
                 log.debug("computeEnhancements from AnalysedText ContentPart of ContentItem {}: text={}",
                     ci.getUri().getUnicodeString(), StringUtils.abbreviate(at.getSpan(), 100));
             }
             text = null;
         } else { //no AnalysedText with tokens ...
             //fallback to processing the plain text is still supported
             Entry<IRI,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
             if(contentPart == null){
                 throw new IllegalStateException("No ContentPart with Mimetype '"
                     + TEXT_PLAIN_MIMETYPE+"' found for ContentItem "+ci.getUri()
                     + ": This is also checked in the canEnhance method! -> This "
                     + "indicated an Bug in the implementation of the "
                     + "EnhancementJobManager!");
             }
             try {
                 text = ContentItemHelper.getText(contentPart.getValue());
             } catch (IOException e) {
                 throw new InvalidContentException(this, ci, e);
             }
             if (text.trim().length() == 0) {
                 // TODO: make the length of the data a field of the ContentItem
                 // interface to be able to filter out empty items in the canEnhance
                 // method
                 log.warn("ContentPart {} of ContentItem {} does not contain any text" +
                         "to extract knowledge from in ContentItem {}",
                         contentPart.getKey(),ci);
                 return;
             }
             if(log.isDebugEnabled()){
                 log.debug("computeEnhancements from ContentPart {} of ContentItem {}: text={}",
                     new Object[]{contentPart.getKey(),ci.getUri().getUnicodeString(),
                                  StringUtils.abbreviate(text, 100)});
             }
         }
         try {
             if(config.isProcessedLangage(language)){
                 for (String defaultModelType : config.getDefaultModelTypes()) {
                     TokenNameFinderModel nameFinderModel = openNLP.getNameModel(defaultModelType, language);
                     if(nameFinderModel == null){
                         log.info("No NER Model for {} and language {} available!",defaultModelType,language);
                     } else {
                         findNamedEntities(ci, at, text, language, nameFinderModel);
                     }
                 }
             } //else do not use default models for languages other than the processed one
             //process for additional models
             for(String additionalModel : config.getSpecificNerModles(language)){
                 TokenNameFinderModel nameFinderModel;
                 try {
                     nameFinderModel = openNLP.getModel(TokenNameFinderModel.class,
                         additionalModel, null);
                     findNamedEntities(ci, at, text, language, nameFinderModel);
                 } catch (IOException e) {
                     log.warn("Unable to load TokenNameFinderModel model for language '"+language
                         + "' (model: "+additionalModel+")",e);
                 } catch (RuntimeException e){
                     log.warn("Error while creating ChunkerModel for language '"+language
                         + "' (model: "+additionalModel+")",e);
                 }
             }
         } catch (Exception e) {
         	if (e instanceof RuntimeException) {
         		throw (RuntimeException)e;
         	} else {
         		throw new EngineException(this, ci, e);
         	}
         }
     }

     protected void findNamedEntities(final ContentItem ci,
                                      final AnalysedText at,
                                      final String text,
                                      final String lang,
                                      final TokenNameFinderModel nameFinderModel) {

         if (ci == null) {
             throw new IllegalArgumentException("Parsed ContentItem MUST NOT be NULL");
         }
         if (at == null && text == null) {
             log.warn("NULL was parsed as AnalysedText AND Text for content item "
                     + ci.getUri() + ". One of the two MUST BE present! -> call ignored");
             return;
         }
         final Language language;
         if(lang != null && !lang.isEmpty()){
             language = new Language(lang);
         } else {
             language = null;
         }
         if(log.isDebugEnabled()){
             log.debug("findNamedEntities model={},  language={}, text=",
                     new Object[]{ nameFinderModel, language,
                                   StringUtils.abbreviate(at != null ? at.getSpan() : text, 100) });
         }
         LiteralFactory literalFactory = LiteralFactory.getInstance();
         Graph g = ci.getMetadata();
         Map<String,List<NameOccurrence>> entityNames;
         if(at != null){
             entityNames = extractNameOccurrences(nameFinderModel, at, lang);
         } else {
             entityNames = extractNameOccurrences(nameFinderModel, text,lang);
         }
         //lock the ContentItem while writing the RDF data for found Named Entities
         ci.getLock().writeLock().lock();
         try {
             Map<String,IRI> previousAnnotations = new LinkedHashMap<String,IRI>();
             for (Map.Entry<String,List<NameOccurrence>> nameInContext : entityNames.entrySet()) {

                 String name = nameInContext.getKey();
                 List<NameOccurrence> occurrences = nameInContext.getValue();

                 IRI firstOccurrenceAnnotation = null;

                 for (NameOccurrence occurrence : occurrences) {
                     IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
                     g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT,
                         new PlainLiteralImpl(name, language)));
                     g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT,
                         new PlainLiteralImpl(occurrence.context, language)));
                     if(occurrence.type != null){
                         g.add(new TripleImpl(textAnnotation, DC_TYPE, occurrence.type));
                     }
                     if(occurrence.confidence != null){
                         g.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory
                                 .createTypedLiteral(occurrence.confidence)));
                     }
                     if (occurrence.start != null && occurrence.end != null) {
                         g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory
                                 .createTypedLiteral(occurrence.start)));
                         g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory
                                 .createTypedLiteral(occurrence.end)));
                     }

                     // add the subsumption relationship among occurrences of the same
                     // name
                     if (firstOccurrenceAnnotation == null) {
                         // check already extracted annotations to find a first most
                         // specific occurrence
                         for (Map.Entry<String,IRI> entry : previousAnnotations.entrySet()) {
                             if (entry.getKey().contains(name)) {
                                 // we have found a most specific previous
                                 // occurrence, use it as subsumption target
                                 firstOccurrenceAnnotation = entry.getValue();
                                 g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
                                 break;
                             }
                         }
                         if (firstOccurrenceAnnotation == null) {
                             // no most specific previous occurrence, I am the first,
                             // most specific occurrence to be later used as a target
                             firstOccurrenceAnnotation = textAnnotation;
                             previousAnnotations.put(name, textAnnotation);
                         }
                     } else {
                         // I am referring to a most specific first occurrence of the
                         // same name
                         g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
                     }
                 }
             }
         } finally {
             ci.getLock().writeLock().unlock();
         }
     }

     @Deprecated
     public Collection<String> extractPersonNames(String text) {
         return extractPersonNames(text, "en");
     }
     public Collection<String> extractPersonNames(String text,String lang) {
         return extractNames(getNameModel("person",lang),text);
     }

     @Deprecated
     public Collection<String> extractLocationNames(String text) {
         return extractLocationNames(text,"en");
     }

     public Collection<String> extractLocationNames(String text,String lang) {
         return extractNames(getNameModel("location",lang), text);
     }

     @Deprecated
     public Collection<String> extractOrganizationNames(String text) {
         return extractOrganizationNames(text,"en");
     }
     public Collection<String> extractOrganizationNames(String text,String lang) {
         return extractNames(getNameModel("organization",lang), text);
     }
     /**
      * extracts the PersonName occurrences for English language texts
      * @param text
      * @return
      * @deprecated use {@link #extractLocationNameOccurrences(String,String)} instead
      */
     @Deprecated
     public Map<String,List<NameOccurrence>> extractPersonNameOccurrences(String text) {
         return this.extractPersonNameOccurrences(text, "en");
     }
     public Map<String,List<NameOccurrence>> extractPersonNameOccurrences(String text, String lang) {
         return extractNameOccurrences(getNameModel("person",lang), text, lang);
     }
     /**
      * extracts the LocationName occurrences for English language texts
      * @param text
      * @return
      * @deprecated use {@link #extractLocationNameOccurrences(String,String)} instead
      */
     @Deprecated
     public Map<String,List<NameOccurrence>> extractLocationNameOccurrences(String text) {
         return extractLocationNameOccurrences(text, "en");
     }

     public Map<String,List<NameOccurrence>> extractLocationNameOccurrences(String text,String lang) {
         return extractNameOccurrences(getNameModel("location",lang), text,lang);
     }

     /**
      * extracts the OrganizationName occurrences for English language texts
      * @param text
      * @return
      * @deprecated use {@link #extractOrganizationNamesOccurrences(String,String)} instead
      */
     @Deprecated
     public Map<String,List<NameOccurrence>> extractOrganizationNameOccurrences(String text) {
         return extractOrganizationNameOccurrences(text,"en");
     }
     public Map<String,List<NameOccurrence>> extractOrganizationNameOccurrences(String text,String lang) {
         return extractNameOccurrences(getNameModel("organization",lang), text,lang);
     }

     protected Collection<String> extractNames(TokenNameFinderModel nameFinderModel, String text) {
         return extractNameOccurrences(nameFinderModel, text, nameFinderModel.getLanguage()).keySet();
     }

     /**
      * Gets/builds a TokenNameFinderModel by using {@link #openNLP} and throws
      * {@link IllegalStateException}s in case the model could not be built or
      * the data for the model where not found.
      * @param the type of the named finder model
      * @param language the language for the model
      * @return the model or an {@link IllegalStateException} if not available
      */
     private TokenNameFinderModel getNameModel(String type,String language) {
         try {
             TokenNameFinderModel model = openNLP.getNameModel(type, language);
             if(model != null){
                 return model;
             } else {
                 throw new IllegalStateException(String.format(
                     "Unable to built Model for extracting %s from '%s' language " +
                     "texts because the model data could not be loaded.",
                     type,language));
             }
         } catch (InvalidFormatException e) {
             throw new IllegalStateException(String.format(
                 "Unable to built Model for extracting %s from '%s' language texts.",
                 type,language),e);
         } catch (IOException e) {
             throw new IllegalStateException(String.format(
                 "Unable to built Model for extracting %s from '%s' language texts.",
                 type,language),e);
         }
     }
     /**
      * Loads the {@link SentenceModel} for the parsed language or
      * English as fallback if one for the language is not available
      * @param language
      * @return
      */
     private SentenceModel getSentenceModel(String language) {
         try {
             SentenceModel model = openNLP.getSentenceModel(language);
             if(model != null){
                 return model;
             } else { //fallback to english
                 log.info("No sentence detection modle for {}. fallback to English");
                 model = openNLP.getSentenceModel("en");
                 if(model == null){
                     throw new IllegalStateException(String.format(
                         "Unable to built Model for extracting sentences neither for '%s' " +
                         "nor the fallback language 'en'.",
                         language));
                 } else {
                     return model;
                 }
             }
         } catch (InvalidFormatException e) {
             throw new IllegalStateException(String.format(
                 "Unable to built Model for extracting sentences from '%s' language texts.",
                 language),e);
         } catch (IOException e) {
             throw new IllegalStateException(String.format(
                 "Unable to built Model for extracting sentences from '%s' language texts.",
                 language),e);
         }
     }
     /**
      * THis method extracts NamedEntity occurrences by using existing {@link Token}s and
      * {@link Sentence}s in the parsed {@link AnalysedText}.
      * @param nameFinderModel the model used to find NamedEntities
      * @param at the Analysed Text
      * @param language the language of the text
      * @return the found named Entity Occurrences
      */
     protected Map<String,List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel,
         AnalysedText at, String language) {
         // version with explicit sentence endings to reflect heading / paragraph
         // structure of an HTML or PDF document converted to text

         NameFinderME finder = new NameFinderME(nameFinderModel);
         Map<String,List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String,List<NameOccurrence>>();
         List<Section> sentences = new ArrayList<Section>();
         //Holds the tokens of the previouse (pos 0) current (pos 1) and next (pos 2) sentence
         AnalysedTextUtils.appandToList(at.getSentences(), sentences);
         if(sentences.isEmpty()){ //no sentence annotations
             sentences.add(at); //process as a single section
         }
         for (int i=0;i<sentences.size();i++) {
             String sentence = sentences.get(i).getSpan();

             // build a context by concatenating three sentences to be used for
             // similarity ranking / disambiguation + contextual snippet in the
             // extraction structure
             List<String> contextElements = new ArrayList<String>();
             contextElements.add(sentence);
             //three sentences as context
             String context = at.getSpan().substring(
                 sentences.get(Math.max(0, i-1)).getStart(),
                 sentences.get(Math.min(sentences.size()-1, i+1)).getEnd());

             // get the tokens, words of the current sentence
             List<Token> tokens = new ArrayList<Token>(32);
             List<String> words = new ArrayList<String>(32);
             for(Iterator<Token> it =sentences.get(i).getTokens();it.hasNext();){
                 Token t = it.next();
                 tokens.add(t);
                 words.add(t.getSpan());
             }
             Span[] nameSpans = finder.find(words.toArray(new String[words.size()]));
             double[] probs = finder.probs();
             //int lastStartPosition = 0;
             for (int j = 0; j < nameSpans.length; j++) {
                 String name = at.getSpan().substring(tokens.get(nameSpans[j].getStart()).getStart(),
                     tokens.get(nameSpans[j].getEnd()-1).getEnd());
                 Double confidence = 1.0;
                 for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
                     confidence *= probs[k];
                 }
                 int start = tokens.get(nameSpans[j].getStart()).getStart();
                 int end = start + name.length();
                 NerTag nerTag = config.getNerTag(nameSpans[j].getType());
                 //create the occurrence for writing fise:TextAnnotations
                 NameOccurrence occurrence = new NameOccurrence(name, start, end, nerTag.getType(),
                     context, confidence);
                 List<NameOccurrence> occurrences = nameOccurrences.get(name);
                 if (occurrences == null) {
                     occurrences = new ArrayList<NameOccurrence>();
                 }
                 occurrences.add(occurrence);
                 nameOccurrences.put(name, occurrences);
                 //add also the NerAnnotation to the AnalysedText
                 Chunk chunk = at.addChunk(start, end);
                 //TODO: build AnnotationModel based on the configured Mappings
                 chunk.addAnnotation(NER_ANNOTATION, Value.value(nerTag, confidence));
             }
         }
         finder.clearAdaptiveData();
         log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
         return nameOccurrences;
     }

     protected Map<String,List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel, String text, String language) {
         // version with explicit sentence endings to reflect heading / paragraph
         // structure of an HTML or PDF document converted to text
         String textWithDots = text.replaceAll("\\n\\n", ".\n");
         text = removeNonUtf8CompliantCharacters(text);

         SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en"));

         Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);

         NameFinderME finder = new NameFinderME(nameFinderModel);
         Tokenizer tokenizer = openNLP.getTokenizer(language);
         Map<String,List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String,List<NameOccurrence>>();
         for (int i = 0; i < sentenceSpans.length; i++) {
             String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();

             // build a context by concatenating three sentences to be used for
             // similarity ranking / disambiguation + contextual snippet in the
             // extraction structure
             List<String> contextElements = new ArrayList<String>();
             if (i > 0) {
                 CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text);
                 contextElements.add(previousSentence.toString().trim());
             }
             contextElements.add(sentence.trim());
             if (i + 1 < sentenceSpans.length) {
                 CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text);
                 contextElements.add(nextSentence.toString().trim());
             }
             String context = StringUtils.join(contextElements, " ");

             // extract the names in the current sentence and
             // keep them store them with the current context
             Span[] tokenSpans = tokenizer.tokenizePos(sentence);
             String[] tokens = Span.spansToStrings(tokenSpans, sentence);
             Span[] nameSpans = finder.find(tokens);
             double[] probs = finder.probs();
             //int lastStartPosition = 0;
             for (int j = 0; j < nameSpans.length; j++) {
                 String name = sentence.substring(tokenSpans[nameSpans[j].getStart()].getStart(),
                     tokenSpans[nameSpans[j].getEnd()-1].getEnd());
                 //NOTE: With OpenNLP 1.6 the probability is now stored in the span
                 double prob = nameSpans[j].getProb();
                 //prob == 0.0 := unspecified
                 Double confidence = prob != 0.0 ? Double.valueOf(prob) : null;
                 if(confidence == null){ //fall back to the old if it is not set.
                     for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
                         prob *= probs[k];
                     }
                     confidence = Double.valueOf(prob);
                 } else if(confidence < 0.5d){
                     //It looks like as if preceptron based models do return
                     //invalid probabilities. As it is expected the Named Entities
                     //with a probability < 50% are not even returned by finder.find(..)
                     //we will just ignore confidence values < 0.5 here
                     confidence = null;
                 }
                 int start = tokenSpans[nameSpans[j].getStart()].getStart();
                 int absoluteStart = sentenceSpans[i].getStart() + start;
                 int absoluteEnd = absoluteStart + name.length();
                 NerTag nerTag = config.getNerTag(nameSpans[j].getType());
                 NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd,
                     nerTag.getType(),context, confidence);

                 List<NameOccurrence> occurrences = nameOccurrences.get(name);
                 if (occurrences == null) {
                     occurrences = new ArrayList<NameOccurrence>();
                 }
                 occurrences.add(occurrence);
                 nameOccurrences.put(name, occurrences);
             }
         }
         finder.clearAdaptiveData();
         log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
         return nameOccurrences;
     }

     public int canEnhance(ContentItem ci) {
         if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null &&
                 isNerModel(extractLanguage(ci))){
             return ENHANCE_ASYNC;
         } else {
             return CANNOT_ENHANCE;
         }
     }

     /**
      * Remove non UTF-8 compliant characters (typically control characters) so has to avoid polluting the
      * annotation graph with snippets that are not serializable as XML.
      */
     protected static String removeNonUtf8CompliantCharacters(final String text) {
         if (null == text) {
             return null;
         }
         StringBuilder sb = null; //initialised on the first replacement
         for (int i = 0; i < text.length(); i++) {
             int ch = text.codePointAt(i);
             // remove any characters outside the valid UTF-8 range as well as all control characters
             // except tabs and new lines
             //NOTE: rewesten (2012-11-21) replaced the original check with the one
             // found at http://blog.mark-mclaren.info/2007/02/invalid-xml-characters-when-valid-utf8_5873.html
             if (!((ch == 0x9) ||
                     (ch == 0xA) ||
                     (ch == 0xD) ||
                     ((ch >= 0x20) && (ch <= 0xD7FF)) ||
                     ((ch >= 0xE000) && (ch <= 0xFFFD)) ||
                     ((ch >= 0x10000) && (ch <= 0x10FFFF)))){
                 if(sb == null){
                     sb = new StringBuilder(text);
                 }
                 sb.setCharAt(i, ' ');
             }
         }
         return sb == null ? text : sb.toString();
     }

     /**
      * Extracts the language of the parsed ContentItem by using
      * {@link EnhancementEngineHelper#getLanguage(ContentItem)} and
      * {@link #defaultLang} as default
      * @param ci the content item
      * @return the language
      */
     private String extractLanguage(ContentItem ci) {
         String lang = EnhancementEngineHelper.getLanguage(ci);
         if(lang != null){
             return lang;
         } else {
             log.info("Unable to extract language for ContentItem %s!",ci.getUri().getUnicodeString());
             log.info(" ... return '{}' as default",config.getDefaultLanguage());
             return config.getDefaultLanguage();
         }
     }
     /**
      * This Method checks if this configuration does have a NER model for the
      * parsed language. This checks if the pased language
      * {@link #isProcessedLangage(String)} and any {@link #getDefaultModelTypes()}
      * is present OR if any {@link #getSpecificNerModles(String)} is configured for the
      * parsed language.
      * @param lang The language to check
      * @return if there is any NER model configured for the parsed language
      */
     public boolean isNerModel(String lang){
         return (config.isProcessedLangage(lang) && !config.getDefaultModelTypes().isEmpty()) ||
                !config.getSpecificNerModles(lang).isEmpty();

     }
 }