blob: ebd7b825d3353a8e35f39bf2f5aa34a76617a817 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.opennlp.impl;
import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.NER_ANNOTATION;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.Span;
import org.apache.clerezza.commons.rdf.Language;
import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.commons.rdf.Graph;
import org.apache.clerezza.commons.rdf.IRI;
import org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl;
import org.apache.clerezza.commons.rdf.impl.utils.TripleImpl;
import org.apache.commons.lang.StringUtils;
import org.apache.stanbol.commons.opennlp.OpenNLP;
import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider;
import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.nlp.model.AnalysedTextUtils;
import org.apache.stanbol.enhancer.nlp.model.Chunk;
import org.apache.stanbol.enhancer.nlp.model.Section;
import org.apache.stanbol.enhancer.nlp.model.Sentence;
import org.apache.stanbol.enhancer.nlp.model.Token;
import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
import org.apache.stanbol.enhancer.nlp.ner.NerTag;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Core of the NER EnhancementEngine(s), separated from the OSGi service to make
* it easier to test this.
*/
public abstract class NEREngineCore
extends AbstractEnhancementEngine<IOException,RuntimeException>
implements EnhancementEngine {
protected static final String TEXT_PLAIN_MIMETYPE = "text/plain";
/**
* Contains the only supported mimetype {@link #TEXT_PLAIN_MIMETYPE}
*/
protected static final Set<String> SUPPORTED_MIMETYPES =
Collections.singleton(TEXT_PLAIN_MIMETYPE);
private final Logger log = LoggerFactory.getLogger(getClass());
protected OpenNLP openNLP;
protected NEREngineConfig config;
/** Comments about our models */
public static final Map<String, String> DATA_FILE_COMMENTS;
static {
DATA_FILE_COMMENTS = new HashMap<String, String>();
DATA_FILE_COMMENTS.put("Default data files", "provided by the org.apache.stanbol.defaultdata bundle");
}
/**
* If used sub classes MUST ensure that {@link #openNLP} and {@link #config}
* are set before calling {@link #canEnhance(ContentItem)} or
* {@link #computeEnhancements(ContentItem)}
*/
protected NEREngineCore(){}
NEREngineCore(OpenNLP openNLP, NEREngineConfig config) throws InvalidFormatException, IOException{
if(openNLP == null){
throw new IllegalArgumentException("The parsed OpenNLP instance MUST NOT be NULL!");
}
if(config == null){
throw new IllegalArgumentException("The parsed NER engine configuration MUST NOT be NULL!");
}
this.openNLP = openNLP;
this.config = config;
}
NEREngineCore(DataFileProvider dfp,NEREngineConfig config) throws InvalidFormatException, IOException {
this(new OpenNLP(dfp),config);
}
public void computeEnhancements(ContentItem ci) throws EngineException {
//first check the langauge before processing the content (text)
String language = extractLanguage(ci);
if(language == null){
throw new IllegalStateException("Unable to extract Language for "
+ "ContentItem "+ci.getUri()+": This is also checked in the canEnhance "
+ "method! -> This indicated an Bug in the implementation of the "
+ "EnhancementJobManager!");
}
if(!isNerModel(language)){
throw new IllegalStateException("For the language '"+language+"' of ContentItem "+ci.getUri()
+ " no NER model is configured: This is also checked in the canEnhance "
+ "method! -> This indicated an Bug in the implementation of the "
+ "EnhancementJobManager!");
}
final AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
//validate data in the AnalysedText
final String text;
if(at != null && at.getTokens().hasNext()){ //if the AnalysedText is present and tokens are present
if(log.isDebugEnabled()){
log.debug("computeEnhancements from AnalysedText ContentPart of ContentItem {}: text={}",
ci.getUri().getUnicodeString(), StringUtils.abbreviate(at.getSpan(), 100));
}
text = null;
} else { //no AnalysedText with tokens ...
//fallback to processing the plain text is still supported
Entry<IRI,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
if(contentPart == null){
throw new IllegalStateException("No ContentPart with Mimetype '"
+ TEXT_PLAIN_MIMETYPE+"' found for ContentItem "+ci.getUri()
+ ": This is also checked in the canEnhance method! -> This "
+ "indicated an Bug in the implementation of the "
+ "EnhancementJobManager!");
}
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(this, ci, e);
}
if (text.trim().length() == 0) {
// TODO: make the length of the data a field of the ContentItem
// interface to be able to filter out empty items in the canEnhance
// method
log.warn("ContentPart {} of ContentItem {} does not contain any text" +
"to extract knowledge from in ContentItem {}",
contentPart.getKey(),ci);
return;
}
if(log.isDebugEnabled()){
log.debug("computeEnhancements from ContentPart {} of ContentItem {}: text={}",
new Object[]{contentPart.getKey(),ci.getUri().getUnicodeString(),
StringUtils.abbreviate(text, 100)});
}
}
try {
if(config.isProcessedLangage(language)){
for (String defaultModelType : config.getDefaultModelTypes()) {
TokenNameFinderModel nameFinderModel = openNLP.getNameModel(defaultModelType, language);
if(nameFinderModel == null){
log.info("No NER Model for {} and language {} available!",defaultModelType,language);
} else {
findNamedEntities(ci, at, text, language, nameFinderModel);
}
}
} //else do not use default models for languages other than the processed one
//process for additional models
for(String additionalModel : config.getSpecificNerModles(language)){
TokenNameFinderModel nameFinderModel;
try {
nameFinderModel = openNLP.getModel(TokenNameFinderModel.class,
additionalModel, null);
findNamedEntities(ci, at, text, language, nameFinderModel);
} catch (IOException e) {
log.warn("Unable to load TokenNameFinderModel model for language '"+language
+ "' (model: "+additionalModel+")",e);
} catch (RuntimeException e){
log.warn("Error while creating ChunkerModel for language '"+language
+ "' (model: "+additionalModel+")",e);
}
}
} catch (Exception e) {
if (e instanceof RuntimeException) {
throw (RuntimeException)e;
} else {
throw new EngineException(this, ci, e);
}
}
}
protected void findNamedEntities(final ContentItem ci,
final AnalysedText at,
final String text,
final String lang,
final TokenNameFinderModel nameFinderModel) {
if (ci == null) {
throw new IllegalArgumentException("Parsed ContentItem MUST NOT be NULL");
}
if (at == null && text == null) {
log.warn("NULL was parsed as AnalysedText AND Text for content item "
+ ci.getUri() + ". One of the two MUST BE present! -> call ignored");
return;
}
final Language language;
if(lang != null && !lang.isEmpty()){
language = new Language(lang);
} else {
language = null;
}
if(log.isDebugEnabled()){
log.debug("findNamedEntities model={}, language={}, text=",
new Object[]{ nameFinderModel, language,
StringUtils.abbreviate(at != null ? at.getSpan() : text, 100) });
}
LiteralFactory literalFactory = LiteralFactory.getInstance();
Graph g = ci.getMetadata();
Map<String,List<NameOccurrence>> entityNames;
if(at != null){
entityNames = extractNameOccurrences(nameFinderModel, at, lang);
} else {
entityNames = extractNameOccurrences(nameFinderModel, text,lang);
}
//lock the ContentItem while writing the RDF data for found Named Entities
ci.getLock().writeLock().lock();
try {
Map<String,IRI> previousAnnotations = new LinkedHashMap<String,IRI>();
for (Map.Entry<String,List<NameOccurrence>> nameInContext : entityNames.entrySet()) {
String name = nameInContext.getKey();
List<NameOccurrence> occurrences = nameInContext.getValue();
IRI firstOccurrenceAnnotation = null;
for (NameOccurrence occurrence : occurrences) {
IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT,
new PlainLiteralImpl(name, language)));
g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT,
new PlainLiteralImpl(occurrence.context, language)));
if(occurrence.type != null){
g.add(new TripleImpl(textAnnotation, DC_TYPE, occurrence.type));
}
if(occurrence.confidence != null){
g.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory
.createTypedLiteral(occurrence.confidence)));
}
if (occurrence.start != null && occurrence.end != null) {
g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory
.createTypedLiteral(occurrence.start)));
g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory
.createTypedLiteral(occurrence.end)));
}
// add the subsumption relationship among occurrences of the same
// name
if (firstOccurrenceAnnotation == null) {
// check already extracted annotations to find a first most
// specific occurrence
for (Map.Entry<String,IRI> entry : previousAnnotations.entrySet()) {
if (entry.getKey().contains(name)) {
// we have found a most specific previous
// occurrence, use it as subsumption target
firstOccurrenceAnnotation = entry.getValue();
g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
break;
}
}
if (firstOccurrenceAnnotation == null) {
// no most specific previous occurrence, I am the first,
// most specific occurrence to be later used as a target
firstOccurrenceAnnotation = textAnnotation;
previousAnnotations.put(name, textAnnotation);
}
} else {
// I am referring to a most specific first occurrence of the
// same name
g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
}
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
@Deprecated
public Collection<String> extractPersonNames(String text) {
return extractPersonNames(text, "en");
}
public Collection<String> extractPersonNames(String text,String lang) {
return extractNames(getNameModel("person",lang),text);
}
@Deprecated
public Collection<String> extractLocationNames(String text) {
return extractLocationNames(text,"en");
}
public Collection<String> extractLocationNames(String text,String lang) {
return extractNames(getNameModel("location",lang), text);
}
@Deprecated
public Collection<String> extractOrganizationNames(String text) {
return extractOrganizationNames(text,"en");
}
public Collection<String> extractOrganizationNames(String text,String lang) {
return extractNames(getNameModel("organization",lang), text);
}
/**
* extracts the PersonName occurrences for English language texts
* @param text
* @return
* @deprecated use {@link #extractLocationNameOccurrences(String,String)} instead
*/
@Deprecated
public Map<String,List<NameOccurrence>> extractPersonNameOccurrences(String text) {
return this.extractPersonNameOccurrences(text, "en");
}
public Map<String,List<NameOccurrence>> extractPersonNameOccurrences(String text, String lang) {
return extractNameOccurrences(getNameModel("person",lang), text, lang);
}
/**
* extracts the LocationName occurrences for English language texts
* @param text
* @return
* @deprecated use {@link #extractLocationNameOccurrences(String,String)} instead
*/
@Deprecated
public Map<String,List<NameOccurrence>> extractLocationNameOccurrences(String text) {
return extractLocationNameOccurrences(text, "en");
}
public Map<String,List<NameOccurrence>> extractLocationNameOccurrences(String text,String lang) {
return extractNameOccurrences(getNameModel("location",lang), text,lang);
}
/**
* extracts the OrganizationName occurrences for English language texts
* @param text
* @return
* @deprecated use {@link #extractOrganizationNamesOccurrences(String,String)} instead
*/
@Deprecated
public Map<String,List<NameOccurrence>> extractOrganizationNameOccurrences(String text) {
return extractOrganizationNameOccurrences(text,"en");
}
public Map<String,List<NameOccurrence>> extractOrganizationNameOccurrences(String text,String lang) {
return extractNameOccurrences(getNameModel("organization",lang), text,lang);
}
protected Collection<String> extractNames(TokenNameFinderModel nameFinderModel, String text) {
return extractNameOccurrences(nameFinderModel, text, nameFinderModel.getLanguage()).keySet();
}
/**
* Gets/builds a TokenNameFinderModel by using {@link #openNLP} and throws
* {@link IllegalStateException}s in case the model could not be built or
* the data for the model where not found.
* @param the type of the named finder model
* @param language the language for the model
* @return the model or an {@link IllegalStateException} if not available
*/
private TokenNameFinderModel getNameModel(String type,String language) {
try {
TokenNameFinderModel model = openNLP.getNameModel(type, language);
if(model != null){
return model;
} else {
throw new IllegalStateException(String.format(
"Unable to built Model for extracting %s from '%s' language " +
"texts because the model data could not be loaded.",
type,language));
}
} catch (InvalidFormatException e) {
throw new IllegalStateException(String.format(
"Unable to built Model for extracting %s from '%s' language texts.",
type,language),e);
} catch (IOException e) {
throw new IllegalStateException(String.format(
"Unable to built Model for extracting %s from '%s' language texts.",
type,language),e);
}
}
/**
* Loads the {@link SentenceModel} for the parsed language or
* English as fallback if one for the language is not available
* @param language
* @return
*/
private SentenceModel getSentenceModel(String language) {
try {
SentenceModel model = openNLP.getSentenceModel(language);
if(model != null){
return model;
} else { //fallback to english
log.info("No sentence detection modle for {}. fallback to English");
model = openNLP.getSentenceModel("en");
if(model == null){
throw new IllegalStateException(String.format(
"Unable to built Model for extracting sentences neither for '%s' " +
"nor the fallback language 'en'.",
language));
} else {
return model;
}
}
} catch (InvalidFormatException e) {
throw new IllegalStateException(String.format(
"Unable to built Model for extracting sentences from '%s' language texts.",
language),e);
} catch (IOException e) {
throw new IllegalStateException(String.format(
"Unable to built Model for extracting sentences from '%s' language texts.",
language),e);
}
}
/**
* THis method extracts NamedEntity occurrences by using existing {@link Token}s and
* {@link Sentence}s in the parsed {@link AnalysedText}.
* @param nameFinderModel the model used to find NamedEntities
* @param at the Analysed Text
* @param language the language of the text
* @return the found named Entity Occurrences
*/
protected Map<String,List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel,
AnalysedText at, String language) {
// version with explicit sentence endings to reflect heading / paragraph
// structure of an HTML or PDF document converted to text
NameFinderME finder = new NameFinderME(nameFinderModel);
Map<String,List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String,List<NameOccurrence>>();
List<Section> sentences = new ArrayList<Section>();
//Holds the tokens of the previouse (pos 0) current (pos 1) and next (pos 2) sentence
AnalysedTextUtils.appandToList(at.getSentences(), sentences);
if(sentences.isEmpty()){ //no sentence annotations
sentences.add(at); //process as a single section
}
for (int i=0;i<sentences.size();i++) {
String sentence = sentences.get(i).getSpan();
// build a context by concatenating three sentences to be used for
// similarity ranking / disambiguation + contextual snippet in the
// extraction structure
List<String> contextElements = new ArrayList<String>();
contextElements.add(sentence);
//three sentences as context
String context = at.getSpan().substring(
sentences.get(Math.max(0, i-1)).getStart(),
sentences.get(Math.min(sentences.size()-1, i+1)).getEnd());
// get the tokens, words of the current sentence
List<Token> tokens = new ArrayList<Token>(32);
List<String> words = new ArrayList<String>(32);
for(Iterator<Token> it =sentences.get(i).getTokens();it.hasNext();){
Token t = it.next();
tokens.add(t);
words.add(t.getSpan());
}
Span[] nameSpans = finder.find(words.toArray(new String[words.size()]));
double[] probs = finder.probs();
//int lastStartPosition = 0;
for (int j = 0; j < nameSpans.length; j++) {
String name = at.getSpan().substring(tokens.get(nameSpans[j].getStart()).getStart(),
tokens.get(nameSpans[j].getEnd()-1).getEnd());
Double confidence = 1.0;
for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
confidence *= probs[k];
}
int start = tokens.get(nameSpans[j].getStart()).getStart();
int end = start + name.length();
NerTag nerTag = config.getNerTag(nameSpans[j].getType());
//create the occurrence for writing fise:TextAnnotations
NameOccurrence occurrence = new NameOccurrence(name, start, end, nerTag.getType(),
context, confidence);
List<NameOccurrence> occurrences = nameOccurrences.get(name);
if (occurrences == null) {
occurrences = new ArrayList<NameOccurrence>();
}
occurrences.add(occurrence);
nameOccurrences.put(name, occurrences);
//add also the NerAnnotation to the AnalysedText
Chunk chunk = at.addChunk(start, end);
//TODO: build AnnotationModel based on the configured Mappings
chunk.addAnnotation(NER_ANNOTATION, Value.value(nerTag, confidence));
}
}
finder.clearAdaptiveData();
log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
return nameOccurrences;
}
protected Map<String,List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel, String text, String language) {
// version with explicit sentence endings to reflect heading / paragraph
// structure of an HTML or PDF document converted to text
String textWithDots = text.replaceAll("\\n\\n", ".\n");
text = removeNonUtf8CompliantCharacters(text);
SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en"));
Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);
NameFinderME finder = new NameFinderME(nameFinderModel);
Tokenizer tokenizer = openNLP.getTokenizer(language);
Map<String,List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String,List<NameOccurrence>>();
for (int i = 0; i < sentenceSpans.length; i++) {
String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();
// build a context by concatenating three sentences to be used for
// similarity ranking / disambiguation + contextual snippet in the
// extraction structure
List<String> contextElements = new ArrayList<String>();
if (i > 0) {
CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text);
contextElements.add(previousSentence.toString().trim());
}
contextElements.add(sentence.trim());
if (i + 1 < sentenceSpans.length) {
CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text);
contextElements.add(nextSentence.toString().trim());
}
String context = StringUtils.join(contextElements, " ");
// extract the names in the current sentence and
// keep them store them with the current context
Span[] tokenSpans = tokenizer.tokenizePos(sentence);
String[] tokens = Span.spansToStrings(tokenSpans, sentence);
Span[] nameSpans = finder.find(tokens);
double[] probs = finder.probs();
//int lastStartPosition = 0;
for (int j = 0; j < nameSpans.length; j++) {
String name = sentence.substring(tokenSpans[nameSpans[j].getStart()].getStart(),
tokenSpans[nameSpans[j].getEnd()-1].getEnd());
//NOTE: With OpenNLP 1.6 the probability is now stored in the span
double prob = nameSpans[j].getProb();
//prob == 0.0 := unspecified
Double confidence = prob != 0.0 ? Double.valueOf(prob) : null;
if(confidence == null){ //fall back to the old if it is not set.
for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
prob *= probs[k];
}
confidence = Double.valueOf(prob);
} else if(confidence < 0.5d){
//It looks like as if preceptron based models do return
//invalid probabilities. As it is expected the Named Entities
//with a probability < 50% are not even returned by finder.find(..)
//we will just ignore confidence values < 0.5 here
confidence = null;
}
int start = tokenSpans[nameSpans[j].getStart()].getStart();
int absoluteStart = sentenceSpans[i].getStart() + start;
int absoluteEnd = absoluteStart + name.length();
NerTag nerTag = config.getNerTag(nameSpans[j].getType());
NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd,
nerTag.getType(),context, confidence);
List<NameOccurrence> occurrences = nameOccurrences.get(name);
if (occurrences == null) {
occurrences = new ArrayList<NameOccurrence>();
}
occurrences.add(occurrence);
nameOccurrences.put(name, occurrences);
}
}
finder.clearAdaptiveData();
log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
return nameOccurrences;
}
public int canEnhance(ContentItem ci) {
if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null &&
isNerModel(extractLanguage(ci))){
return ENHANCE_ASYNC;
} else {
return CANNOT_ENHANCE;
}
}
/**
* Remove non UTF-8 compliant characters (typically control characters) so has to avoid polluting the
* annotation graph with snippets that are not serializable as XML.
*/
protected static String removeNonUtf8CompliantCharacters(final String text) {
if (null == text) {
return null;
}
StringBuilder sb = null; //initialised on the first replacement
for (int i = 0; i < text.length(); i++) {
int ch = text.codePointAt(i);
// remove any characters outside the valid UTF-8 range as well as all control characters
// except tabs and new lines
//NOTE: rewesten (2012-11-21) replaced the original check with the one
// found at http://blog.mark-mclaren.info/2007/02/invalid-xml-characters-when-valid-utf8_5873.html
if (!((ch == 0x9) ||
(ch == 0xA) ||
(ch == 0xD) ||
((ch >= 0x20) && (ch <= 0xD7FF)) ||
((ch >= 0xE000) && (ch <= 0xFFFD)) ||
((ch >= 0x10000) && (ch <= 0x10FFFF)))){
if(sb == null){
sb = new StringBuilder(text);
}
sb.setCharAt(i, ' ');
}
}
return sb == null ? text : sb.toString();
}
/**
* Extracts the language of the parsed ContentItem by using
* {@link EnhancementEngineHelper#getLanguage(ContentItem)} and
* {@link #defaultLang} as default
* @param ci the content item
* @return the language
*/
private String extractLanguage(ContentItem ci) {
String lang = EnhancementEngineHelper.getLanguage(ci);
if(lang != null){
return lang;
} else {
log.info("Unable to extract language for ContentItem %s!",ci.getUri().getUnicodeString());
log.info(" ... return '{}' as default",config.getDefaultLanguage());
return config.getDefaultLanguage();
}
}
/**
* This Method checks if this configuration does have a NER model for the
* parsed language. This checks if the pased language
* {@link #isProcessedLangage(String)} and any {@link #getDefaultModelTypes()}
* is present OR if any {@link #getSpecificNerModles(String)} is configured for the
* parsed language.
* @param lang The language to check
* @return if there is any NER model configured for the parsed language
*/
public boolean isNerModel(String lang){
return (config.isProcessedLangage(lang) && !config.getDefaultModelTypes().isEmpty()) ||
!config.getSpecificNerModles(lang).isEmpty();
}
}