| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.stanbol.enhancer.engines.entitylinking.engine; |
| |
| import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText; |
| import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_CONTRIBUTOR; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_TEXTANNOTATION; |
| |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.Map; |
| import java.util.Map.Entry; |
| import java.util.Set; |
| |
| import org.apache.clerezza.commons.rdf.Language; |
| import org.apache.clerezza.commons.rdf.Literal; |
| import org.apache.clerezza.rdf.core.LiteralFactory; |
| import org.apache.clerezza.commons.rdf.Graph; |
| import org.apache.clerezza.commons.rdf.RDFTerm; |
| import org.apache.clerezza.commons.rdf.Triple; |
| import org.apache.clerezza.commons.rdf.IRI; |
| import org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl; |
| import org.apache.clerezza.commons.rdf.impl.utils.TripleImpl; |
| import org.apache.clerezza.commons.rdf.impl.utils.TypedLiteralImpl; |
| import org.apache.commons.lang.StringUtils; |
| import org.apache.felix.scr.annotations.ReferenceCardinality; |
| import org.apache.felix.scr.annotations.ReferencePolicy; |
| import org.apache.felix.scr.annotations.ReferenceStrategy; |
| import org.apache.stanbol.commons.stanboltools.offline.OfflineMode; |
| import org.apache.stanbol.enhancer.engines.entitylinking.Entity; |
| import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher; |
| import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcherException; |
| import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer; |
| import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig; |
| import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig; |
| import org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig; |
| import org.apache.stanbol.enhancer.engines.entitylinking.impl.EntityLinker; |
| import org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity; |
| import org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity.Occurrence; |
| import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion; |
| import org.apache.stanbol.enhancer.nlp.model.AnalysedText; |
| import org.apache.stanbol.enhancer.servicesapi.ContentItem; |
| import org.apache.stanbol.enhancer.servicesapi.EngineException; |
| import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine; |
| import org.apache.stanbol.enhancer.servicesapi.ServiceProperties; |
| import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper; |
| import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum; |
| import org.apache.stanbol.enhancer.servicesapi.rdf.Properties; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| /** |
| * Engine that consumes NLP processing results from the {@link AnalysedText} |
| * content part of processed {@link ContentItem}s and links them with |
| * Entities as provided by the configured {@link EntitySearcher} instance. |
| * @author Rupert Westenthaler |
| * |
| */ |
| public class EntityLinkingEngine implements EnhancementEngine, ServiceProperties { |
| |
| private final Logger log = LoggerFactory.getLogger(EntityLinkingEngine.class); |
| /** |
| * This is used to check the content type of parsed {@link ContentItem}s for |
| * plain text |
| */ |
| protected static final String TEXT_PLAIN_MIMETYPE = "text/plain"; |
| /** |
| * Contains the only supported mime type {@link #TEXT_PLAIN_MIMETYPE} |
| */ |
| protected static final Set<String> SUPPORTED_MIMETYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE); |
| /** |
| * The default value for the Execution of this Engine. |
| * This Engine creates TextAnnotations that should not be processed by other Engines. |
| * Therefore it uses a lower rank than {@link ServiceProperties#ORDERING_DEFAULT} |
| * to ensure that other engines do not get confused |
| */ |
| public static final Integer DEFAULT_ORDER = ServiceProperties.ORDERING_DEFAULT - 10; |
| |
| private static final IRI XSD_DOUBLE = new IRI("http://www.w3.org/2001/XMLSchema#double"); |
| |
| private static final IRI ENHANCER_ENTITY_RANKING = new IRI(NamespaceEnum.fise + "entity-ranking"); |
| |
| /** |
| * The name of this engine |
| */ |
| protected final String name; |
| /** |
| * The entitySearcher used for linking |
| */ |
| protected final EntitySearcher entitySearcher; |
| /** |
| * configuration for entity linking |
| */ |
| protected final EntityLinkerConfig linkerConfig; |
| /** |
| * The label tokenizer |
| */ |
| protected LabelTokenizer labelTokenizer; |
| |
| /** |
| * The text processing configuration |
| */ |
| protected final TextProcessingConfig textProcessingConfig; |
| /** |
| * The literalFactory used to create typed literals |
| */ |
| private LiteralFactory literalFactory = LiteralFactory.getInstance(); |
| |
| /** |
| * The {@link OfflineMode} is used by Stanbol to indicate that no external service should be referenced. |
| * For this engine that means it is necessary to check if the used {@link ReferencedSite} can operate |
| * offline or not. |
| * |
| * @see #enableOfflineMode(OfflineMode) |
| * @see #disableOfflineMode(OfflineMode) |
| */ |
| @org.apache.felix.scr.annotations.Reference( |
| cardinality = ReferenceCardinality.OPTIONAL_UNARY, |
| policy = ReferencePolicy.DYNAMIC, |
| bind = "enableOfflineMode", |
| unbind = "disableOfflineMode", |
| strategy = ReferenceStrategy.EVENT) |
| private OfflineMode offlineMode; |
| |
| /** |
| * Called by the ConfigurationAdmin to bind the {@link #offlineMode} if the service becomes available |
| * |
| * @param mode |
| */ |
| protected final void enableOfflineMode(OfflineMode mode) { |
| this.offlineMode = mode; |
| } |
| |
| /** |
| * Called by the ConfigurationAdmin to unbind the {@link #offlineMode} if the service becomes unavailable |
| * |
| * @param mode |
| */ |
| protected final void disableOfflineMode(OfflineMode mode) { |
| this.offlineMode = null; |
| } |
| |
| /** |
| * Returns <code>true</code> only if Stanbol operates in {@link OfflineMode}. |
| * |
| * @return the offline state |
| */ |
| protected final boolean isOfflineMode() { |
| return offlineMode != null; |
| } |
| |
| /** |
| * Internal Constructor used by {@link #createInstance(EntitySearcher, LanguageProcessingConfig, EntityLinkerConfig)} |
| * @param entitySearcher The component used to lookup Entities |
| * @param textProcessingConfig The configuration on how to use the {@link AnalysedText} content part of |
| * processed {@link ContentItem}s |
| * @param linkingConfig the configuration for the EntityLinker |
| */ |
| public EntityLinkingEngine(String name, EntitySearcher entitySearcher,TextProcessingConfig textProcessingConfig, |
| EntityLinkerConfig linkingConfig, LabelTokenizer labelTokenizer){ |
| if(name == null || name.isEmpty()){ |
| throw new IllegalArgumentException("The parsed EnhancementEngine name MUST NOT be NULL!"); |
| } |
| this.name = name; |
| this.linkerConfig = linkingConfig != null ? linkingConfig : new EntityLinkerConfig(); |
| this.textProcessingConfig = textProcessingConfig; |
| this.entitySearcher = entitySearcher; |
| this.labelTokenizer = labelTokenizer; |
| } |
| /** |
| * Getter for the {@link LabelTokenizer} used by this Engine |
| * @return the labelTokenizer |
| */ |
| public final LabelTokenizer getLabelTokenizer() { |
| return labelTokenizer; |
| } |
| |
| /** |
| * Setter for the {@link LabelTokenizer} used by this Engine |
| * @param labelTokenizer the labelTokenizer to set |
| */ |
| public final void setLabelTokenizer(LabelTokenizer labelTokenizer) { |
| this.labelTokenizer = labelTokenizer; |
| } |
| @Override |
| public Map<String,Object> getServiceProperties() { |
| return Collections.unmodifiableMap(Collections.singletonMap( |
| ENHANCEMENT_ENGINE_ORDERING, |
| (Object) DEFAULT_ORDER)); |
| } |
| |
| @Override |
| public String getName() { |
| return name; |
| } |
| |
| @Override |
| public int canEnhance(ContentItem ci) throws EngineException { |
| log.trace("canEnhancer {}",ci.getUri()); |
| if(isOfflineMode() && !entitySearcher.supportsOfflineMode()){ |
| log.warn("{} '{}' is inactive because EntitySearcher does not support Offline mode!", |
| getClass().getSimpleName(),getName()); |
| return CANNOT_ENHANCE; |
| } |
| String language = getLanguage(this, ci, false); |
| if(language == null || textProcessingConfig.getConfiguration(language) == null){ |
| log.debug("Engine {} ignores ContentItem {} becuase language {} is not condigured.", |
| new Object[]{ getName(), ci.getUri(), language}); |
| return CANNOT_ENHANCE; |
| } |
| //we need a detected language, the AnalyzedText contentPart with |
| //Tokens. |
| AnalysedText at = getAnalysedText(this, ci, false); |
| return at != null && at.getTokens().hasNext() ? |
| ENHANCE_ASYNC : CANNOT_ENHANCE; |
| } |
| |
| @Override |
| public void computeEnhancements(ContentItem ci) throws EngineException { |
| log.trace(" enhance ci {}",ci.getUri()); |
| if(isOfflineMode() && !entitySearcher.supportsOfflineMode()){ |
| throw new EngineException(this,ci,"Offline mode is not supported by the used EntitySearcher!",null); |
| } |
| AnalysedText at = getAnalysedText(this, ci, true); |
| log.debug(" > AnalysedText {}",at); |
| String language = getLanguage(this, ci, true); |
| if(log.isDebugEnabled()){ |
| log.debug("computeEnhancements for ContentItem {} language {} text={}", |
| new Object []{ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100)}); |
| } |
| log.debug(" > Language {}",language); |
| LanguageProcessingConfig languageConfig = textProcessingConfig.getConfiguration(language); |
| if(languageConfig == null){ |
| throw new IllegalStateException("The language '"+language+"' is not configured " |
| + "to be processed by this Engine. As this is already checked within the " |
| + "canEnhance(..) method this may indicate an bug in the used " |
| + "EnhanceemntJobManager implementation!"); |
| } |
| EntityLinker entityLinker = new EntityLinker(at,language, |
| languageConfig, entitySearcher, linkerConfig, labelTokenizer); |
| //process |
| try { |
| entityLinker.process(); |
| } catch (EntitySearcherException e) { |
| log.error("Unable to link Entities with "+entityLinker,e); |
| throw new EngineException(this, ci, "Unable to link Entities with "+entityLinker, e); |
| } |
| if(log.isInfoEnabled()){ |
| entityLinker.logStatistics(log); |
| } |
| //write results (requires a write lock) |
| ci.getLock().writeLock().lock(); |
| try { |
| writeEnhancements(ci, entityLinker.getLinkedEntities().values(), language, |
| linkerConfig.isWriteEntityRankings()); |
| } finally { |
| ci.getLock().writeLock().unlock(); |
| } |
| } |
| |
| /** |
| * Writes the Enhancements for the {@link LinkedEntity LinkedEntities} |
| * extracted from the parsed ContentItem |
| * @param ci |
| * @param linkedEntities |
| * @param language |
| */ |
| private void writeEnhancements(ContentItem ci, Collection<LinkedEntity> linkedEntities, |
| String language, boolean writeRankings) { |
| Language languageObject = null; |
| if(language != null && !language.isEmpty()){ |
| languageObject = new Language(language); |
| } |
| Set<IRI> dereferencedEntitis = new HashSet<IRI>(); |
| |
| Graph metadata = ci.getMetadata(); |
| for(LinkedEntity linkedEntity : linkedEntities){ |
| Collection<IRI> textAnnotations = new ArrayList<IRI>(linkedEntity.getOccurrences().size()); |
| //first create the TextAnnotations for the Occurrences |
| for(Occurrence occurrence : linkedEntity.getOccurrences()){ |
| Literal startLiteral = literalFactory.createTypedLiteral(occurrence.getStart()); |
| Literal endLiteral = literalFactory.createTypedLiteral(occurrence.getEnd()); |
| //search for existing text annotation |
| Iterator<Triple> it = metadata.filter(null, ENHANCER_START, startLiteral); |
| IRI textAnnotation = null; |
| while(it.hasNext()){ |
| Triple t = it.next(); |
| if(metadata.filter(t.getSubject(), ENHANCER_END, endLiteral).hasNext() && |
| metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()){ |
| textAnnotation = (IRI)t.getSubject(); |
| break; |
| } |
| } |
| if(textAnnotation == null){ //not found ... create a new one |
| textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this); |
| metadata.add(new TripleImpl(textAnnotation, |
| Properties.ENHANCER_START, |
| startLiteral)); |
| metadata.add(new TripleImpl(textAnnotation, |
| Properties.ENHANCER_END, |
| endLiteral)); |
| metadata.add(new TripleImpl(textAnnotation, |
| Properties.ENHANCER_SELECTION_CONTEXT, |
| new PlainLiteralImpl(occurrence.getContext(),languageObject))); |
| metadata.add(new TripleImpl(textAnnotation, |
| Properties.ENHANCER_SELECTED_TEXT, |
| new PlainLiteralImpl(occurrence.getSelectedText(),languageObject))); |
| metadata.add(new TripleImpl(textAnnotation, |
| Properties.ENHANCER_CONFIDENCE, |
| literalFactory.createTypedLiteral(linkedEntity.getScore()))); |
| } else { //if existing add this engine as contributor |
| metadata.add(new TripleImpl(textAnnotation, DC_CONTRIBUTOR, |
| new PlainLiteralImpl(this.getClass().getName()))); |
| } |
| //add dc:types (even to existing) |
| for(IRI dcType : linkedEntity.getTypes()){ |
| metadata.add(new TripleImpl( |
| textAnnotation, Properties.DC_TYPE, dcType)); |
| } |
| textAnnotations.add(textAnnotation); |
| } |
| //now the EntityAnnotations for the Suggestions |
| for(Suggestion suggestion : linkedEntity.getSuggestions()){ |
| IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this); |
| //should we use the label used for the match, or search the |
| //representation for the best label ... currently its the matched one |
| Literal label = suggestion.getBestLabel(linkerConfig.getNameField(),language); |
| Entity entity = suggestion.getEntity(); |
| metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_LABEL, label)); |
| metadata.add(new TripleImpl(entityAnnotation,ENHANCER_ENTITY_REFERENCE, entity.getUri())); |
| Iterator<IRI> suggestionTypes = entity.getReferences(linkerConfig.getTypeField()); |
| while(suggestionTypes.hasNext()){ |
| metadata.add(new TripleImpl(entityAnnotation, |
| Properties.ENHANCER_ENTITY_TYPE, suggestionTypes.next())); |
| } |
| metadata.add(new TripleImpl(entityAnnotation, |
| Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getScore()))); |
| for(IRI textAnnotation : textAnnotations){ |
| metadata.add(new TripleImpl(entityAnnotation, Properties.DC_RELATION, textAnnotation)); |
| } |
| //add origin information of the EntiySearcher |
| for(Entry<IRI,Collection<RDFTerm>> originInfo : entitySearcher.getOriginInformation().entrySet()){ |
| for(RDFTerm value : originInfo.getValue()){ |
| metadata.add(new TripleImpl(entityAnnotation, |
| originInfo.getKey(),value)); |
| } |
| } |
| if(writeRankings){ |
| Float ranking = suggestion.getEntity().getEntityRanking(); |
| if(ranking != null){ |
| metadata.add(new TripleImpl(entityAnnotation, |
| ENHANCER_ENTITY_RANKING, |
| //write the float as double |
| new TypedLiteralImpl(ranking.toString(), XSD_DOUBLE))); |
| } |
| } |
| //in case dereferencing of Entities is enabled we need also to |
| //add the RDF data for entities |
| if(linkerConfig.isDereferenceEntitiesEnabled() && |
| dereferencedEntitis.add(entity.getUri())){ //not yet dereferenced |
| //add all outgoing triples for this entity |
| //NOTE: do not add all triples as there might be other data in the graph |
| for(Iterator<Triple> triples = entity.getData().filter(entity.getUri(), null, null); |
| triples.hasNext();metadata.add(triples.next())); |
| } |
| } |
| } |
| } |
| |
| } |