| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.stanbol.enhancer.engines.entitytagging.impl; |
| |
| import static org.apache.commons.lang.StringUtils.getLevenshteinDistance; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_ORGANISATION; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE; |
| |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.Dictionary; |
| import java.util.HashMap; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Map.Entry; |
| |
| import org.apache.clerezza.rdf.core.LiteralFactory; |
| import org.apache.clerezza.rdf.core.MGraph; |
| import org.apache.clerezza.rdf.core.NonLiteral; |
| import org.apache.clerezza.rdf.core.Triple; |
| import org.apache.clerezza.rdf.core.UriRef; |
| import org.apache.commons.lang.StringUtils; |
| import org.apache.felix.scr.annotations.Activate; |
| import org.apache.felix.scr.annotations.Component; |
| import org.apache.felix.scr.annotations.ConfigurationPolicy; |
| import org.apache.felix.scr.annotations.Deactivate; |
| import org.apache.felix.scr.annotations.Property; |
| import org.apache.felix.scr.annotations.Reference; |
| import org.apache.felix.scr.annotations.ReferenceCardinality; |
| import org.apache.felix.scr.annotations.ReferencePolicy; |
| import org.apache.felix.scr.annotations.ReferenceStrategy; |
| import org.apache.felix.scr.annotations.Service; |
| import org.apache.stanbol.commons.namespaceprefix.NamespaceMappingUtils; |
| import org.apache.stanbol.commons.namespaceprefix.NamespacePrefixService; |
| import org.apache.stanbol.commons.stanboltools.offline.OfflineMode; |
| import org.apache.stanbol.enhancer.servicesapi.ContentItem; |
| import org.apache.stanbol.enhancer.servicesapi.EngineException; |
| import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine; |
| import org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager; |
| import org.apache.stanbol.enhancer.servicesapi.ServiceProperties; |
| import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper; |
| import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine; |
| import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses; |
| import org.apache.stanbol.enhancer.servicesapi.rdf.Properties; |
| import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses; |
| import org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory; |
| import org.apache.stanbol.entityhub.servicesapi.Entityhub; |
| import org.apache.stanbol.entityhub.servicesapi.EntityhubException; |
| import org.apache.stanbol.entityhub.servicesapi.model.Entity; |
| import org.apache.stanbol.entityhub.servicesapi.model.Representation; |
| import org.apache.stanbol.entityhub.servicesapi.model.Text; |
| import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum; |
| import org.apache.stanbol.entityhub.servicesapi.query.Constraint; |
| import org.apache.stanbol.entityhub.servicesapi.query.FieldQuery; |
| import org.apache.stanbol.entityhub.servicesapi.query.FieldQueryFactory; |
| import org.apache.stanbol.entityhub.servicesapi.query.QueryResultList; |
| import org.apache.stanbol.entityhub.servicesapi.query.ReferenceConstraint; |
| import org.apache.stanbol.entityhub.servicesapi.query.TextConstraint; |
| import org.apache.stanbol.entityhub.servicesapi.site.Site; |
| import org.apache.stanbol.entityhub.servicesapi.site.SiteException; |
| import org.apache.stanbol.entityhub.servicesapi.site.SiteManager; |
| import org.osgi.framework.Constants; |
| import org.osgi.service.cm.ConfigurationException; |
| import org.osgi.service.component.ComponentContext; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * Engine that uses a {@link Site} to search for entities for existing TextAnnotations of an Content Item. |
| * |
| * @author ogrisel, rwesten |
| */ |
| @Component(configurationFactory = true, |
| policy = ConfigurationPolicy.REQUIRE, // the baseUri is required! |
| specVersion = "1.1", metatype = true, immediate = true, inherit = true) |
| @Service |
| @org.apache.felix.scr.annotations.Properties(value = {@Property(name = EnhancementEngine.PROPERTY_NAME)}) |
| public class NamedEntityTaggingEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> |
| implements EnhancementEngine, ServiceProperties { |
| |
| private final Logger log = LoggerFactory.getLogger(getClass()); |
| |
| @Property // (value = "dbpedia") |
| public static final String REFERENCED_SITE_ID = "org.apache.stanbol.enhancer.engines.entitytagging.referencedSiteId"; |
| |
| @Property(boolValue = false) |
| public static final String PERSON_STATE = "org.apache.stanbol.enhancer.engines.entitytagging.personState"; |
| |
| @Property // (value = "dbp-ont:Person") |
| public static final String PERSON_TYPE = "org.apache.stanbol.enhancer.engines.entitytagging.personType"; |
| |
| @Property(boolValue = false) |
| public static final String ORG_STATE = "org.apache.stanbol.enhancer.engines.entitytagging.organisationState"; |
| |
| @Property // (value = "dbp-ont:Organisation") |
| public static final String ORG_TYPE = "org.apache.stanbol.enhancer.engines.entitytagging.organisationType"; |
| |
| @Property(boolValue = false) |
| public static final String PLACE_STATE = "org.apache.stanbol.enhancer.engines.entitytagging.placeState"; |
| |
| @Property // (value = "dbp-ont:Place") |
| public static final String PLACE_TYPE = "org.apache.stanbol.enhancer.engines.entitytagging.placeType"; |
| /** |
| * Use the RDFS label as default |
| */ |
| @Property(value = "rdfs:label") |
| public static final String NAME_FIELD = "org.apache.stanbol.enhancer.engines.entitytagging.nameField"; |
| |
| /** |
| * Use the RDFS label as default |
| */ |
| @Property(boolValue = true) |
| public static final String DEREFERENCE_ENTITIES = "org.apache.stanbol.enhancer.engines.entitytagging.dereference"; |
| |
| @Property(intValue = 0) |
| public static final String SERVICE_RANKING = Constants.SERVICE_RANKING; |
| /** |
| * The default language for labels included in the enhancement metadata (if not available for the parsed |
| * content). |
| */ |
| private static final String DEFAULT_LANGUAGE = "en"; |
| |
| /** |
| * Service of the Entityhub that manages all the active referenced Site. This Service is used to lookup |
| * the configured Referenced Site when we need to enhance a content item. |
| */ |
| @Reference |
| protected SiteManager siteManager; |
| |
| /** |
| * Used to lookup Entities if the {@link #REFERENCED_SITE_ID} property is set to "entityhub" or "local" |
| */ |
| @Reference |
| protected Entityhub entityhub; |
| |
| @Reference(cardinality=ReferenceCardinality.OPTIONAL_UNARY) |
| protected NamespacePrefixService nsPrefixService; |
| |
| /** |
| * This holds the id of the {@link Site} used to lookup Entities or <code>null</code> if the |
| * {@link Entityhub} is used. |
| */ |
| protected String referencedSiteID; |
| |
| /** |
| * The default value for the Execution of this Engine. Currently set to |
| * {@link EnhancementJobManager#DEFAULT_ORDER} |
| */ |
| public static final Integer defaultOrder = ORDERING_EXTRACTION_ENHANCEMENT; |
| |
| /** |
| * State if text annotations of type {@link OntologicalClasses#DBPEDIA_PERSON} are enhanced by this engine |
| */ |
| protected boolean personState; |
| |
| /** |
| * State if text annotations of type {@link OntologicalClasses#DBPEDIA_ORGANISATION} are enhanced by this |
| * engine |
| */ |
| protected boolean orgState; |
| |
| /** |
| * State if text annotations of type {@link OntologicalClasses#DBPEDIA_PLACE} are enhanced by this engine |
| */ |
| protected boolean placeState; |
| |
| /** |
| * The rdf:type constraint used to search for persons or <code>null</code> if no type constraint should be |
| * used |
| */ |
| protected String personType; |
| |
| /** |
| * The rdf:type constraint used to search for organisations or <code>null</code> if no type constraint |
| * should be used |
| */ |
| protected String orgType; |
| |
| /** |
| * The rdf:type constraint used to search for places or <code>null</code> if no type constraint should be |
| * used |
| */ |
| protected String placeType; |
| |
| /** |
| * The field used to search for the selected text of the TextAnnotation. |
| */ |
| protected String nameField; |
| |
| /** |
| * The number of Suggestions to be added |
| */ |
| protected Integer numSuggestions = 3; |
| |
| protected boolean dereferenceEntities = true; |
| |
| /** |
| * The {@link OfflineMode} is used by Stanbol to indicate that no external service should be referenced. |
| * For this engine that means it is necessary to check if the used {@link Site} can operate offline or |
| * not. |
| * |
| * @see #enableOfflineMode(OfflineMode) |
| * @see #disableOfflineMode(OfflineMode) |
| */ |
| @Reference(cardinality = ReferenceCardinality.OPTIONAL_UNARY, policy = ReferencePolicy.DYNAMIC, bind = "enableOfflineMode", unbind = "disableOfflineMode", strategy = ReferenceStrategy.EVENT) |
| private OfflineMode offlineMode; |
| |
| /** |
| * Called by the ConfigurationAdmin to bind the {@link #offlineMode} if the service becomes available |
| * |
| * @param mode |
| */ |
| protected final void enableOfflineMode(OfflineMode mode) { |
| this.offlineMode = mode; |
| } |
| |
| /** |
| * Called by the ConfigurationAdmin to unbind the {@link #offlineMode} if the service becomes unavailable |
| * |
| * @param mode |
| */ |
| protected final void disableOfflineMode(OfflineMode mode) { |
| this.offlineMode = null; |
| } |
| |
| /** |
| * Returns <code>true</code> only if Stanbol operates in {@link OfflineMode} . |
| * |
| * @return the offline state |
| */ |
| protected final boolean isOfflineMode() { |
| return offlineMode != null; |
| } |
| |
| @SuppressWarnings("unchecked") |
| @Activate |
| protected void activate(ComponentContext context) throws ConfigurationException { |
| super.activate(context); |
| Dictionary<String,Object> config = context.getProperties(); |
| Object referencedSiteID = config.get(REFERENCED_SITE_ID); |
| if (referencedSiteID == null) { |
| throw new ConfigurationException(REFERENCED_SITE_ID, |
| "The ID of the Referenced Site is a required Parameter and MUST NOT be NULL!"); |
| } |
| |
| this.referencedSiteID = referencedSiteID.toString(); |
| if (this.referencedSiteID.isEmpty()) { |
| throw new ConfigurationException(REFERENCED_SITE_ID, |
| "The ID of the Referenced Site is a required Parameter and MUST NOT be an empty String!"); |
| } |
| if (Entityhub.ENTITYHUB_IDS.contains(this.referencedSiteID.toLowerCase())) { |
| log.debug("Init NamedEntityTaggingEngine instance for the Entityhub"); |
| this.referencedSiteID = null; |
| } |
| Object state = config.get(PERSON_STATE); |
| personState = state == null ? true : Boolean.parseBoolean(state.toString()); |
| state = config.get(ORG_STATE); |
| orgState = state == null ? true : Boolean.parseBoolean(state.toString()); |
| state = config.get(PLACE_STATE); |
| placeState = state == null ? true : Boolean.parseBoolean(state.toString()); |
| Object type = config.get(PERSON_TYPE); |
| personType = type == null || type.toString().isEmpty() ? null : |
| NamespaceMappingUtils.getConfiguredUri(nsPrefixService,PERSON_TYPE, type.toString()); |
| type = config.get(ORG_TYPE); |
| orgType = type == null || type.toString().isEmpty() ? null : |
| NamespaceMappingUtils.getConfiguredUri(nsPrefixService,ORG_TYPE,type.toString()); |
| type = config.get(PLACE_TYPE); |
| placeType = type == null || type.toString().isEmpty() ? null : |
| NamespaceMappingUtils.getConfiguredUri(nsPrefixService,PLACE_TYPE,type.toString()); |
| Object nameField = config.get(NAME_FIELD); |
| this.nameField = nameField == null || nameField.toString().isEmpty() ? |
| "http://www.w3.org/2000/01/rdf-schema#label" : |
| NamespaceMappingUtils.getConfiguredUri(nsPrefixService,NAME_FIELD,nameField.toString()); |
| Object dereferenceEntities = config.get(DEREFERENCE_ENTITIES); |
| this.dereferenceEntities = state == null ? true : Boolean |
| .parseBoolean(dereferenceEntities.toString()); |
| } |
| |
| @Deactivate |
| protected void deactivate(ComponentContext context) { |
| super.deactivate(context); |
| referencedSiteID = null; |
| personType = null; |
| orgType = null; |
| placeType = null; |
| nameField = null; |
| } |
| |
| |
| |
| public void computeEnhancements(ContentItem ci) throws EngineException { |
| final Site site; |
| if (referencedSiteID != null) { // lookup the referenced site |
| site = siteManager.getSite(referencedSiteID); |
| // ensure that it is present |
| if (site == null) { |
| String msg = String.format( |
| "Unable to enhance %s because Referenced Site %s is currently not active!", ci.getUri() |
| .getUnicodeString(), referencedSiteID); |
| log.warn(msg); |
| // TODO: throwing Exceptions is currently deactivated. We need a |
| // more clear |
| // policy what do to in such situations |
| // throw new EngineException(msg); |
| return; |
| } |
| // and that it supports offline mode if required |
| if (isOfflineMode() && !site.supportsLocalMode()) { |
| log.warn( |
| "Unable to enhance ci {} because OfflineMode is not supported by ReferencedSite {}.", ci |
| .getUri().getUnicodeString(), site.getId()); |
| return; |
| } |
| } else { // null indicates to use the Entityhub to lookup Entities |
| site = null; |
| } |
| MGraph graph = ci.getMetadata(); |
| LiteralFactory literalFactory = LiteralFactory.getInstance(); |
| // Retrieve the existing text annotations (requires read lock) |
| Map<NamedEntity,List<UriRef>> textAnnotations = new HashMap<NamedEntity,List<UriRef>>(); |
| // the language extracted for the parsed content or NULL if not |
| // available |
| String contentLangauge; |
| ci.getLock().readLock().lock(); |
| try { |
| contentLangauge = EnhancementEngineHelper.getLanguage(ci); |
| for (Iterator<Triple> it = graph.filter(null, RDF_TYPE, TechnicalClasses.ENHANCER_TEXTANNOTATION); it |
| .hasNext();) { |
| UriRef uri = (UriRef) it.next().getSubject(); |
| if (graph.filter(uri, Properties.DC_RELATION, null).hasNext()) { |
| // this is not the most specific occurrence of this name: |
| // skip |
| continue; |
| } |
| NamedEntity namedEntity = NamedEntity.createFromTextAnnotation(graph, uri); |
| if (namedEntity != null) { |
| // This is a first occurrence, collect any subsumed |
| // annotations |
| List<UriRef> subsumed = new ArrayList<UriRef>(); |
| for (Iterator<Triple> it2 = graph.filter(null, Properties.DC_RELATION, uri); it2 |
| .hasNext();) { |
| subsumed.add((UriRef) it2.next().getSubject()); |
| } |
| textAnnotations.put(namedEntity, subsumed); |
| } |
| } |
| } finally { |
| ci.getLock().readLock().unlock(); |
| } |
| // search the suggestions |
| Map<NamedEntity,List<Suggestion>> suggestions = new HashMap<NamedEntity,List<Suggestion>>( |
| textAnnotations.size()); |
| for (Entry<NamedEntity,List<UriRef>> entry : textAnnotations.entrySet()) { |
| try { |
| List<Suggestion> entitySuggestions = computeEntityRecommentations(site, entry.getKey(), |
| entry.getValue(), contentLangauge); |
| if (entitySuggestions != null && !entitySuggestions.isEmpty()) { |
| suggestions.put(entry.getKey(), entitySuggestions); |
| } |
| } catch (EntityhubException e) { |
| throw new EngineException(this, ci, e); |
| } |
| } |
| // now write the results (requires write lock) |
| ci.getLock().writeLock().lock(); |
| try { |
| RdfValueFactory factory = RdfValueFactory.getInstance(); |
| Map<String,Representation> entityData = new HashMap<String,Representation>(); |
| for (Entry<NamedEntity,List<Suggestion>> entitySuggestions : suggestions.entrySet()) { |
| List<UriRef> subsumed = textAnnotations.get(entitySuggestions.getKey()); |
| List<NonLiteral> annotationsToRelate = new ArrayList<NonLiteral>(subsumed); |
| annotationsToRelate.add(entitySuggestions.getKey().getEntity()); |
| for (Suggestion suggestion : entitySuggestions.getValue()) { |
| log.debug("Add Suggestion {} for {}", suggestion.getEntity().getId(), |
| entitySuggestions.getKey()); |
| EnhancementRDFUtils.writeEntityAnnotation(this, literalFactory, graph, ci.getUri(), |
| annotationsToRelate, suggestion, nameField, |
| // TODO: maybe we want labels in a different |
| // language than the |
| // language of the content (e.g. Accept-Language |
| // header)?! |
| contentLangauge == null ? DEFAULT_LANGUAGE : contentLangauge); |
| if (dereferenceEntities) { |
| entityData.put(suggestion.getEntity().getId(), suggestion.getEntity() |
| .getRepresentation()); |
| } |
| } |
| } |
| // if dereferneceEntities is true the entityData will also contain |
| // all |
| // Representations to add! If false entityData will be empty |
| for (Representation rep : entityData.values()) { |
| graph.addAll(factory.toRdfRepresentation(rep).getRdfGraph()); |
| } |
| } finally { |
| ci.getLock().writeLock().unlock(); |
| } |
| |
| } |
| |
| /** |
| * Computes the Enhancements |
| * |
| * @param site |
| * The {@link SiteException} id or <code>null</code> to use the {@link Entityhub} |
| * @param literalFactory |
| * the {@link LiteralFactory} used to create RDF Literals |
| * @param contentItemId |
| * the id of the contentItem |
| * @param textAnnotation |
| * the text annotation to enhance |
| * @param subsumedAnnotations |
| * other text annotations for the same entity |
| * @param language |
| * the language of the analysed text or <code>null</code> if not available. |
| * @return the suggestions for the parsed {@link NamedEntity} |
| * @throws EntityhubException |
| * On any Error while looking up Entities via the Entityhub |
| */ |
| protected final List<Suggestion> computeEntityRecommentations(Site site, |
| NamedEntity namedEntity, |
| List<UriRef> subsumedAnnotations, |
| String language) throws EntityhubException { |
| // First get the required properties for the parsed textAnnotation |
| // ... and check the values |
| |
| log.debug("Process {}", namedEntity); |
| // if site is NULL use |
| // the Entityhub |
| FieldQueryFactory queryFactory = site == null ? entityhub.getQueryFactory() : site.getQueryFactory(); |
| |
| log.trace("Will use a query-factory of type [{}].", queryFactory.getClass().toString()); |
| |
| FieldQuery query = queryFactory.createFieldQuery(); |
| |
| // replace spaces with plus to create an AND search for all words in the |
| // name! |
| Constraint labelConstraint; |
| // TODO: make case sensitivity configurable |
| boolean casesensitive = false; |
| String namedEntityLabel = casesensitive ? namedEntity.getName() : namedEntity.getName().toLowerCase(); |
| if (language != null) { |
| // search labels in the language and without language |
| labelConstraint = new TextConstraint(namedEntityLabel, casesensitive, language, null); |
| } else { |
| labelConstraint = new TextConstraint(namedEntityLabel, casesensitive); |
| } |
| query.setConstraint(nameField, labelConstraint); |
| if (OntologicalClasses.DBPEDIA_PERSON.equals(namedEntity.getType())) { |
| if (personState) { |
| if (personType != null) { |
| query.setConstraint(RDF_TYPE.getUnicodeString(), new ReferenceConstraint(personType)); |
| } |
| // else no type constraint |
| } else { |
| // ignore people |
| return Collections.emptyList(); |
| } |
| } else if (DBPEDIA_ORGANISATION.equals(namedEntity.getType())) { |
| if (orgState) { |
| if (orgType != null) { |
| query.setConstraint(RDF_TYPE.getUnicodeString(), new ReferenceConstraint(orgType)); |
| } |
| // else no type constraint |
| } else { |
| // ignore people |
| return Collections.emptyList(); |
| } |
| } else if (OntologicalClasses.DBPEDIA_PLACE.equals(namedEntity.getType())) { |
| if (this.placeState) { |
| if (this.placeType != null) { |
| query.setConstraint(RDF_TYPE.getUnicodeString(), new ReferenceConstraint(placeType)); |
| } |
| // else no type constraint |
| } else { |
| // ignore people |
| return Collections.emptyList(); |
| } |
| } |
| query.setLimit(Math.max(20, this.numSuggestions * 3)); |
| |
| log.trace("A query has been created of type [{}] and the following settings:\n{}", query.getClass() |
| .toString(), query.toString()); |
| |
| if (null == site) log.trace("A query will be sent to the entity-hub of type [{}].", entityhub |
| .getClass()); |
| else log.trace("A query will be sent to a site [id :: {}][type :: {}].", site.getId(), site |
| .getClass()); |
| |
| QueryResultList<Entity> results = site == null ? // if site is NULL |
| entityhub.findEntities(query) |
| : // use the Entityhub |
| site.findEntities(query); // else the referenced site |
| log.debug(" - {} results returned by query {}", results.size(), results.getQuery()); |
| if (results.isEmpty()) { // no results nothing to do |
| return Collections.emptyList(); |
| } |
| // we need to normalise the confidence values from [0..1] |
| // * levenshtein distance as absolute (1.0 for exact match) |
| // * Solr scores * levenshtein to rank entities relative to each other |
| Float maxScore = null; |
| Float maxExactScore = null; |
| List<Suggestion> matches = new ArrayList<Suggestion>(numSuggestions); |
| // assumes entities are sorted by score |
| for (Iterator<Entity> guesses = results.iterator(); guesses.hasNext();) { |
| Suggestion match = new Suggestion(guesses.next()); |
| Representation rep = match.getEntity().getRepresentation(); |
| Float score = rep.getFirst(RdfResourceEnum.resultScore.getUri(), Float.class); |
| if (maxScore == null) { |
| maxScore = score; |
| } |
| Iterator<Text> labels = rep.getText(nameField); |
| while (labels.hasNext() && match.getLevenshtein() < 1.0) { |
| Text label = labels.next(); |
| if (language == null || // if the content language is unknown -> |
| // accept all labels |
| label.getLanguage() == null || // accept labels with no |
| // language |
| // and labels in the same language as the content |
| (language != null && label.getLanguage().startsWith(language))) { |
| double actMatch = levenshtein( |
| casesensitive ? label.getText() : label.getText().toLowerCase(), namedEntityLabel); |
| if (actMatch > match.getLevenshtein()) { |
| match.setLevenshtein(actMatch); |
| match.setMatchedLabel(label); |
| } |
| } |
| } |
| if (match.getMatchedLabel() != null) { |
| if (match.getLevenshtein() == 1.0) { |
| if (maxExactScore == null) { |
| maxExactScore = score; |
| } |
| // normalise exact matches against the best exact score |
| match.setScore(score.doubleValue() / maxExactScore.doubleValue()); |
| } else { |
| // normalise partial matches against the best match and the |
| // Levenshtein similarity with the label |
| match.setScore(score.doubleValue() * match.getLevenshtein() / maxScore.doubleValue()); |
| } |
| matches.add(match); |
| } else { |
| log.debug("No value of {} for Entity {}!", nameField, match.getEntity().getId()); |
| } |
| } |
| // now sort the results |
| Collections.sort(matches); |
| return matches.subList(0, Math.min(matches.size(), numSuggestions)); |
| } |
| |
| /** |
| * This EnhancementEngine can enhance any ContentItem as it does consume existing TextAnnotations with the |
| * configured dc:type's |
| * |
| * @see org.apache.stanbol.enhancer.servicesapi.EnhancementEngine#canEnhance(org.apache.stanbol.enhancer.servicesapi.ContentItem) |
| */ |
| public int canEnhance(ContentItem ci) { |
| return ENHANCE_ASYNC; // Entity tagging now supports asyc processing |
| } |
| |
| @Override |
| public Map<String,Object> getServiceProperties() { |
| return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, |
| (Object) defaultOrder)); |
| } |
| |
| /** |
| * Compares two strings (after {@link StringUtils#trim(String) trimming}) by using the Levenshtein's Edit |
| * Distance of the two strings. Does not return the {@link Integer} number of changes but |
| * <code>1-(changes/maxStringSizeAfterTrim)</code> |
| * <p> |
| * |
| * @param s1 |
| * the first string |
| * @param s2 |
| * the second string |
| * @return the distance |
| * @throws IllegalArgumentException |
| * if any of the two parsed strings is NULL |
| */ |
| private static double levenshtein(String s1, String s2) { |
| if (s1 == null || s2 == null) { |
| throw new IllegalArgumentException("NONE of the parsed String MUST BE NULL!"); |
| } |
| s1 = StringUtils.trim(s1); |
| s2 = StringUtils.trim(s2); |
| return s1.isEmpty() || s2.isEmpty() ? 0 |
| : 1.0 - (((double) getLevenshteinDistance(s1, s2)) / ((double) (Math.max(s1.length(), |
| s2.length())))); |
| } |
| } |