| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.stanbol.enhancer.engines.entitylinking.config; |
| |
| import java.net.URI; |
| import java.net.URISyntaxException; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.Dictionary; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.apache.clerezza.commons.rdf.IRI; |
| import org.apache.stanbol.commons.namespaceprefix.NamespaceMappingUtils; |
| import org.apache.stanbol.commons.namespaceprefix.NamespacePrefixService; |
| import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher; |
| import org.apache.stanbol.enhancer.engines.entitylinking.impl.EntityLinker; |
| import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion; |
| import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion.MATCH; |
| import org.apache.stanbol.enhancer.nlp.NlpAnnotations; |
| import org.apache.stanbol.enhancer.nlp.model.Chunk; |
| import org.apache.stanbol.enhancer.nlp.model.Token; |
| import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures; |
| import org.apache.stanbol.enhancer.nlp.pos.Pos; |
| import org.apache.stanbol.enhancer.nlp.pos.PosTag; |
| import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses; |
| import org.osgi.service.cm.ConfigurationException; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * The configuration for the {@link EntityLinker}. Typically this |
| * configuration does not change often. Therefore it will be used for |
| * several {@link EntityLinker} instances processing different |
| * contents. |
| * @author Rupert Westenthaler |
| * |
| */ |
| public class EntityLinkerConfig { |
| |
| private static final Logger log = LoggerFactory.getLogger(EntityLinkerConfig.class); |
| |
| /** |
| * The field used to search for labels in the vocabulary linked against |
| */ |
| public static final String NAME_FIELD = "enhancer.engines.linking.labelField"; |
| /** |
| * The field used as types for entities. While the type does not influence the |
| * suggestions it is used for the <code>fise:entity-type</code> value of |
| * <code>fise:EntityAnnotation</code>s and also to determine the |
| * <code>dc:type</code> value of <code>fise:TextAnnotation</code>s via the |
| * configured {@link #TYPE_MAPPINGS}. |
| */ |
| public static final String TYPE_FIELD = "enhancer.engines.linking.typeField"; |
| /** |
| * Allows to configure a list of entity types that are white/black listed. |
| */ |
| public static final String ENTITY_TYPES = "enhancer.engines.linking.entityTypes"; |
| /** |
| * Allows to enable/disable case sensitive matching |
| */ |
| public static final String CASE_SENSITIVE = "enhancer.engines.linking.caseSensitive"; |
| /** |
| * The field used to lookup redirects |
| */ |
| public static final String REDIRECT_FIELD = "enhancer.engines.linking.redirectField"; |
| /** |
| * If/how redirects (provided by the {@link #REDIRECT_FIELD}) are processed. |
| */ |
| public static final String REDIRECT_MODE = "enhancer.engines.linking.redirectMode"; |
| /** |
| * The maximum number of fise:EntityAnnotations created as suggestion for a fise:TextAnnotation |
| */ |
| public static final String SUGGESTIONS = "enhancer.engines.linking.suggestions"; |
| /** |
| * If enabled Suggestions with similar scores are included. This means also that |
| * there might me more as {@link #SUGGESTIONS} results returned by the engine. |
| */ |
| public static final String INCLUDE_SIMILAR_SCORE = "enhancer.engines.linking.includeSimilarScore"; |
| /** |
| * If enabled {@link MorphoFeatures#getLemma()} values are used instead of the {@link Token#getSpan()} to |
| * search/match Entities within the Vocabulary linked against. |
| * @see EntityLinkerConfig#isLemmaMatching() |
| * @see EntityLinkerConfig#DEFAULT_LEMMA_MATCHING_STATE |
| */ |
| public static final String LEMMA_MATCHING_STATE = "enhancer.engines.linking.lemmaMatching"; |
| /** |
| * Can be used to that the "default language" from <code>null</code> |
| * (labels without language tag) to an other value (e.g. "en").<p> |
| * The "default language" is used in addition to the language of the |
| * processed text to search for labels. |
| */ |
| public static final String DEFAULT_MATCHING_LANGUAGE = "enhancer.engines.linking.defaultMatchingLanguage"; |
| /** |
| * Allows to configure entity type -> dc:type mappings as used for created |
| * fise:TextAnnotations |
| */ |
| public static final String TYPE_MAPPINGS = "enhancer.engines.linking.typeMappings"; |
| /** |
| * How well single tokens of the Label needs to match a token of the Text so that they |
| * are considered to match. Matching does only allow differences at the end of the |
| * token (e.g. "London" -> "Londons major") |
| */ |
| public static final String MIN_TOKEN_SCORE = "enhancer.engines.linking.minTokenScore"; |
| /** |
| * The minimum number of matching tokens. Only "matchable" tokens are counted. |
| * For full matches (where all tokens of the Label do match tokens in the text) |
| * this parameter is ignored.<p> |
| * This parameter is strongly related with the {@link #MIN_LABEL_SCORE}. |
| * Typical setting are<ul> |
| * <li> <code>{@link #MIN_FOUND_TOKENS}=1</code> and <code>{@link #MIN_LABEL_SCORE} > 0.5</code> (e.g. 0.75) |
| * <li> <code>{@link #MIN_FOUND_TOKENS}=2</code> and <code>{@link #MIN_LABEL_SCORE} <= 0.5</code> (e.g. 0.5) |
| * </ul> |
| * as both settings will ensures that Labels with two tokens where only a single one |
| * does match with the text are not suggested.<p> |
| * If used in combination with an disambiguation Engine one might want to consider |
| * Entities where their labels do match only a single token is such cases a |
| * <code>{@link #MIN_FOUND_TOKENS}=1</code> and <code>{@link #MIN_LABEL_SCORE} <= 0.5</code> |
| * might be also a meaningful configuration. In such cases users will also want to set the |
| * <code>{@link #SUGGESTIONS} > 10</code>. |
| */ |
| public static final String MIN_FOUND_TOKENS = "enhancer.engines.linking.minFoundTokens"; |
| /** |
| * The "Label Score" [0..1] represents how much of the |
| * Label of an Entity matches with the Text. It compares the number |
| * of Tokens of the Label with the number of Tokens matched to the |
| * Text. Not exact matches for Tokens, or if the Tokens within the |
| * label do appear in an other order than in the text do also |
| * reduce this score. <p> |
| * The default is {@link EntityLinkerConfig#DEFAULT_MIN_LABEL_SCORE} |
| * (value: {@value EntityLinkerConfig#DEFAULT_MIN_LABEL_SCORE}) |
| */ |
| public static final String MIN_LABEL_SCORE = "enhancer.engines.linking.minLabelScore"; |
| /** |
| * The "Text Score" [0..1] represents how well the |
| * Label of an Entity matches to the selected Span in the Text. |
| * It compares the number of matched {@link Token} from |
| * the label with the number of Tokens enclosed by the Span |
| * in the Text an Entity is suggested for. Not exact matches |
| * for Tokens, or if the Tokens within the label do appear in |
| * an other order than in the text do also reduce this score.<p> |
| * The default is {@link EntityLinkerConfig#DEFAULT_MIN_TEXT_SCORE} |
| * (value: {@value EntityLinkerConfig#DEFAULT_MIN_TEXT_SCORE}) |
| */ |
| public static final String MIN_TEXT_SCORE = "enhancer.engines.linking.minTextScore"; |
| /** |
| * Defined as the product of the "Text Score" with the |
| * "Label Score" - meaning that this value represents |
| * both how well the label matches the text and how much of the |
| * label is matched with the text.<p> |
| * The default is {@link EntityLinkerConfig#DEFAULT_MIN_MATCH_SCORE} |
| * (value: {@value EntityLinkerConfig#DEFAULT_MIN_MATCH_SCORE}) |
| * @see #MIN_TEXT_SCORE |
| * @see #MIN_LABEL_SCORE |
| */ |
| public static final String MIN_MATCH_FACTOR = "enhancer.engines.linking.minMatchScore"; |
| /** |
| * The minimum score an Entity must match matchable {@link Token}s within a processable |
| * {@link Chunk}. By {@link #DEFAULT_MIN_CHUNK_MATCH_SCORE default} this is |
| * set to <code>51%</code> to filter Entities that do only match a single token |
| * within a NounPhrase of two words. This feature was introduced with |
| * <a href="https://issues.apache.org/jira/browse/STANBOL-1211">STANBOL-1211</a> |
| */ |
| public static final String MIN_CHUNK_MATCH_SCORE = "enhancer.engines.linking.minChunkMatchScore"; |
| /** |
| * The maximum number of {@link Token} used as search terms with the |
| * {@link EntitySearcher#lookup(String, Set, java.util.List, String[], Integer)} |
| * method |
| */ |
| public static final String MAX_SEARCH_TOKENS = "enhancer.engines.linking.maxSearchTokens"; |
| /** |
| * The maximum number of {@link Token} searched around a linkable Token for |
| * additional search tokens.<p> |
| * As an Example in the text section "at the University of Munich a new procedure to" |
| * only "Munich" would be classified as {@link Pos#ProperNoun} and considered as |
| * linkable. However for searching it makes sence to use additional Tokens to |
| * reduce (or correctly rank) the expected high number of results for "Munich". |
| * Because of that "matchable" words surrounding the linkable are considered as |
| * included for searches.<p> |
| * This parameter allows to configure the maximum distance surounding the current |
| * linkable Token other linkable tokens can be included in searches. |
| */ |
| public static final String MAX_SEARCH_TOKEN_DISTANCE = "enhancer.engines.linking.maxSearchTokenDistance"; |
| /** |
| * Adds the dereference feature (STANBOL-333) also to this engine. |
| * @deprecated Use a Dereference Engine instead (STANBOL-336) |
| */ |
| @Deprecated |
| public static final String DEREFERENCE_ENTITIES = "enhancer.engines.linking.dereference"; |
| /** |
| * Allows to add a list of fields that are included when dereferencing Entities |
| * @deprecated Use a Dereference Engine instead (STANBOL-336) |
| */ |
| @Deprecated |
| public static final String DEREFERENCE_ENTITIES_FIELDS = "enhancer.engines.linking.dereferenceFields"; |
| /** |
| * Allows to enable/disable sorting of suggestion that have the same score |
| * based on the entity ranking (popularity of the entity within the knowledge base) |
| */ |
| public static final String RANK_EQUAL_SCORES_BASED_ON_ENTITY_RANKINGS = "enhancer.engines.linking.useEntityRankings"; |
| /** |
| * Allows to enable/disable the inclusion of the <code>fise:entity-ranking</code> |
| * property to <code>fise:EntityAnnotation</code> created by the linking engine. |
| */ |
| public static final String WRITE_ENTITY_RANKINGS = "enhancer.engines.linking.writeEntityRankings"; |
| |
| |
| /** |
| * The default number for the maximum number of terms suggested for a word |
| */ |
| public static final int DEFAULT_SUGGESTIONS = 3; |
| /** |
| * By default {@link #INCLUDE_SIMILAR_SCORE} is deactivated |
| */ |
| public static final boolean DEFAULT_INCLUDE_SIMILAR_SCORE = false; |
| /** |
| * Default value for the number of tokens that must be contained in |
| * suggested terms. The default is <code>1</code> |
| */ |
| public static final int DEFAULT_MIN_FOUND_TOKENS = 1; |
| /** |
| * Multiple Tokens can be sent to the {@link EntitySearcher} service. The |
| * service uses this as optional parameters for the search. Therefore |
| * returned Concepts MUST contain at least a single of the parsed |
| * tokens. <p> |
| * The default value of <code>2</code> should be enough for nearly all |
| * Taxonomies to sufficiently reduce the number of results.<p> |
| * NOTE that the labels (nameField) of the results are compared as a |
| * whole. So even if only 2 Tokens are used for the search there may be |
| * more mapped to the actual label of an result. |
| */ |
| public static final int DEFAULT_MAX_SEARCH_TOKENS = 2; |
| /** |
| * Default value for the maximum distance tokens are |
| * considered to be used (in addition to the currently processed on) |
| * for searches of Entities.<p> |
| * The default is set to <code>3</code> |
| */ |
| public static final int DEFAULT_MAX_SEARCH_DISTANCE = 3; |
| |
| /** |
| * Default value for {@link #getNameField()} (rdfs:label) |
| */ |
| public static final IRI DEFAULT_NAME_FIELD = new IRI( |
| "http://www.w3.org/2000/01/rdf-schema#label"); |
| /** |
| * Default value for {@link #getTypeField()} (rdf:type) |
| */ |
| public static final IRI DEFAULT_TYPE_FIELD = new IRI( |
| "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"); |
| /** |
| * Default value for {@link #getRedirectField()} (rdf:seeAlso) |
| */ |
| public static final IRI DEFAULT_REDIRECT_FIELD = new IRI( |
| "http://www.w3.org/2000/01/rdf-schema#seeAlso"); |
| /** |
| * The default language used to search for labels regardless of the language |
| * of the text. The default value is <code>null</code> causing to include |
| * labels that do not have a language assigned. |
| */ |
| public static final String DEFAULT_LANGUAGE = null; |
| /** |
| * The default for case sensitive matching is set to <code>false</code> |
| */ |
| public static final boolean DEFAULT_CASE_SENSITIVE_MATCHING_STATE = false; |
| /** |
| * By default Lemma based matching is deactivated. |
| */ |
| public static final boolean DEFAULT_LEMMA_MATCHING_STATE = false; |
| public static final double DEFAULT_MIN_LABEL_SCORE = 0.75; |
| public static final double DEFAULT_MIN_TEXT_SCORE = 0.4; |
| public static final double DEFAULT_MIN_MATCH_SCORE = 0.3; |
| /** |
| * By default more as 50% of the matchable tokens of a processable chunk |
| * need to match so that a Entity is considered to be mentioned in the text |
| * (STANBOL-1211) |
| */ |
| public static final double DEFAULT_MIN_CHUNK_MATCH_SCORE = 0.51; |
| |
| /** |
| * Default mapping for Concept types to dc:type values added for |
| * TextAnnotations. |
| */ |
| public static final Map<IRI,IRI> DEFAULT_ENTITY_TYPE_MAPPINGS; |
| |
| static { //the default mappings for the three types used by the Stanbol Enhancement Structure |
| Map<IRI,IRI> mappings = new HashMap<IRI,IRI>(); |
| mappings.put(OntologicalClasses.DBPEDIA_ORGANISATION, OntologicalClasses.DBPEDIA_ORGANISATION); |
| mappings.put(new IRI("http://dbpedia.org/ontology/Newspaper"), OntologicalClasses.DBPEDIA_ORGANISATION); |
| mappings.put(new IRI("http://schema.org/Organization"), OntologicalClasses.DBPEDIA_ORGANISATION); |
| // mappings.put(NamespaceEnum.dailymed+"organization",OntologicalClasses.DBPEDIA_ORGANISATION); |
| |
| mappings.put(OntologicalClasses.DBPEDIA_PERSON, OntologicalClasses.DBPEDIA_PERSON); |
| mappings.put(new IRI("http://xmlns.com/foaf/0.1/Person"), OntologicalClasses.DBPEDIA_PERSON); |
| mappings.put(new IRI("http://schema.org/Person"), OntologicalClasses.DBPEDIA_PERSON); |
| |
| mappings.put(OntologicalClasses.DBPEDIA_PLACE, OntologicalClasses.DBPEDIA_PLACE); |
| mappings.put(new IRI("http://schema.org/Place"), OntologicalClasses.DBPEDIA_PLACE); |
| mappings.put(new IRI("http://www.opengis.net/gml/_Feature"), OntologicalClasses.DBPEDIA_PLACE); |
| |
| mappings.put(OntologicalClasses.SKOS_CONCEPT, OntologicalClasses.SKOS_CONCEPT); |
| |
| // IRI DRUG = new IRI(NamespaceEnum.drugbank+"drugs"); |
| // mappings.put(DRUG.getUnicodeString(), DRUG); |
| // mappings.put(NamespaceEnum.dbpediaOnt+"Drug", DRUG); |
| // mappings.put(NamespaceEnum.dailymed+"drugs", DRUG); |
| // mappings.put(NamespaceEnum.sider+"drugs", DRUG); |
| // mappings.put(NamespaceEnum.tcm+"Medicine", DRUG); |
| // |
| // IRI DISEASE = new IRI(NamespaceEnum.diseasome+"diseases"); |
| // mappings.put(DISEASE.getUnicodeString(), DISEASE); |
| // mappings.put(NamespaceEnum.linkedct+"condition", DISEASE); |
| // mappings.put(NamespaceEnum.tcm+"Disease", DISEASE); |
| // |
| // IRI SIDE_EFFECT = new IRI(NamespaceEnum.sider+"side_effects"); |
| // mappings.put(SIDE_EFFECT.getUnicodeString(), SIDE_EFFECT); |
| // |
| // IRI INGREDIENT = new IRI(NamespaceEnum.dailymed+"ingredients"); |
| // mappings.put(INGREDIENT.getUnicodeString(), INGREDIENT); |
| |
| DEFAULT_ENTITY_TYPE_MAPPINGS = Collections.unmodifiableMap(mappings); |
| } |
| /** |
| * Enumeration over the different possibilities on how to deal with |
| * redirects (similar to Browsers following HTTP status 303 and RDF defining |
| * the "rdf:seeAlso" relation. |
| * @author Rupert Westenthaler |
| */ |
| public static enum RedirectProcessingMode { |
| /** |
| * Ignore redirects |
| */ |
| IGNORE, |
| /** |
| * Follow redirects, but only add the values (e.g. labels, types) such |
| * entities to the original one. |
| */ |
| ADD_VALUES, |
| /** |
| * Follow the redirect. |
| */ |
| FOLLOW |
| } |
| /** |
| * The default value for how to process redirect is set to |
| * {@link RedirectProcessingMode#IGNORE} |
| */ |
| public static RedirectProcessingMode DEFAULT_REDIRECT_PROCESSING_MODE = |
| RedirectProcessingMode.IGNORE; |
| |
| /** |
| * The dereferenceEntitiesState as set in {@link #activateEntityDereference(Dictionary)} |
| */ |
| private boolean dereferenceEntitiesState; |
| /** |
| * The the maximum number of terms suggested for a word |
| */ |
| private int maxSuggestions = DEFAULT_SUGGESTIONS; |
| |
| private boolean includeSuggestionsWithSimilarScore = DEFAULT_INCLUDE_SIMILAR_SCORE; |
| /** |
| * The minimum number of Tokens in the text that must match with |
| * a label of the Entity so that also non-exact matches are |
| * used for suggestions |
| */ |
| private int minFoundTokens = DEFAULT_MIN_FOUND_TOKENS; |
| /** |
| * The maximum numbers of Tokens sent to the {@link EntitySearcher} to search |
| * for concepts. <p> |
| * NOTE that the labels (nameField) of the results are compared as a |
| * whole. So even if only e.g. 2 tokens are used for the search there may be |
| * more mapped to the actual label of an result. |
| */ |
| private int maxSearchTokens = DEFAULT_MAX_SEARCH_TOKENS; |
| /** |
| * Defines the maximum distance tokens are |
| * considered to be used (in addition to the currently processed on) |
| * for searches of Entities.<p> |
| */ |
| private int maxSearchDistance = DEFAULT_MAX_SEARCH_DISTANCE; |
| |
| private boolean caseSensitiveMatchingState = DEFAULT_CASE_SENSITIVE_MATCHING_STATE; |
| /** |
| * Holds the mappings of rdf:type used by concepts to dc:type values used |
| * by TextAnnotations. |
| */ |
| private Map<IRI,IRI> typeMappings; |
| private Map<IRI, IRI> unmodTypeMappings; |
| /** |
| * The mode on how to process redirect for Entities. |
| */ |
| private RedirectProcessingMode redirectProcessingMode; |
| /** |
| * the default DC Type |
| */ |
| private IRI defaultDcType; |
| private IRI nameField; |
| private IRI redirectField; |
| private IRI typeField; |
| private Map<IRI,Integer> blacklistedTypes = new HashMap<IRI,Integer>(); |
| private Map<IRI,Integer> whitelistedTypes = new HashMap<IRI,Integer>(); |
| private Boolean defaultWhitelistTypes = null; |
| private Set<IRI> dereferencedFields = new HashSet<IRI>(); |
| |
| private Set<IRI> __selectedFields; |
| /** |
| * The language always included in searches (regardless of the language |
| * detected for the text. |
| */ |
| private String defaultLanguage = DEFAULT_LANGUAGE; |
| |
| /** |
| * Default for the maximum number of non-processable tokens that are |
| * allowed to not match before no further tokens are matched against a label |
| * of an Entity. <p> |
| * This allows e.g. to match "Dr. Richard Dogles" with "Dr Richard Dogles" |
| * as '.' is a non-processable token in the text that is missing in the |
| * label.<p> |
| * The default is set to <code>1</code> |
| */ |
| public final static int DEFAULT_MAX_NOT_FOUND = 1; |
| /** |
| * Value of the maximum number of non-processable tokens that are |
| * allowed to not match before no further tokens are matched against a label |
| * of an Entity. <p> |
| * This allows e.g. to match "Dr. Richard Dogles" with "Dr Richard Dogles" |
| * as '.' is a non-processable token in the text that is missing in the |
| * label. |
| */ |
| private int maxNotFound; |
| /** |
| * Default value for the minimum token match factor. |
| * If Tokens match is determined by comparing them using some algorithm. |
| * Results need to be in the range [0..1]. This factor defines the minimum |
| * similarity value so that a match is assumed. Not that this factor only |
| * is used for filtering out non-matching tokens. The similarity value will |
| * still used for calculating the confidence.<p> |
| * The default is set to <code>0.7</code>. |
| */ |
| public final static float DEFAULT_MIN_TOKEN_SCORE = 0.7f; |
| |
| /** |
| * By default Entities are dereferenced. Default chanted to <code>false</code> |
| * as this is now deprecated |
| * @deprecated Use a Dereference Engine instead (STANBOL-336) |
| */ |
| @Deprecated |
| public static final boolean DEFAULT_DEREFERENCE_ENTITIES_STATE = false; |
| |
| /** |
| * The default value for the state if entities that would have the same score |
| * should get their score slightly changed to ensure that entities with an |
| * higher ranking (popularity) do have an higher score. |
| */ |
| public static final boolean DEFAULT_RANK_EQUAL_SCORES_BASED_ON_ENTITY_RANKINGS = true; |
| |
| /** |
| * By default the <code>fise:entity-ranking</code> property is not added to |
| * <code>fise:EntityAnnotation</code>. |
| */ |
| public static final boolean DEFAULT_WRITE_ENTITY_RANKINGS = false; |
| |
| /** |
| * If Tokens match is determined by comparing them using some algorithm. |
| * Results need to be in the range [0..1]. This factor defines the minimum |
| * similarity value so that a match is assumed. Not that this factor only |
| * is used for filtering out non-matching tokens. The similarity value will |
| * still used for calculating the confidence |
| */ |
| private float minTokenMatchFactor; |
| /** |
| * If lemmas are used instead of the Tokens as present in the text to search |
| * and match Entities within the linked vocabulary |
| */ |
| private boolean lemmaMatchingState = DEFAULT_LEMMA_MATCHING_STATE; |
| private double minLabelScore = DEFAULT_MIN_LABEL_SCORE; |
| private double minTextScore = DEFAULT_MIN_TEXT_SCORE; |
| private double minMatchScore = DEFAULT_MIN_MATCH_SCORE; |
| /** |
| * The minimum score an entity needs to match matchable tokens within a |
| * chunk so that is is considered as a mentions (STANBOL-1211) |
| */ |
| private double minChunkMatchScore = DEFAULT_MIN_CHUNK_MATCH_SCORE; |
| |
| private boolean rankEqualScoresBasedOnEntityRankings = DEFAULT_RANK_EQUAL_SCORES_BASED_ON_ENTITY_RANKINGS; |
| |
| private boolean writeEntityRankings = DEFAULT_WRITE_ENTITY_RANKINGS; |
| |
| /** |
| * Default constructor the initializes the configuration with the |
| * default values |
| */ |
| public EntityLinkerConfig(){ |
| setMaxSuggestions(DEFAULT_SUGGESTIONS); |
| setMaxSearchTokens(DEFAULT_MAX_SEARCH_TOKENS); |
| setRedirectProcessingMode(DEFAULT_REDIRECT_PROCESSING_MODE); |
| typeMappings = new HashMap<IRI,IRI>(DEFAULT_ENTITY_TYPE_MAPPINGS); |
| unmodTypeMappings = Collections.unmodifiableMap(typeMappings); |
| setDefaultDcType(typeMappings.remove(null)); |
| setNameField(DEFAULT_NAME_FIELD); |
| setRedirectField(DEFAULT_REDIRECT_FIELD); |
| setTypeField(DEFAULT_TYPE_FIELD); |
| setMaxNotFound(DEFAULT_MAX_NOT_FOUND); |
| setMinTokenMatchFactor(DEFAULT_MIN_TOKEN_SCORE); |
| setDereferenceEntitiesState(DEFAULT_DEREFERENCE_ENTITIES_STATE); |
| } |
| |
| /** |
| * Creates a new {@link EntityLinkerConfig} based on the properties |
| * in the parsed {@link Dictionary} |
| * @param configuration the configuration |
| * @param prefixService Optionally a namespace prefix service used to |
| * convert '{prefix}:{localname}' parameters in the configuration to URIs. |
| * If <code>null</code> is parsed this feature is not supported and parameters |
| * are not changed. |
| * @return the configured {@link EntityLinkerConfig} |
| * @throws ConfigurationException if the parsed configuration is not valid |
| */ |
| public static EntityLinkerConfig createInstance(Dictionary<String,Object> configuration, |
| NamespacePrefixService prefixService) throws ConfigurationException { |
| EntityLinkerConfig elc = new EntityLinkerConfig(); |
| setConfiguration(elc, configuration, prefixService); |
| return elc; |
| } |
| /** |
| * Sets the configuration as parsed by the {@link Dictionary} to the |
| * parsed {@link EntityLinkerConfig}. |
| * @param linkerConfig the instance to apply the configuration to |
| * @param configuration the configuration |
| * @param prefixService Optionally a namespace prefix service used to |
| * convert '{prefix}:{localname}' parameters in the configuration to URIs. |
| * If <code>null</code> is parsed this feature is not supported and parameters |
| * are not changed. |
| * @throws ConfigurationException in case the configuration is invalid |
| */ |
| public static void setConfiguration(EntityLinkerConfig linkerConfig,Dictionary<String,Object> configuration,NamespacePrefixService prefixService) throws ConfigurationException { |
| Object value; |
| value = configuration.get(NAME_FIELD); |
| if(value != null){ |
| if(value.toString().isEmpty()){ |
| throw new ConfigurationException(NAME_FIELD,"The configured name field MUST NOT be empty"); |
| } |
| linkerConfig.setNameField(new IRI( |
| getFullName(prefixService,NAME_FIELD,value.toString()))); |
| } |
| |
| //init case sensitivity |
| value = configuration.get(CASE_SENSITIVE); |
| if(value instanceof Boolean){ |
| linkerConfig.setCaseSensitiveMatchingState((Boolean)value); |
| } else if(value != null && !value.toString().isEmpty()){ |
| linkerConfig.setCaseSensitiveMatchingState(Boolean.valueOf(value.toString())); |
| } //if NULL or empty use default |
| |
| //init TYPE_FIELD |
| value = configuration.get(TYPE_FIELD); |
| if(value != null){ |
| if(value.toString().isEmpty()){ |
| throw new ConfigurationException(TYPE_FIELD,"The configured name field MUST NOT be empty"); |
| } |
| linkerConfig.setTypeField(new IRI( |
| getFullName(prefixService, TYPE_FIELD, value.toString()))); |
| } |
| |
| //init REDIRECT_FIELD |
| value = configuration.get(REDIRECT_FIELD); |
| if(value != null){ |
| if(value.toString().isEmpty()){ |
| throw new ConfigurationException(NAME_FIELD,"The configured name field MUST NOT be empty"); |
| } |
| linkerConfig.setRedirectField(new IRI( |
| getFullName(prefixService,REDIRECT_FIELD,value.toString()))); |
| } |
| |
| //init MAX_SUGGESTIONS |
| value = configuration.get(SUGGESTIONS); |
| Integer maxSuggestions; |
| if(value instanceof Integer){ |
| maxSuggestions = (Integer)value; |
| } else if (value != null){ |
| try { |
| maxSuggestions = Integer.valueOf(value.toString()); |
| } catch(NumberFormatException e){ |
| throw new ConfigurationException(SUGGESTIONS, "Values MUST be valid Integer values > 0",e); |
| } |
| } else { |
| maxSuggestions = null; |
| } |
| if(maxSuggestions != null){ |
| if(maxSuggestions < 1){ |
| throw new ConfigurationException(SUGGESTIONS, "Values MUST be valid Integer values > 0"); |
| } |
| linkerConfig.setMaxSuggestions(maxSuggestions); |
| } |
| //init INCLUDE_SIMILAR_SCORE |
| value = configuration.get(INCLUDE_SIMILAR_SCORE); |
| if(value instanceof Boolean){ |
| linkerConfig.setIncludeSuggestionsWithSimilarScore((Boolean)value); |
| } else if(value != null){ |
| linkerConfig.setIncludeSuggestionsWithSimilarScore(Boolean.parseBoolean(value.toString())); |
| } |
| |
| //init MIN_FOUND_TOKENS |
| value = configuration.get(MIN_FOUND_TOKENS); |
| Integer minFoundTokens; |
| if(value instanceof Integer){ |
| minFoundTokens = (Integer)value; |
| } else if(value != null){ |
| try { |
| minFoundTokens = Integer.valueOf(value.toString()); |
| } catch(NumberFormatException e){ |
| throw new ConfigurationException(MIN_FOUND_TOKENS, "Values MUST be valid Integer values > 0",e); |
| } |
| } else { |
| minFoundTokens = null; |
| } |
| if(minFoundTokens != null){ |
| if(minFoundTokens < 1){ |
| throw new ConfigurationException(MIN_FOUND_TOKENS, "Values MUST be valid Integer values > 0"); |
| } |
| linkerConfig.setMinFoundTokens(minFoundTokens); |
| } |
| |
| //init Label Score parameters |
| value = configuration.get(MIN_LABEL_SCORE); |
| Double minLabelMatchFactor = null; |
| if(value instanceof Number){ |
| minLabelMatchFactor = Double.valueOf(((Number)value).doubleValue()); |
| } else if(value != null){ |
| try { |
| minLabelMatchFactor = Double.valueOf(value.toString()); |
| } catch (NumberFormatException e) { |
| throw new ConfigurationException(MIN_LABEL_SCORE, "Parsed value '" |
| +value+"' is not an valid double!"); |
| } |
| } |
| try { |
| linkerConfig.setMinLabelScore(minLabelMatchFactor); |
| } catch (IllegalArgumentException e){ |
| throw new ConfigurationException(MIN_LABEL_SCORE, e.getMessage()); |
| } |
| value = configuration.get(MIN_TEXT_SCORE); |
| Double minTextMatchFactor = null; |
| if(value instanceof Number){ |
| minTextMatchFactor = Double.valueOf(((Number)value).doubleValue()); |
| } else if(value != null){ |
| try { |
| minTextMatchFactor = Double.valueOf(value.toString()); |
| } catch (NumberFormatException e) { |
| throw new ConfigurationException(MIN_TEXT_SCORE, "Parsed value '" |
| +value+"' is not an valid double!"); |
| } |
| } |
| try { |
| linkerConfig.setMinTextScore(minTextMatchFactor); |
| } catch (IllegalArgumentException e){ |
| throw new ConfigurationException(MIN_TEXT_SCORE, e.getMessage()); |
| } |
| value = configuration.get(MIN_MATCH_FACTOR); |
| Double minMatchFactor = null; |
| if(value instanceof Number){ |
| minMatchFactor = Double.valueOf(((Number)value).doubleValue()); |
| } else if(value != null){ |
| try { |
| minMatchFactor = Double.valueOf(value.toString()); |
| } catch (NumberFormatException e) { |
| throw new ConfigurationException(MIN_MATCH_FACTOR, "Parsed value '" |
| +value+"' is not an valid double!"); |
| } |
| } |
| try { |
| linkerConfig.setMinMatchScore(minMatchFactor); |
| } catch (IllegalArgumentException e){ |
| throw new ConfigurationException(MIN_MATCH_FACTOR, e.getMessage()); |
| } |
| |
| value = configuration.get(MIN_CHUNK_MATCH_SCORE); |
| Double minChunkMatchScore = null; |
| if(value instanceof Number){ |
| minChunkMatchScore = Double.valueOf(((Number)value).doubleValue()); |
| } else if(value != null){ |
| try { |
| minChunkMatchScore = Double.valueOf(value.toString()); |
| } catch (NumberFormatException e) { |
| throw new ConfigurationException(MIN_CHUNK_MATCH_SCORE, "Parsed value '" |
| +value+"' is not an valid double!"); |
| } |
| } |
| try { |
| linkerConfig.setMinChunkMatchScore(minChunkMatchScore); |
| } catch (IllegalArgumentException e){ |
| throw new ConfigurationException(MIN_CHUNK_MATCH_SCORE, e.getMessage()); |
| } |
| |
| //init LEMMA_MATCHING_STATE |
| value = configuration.get(LEMMA_MATCHING_STATE); |
| if(value instanceof Boolean){ |
| linkerConfig.setLemmaMatchingState((Boolean)value); |
| } else if (value != null){ |
| linkerConfig.setLemmaMatchingState(Boolean.parseBoolean(value.toString())); |
| } |
| |
| //init MAX_SEARCH_TOKENS |
| value = configuration.get(MAX_SEARCH_TOKENS); |
| Integer maxSearchTokens; |
| if(value instanceof Integer){ |
| maxSearchTokens = (Integer)value; |
| } else if (value != null){ |
| try { |
| maxSearchTokens = Integer.valueOf(value.toString()); |
| } catch(NumberFormatException e){ |
| throw new ConfigurationException(MAX_SEARCH_TOKENS, "Values MUST be valid Integer values > 0",e); |
| } |
| } else { |
| maxSearchTokens = null; |
| } |
| if(maxSearchTokens != null){ |
| if(maxSearchTokens < 1){ |
| throw new ConfigurationException(MAX_SEARCH_TOKENS, "Values MUST be valid Integer values > 0"); |
| } |
| linkerConfig.setMaxSearchTokens(maxSearchTokens); |
| } |
| |
| //init the MAX_SEARCH_TOKEN_DISTANCE |
| value = configuration.get(MAX_SEARCH_TOKEN_DISTANCE); |
| Integer maxSearchDistance; |
| if(value instanceof Integer){ |
| maxSearchDistance = (Integer)value; |
| } else if (value != null){ |
| try { |
| maxSearchDistance = Integer.valueOf(value.toString()); |
| } catch(NumberFormatException e){ |
| throw new ConfigurationException(MAX_SEARCH_TOKEN_DISTANCE, "Values MUST be valid Integer values > 0",e); |
| } |
| } else { |
| maxSearchDistance = null; |
| } |
| if(maxSearchDistance != null){ |
| if(maxSearchDistance < 1){ |
| throw new ConfigurationException(MAX_SEARCH_TOKEN_DISTANCE, "Values MUST be valid Integer values > 0"); |
| } |
| linkerConfig.setMaxSearchDistance(maxSearchDistance); |
| } |
| |
| //init the REDIRECT_PROCESSING_MODE |
| value = configuration.get(REDIRECT_MODE); |
| if(value != null){ |
| try { |
| linkerConfig.setRedirectProcessingMode(RedirectProcessingMode.valueOf(value.toString())); |
| } catch (IllegalArgumentException e) { |
| throw new ConfigurationException(REDIRECT_MODE, "Values MUST be one of "+ |
| Arrays.toString(RedirectProcessingMode.values())); |
| } |
| } |
| |
| //init the DEFAULT_LANGUAGE |
| value = configuration.get(DEFAULT_MATCHING_LANGUAGE); |
| if(value != null){ |
| String defaultLang = value.toString().trim(); |
| if(defaultLang.isEmpty()){ |
| linkerConfig.setDefaultLanguage(null); |
| } else if(defaultLang.length() == 1){ |
| throw new ConfigurationException(DEFAULT_MATCHING_LANGUAGE, "Illegal language code '"+ |
| defaultLang+"'! Language Codes MUST BE at least 2 chars long."); |
| } else { |
| linkerConfig.setDefaultLanguage(defaultLang); |
| } |
| } |
| |
| // init MIN_TOKEN_MATCH_FACTOR |
| value=configuration.get(MIN_TOKEN_SCORE); |
| float minTokenMatchFactor; |
| if(value instanceof Number){ |
| minTokenMatchFactor = ((Number)value).floatValue(); |
| } else if(value != null){ |
| try { |
| minTokenMatchFactor = Float.valueOf(value.toString()); |
| } catch (NumberFormatException e) { |
| throw new ConfigurationException(MIN_TOKEN_SCORE, |
| "Unable to parse the minimum token match factor from the parsed value "+value,e); |
| } |
| if(minTokenMatchFactor < 0){ |
| minTokenMatchFactor = EntityLinkerConfig.DEFAULT_MIN_TOKEN_SCORE; |
| } |
| } else { |
| minTokenMatchFactor = EntityLinkerConfig.DEFAULT_MIN_TOKEN_SCORE; |
| } |
| if(minTokenMatchFactor == 0 || minTokenMatchFactor > 1){ |
| throw new ConfigurationException(MIN_TOKEN_SCORE, |
| "The minimum token match factor MUST be > 0 and <= 1 (negative values for the default)"); |
| } |
| linkerConfig.setMinTokenMatchFactor(minTokenMatchFactor); |
| |
| //init type mappings |
| value = configuration.get(TYPE_MAPPINGS); |
| if(value instanceof String[]){ //support array |
| value = Arrays.asList((String[])value); |
| } else if(value instanceof String) { //single value |
| value = Collections.singleton(value); |
| } |
| if(value instanceof Collection<?>){ //and collection |
| log.info("Init Type Mappings"); |
| configs : |
| for(Object o : (Iterable<?>)value){ |
| if(o != null){ |
| StringBuilder usage = new StringBuilder("useages: "); |
| usage.append("a: '{uri}' short for {uri} > {uri} | "); |
| usage.append("b: '{source1};{source2};..;{sourceN} > {target}'"); |
| String[] config = o.toString().split(">"); |
| if(config[0].isEmpty()){ |
| log.warn("Invalid Type Mapping Config '{}': Missing Source Type ({}) -> ignore this config", |
| o,usage); |
| continue configs; |
| } |
| String[] sourceTypes = config[0].split(";"); |
| if(sourceTypes.length > 1 && (config.length < 2 || config[1].isEmpty())){ |
| log.warn("Invalid Type Mapping Config '{}': Missing Target Type '{}' ({}) -> ignore this config", |
| o,usage); |
| continue configs; |
| } |
| String targetType = config.length < 2 ? sourceTypes[0] : config[1]; |
| targetType = getFullName(prefixService,TYPE_MAPPINGS,targetType.trim()); //support for ns:localName |
| try { //validate |
| new URI(targetType); |
| } catch (URISyntaxException e) { |
| log.warn("Invalid URI '{}' in Type Mapping Config '{}' -> ignore this config", |
| sourceTypes[0],o); |
| continue configs; |
| } |
| IRI targetUri = new IRI(targetType); |
| for(String sourceType : sourceTypes){ |
| if(!sourceType.isEmpty()){ |
| sourceType = getFullName(prefixService,TYPE_MAPPINGS,sourceType.trim()); //support for ns:localName |
| try { //validate |
| new URI(sourceType); |
| IRI old = linkerConfig.setTypeMapping(sourceType, targetUri); |
| if(old == null){ |
| log.info(" > add type mapping {} > {}", sourceType,targetType); |
| } else { |
| log.info(" > set type mapping {} > {} (old: {})", |
| new Object[]{sourceType,targetType,old.getUnicodeString()}); |
| } |
| } catch (URISyntaxException e) { |
| log.warn("Invalid URI '{}' in Type Mapping Config '{}' -> ignore this source type", |
| sourceTypes[0],o); |
| } |
| } |
| } |
| } |
| } |
| } else { |
| log.debug("No Type mappings configured"); |
| } |
| //dereference entities |
| value = configuration.get(DEREFERENCE_ENTITIES); |
| if(value instanceof Boolean){ |
| linkerConfig.setDereferenceEntitiesState(((Boolean)value).booleanValue()); |
| } else if(value != null && !value.toString().isEmpty()){ |
| linkerConfig.setDereferenceEntitiesState(Boolean.parseBoolean(value.toString())); |
| } |
| if(linkerConfig.isDereferenceEntitiesEnabled()){ |
| log.warn("DereferenceEntities is deprecated for the Engine. Please use the " |
| + "EntityhubDereferenceEngine instead (see STANBOL-1223 for details)"); |
| } |
| if(linkerConfig.isDereferenceEntitiesEnabled()){ |
| value = configuration.get(DEREFERENCE_ENTITIES_FIELDS); |
| if(value instanceof String[]){ |
| for(String field : (String[])value){ |
| if(field != null && !field.isEmpty()){ |
| linkerConfig.getDereferencedFields().add( |
| new IRI(getFullName(prefixService,DEREFERENCE_ENTITIES_FIELDS,field))); |
| } |
| } |
| } else if(value instanceof Collection<?>){ |
| for(Object field : (Collection<?>)value){ |
| if(field != null && !field.toString().isEmpty()){ |
| linkerConfig.getDereferencedFields().add( |
| new IRI(getFullName(prefixService,DEREFERENCE_ENTITIES_FIELDS,field.toString()))); |
| } |
| } |
| } else if(value instanceof String){ |
| if(!value.toString().isEmpty()){ |
| linkerConfig.getDereferencedFields().add( |
| new IRI(getFullName(prefixService,DEREFERENCE_ENTITIES_FIELDS,value.toString()))); |
| } |
| } else if(value != null){ |
| throw new ConfigurationException(DEREFERENCE_ENTITIES_FIELDS, |
| "Dereference Entities_Fields MUST BE parsed as String[], Collection<String> or " |
| + "String (single value). The actual value '"+value+"'(type: '"+value.getClass() |
| + "') is NOT supported"); |
| }else { //value == null |
| log.debug("No deference fields for entity configured"); |
| } |
| } |
| |
| //init USE ENTITY RANKINGS (STANBOL-1030) |
| value = configuration.get(RANK_EQUAL_SCORES_BASED_ON_ENTITY_RANKINGS); |
| if(value instanceof Boolean){ |
| linkerConfig.setRankEqualScoresBasedOnEntityRankings(((Boolean)value).booleanValue()); |
| } else if (value != null){ |
| linkerConfig.setRankEqualScoresBasedOnEntityRankings( |
| Boolean.parseBoolean(value.toString())); |
| } else { |
| linkerConfig.setRankEqualScoresBasedOnEntityRankings( |
| DEFAULT_RANK_EQUAL_SCORES_BASED_ON_ENTITY_RANKINGS); |
| } |
| //init WRITE ENTITY RANKINGS (STANBOL-1292) |
| value = configuration.get(WRITE_ENTITY_RANKINGS); |
| if(value instanceof Boolean){ |
| linkerConfig.setWriteEntityRankings(((Boolean)value).booleanValue()); |
| } else if (value != null){ |
| linkerConfig.setWriteEntityRankings(Boolean.parseBoolean(value.toString())); |
| } else { |
| linkerConfig.setWriteEntityRankings(DEFAULT_WRITE_ENTITY_RANKINGS); |
| } |
| |
| //init the list of whitelisted/blacklisted types |
| value = configuration.get(ENTITY_TYPES); |
| List<String> entityTypesConfig; //first collect and cleanup the config |
| if(value == null){ |
| entityTypesConfig = Collections.emptyList(); |
| } else if(value instanceof String[]){ |
| entityTypesConfig = new ArrayList<String>(); |
| for(String type : (String[])value){ |
| if(type != null){ |
| type = type.trim(); |
| if(!type.isEmpty()){ |
| entityTypesConfig.add(type); |
| } |
| } |
| } |
| } else if(value instanceof Collection<?>){ |
| entityTypesConfig = new ArrayList<String>(); |
| for(Object o : (Collection<Object>)value){ |
| if(o != null){ |
| String type = o.toString().trim(); |
| if(!type.isEmpty()){ |
| entityTypesConfig.add(type); |
| } |
| } |
| } |
| } else if(value instanceof String){ //support parsing single values as string |
| String type = value.toString().trim(); |
| if(type.isEmpty()){ |
| entityTypesConfig = Collections.emptyList(); |
| } else { |
| entityTypesConfig = Collections.singletonList(type); |
| } |
| } else { |
| throw new ConfigurationException(ENTITY_TYPES, "The list of ignored types (if present) " |
| + "MUST BE a collection or a string array (present: "+value.getClass().getName()+")!"); |
| } |
| //apply the config |
| for(int i = 0; i < entityTypesConfig.size(); i++){ |
| String type = entityTypesConfig.get(i); |
| if("*".equals(type)){ |
| linkerConfig.setDefaultWhitelistTypes(Boolean.TRUE); |
| } else { |
| boolean blacklisted = type.charAt(0) == '!'; |
| if(blacklisted && type.length() < 2){ |
| throw new ConfigurationException(ENTITY_TYPES, "The list of whitelisted/blacklisted " |
| + "MUST NOT contain '!' (configured: "+entityTypesConfig+")!"); |
| } |
| IRI uri = new IRI(getFullName(prefixService, ENTITY_TYPES, |
| blacklisted ? type.substring(1) : type)); |
| if(blacklisted){ |
| linkerConfig.addBlacklistType(uri, Integer.valueOf(i)); |
| } else { |
| linkerConfig.addWhitelistType(uri, Integer.valueOf(i)); |
| } |
| } |
| } |
| } |
| /** |
| * Gets the full URI for the parsed value by using the parsed {@link NamespacePrefixService} |
| * @param prefixService the {@link NamespacePrefixService} used to lookup the full URI |
| * @param property the config property (just used to create a {@link ConfigurationException} |
| * in case the used namespace prefix is unknown by the namespace prefix service) |
| * @param value the configured value (might be both a short or a full URI) |
| * @return the full URI |
| * @throws ConfigurationException |
| */ |
| private static String getFullName(NamespacePrefixService prefixService, String property,String value) throws ConfigurationException { |
| String prefix = NamespaceMappingUtils.getPrefix(value); |
| if(prefixService == null){ |
| if(prefix != null){ |
| throw new ConfigurationException(property, "'{prefix}:{localname}' tpye configurations " |
| + "are not supported if no "+NamespacePrefixService.class.getSimpleName() |
| + "is present (configured value='"+value+"')!"); |
| } else { |
| return value; |
| } |
| } else { |
| String uri = prefixService.getFullName(value); |
| if(uri == null){ |
| throw new ConfigurationException(property, "The prefix '"+prefix |
| + "' as used by the configured value '"+value+"' is unknow to the" |
| + NamespacePrefixService.class.getSimpleName()); |
| } |
| log.debug("mapped '{}' -> '{}'",value,uri); |
| return uri; |
| } |
| } |
| |
| /** |
| * Getter for the uri of the field used for the names in the taxonomy |
| * (e.g. rdfs:label, skos:prefLabel). Needs to return the full URI |
| * @return the field used for the names of in the Taxonomy. |
| */ |
| public final IRI getNameField() { |
| return nameField; |
| } |
| /** |
| * Setter for the uri of the field used for the names in the taxonomy |
| * (e.g. rdfs:label, skos:prefLabel). |
| * @param nameField the nameField to set |
| */ |
| public final void setNameField(IRI nameField) { |
| this.nameField = nameField; |
| __selectedFields = null; |
| } |
| /** |
| * Getter for the dereferencedFields. This is a read- and write-able |
| * set that allows to configure the fields that should be dereferenced |
| * @return |
| */ |
| public final Set<IRI> getDereferencedFields(){ |
| return dereferencedFields; |
| } |
| /** |
| * The field used to follow redirects (typically rdf:seeAlso) |
| * @return the redirect field |
| */ |
| public final IRI getRedirectField() { |
| return redirectField; |
| } |
| /** |
| * The field used to follow redirects (typically rdf:seeAlso) |
| * @param redirectField the redirectField to set |
| */ |
| public final void setRedirectField(IRI redirectField) { |
| this.redirectField = redirectField; |
| __selectedFields = null; |
| } |
| /** |
| * The field used to lookup the types (typically rdf:type) |
| * @return the field name used to lookup types |
| */ |
| public final IRI getTypeField() { |
| return typeField; |
| } |
| /** |
| * The field used to lookup the types (typically rdf:type) |
| * @param typeField the typeField to set |
| */ |
| public final void setTypeField(IRI typeField) { |
| this.typeField = typeField; |
| __selectedFields = null; |
| } |
| /** |
| * Setter for the maximum number of suggestion returned. |
| * @param maxSuggestions the maxSuggestions to set |
| */ |
| public void setMaxSuggestions(int maxSuggestions) { |
| this.maxSuggestions = maxSuggestions; |
| } |
| /** |
| * Getter for the maximum number of suggestion returned. |
| * @return the maxSuggestions |
| */ |
| public int getMaxSuggestions() { |
| return maxSuggestions; |
| } |
| |
| public boolean isIncludeSuggestionsWithSimilarScore(){ |
| return includeSuggestionsWithSimilarScore; |
| } |
| public void setIncludeSuggestionsWithSimilarScore(Boolean state){ |
| if(state == null){ |
| includeSuggestionsWithSimilarScore = DEFAULT_INCLUDE_SIMILAR_SCORE; |
| } else { |
| includeSuggestionsWithSimilarScore = state; |
| } |
| } |
| |
| /** |
| * Setter for the minimum number of Tokens (of the content) that MUST match |
| * with a {@link EntitySearcher#getNameField() label} of a |
| * {@link EntitySearcher#lookup(java.util.List, String...) concept of the taxonomy} |
| * so that it is {@link Suggestion suggested} even if the match is only |
| * {@link MATCH#PARTIAL}. Entities that match less than that are only included |
| * if a label is an {@link MATCH#EXACT EXACT} match with the current position |
| * in the text. |
| * @param minFoundTokens the minFoundTokens to set |
| */ |
| public void setMinFoundTokens(int minFoundTokens) { |
| this.minFoundTokens = minFoundTokens; |
| } |
| /** |
| * Getter for the minimum number of Tokens (of the content) that MUST match |
| * with a {@link EntitySearcher#getNameField() label} of a |
| * {@link EntitySearcher#lookup(java.util.List, String...) concept of the taxonomy} |
| * so that it is {@link Suggestion suggested} even if the match is only |
| * {@link MATCH#PARTIAL}. Entities that match less than that are only included |
| * if a label is an {@link MATCH#EXACT EXACT} match with the current position |
| * in the text. |
| * @return the minFoundTokens |
| */ |
| public int getMinFoundTokens() { |
| return minFoundTokens; |
| } |
| /** |
| * Getter for the maximum number of tokens parsed to |
| * {@link EntitySearcher#lookup(java.util.List, String...)} |
| * @return the maxSearchTokens |
| */ |
| public final int getMaxSearchTokens() { |
| return maxSearchTokens; |
| } |
| /** |
| * The maximum number of tokens parsed to |
| * {@link EntitySearcher#lookup(java.util.List, String...)}. This is NOT the |
| * maximum number of Tokens mapped for Entities returned by such queries.<p> |
| * In case {@link Chunk}s are available in the parsed {@link AnalysedText} |
| * searches can be scoped by such chunks. However if no chunks are available, |
| * than this value is used to collect this number of words in the text.<p> |
| * The {@link #DEFAULT_MAX_SEARCH_TOKENS default value} of <code>2</code> |
| * should be ok in most cases. |
| * @param maxSearchTokens the maxSearchTokens to set |
| */ |
| public final void setMaxSearchTokens(int maxSearchTokens) { |
| if(maxSearchTokens == 0){ |
| this.maxSearchTokens = DEFAULT_MAX_SEARCH_TOKENS; |
| } else if (maxSearchTokens < 0){ |
| throw new IllegalArgumentException("The maxSearchToken value MUST BE >= 0 (0 for setting the default)"); |
| } else { |
| this.maxSearchTokens = maxSearchTokens; |
| } |
| } |
| /** |
| * Getter for the case sensitive matching state |
| * @return the state |
| */ |
| public boolean isCaseSensitiveMatching() { |
| return caseSensitiveMatchingState; |
| } |
| /** |
| * Setter for the case sensitive matching state |
| * @param caseSensitiveMatchingState the state |
| */ |
| public void setCaseSensitiveMatchingState(boolean state) { |
| this.caseSensitiveMatchingState = state; |
| } |
| /* REMOVED because getTypemappings.remove(conceptType) can be used anyway |
| * Removes the mapping for the parsed concept type |
| * @param conceptType the concept type to remove the mapping |
| * @return the previously mapped dc:type value or <code>null</code> if |
| * no mapping for the parsed concept type was present |
| public IRI removeTypeMapping(IRI conceptType){ |
| return typeMappings.remove(conceptType); |
| } |
| */ |
| /** |
| * |
| * @param conceptType the type of the concept or <code>null</code> to |
| * add the default dc:type mapping. See also {@link #setDefaultDcType(IRI)} |
| * @param dcType the dc:type for the parsed concept type |
| * @return the previously mapped dc:type value if an existing mapping |
| * was updated or <code>null</code> if a new mapping was added. |
| */ |
| public IRI setTypeMapping(String conceptType, IRI dcType){ |
| if(dcType == null) { |
| return typeMappings.remove(conceptType == null ? null : new IRI(conceptType)); |
| } else { |
| if(conceptType == null){ //handle setting of the default dc:type value |
| IRI oldDefault = getDefaultDcType(); |
| setDefaultDcType(dcType); |
| return oldDefault; |
| } |
| return typeMappings.put(new IRI(conceptType), dcType); |
| } |
| } |
| |
| /** |
| * Setter for the default dc:type of linked entities if for none of the |
| * types of the suggestions a {@link #getTypeMappings()} exists. Set this |
| * to <code>null</code> to specify that no dc:type should be set in such |
| * cases. |
| * @param defaultDcType the defaultDcType to set |
| */ |
| public void setDefaultDcType(IRI defaultDcType) { |
| this.defaultDcType = defaultDcType; |
| } |
| /** |
| * The default type for Entities if no {@link #getTypeMappings() type mapping} |
| * is present. <code>null</code> means that no type should be set if no |
| * explicit mapping exists |
| * @return the defaultDcType |
| */ |
| public IRI getDefaultDcType() { |
| return defaultDcType; |
| } |
| /** |
| * Setter for the mode on how to deal with redirects |
| * @param redirectProcessingMode the redirectProcessingMode to set |
| */ |
| public void setRedirectProcessingMode(RedirectProcessingMode redirectProcessingMode) { |
| this.redirectProcessingMode = redirectProcessingMode; |
| __selectedFields = null; |
| } |
| /** |
| * Getter for the mode how to deal with redirects |
| * @return the redirectProcessingMode |
| */ |
| public RedirectProcessingMode getRedirectProcessingMode() { |
| return redirectProcessingMode; |
| } |
| /** |
| * Getter for the read only mappings of type mappings |
| * @return the type mappings (read only) |
| */ |
| public Map<IRI,IRI> getTypeMappings() { |
| return unmodTypeMappings; |
| } |
| /** |
| * Setter for the language of labels searched in addition to the current |
| * language of the text. Setting this to <code>null</code> (also the default) |
| * will cause to search labels without any defined language.<p> |
| * Changing this makes only sense if a dataset (such as dbpedia.org) adds |
| * language tags to labels even if they are typically used in any language. |
| * @param defaultLanguage the default language |
| */ |
| public void setDefaultLanguage(String defaultLanguage) { |
| this.defaultLanguage = defaultLanguage; |
| } |
| /** |
| * Getter for the language of labels searched in addition to the current |
| * language of the text. |
| * @return the default language |
| */ |
| public String getDefaultLanguage() { |
| return defaultLanguage; |
| } |
| /** |
| * Getter for the maximum number of non-processable tokens that are |
| * allowed to not match before no further tokens are matched against a label |
| * of an Entity. <p> |
| * This allows e.g. to match "Dr. Richard Dogles" with "Dr Richard Dogles" |
| * as '.' is a non-processable token in the text that is missing in the |
| * label. |
| * @return the maxNotFound |
| */ |
| public int getMaxNotFound() { |
| return maxNotFound; |
| } |
| /** |
| * Setter for the maximum number of non-processable tokens that are |
| * allowed to not match before no further tokens are matched against a label |
| * of an Entity. <p> |
| * This allows e.g. to match "Dr. Richard Dogles" with "Dr Richard Dogles" |
| * as '.' is a non-processable token in the text that is missing in the |
| * label. |
| * @param maxNotFound the maxNotFound to set |
| */ |
| public void setMaxNotFound(int maxNotFound) { |
| if(maxNotFound < 0){ |
| this.maxNotFound = DEFAULT_MAX_NOT_FOUND; |
| } else { |
| this.maxNotFound = maxNotFound; |
| } |
| } |
| /** |
| * Getter for the minimum token match Factor. |
| * If Tokens match is determined by comparing them using some algorithm. |
| * Results need to be in the range [0..1]. This factor defines the minimum |
| * similarity value so that a match is assumed. Not that this factor only |
| * is used for filtering out non-matching tokens. The similarity value will |
| * still used for calculating the confidence |
| * @return the minTokenMatchFactor |
| */ |
| public float getMinTokenMatchFactor() { |
| return minTokenMatchFactor; |
| } |
| /** |
| * Setter for the minimum token match Factor. |
| * If Tokens match is determined by comparing them using some algorithm. |
| * Results need to be in the range [0..1]. This factor defines the minimum |
| * similarity value so that a match is assumed. Not that this factor only |
| * is used for filtering out non-matching tokens. The similarity value will |
| * still used for calculating the confidence |
| * @param minTokenMatchFactor the minTokenMatchFactor to set |
| */ |
| public void setMinTokenMatchFactor(float minTokenMatchFactor) { |
| if(minTokenMatchFactor < 0 ){ |
| this.minTokenMatchFactor = DEFAULT_MIN_TOKEN_SCORE; |
| } else if(minTokenMatchFactor == 0 || minTokenMatchFactor > 1){ |
| throw new IllegalArgumentException("minimum Token Match Facter MUST be > 0 <= 1 (parsed: "+minTokenMatchFactor+")!"); |
| } else { |
| this.minTokenMatchFactor = minTokenMatchFactor; |
| } |
| } |
| /** |
| * Getter for the maximum distance tokens are |
| * considered to be used (in addition to the currently processed on) |
| * for searches of Entities. |
| * @return the maximum search token distance |
| */ |
| public int getMaxSearchDistance() { |
| return maxSearchDistance; |
| } |
| /** |
| /** |
| * Getter for the maximum distance tokens are |
| * considered to be used (in addition to the currently processed on) |
| * for searches of Entities. |
| * @param maxSearchDistance the maximum search token distance. If |
| * values <= 0 are parsed the value is set to |
| * {@link #DEFAULT_MAX_SEARCH_DISTANCE} |
| */ |
| public void setMaxSearchDistance(int maxSearchDistance) { |
| if(maxSearchDistance <= 0){ |
| maxSearchDistance = DEFAULT_MAX_SEARCH_DISTANCE; |
| } else { |
| this.maxSearchDistance = maxSearchDistance; |
| } |
| } |
| public boolean isLemmaMatching() { |
| return lemmaMatchingState; |
| } |
| |
| public void setLemmaMatchingState(Boolean lemmaMatchingState) { |
| if(lemmaMatchingState == null){ |
| this.lemmaMatchingState = DEFAULT_LEMMA_MATCHING_STATE; |
| } else { |
| this.lemmaMatchingState = lemmaMatchingState; |
| } |
| } |
| /** |
| * The minimum LabelScore required to suggest an Entity.<p> |
| * The "Label Score" [0..1] represents how much of the |
| * Label of an Entity matches with the Text. It compares the number |
| * of Tokens of the Label with the number of Tokens matched to the |
| * Text. Not exact matches for Tokens, or if the Tokens within the |
| * label do appear in an other order than in the text do also |
| * reduce this score. |
| * @return the minimum required LabelScore |
| */ |
| public double getMinLabelScore() { |
| return minLabelScore; |
| } |
| /** |
| * Setter for the minimum label score for suggested entities |
| * @param score the score [0..1] or <code>null</code> to reset |
| * to the default. |
| */ |
| public void setMinLabelScore(Double score){ |
| if(score == null){ |
| minLabelScore = DEFAULT_MIN_LABEL_SCORE; |
| } else if(score > 1 || score < 0) { |
| throw new IllegalArgumentException("The parsed MinLabelScore '" |
| + score + "' MUST BE in the range [0..1]!"); |
| } else { |
| minLabelScore = score; |
| } |
| } |
| /** |
| * The minimum Text Score required to suggest an Entity.<p> |
| * The "Text Score" [0..1] represents how well the |
| * Label of an Entity matches to the selected Span in the Text. |
| * It compares the number of matched {@link Token} from |
| * the label with the number of Tokens enclosed by the Span |
| * in the Text an Entity is suggested for. Not exact matches |
| * for Tokens, or if the Tokens within the label do appear in |
| * an other order than in the text do also reduce this score |
| * @return the minimum required Text Score for labels of suggested |
| * Entities |
| */ |
| public double getMinTextScore() { |
| return minTextScore; |
| } |
| /** |
| * Setter for the minimum text score for suggested entities |
| * @param score the score [0..1] or <code>null</code> to reset |
| * to the default. |
| */ |
| public void setMinTextScore(Double score){ |
| if(score == null){ |
| minTextScore = DEFAULT_MIN_TEXT_SCORE; |
| } else if(score > 1 || score < 0) { |
| throw new IllegalArgumentException("The parsed MinTextScore '" |
| + score + "' MUST BE in the range [0..1]!"); |
| } else { |
| minTextScore = score; |
| } |
| } |
| /** |
| * Getter for the minimum amount of matchable {@link Token}s an Entity must match |
| * within an {@link Chunk} to be considered (see STANBOL-1211).<p> |
| * The default is <code>>0.5</code> to omit matches for a single token |
| * in a chunk - typically a noun phrase - including two words. |
| * @return the minimum chunk match score. |
| */ |
| public double getMinChunkMatchScore() { |
| return minChunkMatchScore; |
| } |
| /** |
| * Setter for the minimum amount of matchable {@link Token}s an Entity must match |
| * within an {@link Chunk} to be considered (see STANBOL-1211).<p> |
| * The default is <code>>0.5</code> to omit matches for a single token |
| * in a chunk - typically a noun phrase - including two words. |
| * @param minChunkMatchScore the minimum chunk match score or <code>null</code> |
| * to reset to the default value |
| */ |
| public void setMinChunkMatchScore(Double minChunkMatchScore) { |
| if(minChunkMatchScore == null){ |
| this.minChunkMatchScore = DEFAULT_MIN_CHUNK_MATCH_SCORE; |
| } else if(minChunkMatchScore < 0.0 || minChunkMatchScore > 1.0){ |
| throw new IllegalArgumentException("The minChunkMatchScore MUST BE " |
| + "in the range [0..1] (parsed: "+minChunkMatchScore+")!"); |
| } else { |
| this.minChunkMatchScore = minChunkMatchScore; |
| } |
| } |
| /** |
| * Getter for the minimum match Score of Entity labels against the |
| * Text.<p> |
| * This is the product of the {@link #getMinLabelScore()} with the |
| * {@link #getMinTextScore()} - meaning that this value represents |
| * both how well the label matches the text and how much of the |
| * label is matched with the text. |
| * @return |
| */ |
| public double getMinMatchScore() { |
| return minMatchScore; |
| } |
| /** |
| * Setter for the minimum text score for suggested entities |
| * @param score the score [0..1] or <code>null</code> to reset |
| * to the default. |
| */ |
| public void setMinMatchScore(Double score){ |
| if(score == null){ |
| minMatchScore = DEFAULT_MIN_MATCH_SCORE; |
| } else if(score > 1 || score < 0) { |
| throw new IllegalArgumentException("The parsed MinMatchScore '" |
| + score + "' MUST BE in the range [0..1]!"); |
| } else { |
| minMatchScore = score; |
| } |
| } |
| /** |
| * Setter for the dereference entities state. |
| * @param state the state or <code>null</code> to set the |
| * default. |
| * @deprecated Use a Dereference Engine instead (STANBOL-336) |
| */ |
| public void setDereferenceEntitiesState(Boolean state) { |
| if(state == null){ |
| this.dereferenceEntitiesState = DEFAULT_DEREFERENCE_ENTITIES_STATE; |
| } else { |
| this.dereferenceEntitiesState = state; |
| } |
| __selectedFields = null; |
| } |
| /** |
| * Getter for the dereference entities state |
| * @return <code>true</code> if enabled otherwise <code>false</code> |
| * @deprecated Use a Dereference Engine instead (STANBOL-336) |
| */ |
| public boolean isDereferenceEntitiesEnabled(){ |
| return dereferenceEntitiesState; |
| } |
| |
| /** |
| * Getter for all fields that need to be selected based on the |
| * current EntityLinker configuration. This includes<ul> |
| * <li> {@link #getNameField()} |
| * <li> {@link #getTypeField()} |
| * <li> {@link #getRedirectField()} if {@link #getRedirectProcessingMode()} |
| * != {@link RedirectProcessingMode#IGNORE} |
| * <li> {@link #getDereferencedFields()} if {@link #isDereferenceEntitiesEnabled()} |
| * </ul> |
| * @return the selected fields for queries against the linked vocabulary. |
| * @deprecated Use a Dereference Engine instead (STANBOL-336) |
| */ |
| public Set<IRI> getSelectedFields() { |
| if(__selectedFields == null){ |
| Set<IRI> fields = new HashSet<IRI>(); |
| fields.add(nameField); |
| fields.add(typeField); |
| if(redirectProcessingMode != RedirectProcessingMode.IGNORE){ |
| fields.add(redirectField); |
| } |
| if(dereferenceEntitiesState){ |
| fields.addAll(dereferencedFields); |
| } |
| __selectedFields = Collections.unmodifiableSet(fields); |
| return __selectedFields; |
| } else { |
| return __selectedFields; |
| } |
| } |
| /** |
| * If suggested entities that would have the same score (e.g. 1.0 - for a |
| * perfect match) should have their score slightly adapted so that they |
| * are sorted based on their entity ranking.<p> |
| * The entity ranking is defined as the importance (popularity, connectivity, ...) |
| * of an entity within the knowledge base |
| * @return the state |
| */ |
| public boolean isRankEqualScoresBasedOnEntityRankings() { |
| return rankEqualScoresBasedOnEntityRankings; |
| } |
| /** |
| * Setter for the state if suggested that would have the same score (e.g. 1.0 - for a |
| * perfect match) should have their score slightly adapted so that they |
| * are sorted based on their entity ranking.<p> |
| * The entity ranking is defined as the importance (popularity, connectivity, ...) |
| * of an entity within the knowledge base |
| * @param state the state |
| */ |
| public void setRankEqualScoresBasedOnEntityRankings(boolean state) { |
| this.rankEqualScoresBasedOnEntityRankings = state; |
| } |
| |
| /** |
| * getter for the state if <code>fise:entity-ranking</code> values should |
| * be added to <code>fise:EntityAnnotation</code> (if rankings are available |
| * for the linked datasets |
| * @return the write entity ranking state |
| */ |
| public boolean isWriteEntityRankings() { |
| return writeEntityRankings; |
| } |
| /** |
| * Setter for the {@link #WRITE_ENTITY_RANKINGS} state. |
| * @param writeEntityRankings the state. Parse <code>null</code> to set |
| * to the default |
| */ |
| public void setWriteEntityRankings(Boolean writeEntityRankings) { |
| if(writeEntityRankings == null){ |
| this.writeEntityRankings = DEFAULT_WRITE_ENTITY_RANKINGS; |
| } else { |
| this.writeEntityRankings = writeEntityRankings; |
| } |
| } |
| |
| |
| /** |
| * Adds an type to the blacklist |
| */ |
| public final void addBlacklistType(IRI type, Integer order) { |
| if(type != null && order != null){ |
| blacklistedTypes.put(type, order); |
| } |
| } |
| /** |
| * Adds an type to the blacklist |
| */ |
| public final void addWhitelistType(IRI type, Integer order) { |
| if(type != null && order != null){ |
| whitelistedTypes.put(type, order); |
| } |
| } |
| |
| public final void setDefaultWhitelistTypes(Boolean state){ |
| this.defaultWhitelistTypes = state; |
| } |
| |
| |
| public final boolean isDefaultWhitelistTypes(){ |
| if(Boolean.FALSE.equals(defaultWhitelistTypes) && whitelistedTypes.isEmpty()){ |
| //illegal configuration ... ignore |
| return true; |
| } else { |
| return defaultWhitelistTypes != null ? defaultWhitelistTypes.booleanValue() : |
| whitelistedTypes.isEmpty(); //if whitelist is empty ... true |
| } |
| } |
| |
| /** |
| * @param ignoredTypes the ignoredTypes to set |
| */ |
| public final Map<IRI, Integer> getBlacklistedTypes() { |
| return blacklistedTypes; |
| } |
| |
| |
| /** |
| * @param ignoredTypes the ignoredTypes to set |
| */ |
| public final Map<IRI, Integer> getWhitelistedTypes() { |
| return whitelistedTypes; |
| } |
| /** |
| * checks if EntityType filtering is active or not |
| */ |
| public final boolean isEntityTypeFilteringActive(){ |
| if(whitelistedTypes.isEmpty() && blacklistedTypes.isEmpty()){ |
| return false; |
| } else { |
| return true; |
| } |
| } |
| |
| } |