enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java - stanbol - Git at Google

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.stanbol.enhancer.engines.entitylinking.config;

 import java.util.Collections;
 import java.util.EnumSet;
 import java.util.HashSet;
 import java.util.Set;

 import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher;
 import org.apache.stanbol.enhancer.nlp.model.Chunk;
 import org.apache.stanbol.enhancer.nlp.model.Token;
 import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
 import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
 import org.apache.stanbol.enhancer.nlp.pos.Pos;
 import org.apache.stanbol.enhancer.nlp.pos.PosTag;

 public class LanguageProcessingConfig implements Cloneable{

     /**
      * The linked Phrase types. Includes {@link LexicalCategory#Noun} phrases
      */
     public static final Set<LexicalCategory> DEFAULT_PROCESSED_PHRASE_CATEGORIES =
             EnumSet.of(LexicalCategory.Noun);
     /**
      * The default set of {@link LexicalCategory LexicalCategories} used to
      * lookup (link) Entities within the {@link EntitySearcher}
      */
     public static final Set<LexicalCategory> DEFAULT_LINKED_LEXICAL_CATEGORIES =
             EnumSet.of(LexicalCategory.Noun, LexicalCategory.Residual);

     /**
      * The default set of {@link LexicalCategory LexicalCategories} used to
      * match (and search) for Entities.<p>
      * Matched Tokens are not used for linking, but are considered when matching
      * label tokens of Entities with the Text.
      */
     public static final Set<LexicalCategory> DEFAULT_MATCHED_LEXICAL_CATEGORIES =
             EnumSet.of(LexicalCategory.Noun, LexicalCategory.Quantifier,LexicalCategory.Residual);

     /**
      * The default set of {@link Pos} types that are used to lookup (link) Entities.
      * By defualt only {@link Pos#ProperNoun}s and two
      * {@link LexicalCategory#Residual} acronyms and
      * words marked as foreign material.
      */
     public static final Set<Pos> DEFAULT_LINKED_POS =
             EnumSet.of(Pos.ProperNoun, Pos.Foreign, Pos.Acronym);

     /**
      * Default value for POS annotation confidence required for processed POS tags.
      * Used for <ul>
      * <li> {@link #getLinkedLexicalCategories()}
      * <li> {@link #getLinkedPosTags()} and
      * <li> {@link #getMatchedLexicalCategories()}
      * <ul>
      */
     public static final double DEFAULT_MIN_POS_ANNOTATION_PROBABILITY = 0.75;

     /**
      * Default {@link LexicalCategory LexicalCategories} that allow the EntityLinker
      * to step-over non matchable tokens when determining search tokens for
      * Entityhub lookups (Defaults: {@link LexicalCategory#Noun},
      * {@link LexicalCategory#Punctuation} and {@link LexicalCategory#Adposition}).
      */
     public static final Set<LexicalCategory> DEFAULT_CHUNKABLE_CATEGORIES = EnumSet.of(
         LexicalCategory.Noun, LexicalCategory.Punctuation, LexicalCategory.Conjuction);

     /**
      * Default {@link Pos} tags that allow the EntityLinker to step-over non matchable
      * tokens when determining search tokens for Entityhub lookups (default: empty).
      */
     private static final Set<Pos> DEFAULT_CHUNKABLE_POS = EnumSet.of(Pos.Preposition);
     /**
      * Default string tags that allow the EntityLinker to step-over non matchable
      * tokens when determining search tokens for Entityhub lookups (default: empty).
      */
     private static final Set<String> DEFAULT_CHUNKABKE_TAGS = Collections.emptySet();

     /**
      * Default value for POS annotation confidence required for not-processed POS tags
      * (not contained in both {@link #getLinkedLexicalCategories()} and
      * {@link #getLinkedPosTags()}). <br> The default is
      * <code>{@link #DEFAULT_MIN_POS_ANNOTATION_PROBABILITY}/2</code>
      */
     public static final double DEFAULT_MIN_EXCLUDE_POS_ANNOTATION_PROBABILITY = DEFAULT_MIN_POS_ANNOTATION_PROBABILITY/2;

     /**
      * By default {@link Chunk}s are considered
      */
     public static final boolean DEFAULT_IGNORE_CHUNK_STATE = false;
     /**
      * the minimum probability so that a phrase in processed based on the Phrase Annotation
      */
     public static final double DEFAULT_MIN_PHRASE_ANNOTATION_PROBABILITY = 0.75;
     /**
      * the minimum probability so that a phrase is rejected based on the Phrase Annotation
      */
     public static final double DEFAULT_MIN_EXCLUDE_PHRASE_ANNOTATION_PROBABILITY =
             DEFAULT_MIN_PHRASE_ANNOTATION_PROBABILITY/2;
     /**
      * The default for linking upper case tokens (regardless of length and POS)
      * The default is <code>false</code> as some languages (like German) use upper
      * case for Nouns and so this would also affect configurations that only
      * link {@link Pos#ProperNoun}s
      */
     public static final boolean DEFAULT_LINK_UPPER_CASE_TOKEN_STATE = false;
     /**
      * The default for matching upper case tokens (regardless of length and POS)
      * is <code>true</code>
      */
     public static final boolean DEFAULT_MATCH_UPPER_CASE_TOKEN_STATE = true;
     /**
      * By default linking of chunks with multiple matchable tokens is enabled.
      * This is useful to link Entities represented by two common nouns.
      */
     public static final boolean DEFAULT_LINK_MULTIPLE_MATCHABLE_TOKENS_IN_CHUNKS_STATE = true;

     /**
      * The set of {@link PosTag#getCategory()} considered for EntityLinking
      * @see #DEFAULT_LINKED_LEXICAL_CATEGORIES
      */
     private Set<LexicalCategory> linkedLexicalCategories = DEFAULT_LINKED_LEXICAL_CATEGORIES;

     private Set<LexicalCategory> matchedLexicalCategories = DEFAULT_MATCHED_LEXICAL_CATEGORIES;

     /**
      * The linked {@link Pos} categories
      */
     private Set<Pos> linkedPos = DEFAULT_LINKED_POS;
     /**
      * The set of {@link PosTag#getTag()} values that are processed
      */
     private Set<String> linkedPosTags = Collections.emptySet();
     /**
      * The minimum confidence of POS annotations for {@link #getLinkedLexicalCategories()}
      * and {@link #getLinkedPosTags()}
      */
     private double minPosAnnotationProbability = DEFAULT_MIN_POS_ANNOTATION_PROBABILITY;

     /**
      * The minimum confidence that a POS annotation
      */
     private double minExcludePosAnnotationProbability = DEFAULT_MIN_EXCLUDE_POS_ANNOTATION_PROBABILITY;

     private boolean ignoreChunksState = DEFAULT_IGNORE_CHUNK_STATE;

     private Set<LexicalCategory> chunkableCategories = DEFAULT_CHUNKABLE_CATEGORIES;
     private Set<Pos> chunkablePos = DEFAULT_CHUNKABLE_POS;
     private Set<String> chunkableTags = DEFAULT_CHUNKABKE_TAGS;

     private double minPhraseAnnotationProbability = DEFAULT_MIN_PHRASE_ANNOTATION_PROBABILITY;

     private double minExcludePhraseAnnotationProbability = DEFAULT_MIN_EXCLUDE_PHRASE_ANNOTATION_PROBABILITY;

     private Set<LexicalCategory> processedPhraseCategories = DEFAULT_PROCESSED_PHRASE_CATEGORIES;

     private Set<String> processedPhraseTags = Collections.emptySet();
     /**
      * If upper case tokens are linked (and matched)
      */
     private boolean linkUpperCaseTokensState = DEFAULT_LINK_UPPER_CASE_TOKEN_STATE;
     /**
      * If upper case tokens are matched
      */
     private boolean matchUpperCaseTokensState = DEFAULT_MATCH_UPPER_CASE_TOKEN_STATE;
     /**
      * If for {@link Chunk}s with multiple matchable Tokens those should be
      * linked.
      */
     private boolean linkMultiMatchableTokensInChunkState = DEFAULT_LINK_MULTIPLE_MATCHABLE_TOKENS_IN_CHUNKS_STATE;
     private int minSearchTokenLength;
     private boolean linkOnlyUpperCaseTokenWithUnknownPos;


     /**
      * The language or <code>null</code> for the default configuration
      * @param language
      */
     public LanguageProcessingConfig(){
     }

     public final boolean isIgnoreChunks() {
         return ignoreChunksState;
     }

     /**
      * Setter for the ignore {@link Chunk} state.
      * @param state the state or <code>null</code> to set the
      * {@link #DEFAULT_IGNORE_CHUNK_STATE}
      */
     public final void setIgnoreChunksState(Boolean state){
         if(state == null){
             this.ignoreChunksState = DEFAULT_IGNORE_CHUNK_STATE;
         } else {
             this.ignoreChunksState = state;
         }
     }

     /**
      * Getter for the set of {@link LexicalCategory LexicalCategories} used
      * to link Entities in the configured Vocabulary.
      * @return the set of {@link LexicalCategory LexicalCategories} used
      * for linking.
      * @see #DEFAULT_LINKED_LEXICAL_CATEGORIES
      */
     public final Set<LexicalCategory> getLinkedLexicalCategories() {
         return linkedLexicalCategories;
     }
     /**
      * Getter for the set of {@link LexicalCategory LexicalCategories} used
      * to match label tokens of suggested Entities.
      * @return the set of {@link LexicalCategory LexicalCategories} used for
      * matching
      */
     public final Set<LexicalCategory> getMatchedLexicalCategories(){
         return matchedLexicalCategories;
     }
     /**
      * Setter for the matched lexical categories
      * @param matchedLexicalCategories the set or <code>null</code>
      * to set the {@link #DEFAULT_MATCHED_LEXICAL_CATEGORIES}
      */
     public void setMatchedLexicalCategories(Set<LexicalCategory> matchedLexicalCategories) {
         if(matchedLexicalCategories == null){
             this.matchedLexicalCategories = DEFAULT_MATCHED_LEXICAL_CATEGORIES;
         } else {
             this.matchedLexicalCategories = EnumSet.noneOf(LexicalCategory.class);
             this.matchedLexicalCategories.addAll(matchedLexicalCategories);
         }
     }
     /**
      * The set of tags used for linking. This is useful if the string tags
      * used by the POS tagger are not mapped to {@link LexicalCategory} nor
      * {@link Pos} enum members.
      * @return the set of pos tags used for linking entities
      */
     public final Set<String> getLinkedPosTags() {
         return linkedPosTags;
     }

     /**
      * Getter for the minimum probability of POS annotations for
      * {@link #getLinkedLexicalCategories()} or {@link #getLinkedPosTags()}
      * @return the probability
      */
     public final double getMinPosAnnotationProbability() {
         return minPosAnnotationProbability ;
     }


     /**
      * Getter for the minimum probability of POS annotations not included in
      * {@link #getLinkedLexicalCategories()} or {@link #getLinkedPosTags()}
      * @return the probability
      */
     public final double getMinExcludePosAnnotationProbability() {
         return minExcludePosAnnotationProbability;
     }

     /**
      * Setter for the minimum probability of POS annotations for
      * {@link #getLinkedLexicalCategories()} or {@link #getLinkedPosTags()}
      * @param minPosAnnotationProbability the probability or <code>null</code> to set
      * {@value #DEFAULT_MIN_POS_ANNOTATION_PROBABILITY}
      */
     public final void setMinPosAnnotationProbability(Double minPosAnnotationProbability) {
         if(minPosAnnotationProbability == null){
             this.minPosAnnotationProbability = DEFAULT_MIN_POS_ANNOTATION_PROBABILITY;
         } else if(minPosAnnotationProbability >= 0 && minPosAnnotationProbability <= 1) {
             this.minPosAnnotationProbability = minPosAnnotationProbability;
         } else {
             throw new IllegalArgumentException("parsed value MUST BE in the range 0..1 or NULL to set the default");
         }
     }

     /**
      * Setter for the minimum probability of POS annotations not included in
      * {@link #getLinkedLexicalCategories()} or {@link #getLinkedPosTags()}
      * @param minExcludePosAnnotationProbability the probability or <code>null</code> to set
      * {@value #DEFAULT_MIN_EXCLUDE_POS_ANNOTATION_PROBABILITY}
      */
     public final void setMinExcludePosAnnotationProbability(Double minExcludePosAnnotationProbability){
         if(minExcludePosAnnotationProbability == null){
             this.minExcludePosAnnotationProbability = DEFAULT_MIN_EXCLUDE_POS_ANNOTATION_PROBABILITY;
         } else if(minExcludePosAnnotationProbability >= 0 && minExcludePosAnnotationProbability <= 1) {
             this.minExcludePosAnnotationProbability = minExcludePosAnnotationProbability;
         } else {
             throw new IllegalArgumentException("parsed value MUST BE in the range 0..1 or NULL to set the default");
         }
     }
     /**
      * Setter for the linked {@link LexicalCategory LexicalCategories}
      * @param linkedLexicalCategories the set or <code>null</code> to set
      * the {@link #DEFAULT_LINKED_LEXICAL_CATEGORIES}.
      */
     public final void setLinkedLexicalCategories(Set<LexicalCategory> linkedLexicalCategories) {
         if(linkedLexicalCategories == null){
             this.linkedLexicalCategories = DEFAULT_LINKED_LEXICAL_CATEGORIES;
         } else if(linkedLexicalCategories.contains(null)){
             throw new IllegalArgumentException("The parsed set with linked LexicalCategories MUST NOT contain the NULL element!");
         } else {
             this.linkedLexicalCategories = linkedLexicalCategories;
         }
     }
     /**
      * Setter for the linked {@link Pos} types.
      * @param linkedLexicalCategories the set of linked {@link Pos} types or <code>null</code>
      * to set the {@link #DEFAULT_LINKED_POS} types
      */
     public final void setLinkedPos(Set<Pos> linkedPos) {
         if(linkedPos == null){
             this.linkedPos = DEFAULT_LINKED_POS;
         } else if(linkedPos.contains(null)){
             throw new IllegalArgumentException("The parsed set with linked LexicalCategories MUST NOT contain the NULL element!");
         } else {
             this.linkedPos = linkedPos;
         }
     }
     /**
      * Setter for the linked Pos Tags. This should only be used of the
      * used POS tagger uses {@link PosTag}s that are not mapped to
      * {@link LexicalCategory LexicalCategories} nor {@link Pos} types.
      * @param processedPosTags the linked Pos tags. if <code>null</code>
      * the value is set to an empty set.
      */
     public final void setLinkedPosTags(Set<String> processedPosTags) {
         if(processedPosTags == null){
             this.linkedPosTags = Collections.emptySet();
         } else if(processedPosTags.contains(null)){
             throw new IllegalArgumentException("The parsed set with processed POS tags MUST NOT contain the NULL element!");
         } else {
             this.linkedPosTags = processedPosTags;
         }
     }
     /**
      * Getter for the processed phrase categories.
      * {@link Chunk}s of other types will be ignored.
      * @return
      */
     public Set<LexicalCategory> getProcessedPhraseCategories() {
         return processedPhraseCategories;
     }
     /**
      * Setter for the processable phrase categories.
      * @param processablePhraseCategories the processable categories or
      * <code>null</code> to set the {@link #DEFAULT_PROCESSED_PHRASE_CATEGORIES}.
      */
     public void setProcessedPhraseCategories(Set<LexicalCategory> processablePhraseCategories){
         if(processablePhraseCategories == null){
             this.processedPhraseCategories = DEFAULT_PROCESSED_PHRASE_CATEGORIES;
         } else {
             this.processedPhraseCategories = EnumSet.noneOf(LexicalCategory.class);
             this.processedPhraseCategories.addAll(processablePhraseCategories);
         }
     }
     /**
      * Getter for the prococessed phrase Tags. This should be only
      * used if the {@link PhraseTag}s used by the Chunker are not
      * mapped to {@link LexicalCategory LexicalCategories}.
      * @return the processed phrase tags
      */
     public Set<String> getProcessedPhraseTags() {
         return processedPhraseTags;
     }
     /**
      * Setter for the Processed Phrase Tags
      * @param processedPhraseTags the set with the tags. If <code>null</code>
      * the value is set to an empty set.
      */
     public void setProcessedPhraseTags(Set<String> processedPhraseTags) {
         if(processedPhraseTags == null || processedPhraseTags.isEmpty()){
             this.processedPhraseTags = Collections.emptySet();
         } else {
             this.processedPhraseTags = new HashSet<String>(processedPhraseTags);
         }
     }
     /**
      * Getter for the minimum required probability so that {@link PhraseTag}s
      * are accepted.
      * @return the probability [0..1)
      */
     public double getMinPhraseAnnotationProbability() {
         return minPhraseAnnotationProbability;
     }
     /**
      * Getter for the minimum required probability so that {@link PhraseTag}s
      * are considered for rejecting (e.g. to skip a VerbPhrase if
      * {@link LexicalCategory#Verb} is not present in
      * {@link #getProcessedPhraseCategories()}). Typically this value is
      * lower as {@link #getMinPhraseAnnotationProbability()}
      * @return the probability [0..1)
      */
     public double getMinExcludePhraseAnnotationProbability() {
         return minExcludePhraseAnnotationProbability;
     }
     /**
      * Setter for the minimum phrase annotation probability [0..1)
      * @param prob the probability [0..1) or <code>null</code> to set
      * the {@value #DEFAULT_MIN_PHRASE_ANNOTATION_PROBABILITY}
      * @throws IllegalArgumentException if the parsed value is not
      * in the range [0..1).
      */
     public void setMinPhraseAnnotationProbability(Double prob) {
         if(prob == null){
             this.minPhraseAnnotationProbability = DEFAULT_MIN_PHRASE_ANNOTATION_PROBABILITY;
         } else if (prob >= 1 || prob < 0){
             throw new IllegalArgumentException("The parsed minimum phrase annotation probability '"
                 + prob +" MUST be in the range [0..1)!");
         } else {
             this.minPhraseAnnotationProbability = prob;
         }
     }

     /**
      * Setter for the minimum excluded phrase annotation probability [0..1)
      * @param prob the probability [0..1) or <code>null</code> to set
      * the {@value #DEFAULT_MIN_EXCLUDE_PHRASE_ANNOTATION_PROBABILITY}
      * @throws IllegalArgumentException if the parsed value is not
      * in the range [0..1).
      */
     public void setMinExcludePhraseAnnotationProbability(Double prob) {
         if(prob == null){
             this.minExcludePhraseAnnotationProbability = DEFAULT_MIN_EXCLUDE_PHRASE_ANNOTATION_PROBABILITY;
         } else if (prob >= 1 || prob < 0){
             throw new IllegalArgumentException("The parsed minimum exclude phrase annotation probability '"
                 + prob +" MUST be in the range [0..1)!");
         } else {
             this.minExcludePhraseAnnotationProbability = prob;
         }
     }
     /**
      * Getter for the set of {@link Pos} types used for linking Entities
      * @return the linked {@link Pos} types
      */
     public Set<Pos> getLinkedPos() {
         return linkedPos;
     }

     /**
      * If upper case Tokens should be linked regardless
      * of the POS type and length
      * @return
      */
     public boolean isLinkUpperCaseTokens(){
         return linkUpperCaseTokensState;
     }
     /**
      * Setter for the state if upper case token should be
      * linked regardless of the POS type and length
      * @param linkUpperCaseTokensState the state or <code>null</code>
      * to set the {@link #DEFAULT_LINK_UPPER_CASE_TOKEN_STATE}
      */
     public void setLinkUpperCaseTokensState(Boolean linkUpperCaseTokensState) {
         if(linkUpperCaseTokensState == null){
             this.linkUpperCaseTokensState = DEFAULT_LINK_UPPER_CASE_TOKEN_STATE;
         } else {
             this.linkUpperCaseTokensState = linkUpperCaseTokensState;
         }
     }
     /**
      * If upper case Tokens should be matched regardless
      * of the POS type and length
      * @return
      */
     public boolean isMatchUpperCaseTokens(){
         return matchUpperCaseTokensState;
     }
     /**
      * Setter for the state if upper case token should be
      * matched regardless of the POS type and length
      * @param matchUpperCaseTokensState the state or <code>null</code>
      * to set the {@link #DEFAULT_MATCH_UPPER_CASE_TOKEN_STATE}
      */
     public void setMatchUpperCaseTokensState(Boolean matchUpperCaseTokensState) {
         if(matchUpperCaseTokensState == null){
             this.matchUpperCaseTokensState = DEFAULT_MATCH_UPPER_CASE_TOKEN_STATE;
         } else {
             this.matchUpperCaseTokensState = matchUpperCaseTokensState;
         }
     }
     /**
      * If {@link #isIgnoreChunks()} is disabled than this allows
      * to convert matchable {@link Token}s to linked one in
      * case a {@link Chunk} contains more than one matchable
      * Token. <p>
      * This is especially useful in cases where only
      * {@link Pos#ProperNoun}s are processed to also detect
      * Entities that are named by using multiple Common Nouns.
      * In cases where all {@link LexicalCategory#Noun}s are
      * processed this option has usually no influence on the
      * results.
      * @return the state
      */
     public boolean isLinkMultiMatchableTokensInChunk() {
         return linkMultiMatchableTokensInChunkState;
     }
     /**
      * Setter for state if for {@link Chunk}s with multiple
      * matchable {@link Token}s those Tokens should be treated
      * as linkable.<p>
      * This is especially useful in cases where only
      * {@link Pos#ProperNoun}s are linked to also detect
      * Entities that are named by using multiple Common Nouns.
      * In cases where all {@link LexicalCategory#Noun}s are
      * processed this option has usually no influence on the
      * results.
      * @param state the state or <code>null</code> to reset to the
      * the {@link #DEFAULT_LINK_MULTIPLE_MATCHABLE_TOKENS_IN_CHUNKS_STATE default}
      */
     public void setLinkMultiMatchableTokensInChunkState(Boolean state){
         if(state == null){
             this.linkMultiMatchableTokensInChunkState = DEFAULT_LINK_MULTIPLE_MATCHABLE_TOKENS_IN_CHUNKS_STATE;
         } else {
             this.linkMultiMatchableTokensInChunkState = state;
         }
     }
     /**
      * The minimum number of character a {@link Token} (word) must have to be
      * used {@link EntitySearcher#lookup(java.util.List, String...) lookup} concepts
      * in the taxonomy. Note that this parameter is only used of no POS (Part-
      * of-speech) tags are available in the {@link AnalysedText}.
      * @param minSearchTokenLength the minSearchTokenLength to set
      */
     public void setMinSearchTokenLength(int minSearchTokenLength) {
         this.minSearchTokenLength = minSearchTokenLength;
     }

     /**
      * The minimum number of character a {@link Token} (word) must have to be
      * used {@link EntitySearcher#lookup(java.util.List, String...) lookup} concepts
      * in the taxonomy. Note that this parameter is only used of no POS (Part-
      * of-speech) tags are available in the {@link AnalysedText}.
      * @return the minSearchTokenLength
      */
     public int getMinSearchTokenLength() {
         return minSearchTokenLength;
     }

     /**
      * This returns the state if only upper case tokens should be marked as
      * 'linkable' if they do not have a POS tag
      * @return the state
      */
     public boolean isLinkOnlyUpperCaseTokensWithUnknownPos(){
         return linkOnlyUpperCaseTokenWithUnknownPos;
     }

     /**
      * This returns the state if only upper case tokens should be marked as
      * 'linkable' if they do not have a POS tag
      * @param linkOnlyUpperCaseTokenWithUnknownPos the state
      */
     public void setLinkOnlyUpperCaseTokenWithUnknownPos(boolean linkOnlyUpperCaseTokenWithUnknownPos) {
         this.linkOnlyUpperCaseTokenWithUnknownPos = linkOnlyUpperCaseTokenWithUnknownPos;
     }

     /**
      * Getter for the chunkable {@link LexicalCategory LexicalCategories}. Those
      * allow the EntityLinker to step-over non matchable tokens when determining
      * search tokens for Entityhub lookups.
      * @return
      */
     public Set<LexicalCategory> getChunkableCategories(){
         return chunkableCategories;
     }

     /**
      * Setter for the chunkable {@link LexicalCategory LexicalCategories}. Those
      * allow the EntityLinker to step-over non matchable tokens when determining
      * search tokens for Entityhub lookups.
      * @param categories The list of {@link LexicalCategory LexicalCategories}
      * considered as chunkable or <code>null</code> to reset to the default
      */
     public void setChunkableCategories(Set<LexicalCategory> categories){
         if(categories == null){
             this.chunkableCategories = DEFAULT_CHUNKABLE_CATEGORIES;
         } else {
             this.chunkableCategories = categories;
         }
     }

     /**
      * Setter for the {@link Pos} tags considered by the EntityLinker to step-over
      * non matchable tokens when determining search tokens for Entityhub lookups
      * @param pos The list of {@link Pos} tags considered as chunkable or
      * <code>null</code> to reset to the default
      */
     public void setChunkablePos(Set<Pos> pos){
         if(pos == null){
             this.chunkablePos = DEFAULT_CHUNKABLE_POS;
         } else {
             this.chunkablePos = pos;
         }
     }

     /**
      * Setter for the String tags considered by the EntityLinker to step-over
      * non matchable tokens when determining search tokens for Entityhub lookups
      * @param tags The list of String tags considered as chunkable or
      * <code>null</code> to reset to the default
      */
     public void setChunkableTags(Set<String> tags){
         if(tags == null){
             this.chunkableTags = DEFAULT_CHUNKABKE_TAGS;
         } else {
             this.chunkableTags = tags;
         }
     }
     /**
      * Getter for the {@link Pos} tags considered by the EntityLinker to step-over
      * non matchable tokens when determining search tokens for Entityhub lookups
      * @return
      */
     public Set<Pos> getChunkablePos(){
         return chunkablePos;
     }

     /**
      * Getter for the String tags considered by the EntityLinker to step-over
      * non matchable tokens when determining search tokens for Entityhub lookups
      * @return the String tags considered as chunkable
      */
     public Set<String> getChunkableTags(){
         return chunkableTags;
     }

     /**
      * Clones the {@link LanguageProcessingConfig}. Intended to be used
      * to create language specific configs based on the default one.
      */
     @Override
     public LanguageProcessingConfig clone() {
         LanguageProcessingConfig c = new LanguageProcessingConfig();
         c.ignoreChunksState = ignoreChunksState;
         c.minExcludePhraseAnnotationProbability = minExcludePhraseAnnotationProbability;
         c.minExcludePosAnnotationProbability = minExcludePosAnnotationProbability;
         c.minPhraseAnnotationProbability = minPhraseAnnotationProbability;
         c.minPosAnnotationProbability = minPosAnnotationProbability;
         c.linkedLexicalCategories = linkedLexicalCategories;
         c.processedPhraseCategories = processedPhraseCategories;
         c.processedPhraseTags = processedPhraseTags;
         c.linkedPos = linkedPos;
         c.linkedPosTags = linkedPosTags;
         c.linkUpperCaseTokensState = linkUpperCaseTokensState;
         c.matchUpperCaseTokensState = matchUpperCaseTokensState;
         c.linkMultiMatchableTokensInChunkState = linkMultiMatchableTokensInChunkState;
         c.matchedLexicalCategories = matchedLexicalCategories;
         c.minSearchTokenLength = minSearchTokenLength;
         c.linkOnlyUpperCaseTokenWithUnknownPos = linkOnlyUpperCaseTokenWithUnknownPos;
         c.chunkableCategories = chunkableCategories;
         c.chunkablePos = chunkablePos;
         c.chunkableTags = chunkableTags;
         return c;
     }


 }