enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java - stanbol - Git at Google

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.stanbol.enhancer.engines.lucenefstlinking;

 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.locks.ReadWriteLock;
 import java.util.concurrent.locks.ReentrantReadWriteLock;

 import org.apache.clerezza.rdf.core.Literal;
 import org.apache.clerezza.rdf.core.Resource;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.FilenameUtils;
 import org.apache.commons.io.filefilter.WildcardFileFilter;
 import org.apache.commons.lang.StringUtils;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.AtomicReader;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.schema.FieldType;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.search.SolrIndexSearcher;
 import org.apache.solr.util.RefCounted;
 import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider;
 import org.apache.stanbol.enhancer.engines.lucenefstlinking.cache.EntityCacheManager;
 import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
 import org.opensextant.solrtexttagger.TaggerFstCorpus;
 import org.opensextant.solrtexttagger.UnsupportedTokenException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /**
  * Holds the configuration of the index used by the FST linking engine.
  *
  * @author Rupert Westenthaler
  *
  */
 public class IndexConfiguration {

     private static final Logger log = LoggerFactory.getLogger(IndexConfiguration.class);

     private final SolrCore index;
     /**
      * The type field
      */
     private String typeField;

     /**
      * The redirect field
      */
     private String redirectField;
     /**
      * The entityRanking field
      */
     private String rankingField;

     /**
      * Used to sync access to {@link #corpusInfos}
      */
     private ReadWriteLock corpusInfoLock = new ReentrantReadWriteLock();
     /**
      * FST corpus configuration
      */
     private Map<String,CorpusInfo> corpusInfos;
     /**
      * {@link ExecutorService} used to create {@link TaggerFstCorpus} instances
      * at runtime.
      */
     protected ExecutorService executorService;
     /**
      * The encoding used by SolrFields (e.g. to define label fields for different
      * languages).
      */
     private final FieldEncodingEnum fieldEncoding;
     /**
      * The instance used to retrieve/create the cache for Lucene {@link Document}s
      * of Entities.
      */
     private EntityCacheManager entityCacheManager;

     private final LanguageConfiguration fstConfig;

     /**
      * If runtime generation is enabled by default (Note: explicitly configured
      * lanugages might override this)
      */
     private final boolean runtimeGeneration;

     /**
      * used to track if this index configuration is active
      */
     private boolean active = false;

     private long indexVersion = -1;

     private File fstDirectory;

     /**
      * The origin is added to <code>fise:TextAnnotation</code> created for
      * linked Entities. It is intended to be used for providing a reference to
      * dataset of the Entity. Both {@link UriRef URI}s and {@link Literal}s can
      * be used here
      */
     private Resource origin;

     /**
      * If alternate tokens (<code>posInc == 0</code>) can be skipped or if such
      * tokens should cause an {@link UnsupportedTokenException}.
      */
     private boolean skipAltTokens;

     /**
      * The default language
      */
     private String defaultLanguage;
     /**
      * If alternate tokens (<code>posInc == 0</code>) can be skipped or if such
      * tokens should cause an {@link UnsupportedTokenException}.
      * <p>
      * While enabling this will allow to use FST linking with query time Lucene
      * {@link Analyzer}s that emit alternate tokens (e.g. the Kuromoji analyzers
      * for Japanese) but it also requires special care with index time
      * {@link Analyzer} configurations. If enabled the index time analyzer MUST
      * produce all possible tokens emited by the query time analyzer as only if
      * all such  combinations are added to the FST model skipped alternate
      * tokens can not prevent mentions from being detected.
      * <p>
      * By default <code>skipAltTokens</code> is enabled for
      * {@link FieldEncodingEnum#SolrYard} and deactivated for all other field
      * encoding setting. This is because all Solr <code>schema.xml</code> used
      * by the Stanbol Entityhub SolrYard ensure the requirement stated above.
      * For other Solr configurations users will neet to explicitly activate this.
      */
     public static final String SKIP_ALT_TOKENS = "enhancer.engines.linking.lucenefst.skipAltTokens";

     /**
      * Property used to configure the FieldName encoding of the SolrIndex. This
      * is mainly needed for label fields of different languages (e.g. by using
      * the iso language code as prefix/suffix of Solr fields. However this also
      * adds support for SolrIndexes encoded as specified by the Stanbol
      * Entityhub SolrYard implementation. See {@link FieldEncodingEnum} for
      * supported values
      */
     public static final String FIELD_ENCODING = "enhancer.engines.linking.lucenefst.fieldEncoding";
     /**
      * The name of the Solr field storing rankings for entities. Entities with a
      * higher value are considered as better (more popular).
      */
     public static final String SOLR_RANKING_FIELD = "enhancer.engines.linking.lucenefst.rankingField";
     /**
      * The name of the Solr field holding the entity type information
      */
     public static final String SOLR_TYPE_FIELD = "enhancer.engines.linking.lucenefst.typeField";
     /**
      * Language configuration defining the language, solr field and the name of the
      * FST file. The FST file is looked up using the {@link DataFileProvider}.
      */
     public static final String FST_CONFIG = "enhancer.engines.linking.lucenefst.fstconfig";
     /**
      * The folder used to store the FST files. The {@link DEFAULT_FST_FOLDER default} is
      * '<code>${solr-data-dir}/fst</code>' - this is '<code>./fst</code>' relative to the
      * {@link SolrCore#getDataDir()} of the current SolrCore.
      */
     public static final String FST_FOLDER = "enhancer.engines.linking.lucenefst.fstfolder";
     /**
      * The default of the FST folder is '<code>${solr-data-dir}/fst</code>' -
      * this is '<code>./fst</code>' relative to the {@link SolrCore#getDataDir()}
      * of the current SolrCore.
      */
     public static final String DEFAULT_FST_FOLDER = "${solr-data-dir}/fst";
     /**
      * By default runtime generation for the FST is deactivated. Use the
      * {@link PARAM_RUNTIME_GENERATION} to enable it.
      */
     public static final boolean DEFAULT_RUNTIME_GENERATION = false;
     /**
      * Parameter that specifies if FST files are allowed to be generated at runtime.
      * Enabling this will require (1) write access to the SolrCore directory and
      * (2) a lot of Memory and CPU usage during the generation.
      */
     public static final String PARAM_RUNTIME_GENERATION = "generate";
     /**
      * Parameter used by the {@link IndexConfiguration#FST_CONFIG} to configure the solrField with
      * the stored labels. If not defined this defaults to the configured
      * {@link PARAM_FIELD}.
      */
     public static final String PARAM_STORE_FIELD = "stored";
     /**
      * Parameter used by the {@link IndexConfiguration#FST_CONFIG} to configure the Solr Field
      * with the indexed labels used to buld the FST corpus.
      */
     public static final String PARAM_FIELD = "field";
     public static final String DEFAULT_FIELD = "rdfs:label";
     /**
      * Parameter used by the {@link IndexConfiguration#FST_CONFIG} to configure the name of the fst
      * file for a language
      */
     public static final String PARAM_FST = "fst";

     public IndexConfiguration(LanguageConfiguration fstConfig, SolrCore index, FieldEncodingEnum fieldEncoding, String defaultLanguage){
         if(fstConfig == null){
             throw new IllegalArgumentException("The parsed FST configuration MUST NOT be NULL!");
         }
         this.fstConfig = fstConfig;
         if(index == null || index.isClosed()){
             throw new IllegalArgumentException("The parsed SolrCore MUST NOT be NULL nore closed!");
         }
         //check if we have runtime generation
         String allowCreationString = fstConfig.getDefaultParameters().get(IndexConfiguration.PARAM_RUNTIME_GENERATION);
         if(allowCreationString == null){
             runtimeGeneration = IndexConfiguration.DEFAULT_RUNTIME_GENERATION;
         } else {
             runtimeGeneration = Boolean.parseBoolean(allowCreationString);
         }

         this.index = index;
         if(fieldEncoding == null){
             fieldEncoding = FieldEncodingEnum.None;
         }
         this.fieldEncoding = fieldEncoding;
         //In case of a SolrYard we can activate skipAltTokens (see javadoc for
         //#SKIP_ALT_TOKENS for more information)
         if(fieldEncoding == FieldEncodingEnum.SolrYard){
             this.skipAltTokens = true;
         } else {
             this.skipAltTokens = false;
         }
         this.defaultLanguage = defaultLanguage == null ? "" : defaultLanguage;
     }

     /**
      * Assumed to be called in a write lock on {@link #corpusInfoLock}
      * @param corpus
      */
     private void addCorpusInfo(CorpusInfo corpus){
         if(corpus != null){
             corpusInfos.put(corpus.language, corpus);
         }
     }

     protected CorpusInfo removeCorpus(String language){
         return corpusInfos.remove(language);
     }
     /**
      * @return the fieldEncoding
      */
     public final FieldEncodingEnum getFieldEncoding() {
         return fieldEncoding;
     }

     /**
      * @return the typeField
      */
     public final String getEncodedTypeField() {
         return typeField;
     }

     /**
      * Sets AND encodes the parsed value (based on the specified
      * {@link #getFieldEncoding() FieldEncoding})
      * @param typeField the typeField to set
      */
     public final void setTypeField(String typeField) {
         this.typeField = typeField == null ? null :
             FieldEncodingEnum.encodeUri(typeField, fieldEncoding);
     }
     /**
      * @return the redirectField
      */
     public final String getEncodedRedirectField() {
         return redirectField;
     }
     /**
      * Sets AND encodes the parsed value (based on the specified
      * {@link #getFieldEncoding() FieldEncoding})
      * @param redirectField the redirectField to set
      */
     public final void setRedirectField(String redirectField) {
         this.redirectField = redirectField == null ? null :
             FieldEncodingEnum.encodeUri(redirectField, fieldEncoding);
     }
     /**
      * @return the rankingField
      */
     public final String getEncodedRankingField() {
         return rankingField;
     }
     /**
      * Sets AND encodes the parsed value (based on the specified
      * {@link #getFieldEncoding() FieldEncoding})
      * @param rankingField the rankingField to set
      */
     public final void setRankingField(String rankingField) {
         this.rankingField = rankingField == null ? null :
             FieldEncodingEnum.encodeFloat(rankingField, fieldEncoding);
     }
     /**
      * The version of the {@link #getIndex()} this configuration was built for.
      * @return the index version this configuration was built for.
      */
     public long getVersion() {
         return indexVersion;
     }
     /**
      * Returns the CorpusInfo for the parsed language. If the language has an
      * extension (e.g. en-US) it first tires to load the corpus for the exact
      * match and falls back to the main lanugage (en) if such a corpus does not
      * exist.
      * @param language the language
      * @return the corpus information or <code>null</code> if not present
      */
     public CorpusInfo getCorpus(String language) {
         corpusInfoLock.readLock().lock();
         try {
             CorpusInfo langCorpusInfo =  corpusInfos.get(language);
             if(langCorpusInfo == null && language.indexOf('-') > 0){
             	String rootLang = language.substring(0,language.indexOf('-'));
             	log.debug(" - no FST corpus for {}. Fallback to {}", language,rootLang);
             	langCorpusInfo =  corpusInfos.get(rootLang);
             }
             return langCorpusInfo;
         } finally {
             corpusInfoLock.readLock().unlock();
         }
     }
     /**
      * Getter for the languages of all configured FST corpora
      * @return a read-only copy of the languages of all configured FST corpora
      */
     public Set<String> getCorpusLanguages(){
         return Collections.unmodifiableSet(new HashSet<String>(corpusInfos.keySet()));
     }
     /**
      * Read-only collection of all {@link CorpusInfo}s defined for this
      * configuration.
      * @return a read only copy of the current {@link CorpusInfo}s
      */
     public Collection<CorpusInfo> getCorpora(){
         corpusInfoLock.readLock().lock();
         try {
             return Collections.unmodifiableCollection(new ArrayList<CorpusInfo>(corpusInfos.values()));
         } finally {
             corpusInfoLock.readLock().unlock();
         }
     }

     /**
      * The {@link CorpusInfo} for the default laugnage
      * @return the default corpus or <code>null</code> if no corpus is available
      * for the default language
      */
     public CorpusInfo getDefaultCorpus() {
         corpusInfoLock.readLock().lock();
         try {
             return corpusInfos.get(defaultLanguage);
         } finally {
             corpusInfoLock.readLock().unlock();
         }
     }

     public void setExecutorService(ExecutorService executorService) {
         this.executorService = executorService;
     }

     public ExecutorService getExecutorService() {
         return executorService;
     }

     /**
      * The FST configuration
      * @return
      */
     public LanguageConfiguration getFstConfig() {
         return fstConfig;
     }

     public SolrCore getIndex() {
         return index;
     }

     public void setEntityCacheManager(EntityCacheManager entityCacheManager) {
         this.entityCacheManager = entityCacheManager;
     }

     public EntityCacheManager getEntityCacheManager() {
         return entityCacheManager;
     }

     public File getFstDirectory() {
         return fstDirectory;
     }

     public void setFstDirectory(File fstDirectory) {
         this.fstDirectory = fstDirectory;
     }

     public void setOrigin(Resource origin) {
         this.origin = origin;
     }
     /**
      * The Origin of the dataset or <code>null</code> if not defined. The
      * origin can be used to specify the dataset where the Entities described by
      * the configured FST originate from. If can be both an URI (e.g.
      * <code>http://dbpedia.org</code>) or an literal "<code>dbpedia</code>").
      * If present the origin is added to any <code>fise:TextAnnotation</code>
      * created by the FstLinkingEngine with the property <code>fise:origin</code>
      *
      * @return the origin or <code>null</code> if none is configured
      */
     public Resource getOrigin() {
         return origin;
     }

     /**
      * Deactivates this {@link IndexConfiguration}
      */
     public void deactivate(){
         active = false;
     }

     /**
      * If this {@link IndexConfiguration} is still in sync with the version
      * of the {@link #getIndex() SolrCore}. This will return true if
      * <code>{@link #isRuntimeGeneration()} == false </code>
      * @return <code>true</code> if still active. Otherwise <code>false</code>
      */
     public boolean isCurrent(){
         if(!runtimeGeneration){
             return true;
         } else {
             RefCounted<SolrIndexSearcher> searcherRef = index.getSearcher();
             try {
                 long version = searcherRef.get().getIndexReader().getVersion();
                 return indexVersion == version;
             } finally {
                 searcherRef.decref();
             }
         }
     }

     private long getIndexVersion(){
         RefCounted<SolrIndexSearcher> searcherRef = index.getSearcher();
         try {
             return getIndexVersion(searcherRef.get());
         } finally {
             searcherRef.decref();
         }
     }

     private long getIndexVersion(SolrIndexSearcher searcher){
             return searcher.getIndexReader().getVersion();
     }

     public boolean isRuntimeGeneration() {
         return runtimeGeneration;
     }

     public boolean isActive() {
         return active;
     }

     /**
      * Activated this indexing configuration by inspecting the {@link SolrCore}
      * based on the provided configuration
      * @return
      */
     public void activate() throws IOException {
         active = true;
         if(index == null){ //do we have an SolrCore
             throw new IllegalArgumentException("No SolrCore set for this configuration");
         }
         //if no fstDirectory is configured
         if(fstDirectory == null){ //use the default
             fstDirectory = new File(index.getDataDir(),"fst");
         }
         //init the fstDirectory (may throw IOException)
         if(fstDirectory.isFile()){
             throw new IOException("Default FST directory exists and "
                     + "is a File. Use #setFstDirectory() to set different one");
         } else if(!fstDirectory.exists()){
             FileUtils.forceMkdir(fstDirectory);
         }
         //acquire the initial index configuration
         update();
     }

     /**
      * Updates the configuration based on the  current version of the
      * {@link #getIndex()}. If the SolrCore was not updated this will do
      * nothing.
      */
     public void update(){
         RefCounted<SolrIndexSearcher> searcherRef = index.getSearcher();
         try {
             update(getIndexVersion(searcherRef.get()), searcherRef.get());
         } finally {
             searcherRef.decref(); //decrease the count on the searcher
         }
     }
     /**
      * Version of {@link #update()} to be used in cases where the indexVersion
      * and a Solr searcher is already available in the calling method
      * @param indexVersion
      * @param searcher
      */
     protected void update(long indexVersion, SolrIndexSearcher searcher){
         assert searcher != null;
         assert searcher.getCore().equals(index);
         processFstConfig(indexVersion, searcher.getAtomicReader());
     }

     /**
      * This method combines the {@link #fstConfig} with the data present in the
      * {@link SolrCore}.
      * <p>
      * As information for fields are only available when a
      * field was actually used by a document stored in the index one needs to
      * inspect the index after every change.
      * <p>
      * An empty Solr index will result in
      * an empty {@link #corpusInfos} map. The first document with an value
      * for the English field will cause an {@link CorpusInfo} for the English
      * language to be created. As soon as the last document with an label for
      * a given language will be deleted the {@link CorpusInfo} for that language
      * will also disappear.
      * @param indexVersion the current version of the {@link #index} to process
      * the FST config for.
      * <p>
      * This method acquires a write lock on {@link #corpusInfoLock} while it
      * inspects the Solr index
      * @param indexReader The {@link AtomicReader} has access to the actual
      * fields present in the {@link SolrCore}. It is used to compare field
      * configurations in the {@link #fstConfig} with fields present in the Solr
      * {@link #index}.
      * @return If any {@link CorpusInfo FST configuration} where found during
      * inspecting the Solr {@link #index}
      */
     private boolean processFstConfig(long indexVersion, AtomicReader indexReader) {
         //first check if the Solr index was updated
         corpusInfoLock.readLock().lock();
         try {
             if(indexVersion == this.indexVersion){ //no update?
                 return !corpusInfos.isEmpty(); //nothing to do
             }
         } finally {
             corpusInfoLock.readLock().unlock();
         }
         log.debug("> {} FST config for {} (FST dir: {})",
             corpusInfos == null ? "create" : "update",
             index.getName(), fstDirectory.getAbsolutePath());

         boolean foundCorpus = false;

         corpusInfoLock.writeLock().lock();
         try {
             this.indexVersion = indexVersion;
             IndexSchema schema = index.getLatestSchema();
             Map<String,CorpusInfo> corpusInfosCopy;
             if(corpusInfos == null){ //first call
                 corpusInfos = new HashMap<String,CorpusInfo>(); //init the field
                 corpusInfosCopy = new HashMap<String,CorpusInfo>();
             } else {
                 corpusInfosCopy = new HashMap<String,CorpusInfo>(corpusInfos);
                 corpusInfos.clear(); //clear the old data
             }
             //(0) get basic parameters of the default configuration
             log.debug(" - default config");
             Map<String,String> defaultParams = fstConfig.getDefaultParameters();
             String fstName = defaultParams.get(IndexConfiguration.PARAM_FST);
             String indexField = defaultParams.get(IndexConfiguration.PARAM_FIELD);
             String storeField = defaultParams.get(IndexConfiguration.PARAM_STORE_FIELD);
             if(storeField == null){
                 //apply indexField as default if indexField is NOT NULL
                 storeField = indexField;
             }
             if(indexField == null){ //apply the defaults if null
                 indexField = IndexConfiguration.DEFAULT_FIELD;
             }
             if(fstName == null){ //use default
                 fstName = getDefaultFstFileName(indexField);
             }
             //This are all fields actually present in the index (distinguished with
             //those defined in the schema). This also includes actual instances of
             //dynamic field definition in the schema.
             FieldInfos fieldInfos = indexReader.getFieldInfos(); //we need this twice

             //(1) in case the fstConfig uses a wildcard we need to search for
             //    languages present in the SolrIndex. For that we use the indexReader
             //    to get the FieldInfos and match them against FST files in the FST
             //    directory and FieldType definitions in the schema of the SolrCore
             //NOTE: this needs only do be done if wildcards are enabled in the fstConfig
             if(fstConfig.useWildcard()){
                 //(1.a) search for present FST files in the FST directory
                 Map<String,File> presentFstFiles = new HashMap<String,File>();
                 WildcardFileFilter fstFilter = new WildcardFileFilter(
                     fstName+".*.fst");
                 Iterator<File> fstFiles = FileUtils.iterateFiles(fstDirectory, fstFilter, null);
                 while(fstFiles.hasNext()){
                     File fstFile = fstFiles.next();
                     String fstFileName = fstFile.getName();
                     //files are named such as "{name}.{lang}.fst"
                     String language = FilenameUtils.getExtension(
                         FilenameUtils.getBaseName(fstFileName));
                     presentFstFiles.put(language, fstFile);
                 }
                 //(1.b) iterate over the fields in the Solr index and search for
                 //      matches against the configured indexField name
                 String fieldWildcard = FieldEncodingEnum.encodeLanguage(indexField,
                     fieldEncoding, "*");
                 for(FieldInfo fieldInfo : fieldInfos){
                     //try to match the field names against the wildcard
                     if(FilenameUtils.wildcardMatch(fieldInfo.name, fieldWildcard)){
                         //for matches parse the language from the field name
                         String language = FieldEncodingEnum.parseLanguage(
                             fieldInfo.name, fieldEncoding, indexField);
                         if(language != null && //successfully parsed language
                                 //is current language is enabled?
                                 fstConfig.isLanguage(language) &&
                                 //is there no explicit configuration for this language?
                                 !fstConfig.getExplicitlyIncluded().contains(language)){
                             //generate the FST file name
                             StringBuilder fstFileName = new StringBuilder(fstName);
                             if(!language.isEmpty()){
                                 fstFileName.append('.').append(language);
                             }
                             fstFileName.append(".fst");
                             File fstFile = new File(fstDirectory,fstFileName.toString());
                             //get the FieldType of the field from the Solr schema
                             FieldType fieldType = schema.getFieldTypeNoEx(fieldInfo.name);
                             if(fieldType != null){ //if the fieldType is present
                                 if(runtimeGeneration || fstFile.isFile()){ //and FST is present or can be created
                                     //we need also to check if the stored field with
                                     //the labels is present
                                     //get the stored Field and check if it is present!
                                     String storeFieldName;
                                     if(storeField == null){ //storeField == indexField
                                         storeFieldName = fieldInfo.name;
                                     } else { // check that the storeField is present in the index
                                         storeFieldName = FieldEncodingEnum.encodeLanguage(
                                             storeField, fieldEncoding, language);
                                         FieldInfo storedFieldInfos = fieldInfos.fieldInfo(storeFieldName);
                                         if(storedFieldInfos == null){
                                             log.debug(" ... ignore language {} because Stored Field {} "
                                                     + "for IndexField {} does not exist! ", new Object[]{
                                                     language,storeFieldName,fieldInfo.name});
                                             storeFieldName = null;
                                         }

                                     }
                                     if(storeFieldName != null){ // == valid configuration
                                         CorpusInfo fstInfo = corpusInfosCopy.get(language);
                                         if(fstInfo == null || //new one
                                                 !fstInfo.indexedField.equals(fieldInfo.name) || //index field compatible
                                                 !fstInfo.storedField.equals(storeFieldName)){ //store field compatible
                                             CorpusInfo newFstInfo = new CorpusInfo(language,
                                                 fieldInfo.name, storeFieldName,
                                                 fieldType, fstFile, runtimeGeneration);
                                             log.debug(" ... {} {} ", fstInfo == null ? "create" : "update", newFstInfo);
                                             addCorpusInfo(newFstInfo);
                                             corpusInfosCopy.put(language, newFstInfo);
                                         } else { //no change in the SolrIndex ... use the exsisting CorpusInfo
                                             addCorpusInfo(fstInfo);
                                         }
                                         foundCorpus = true;
                                     }
                                 } else {
                                     log.debug(" ... ignore language {} (field: {}) because "
                                         + "FST file '{}' does not exist and runtime creation "
                                         + "is deactivated!",new Object[]{ language,
                                                 fieldInfo.name, fstFile.getAbsolutePath()});
                                 }
                             } else {
                                 log.debug(" ... ignore language {} becuase unknown fieldtype "
                                     + "for SolrFied {}",language,fieldInfo.name);
                             }
                         } //else the field matched the wildcard, but has not passed the
                         //encoding test.
                     } //Solr field does not match the field definition in the config
                 } // end iterate over all fields in the SolrIndex
             } //else Wildcard not enabled in the fstConfig

             //(2) process explicit configuration for configured languages
             for(String language : fstConfig.getExplicitlyIncluded()){
                 //(2.a) get the language specific config (with fallback to default)
                 Map<String,String> config = fstConfig.getParameters(language);
                 String langIndexField = config.get(IndexConfiguration.PARAM_FIELD);
                 String langStoreField = config.get(IndexConfiguration.PARAM_STORE_FIELD);
                 String langFstFileName = config.get(IndexConfiguration.PARAM_FST);
                 final boolean langAllowCreation;
                 final String langAllowCreationString = config.get(IndexConfiguration.PARAM_RUNTIME_GENERATION);
                 if(langIndexField != null){
                     //also consider explicit field names as default for the fst name
                     if(langFstFileName == null){
                         StringBuilder fileName = new StringBuilder(
                             getDefaultFstFileName(langIndexField));
                         if(!language.isEmpty()){
                             fileName.append('.').append(language);
                         }
                         fileName.append(".fst");
                         langFstFileName = fileName.toString();
                     }
                 } else {
                     langIndexField = indexField;
                 }
                 if(langStoreField == null){ //fallbacks
                     if(storeField != null){ //first to default store field
                         langStoreField = storeField;
                     } else { //else to the lang index field
                         langStoreField = langIndexField;
                     }
                 }
                 if(langFstFileName == null){ //no fstFileName config
                     // ... use the default
                     langFstFileName = new StringBuilder(fstName).append('.')
                             .append(language).append(".fst").toString();
                 }
                 if(langAllowCreationString != null){
                     langAllowCreation = Boolean.parseBoolean(langAllowCreationString);
                 } else {
                     langAllowCreation = runtimeGeneration;
                 }
                 //(2.b) check if the Solr field is present
                 String encodedLangIndexField = FieldEncodingEnum.encodeLanguage(
                     langIndexField, fieldEncoding, language);
                 String encodedLangStoreField = FieldEncodingEnum.encodeLanguage(
                     langStoreField, fieldEncoding, language);
                 FieldInfo langIndexFieldInfo = fieldInfos.fieldInfo(encodedLangIndexField);
                 if(langIndexFieldInfo != null){
                     FieldInfo langStoreFieldInfo = fieldInfos.fieldInfo(encodedLangStoreField);
                     if(langStoreFieldInfo != null){
                         FieldType fieldType = schema.getFieldTypeNoEx(langIndexFieldInfo.name);
                         if(fieldType != null){
                             //(2.c) check the FST file
                             File langFstFile = new File(fstDirectory,langFstFileName);
                             if(langFstFile.isFile() || langAllowCreation){
                                 CorpusInfo langFstInfo = corpusInfosCopy.get(language);
                                 if(langFstInfo == null || //new one
                                         !langFstInfo.indexedField.equals(encodedLangIndexField) || //index field compatible
                                         !langFstInfo.storedField.equals(encodedLangStoreField)){ //store field compatible
                                     CorpusInfo newLangFstInfo = new CorpusInfo(language,
                                         encodedLangIndexField,encodedLangStoreField,
                                         fieldType, langFstFile, langAllowCreation);
                                     log.debug("   ... {} {} for explicitly configured language",
                                         langFstInfo == null ? "create" : "update", newLangFstInfo);
                                     addCorpusInfo(newLangFstInfo);
                                 } else { //we can use the existing instance
                                     addCorpusInfo(langFstInfo);
                                 }
                                 foundCorpus = true;
                             } else {
                                 log.debug(" ... ignore explicitly configured language {} (field: {}) because "
                                         + "FST file '{}' does not exist and runtime creation "
                                         + "is deactivated!",new Object[]{ language,
                                                 langIndexFieldInfo.name, langFstFile.getAbsolutePath()});
                             }
                         } else {
                             log.debug(" ... ignore explicitly configured language {} becuase unknown fieldtype "
                                     + "for SolrFied {}", language, langIndexFieldInfo.name);
                         }
                     } else {
                         log.debug(" ... ignore explicitly configured language {} because configured stored Field {} "
                                 + "for IndexField {} does not exist! ", new Object[]{
                                 language,langStoreField,langIndexFieldInfo.name});
                     }
                 } else {
                     log.debug(" ... ignore explicitly configured language {} because configured field {} (encoded: {}) "
                         + "is not present in the SolrIndex!", new Object[]{
                                 language, langIndexField, encodedLangIndexField });
                 }
             }
         } finally {
             corpusInfoLock.writeLock().unlock();
         }
         return foundCorpus;
     }

     /**
      * Getter for the default FST file name based on the configured field
      * name. This method returns the '<code>{name}</code>' part of the
      * '<code>{name}.{lang}.fst</code>' name.
      * @param fstFieldName the field name.
      * @return the '<code>{name}</code>' part of the'<code>{name}.{lang}.fst</code>' name
      */
     private String getDefaultFstFileName(final String fstFieldName) {
         String fstName;
         if(!StringUtils.isAlphanumeric(fstFieldName)) {
             StringBuilder escaped = new StringBuilder(fstFieldName.length());
             for(int i = 0; i < fstFieldName.length();i++){
                 int codepoint = fstFieldName.codePointAt(i);
                 if(Character.isLetterOrDigit(codepoint)){
                     escaped.appendCodePoint(codepoint);
                 } else {
                     escaped.append('_');
                 }
             }
             fstName = escaped.toString();
         } else {
             fstName = fstFieldName;
         }
         return fstName;
     }

     public boolean isSkipAltTokens() {
         return skipAltTokens;
     }

     public void setSkipAltTokens(boolean skipAltTokens) {
         this.skipAltTokens = skipAltTokens;

     }

 }