blob: 0ae469c896eb47dceb5cbf4a5389baefcb406027 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.lucenefstlinking;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import org.apache.clerezza.rdf.core.Literal;
import org.apache.clerezza.rdf.core.Resource;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.filefilter.WildcardFileFilter;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.search.IndexSearcher;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.RefCounted;
import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider;
import org.apache.stanbol.enhancer.engines.lucenefstlinking.cache.EntityCacheManager;
import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
import org.opensextant.solrtexttagger.TaggerFstCorpus;
import org.opensextant.solrtexttagger.UnsupportedTokenException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Holds the configuration of the index used by the FST linking engine.
*
* @author Rupert Westenthaler
*
*/
public class IndexConfiguration {
private static final Logger log = LoggerFactory.getLogger(IndexConfiguration.class);
private final SolrCore index;
/**
* The type field
*/
private String typeField;
/**
* The redirect field
*/
private String redirectField;
/**
* The entityRanking field
*/
private String rankingField;
/**
* Used to sync access to {@link #corpusInfos}
*/
private ReadWriteLock corpusInfoLock = new ReentrantReadWriteLock();
/**
* FST corpus configuration
*/
private Map<String,CorpusInfo> corpusInfos;
/**
* {@link ExecutorService} used to create {@link TaggerFstCorpus} instances
* at runtime.
*/
protected ExecutorService executorService;
/**
* The encoding used by SolrFields (e.g. to define label fields for different
* languages).
*/
private final FieldEncodingEnum fieldEncoding;
/**
* The instance used to retrieve/create the cache for Lucene {@link Document}s
* of Entities.
*/
private EntityCacheManager entityCacheManager;
private final LanguageConfiguration fstConfig;
/**
* If runtime generation is enabled by default (Note: explicitly configured
* lanugages might override this)
*/
private final boolean runtimeGeneration;
/**
* used to track if this index configuration is active
*/
private boolean active = false;
private long indexVersion = -1;
private File fstDirectory;
/**
* The origin is added to <code>fise:TextAnnotation</code> created for
* linked Entities. It is intended to be used for providing a reference to
* dataset of the Entity. Both {@link UriRef URI}s and {@link Literal}s can
* be used here
*/
private Resource origin;
/**
* If alternate tokens (<code>posInc == 0</code>) can be skipped or if such
* tokens should cause an {@link UnsupportedTokenException}.
*/
private boolean skipAltTokens;
/**
* The default language
*/
private String defaultLanguage;
/**
* If alternate tokens (<code>posInc == 0</code>) can be skipped or if such
* tokens should cause an {@link UnsupportedTokenException}.
* <p>
* While enabling this will allow to use FST linking with query time Lucene
* {@link Analyzer}s that emit alternate tokens (e.g. the Kuromoji analyzers
* for Japanese) but it also requires special care with index time
* {@link Analyzer} configurations. If enabled the index time analyzer MUST
* produce all possible tokens emited by the query time analyzer as only if
* all such combinations are added to the FST model skipped alternate
* tokens can not prevent mentions from being detected.
* <p>
* By default <code>skipAltTokens</code> is enabled for
* {@link FieldEncodingEnum#SolrYard} and deactivated for all other field
* encoding setting. This is because all Solr <code>schema.xml</code> used
* by the Stanbol Entityhub SolrYard ensure the requirement stated above.
* For other Solr configurations users will neet to explicitly activate this.
*/
public static final String SKIP_ALT_TOKENS = "enhancer.engines.linking.lucenefst.skipAltTokens";
/**
* Property used to configure the FieldName encoding of the SolrIndex. This
* is mainly needed for label fields of different languages (e.g. by using
* the iso language code as prefix/suffix of Solr fields. However this also
* adds support for SolrIndexes encoded as specified by the Stanbol
* Entityhub SolrYard implementation. See {@link FieldEncodingEnum} for
* supported values
*/
public static final String FIELD_ENCODING = "enhancer.engines.linking.lucenefst.fieldEncoding";
/**
* The name of the Solr field storing rankings for entities. Entities with a
* higher value are considered as better (more popular).
*/
public static final String SOLR_RANKING_FIELD = "enhancer.engines.linking.lucenefst.rankingField";
/**
* The name of the Solr field holding the entity type information
*/
public static final String SOLR_TYPE_FIELD = "enhancer.engines.linking.lucenefst.typeField";
/**
* Language configuration defining the language, solr field and the name of the
* FST file. The FST file is looked up using the {@link DataFileProvider}.
*/
public static final String FST_CONFIG = "enhancer.engines.linking.lucenefst.fstconfig";
/**
* The folder used to store the FST files. The {@link DEFAULT_FST_FOLDER default} is
* '<code>${solr-data-dir}/fst</code>' - this is '<code>./fst</code>' relative to the
* {@link SolrCore#getDataDir()} of the current SolrCore.
*/
public static final String FST_FOLDER = "enhancer.engines.linking.lucenefst.fstfolder";
/**
* The default of the FST folder is '<code>${solr-data-dir}/fst</code>' -
* this is '<code>./fst</code>' relative to the {@link SolrCore#getDataDir()}
* of the current SolrCore.
*/
public static final String DEFAULT_FST_FOLDER = "${solr-data-dir}/fst";
/**
* By default runtime generation for the FST is deactivated. Use the
* {@link PARAM_RUNTIME_GENERATION} to enable it.
*/
public static final boolean DEFAULT_RUNTIME_GENERATION = false;
/**
* Parameter that specifies if FST files are allowed to be generated at runtime.
* Enabling this will require (1) write access to the SolrCore directory and
* (2) a lot of Memory and CPU usage during the generation.
*/
public static final String PARAM_RUNTIME_GENERATION = "generate";
/**
* Parameter used by the {@link IndexConfiguration#FST_CONFIG} to configure the solrField with
* the stored labels. If not defined this defaults to the configured
* {@link PARAM_FIELD}.
*/
public static final String PARAM_STORE_FIELD = "stored";
/**
* Parameter used by the {@link IndexConfiguration#FST_CONFIG} to configure the Solr Field
* with the indexed labels used to buld the FST corpus.
*/
public static final String PARAM_FIELD = "field";
public static final String DEFAULT_FIELD = "rdfs:label";
/**
* Parameter used by the {@link IndexConfiguration#FST_CONFIG} to configure the name of the fst
* file for a language
*/
public static final String PARAM_FST = "fst";
public IndexConfiguration(LanguageConfiguration fstConfig, SolrCore index, FieldEncodingEnum fieldEncoding, String defaultLanguage){
if(fstConfig == null){
throw new IllegalArgumentException("The parsed FST configuration MUST NOT be NULL!");
}
this.fstConfig = fstConfig;
if(index == null || index.isClosed()){
throw new IllegalArgumentException("The parsed SolrCore MUST NOT be NULL nore closed!");
}
//check if we have runtime generation
String allowCreationString = fstConfig.getDefaultParameters().get(IndexConfiguration.PARAM_RUNTIME_GENERATION);
if(allowCreationString == null){
runtimeGeneration = IndexConfiguration.DEFAULT_RUNTIME_GENERATION;
} else {
runtimeGeneration = Boolean.parseBoolean(allowCreationString);
}
this.index = index;
if(fieldEncoding == null){
fieldEncoding = FieldEncodingEnum.None;
}
this.fieldEncoding = fieldEncoding;
//In case of a SolrYard we can activate skipAltTokens (see javadoc for
//#SKIP_ALT_TOKENS for more information)
if(fieldEncoding == FieldEncodingEnum.SolrYard){
this.skipAltTokens = true;
} else {
this.skipAltTokens = false;
}
this.defaultLanguage = defaultLanguage == null ? "" : defaultLanguage;
}
/**
* Assumed to be called in a write lock on {@link #corpusInfoLock}
* @param corpus
*/
private void addCorpusInfo(CorpusInfo corpus){
if(corpus != null){
corpusInfos.put(corpus.language, corpus);
}
}
protected CorpusInfo removeCorpus(String language){
return corpusInfos.remove(language);
}
/**
* @return the fieldEncoding
*/
public final FieldEncodingEnum getFieldEncoding() {
return fieldEncoding;
}
/**
* @return the typeField
*/
public final String getEncodedTypeField() {
return typeField;
}
/**
* Sets AND encodes the parsed value (based on the specified
* {@link #getFieldEncoding() FieldEncoding})
* @param typeField the typeField to set
*/
public final void setTypeField(String typeField) {
this.typeField = typeField == null ? null :
FieldEncodingEnum.encodeUri(typeField, fieldEncoding);
}
/**
* @return the redirectField
*/
public final String getEncodedRedirectField() {
return redirectField;
}
/**
* Sets AND encodes the parsed value (based on the specified
* {@link #getFieldEncoding() FieldEncoding})
* @param redirectField the redirectField to set
*/
public final void setRedirectField(String redirectField) {
this.redirectField = redirectField == null ? null :
FieldEncodingEnum.encodeUri(redirectField, fieldEncoding);
}
/**
* @return the rankingField
*/
public final String getEncodedRankingField() {
return rankingField;
}
/**
* Sets AND encodes the parsed value (based on the specified
* {@link #getFieldEncoding() FieldEncoding})
* @param rankingField the rankingField to set
*/
public final void setRankingField(String rankingField) {
this.rankingField = rankingField == null ? null :
FieldEncodingEnum.encodeFloat(rankingField, fieldEncoding);
}
/**
* The version of the {@link #getIndex()} this configuration was built for.
* @return the index version this configuration was built for.
*/
public long getVersion() {
return indexVersion;
}
/**
* Returns the CorpusInfo for the parsed language. If the language has an
* extension (e.g. en-US) it first tires to load the corpus for the exact
* match and falls back to the main lanugage (en) if such a corpus does not
* exist.
* @param language the language
* @return the corpus information or <code>null</code> if not present
*/
public CorpusInfo getCorpus(String language) {
corpusInfoLock.readLock().lock();
try {
CorpusInfo langCorpusInfo = corpusInfos.get(language);
if(langCorpusInfo == null && language.indexOf('-') > 0){
String rootLang = language.substring(0,language.indexOf('-'));
log.debug(" - no FST corpus for {}. Fallback to {}", language,rootLang);
langCorpusInfo = corpusInfos.get(rootLang);
}
return langCorpusInfo;
} finally {
corpusInfoLock.readLock().unlock();
}
}
/**
* Getter for the languages of all configured FST corpora
* @return a read-only copy of the languages of all configured FST corpora
*/
public Set<String> getCorpusLanguages(){
return Collections.unmodifiableSet(new HashSet<String>(corpusInfos.keySet()));
}
/**
* Read-only collection of all {@link CorpusInfo}s defined for this
* configuration.
* @return a read only copy of the current {@link CorpusInfo}s
*/
public Collection<CorpusInfo> getCorpora(){
corpusInfoLock.readLock().lock();
try {
return Collections.unmodifiableCollection(new ArrayList<CorpusInfo>(corpusInfos.values()));
} finally {
corpusInfoLock.readLock().unlock();
}
}
/**
* The {@link CorpusInfo} for the default laugnage
* @return the default corpus or <code>null</code> if no corpus is available
* for the default language
*/
public CorpusInfo getDefaultCorpus() {
corpusInfoLock.readLock().lock();
try {
return corpusInfos.get(defaultLanguage);
} finally {
corpusInfoLock.readLock().unlock();
}
}
public void setExecutorService(ExecutorService executorService) {
this.executorService = executorService;
}
public ExecutorService getExecutorService() {
return executorService;
}
/**
* The FST configuration
* @return
*/
public LanguageConfiguration getFstConfig() {
return fstConfig;
}
public SolrCore getIndex() {
return index;
}
public void setEntityCacheManager(EntityCacheManager entityCacheManager) {
this.entityCacheManager = entityCacheManager;
}
public EntityCacheManager getEntityCacheManager() {
return entityCacheManager;
}
public File getFstDirectory() {
return fstDirectory;
}
public void setFstDirectory(File fstDirectory) {
this.fstDirectory = fstDirectory;
}
public void setOrigin(Resource origin) {
this.origin = origin;
}
/**
* The Origin of the dataset or <code>null</code> if not defined. The
* origin can be used to specify the dataset where the Entities described by
* the configured FST originate from. If can be both an URI (e.g.
* <code>http://dbpedia.org</code>) or an literal "<code>dbpedia</code>").
* If present the origin is added to any <code>fise:TextAnnotation</code>
* created by the FstLinkingEngine with the property <code>fise:origin</code>
*
* @return the origin or <code>null</code> if none is configured
*/
public Resource getOrigin() {
return origin;
}
/**
* Deactivates this {@link IndexConfiguration}
*/
public void deactivate(){
active = false;
}
/**
* If this {@link IndexConfiguration} is still in sync with the version
* of the {@link #getIndex() SolrCore}. This will return true if
* <code>{@link #isRuntimeGeneration()} == false </code>
* @return <code>true</code> if still active. Otherwise <code>false</code>
*/
public boolean isCurrent(){
if(!runtimeGeneration){
return true;
} else {
RefCounted<SolrIndexSearcher> searcherRef = index.getSearcher();
try {
long version = searcherRef.get().getIndexReader().getVersion();
return indexVersion == version;
} finally {
searcherRef.decref();
}
}
}
private long getIndexVersion(){
RefCounted<SolrIndexSearcher> searcherRef = index.getSearcher();
try {
return getIndexVersion(searcherRef.get());
} finally {
searcherRef.decref();
}
}
private long getIndexVersion(SolrIndexSearcher searcher){
return searcher.getIndexReader().getVersion();
}
public boolean isRuntimeGeneration() {
return runtimeGeneration;
}
public boolean isActive() {
return active;
}
/**
* Activated this indexing configuration by inspecting the {@link SolrCore}
* based on the provided configuration
* @return
*/
public void activate() throws IOException {
active = true;
if(index == null){ //do we have an SolrCore
throw new IllegalArgumentException("No SolrCore set for this configuration");
}
//if no fstDirectory is configured
if(fstDirectory == null){ //use the default
fstDirectory = new File(index.getDataDir(),"fst");
}
//init the fstDirectory (may throw IOException)
if(fstDirectory.isFile()){
throw new IOException("Default FST directory exists and "
+ "is a File. Use #setFstDirectory() to set different one");
} else if(!fstDirectory.exists()){
FileUtils.forceMkdir(fstDirectory);
}
//acquire the initial index configuration
update();
}
/**
* Updates the configuration based on the current version of the
* {@link #getIndex()}. If the SolrCore was not updated this will do
* nothing.
*/
public void update(){
RefCounted<SolrIndexSearcher> searcherRef = index.getSearcher();
try {
update(getIndexVersion(searcherRef.get()), searcherRef.get());
} finally {
searcherRef.decref(); //decrease the count on the searcher
}
}
/**
* Version of {@link #update()} to be used in cases where the indexVersion
* and a Solr searcher is already available in the calling method
* @param indexVersion
* @param searcher
*/
protected void update(long indexVersion, SolrIndexSearcher searcher){
assert searcher != null;
assert searcher.getCore().equals(index);
processFstConfig(indexVersion, searcher.getAtomicReader());
}
/**
* This method combines the {@link #fstConfig} with the data present in the
* {@link SolrCore}.
* <p>
* As information for fields are only available when a
* field was actually used by a document stored in the index one needs to
* inspect the index after every change.
* <p>
* An empty Solr index will result in
* an empty {@link #corpusInfos} map. The first document with an value
* for the English field will cause an {@link CorpusInfo} for the English
* language to be created. As soon as the last document with an label for
* a given language will be deleted the {@link CorpusInfo} for that language
* will also disappear.
* @param indexVersion the current version of the {@link #index} to process
* the FST config for.
* <p>
* This method acquires a write lock on {@link #corpusInfoLock} while it
* inspects the Solr index
* @param indexReader The {@link AtomicReader} has access to the actual
* fields present in the {@link SolrCore}. It is used to compare field
* configurations in the {@link #fstConfig} with fields present in the Solr
* {@link #index}.
* @return If any {@link CorpusInfo FST configuration} where found during
* inspecting the Solr {@link #index}
*/
private boolean processFstConfig(long indexVersion, AtomicReader indexReader) {
//first check if the Solr index was updated
corpusInfoLock.readLock().lock();
try {
if(indexVersion == this.indexVersion){ //no update?
return !corpusInfos.isEmpty(); //nothing to do
}
} finally {
corpusInfoLock.readLock().unlock();
}
log.debug("> {} FST config for {} (FST dir: {})",
corpusInfos == null ? "create" : "update",
index.getName(), fstDirectory.getAbsolutePath());
boolean foundCorpus = false;
corpusInfoLock.writeLock().lock();
try {
this.indexVersion = indexVersion;
IndexSchema schema = index.getLatestSchema();
Map<String,CorpusInfo> corpusInfosCopy;
if(corpusInfos == null){ //first call
corpusInfos = new HashMap<String,CorpusInfo>(); //init the field
corpusInfosCopy = new HashMap<String,CorpusInfo>();
} else {
corpusInfosCopy = new HashMap<String,CorpusInfo>(corpusInfos);
corpusInfos.clear(); //clear the old data
}
//(0) get basic parameters of the default configuration
log.debug(" - default config");
Map<String,String> defaultParams = fstConfig.getDefaultParameters();
String fstName = defaultParams.get(IndexConfiguration.PARAM_FST);
String indexField = defaultParams.get(IndexConfiguration.PARAM_FIELD);
String storeField = defaultParams.get(IndexConfiguration.PARAM_STORE_FIELD);
if(storeField == null){
//apply indexField as default if indexField is NOT NULL
storeField = indexField;
}
if(indexField == null){ //apply the defaults if null
indexField = IndexConfiguration.DEFAULT_FIELD;
}
if(fstName == null){ //use default
fstName = getDefaultFstFileName(indexField);
}
//This are all fields actually present in the index (distinguished with
//those defined in the schema). This also includes actual instances of
//dynamic field definition in the schema.
FieldInfos fieldInfos = indexReader.getFieldInfos(); //we need this twice
//(1) in case the fstConfig uses a wildcard we need to search for
// languages present in the SolrIndex. For that we use the indexReader
// to get the FieldInfos and match them against FST files in the FST
// directory and FieldType definitions in the schema of the SolrCore
//NOTE: this needs only do be done if wildcards are enabled in the fstConfig
if(fstConfig.useWildcard()){
//(1.a) search for present FST files in the FST directory
Map<String,File> presentFstFiles = new HashMap<String,File>();
WildcardFileFilter fstFilter = new WildcardFileFilter(
fstName+".*.fst");
Iterator<File> fstFiles = FileUtils.iterateFiles(fstDirectory, fstFilter, null);
while(fstFiles.hasNext()){
File fstFile = fstFiles.next();
String fstFileName = fstFile.getName();
//files are named such as "{name}.{lang}.fst"
String language = FilenameUtils.getExtension(
FilenameUtils.getBaseName(fstFileName));
presentFstFiles.put(language, fstFile);
}
//(1.b) iterate over the fields in the Solr index and search for
// matches against the configured indexField name
String fieldWildcard = FieldEncodingEnum.encodeLanguage(indexField,
fieldEncoding, "*");
for(FieldInfo fieldInfo : fieldInfos){
//try to match the field names against the wildcard
if(FilenameUtils.wildcardMatch(fieldInfo.name, fieldWildcard)){
//for matches parse the language from the field name
String language = FieldEncodingEnum.parseLanguage(
fieldInfo.name, fieldEncoding, indexField);
if(language != null && //successfully parsed language
//is current language is enabled?
fstConfig.isLanguage(language) &&
//is there no explicit configuration for this language?
!fstConfig.getExplicitlyIncluded().contains(language)){
//generate the FST file name
StringBuilder fstFileName = new StringBuilder(fstName);
if(!language.isEmpty()){
fstFileName.append('.').append(language);
}
fstFileName.append(".fst");
File fstFile = new File(fstDirectory,fstFileName.toString());
//get the FieldType of the field from the Solr schema
FieldType fieldType = schema.getFieldTypeNoEx(fieldInfo.name);
if(fieldType != null){ //if the fieldType is present
if(runtimeGeneration || fstFile.isFile()){ //and FST is present or can be created
//we need also to check if the stored field with
//the labels is present
//get the stored Field and check if it is present!
String storeFieldName;
if(storeField == null){ //storeField == indexField
storeFieldName = fieldInfo.name;
} else { // check that the storeField is present in the index
storeFieldName = FieldEncodingEnum.encodeLanguage(
storeField, fieldEncoding, language);
FieldInfo storedFieldInfos = fieldInfos.fieldInfo(storeFieldName);
if(storedFieldInfos == null){
log.debug(" ... ignore language {} because Stored Field {} "
+ "for IndexField {} does not exist! ", new Object[]{
language,storeFieldName,fieldInfo.name});
storeFieldName = null;
}
}
if(storeFieldName != null){ // == valid configuration
CorpusInfo fstInfo = corpusInfosCopy.get(language);
if(fstInfo == null || //new one
!fstInfo.indexedField.equals(fieldInfo.name) || //index field compatible
!fstInfo.storedField.equals(storeFieldName)){ //store field compatible
CorpusInfo newFstInfo = new CorpusInfo(language,
fieldInfo.name, storeFieldName,
fieldType, fstFile, runtimeGeneration);
log.debug(" ... {} {} ", fstInfo == null ? "create" : "update", newFstInfo);
addCorpusInfo(newFstInfo);
corpusInfosCopy.put(language, newFstInfo);
} else { //no change in the SolrIndex ... use the exsisting CorpusInfo
addCorpusInfo(fstInfo);
}
foundCorpus = true;
}
} else {
log.debug(" ... ignore language {} (field: {}) because "
+ "FST file '{}' does not exist and runtime creation "
+ "is deactivated!",new Object[]{ language,
fieldInfo.name, fstFile.getAbsolutePath()});
}
} else {
log.debug(" ... ignore language {} becuase unknown fieldtype "
+ "for SolrFied {}",language,fieldInfo.name);
}
} //else the field matched the wildcard, but has not passed the
//encoding test.
} //Solr field does not match the field definition in the config
} // end iterate over all fields in the SolrIndex
} //else Wildcard not enabled in the fstConfig
//(2) process explicit configuration for configured languages
for(String language : fstConfig.getExplicitlyIncluded()){
//(2.a) get the language specific config (with fallback to default)
Map<String,String> config = fstConfig.getParameters(language);
String langIndexField = config.get(IndexConfiguration.PARAM_FIELD);
String langStoreField = config.get(IndexConfiguration.PARAM_STORE_FIELD);
String langFstFileName = config.get(IndexConfiguration.PARAM_FST);
final boolean langAllowCreation;
final String langAllowCreationString = config.get(IndexConfiguration.PARAM_RUNTIME_GENERATION);
if(langIndexField != null){
//also consider explicit field names as default for the fst name
if(langFstFileName == null){
StringBuilder fileName = new StringBuilder(
getDefaultFstFileName(langIndexField));
if(!language.isEmpty()){
fileName.append('.').append(language);
}
fileName.append(".fst");
langFstFileName = fileName.toString();
}
} else {
langIndexField = indexField;
}
if(langStoreField == null){ //fallbacks
if(storeField != null){ //first to default store field
langStoreField = storeField;
} else { //else to the lang index field
langStoreField = langIndexField;
}
}
if(langFstFileName == null){ //no fstFileName config
// ... use the default
langFstFileName = new StringBuilder(fstName).append('.')
.append(language).append(".fst").toString();
}
if(langAllowCreationString != null){
langAllowCreation = Boolean.parseBoolean(langAllowCreationString);
} else {
langAllowCreation = runtimeGeneration;
}
//(2.b) check if the Solr field is present
String encodedLangIndexField = FieldEncodingEnum.encodeLanguage(
langIndexField, fieldEncoding, language);
String encodedLangStoreField = FieldEncodingEnum.encodeLanguage(
langStoreField, fieldEncoding, language);
FieldInfo langIndexFieldInfo = fieldInfos.fieldInfo(encodedLangIndexField);
if(langIndexFieldInfo != null){
FieldInfo langStoreFieldInfo = fieldInfos.fieldInfo(encodedLangStoreField);
if(langStoreFieldInfo != null){
FieldType fieldType = schema.getFieldTypeNoEx(langIndexFieldInfo.name);
if(fieldType != null){
//(2.c) check the FST file
File langFstFile = new File(fstDirectory,langFstFileName);
if(langFstFile.isFile() || langAllowCreation){
CorpusInfo langFstInfo = corpusInfosCopy.get(language);
if(langFstInfo == null || //new one
!langFstInfo.indexedField.equals(encodedLangIndexField) || //index field compatible
!langFstInfo.storedField.equals(encodedLangStoreField)){ //store field compatible
CorpusInfo newLangFstInfo = new CorpusInfo(language,
encodedLangIndexField,encodedLangStoreField,
fieldType, langFstFile, langAllowCreation);
log.debug(" ... {} {} for explicitly configured language",
langFstInfo == null ? "create" : "update", newLangFstInfo);
addCorpusInfo(newLangFstInfo);
} else { //we can use the existing instance
addCorpusInfo(langFstInfo);
}
foundCorpus = true;
} else {
log.debug(" ... ignore explicitly configured language {} (field: {}) because "
+ "FST file '{}' does not exist and runtime creation "
+ "is deactivated!",new Object[]{ language,
langIndexFieldInfo.name, langFstFile.getAbsolutePath()});
}
} else {
log.debug(" ... ignore explicitly configured language {} becuase unknown fieldtype "
+ "for SolrFied {}", language, langIndexFieldInfo.name);
}
} else {
log.debug(" ... ignore explicitly configured language {} because configured stored Field {} "
+ "for IndexField {} does not exist! ", new Object[]{
language,langStoreField,langIndexFieldInfo.name});
}
} else {
log.debug(" ... ignore explicitly configured language {} because configured field {} (encoded: {}) "
+ "is not present in the SolrIndex!", new Object[]{
language, langIndexField, encodedLangIndexField });
}
}
} finally {
corpusInfoLock.writeLock().unlock();
}
return foundCorpus;
}
/**
* Getter for the default FST file name based on the configured field
* name. This method returns the '<code>{name}</code>' part of the
* '<code>{name}.{lang}.fst</code>' name.
* @param fstFieldName the field name.
* @return the '<code>{name}</code>' part of the'<code>{name}.{lang}.fst</code>' name
*/
private String getDefaultFstFileName(final String fstFieldName) {
String fstName;
if(!StringUtils.isAlphanumeric(fstFieldName)) {
StringBuilder escaped = new StringBuilder(fstFieldName.length());
for(int i = 0; i < fstFieldName.length();i++){
int codepoint = fstFieldName.codePointAt(i);
if(Character.isLetterOrDigit(codepoint)){
escaped.appendCodePoint(codepoint);
} else {
escaped.append('_');
}
}
fstName = escaped.toString();
} else {
fstName = fstFieldName;
}
return fstName;
}
public boolean isSkipAltTokens() {
return skipAltTokens;
}
public void setSkipAltTokens(boolean skipAltTokens) {
this.skipAltTokens = skipAltTokens;
}
}