blob: ba691329de488ac78c697bc1c9fd0bc166eae2dd [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.lucenefstlinking;
import org.apache.commons.lang.StringUtils;
/**
* Enumeration that describes supported Solr Field encodings for multiple
* languages. In Solr this might be represented by multiple explicitly
* defined fields, or by a dynamic field configuration. With the Lucene FST
* based entity linking engine this is used to allow the configuration
* of a single field (e.g. rdfs:label) and an algorithm based generation
* of field names for specific languages (e.g. "rdfs:label@en",
* "rdfs:label@de", "rdfs:label@de-AT", ...).
* <p>
* This enumeration also defines some utility methods for encoding and decoding
* of fields
*
* @author Rupert Westenthaler
*
*/
public enum FieldEncodingEnum {
/**
* No encoding is used. This will require to explicitly set a field for
* all processed languages.
*/
None,
/**
* The encoding as used by the SolrYard implementation of the Entityhub
*/
SolrYard,
/**
* '<code>{lang}-{field-name}</code>' encoding
*/
MinusPrefix(true,false,'-'),
/**
* '<code>{lang}_{field-name}</code>' encoding
*/
UnderscorePrefix(true,false,'_'),
/**
* '<code>{field-name}-{lang}</code>' encoding
*/
MinusSuffix(false,true,'-'),
/**
* '<code>{field-name}_{lang}</code>' encoding
*/
UnderscoreSuffix(false,true,'_'),
/**
* '<code>{lang}@{field-name}</code>' encoding
*/
AtPrefix(true,false,'@'),
/**
* '<code>{field-name}@{lang}</code>' encoding
*/
AtSuffix(false,true,'@');
private boolean prefix;
private boolean suffix;
private char sep;
private FieldEncodingEnum(){
this(false,false,' ');
}
private FieldEncodingEnum(boolean prefix, boolean suffix, char sep){
this.prefix = prefix;
this.suffix = suffix;
this.sep = sep;
}
/**
* Encodes a Solr index field holding {@link Double} values.
* @param field the field (as configured by the user)
* @param encoding the Encoding
* @return Encodes the field if {@link FieldEncodingEnum#SolrYard}. Otherwise
* it returns the parsed field value.
*/
public static String encodeDouble(String field, FieldEncodingEnum encoding){
if(encoding == SolrYard){
return new StringBuilder("dou").append('/').append(field).append('/').toString();
} else {
return field;
}
}
/**
* Encodes a Solr index field holding {@link Float} values.
* @param field the field (as configured by the user)
* @param encoding the Encoding
* @return Encodes the field if {@link FieldEncodingEnum#SolrYard}. Otherwise
* it returns the parsed field value.
*/
public static String encodeFloat(String field, FieldEncodingEnum encoding) {
if(encoding == SolrYard){
return new StringBuilder("flo").append('/').append(field).append('/').toString();
} else {
return field;
}
}
/**
* Encodes a Solr index field holding URIs.
* @param field the field (as configured by the user)
* @param encoding the Encoding
* @return Encodes the field if {@link FieldEncodingEnum#SolrYard}. Otherwise
* it returns the parsed field value.
*/
public static String encodeUri(String field, FieldEncodingEnum encoding){
if(encoding == SolrYard){
return new StringBuilder("ref").append('/').append(field).append('/').toString();
} else {
return field;
}
}
/**
* encodes the parsed field and language based on the encoding defined
* by this enum instance
* @param field the field
* @param language the language
* @return the encoded field
*/
public static String encodeLanguage(String field, FieldEncodingEnum encoding, String language){
if(encoding == None){
return field;
}
if(encoding == SolrYard){
StringBuilder sb = new StringBuilder();
sb.append('@').append(language).append('/');
sb.append(field).append('/');
return sb.toString();
}
StringBuilder encoded = new StringBuilder();
if(encoding.prefix && !StringUtils.isEmpty(language)){
encoded.append(language).append(encoding.sep);
}
encoded.append(field);
if(encoding.suffix && !StringUtils.isEmpty(language)){
encoded.append(encoding.sep).append(language);
}
return encoded.toString();
}
public static String parseLanguage(String value, FieldEncodingEnum encoding, String field){
if(encoding == None){
return null;
}
if(encoding == SolrYard){
int atIndex = value.indexOf('@');
int slashIndex = value.indexOf('/');
//expect @{lang}/{field}/
if(value.indexOf(field, slashIndex) != value.length()-1-field.length()){
return null; //no match
}
if(atIndex == 0 && slashIndex > 0){
return value.substring(1,slashIndex);
} else {
return null;//no match
}
}
if(encoding.prefix){
if(!value.endsWith(field)){
return null;//no match
}
//just subtract the field and the sep from the value
return value.substring(0,value.length()-field.length()-1);
}
if(encoding.suffix){
if(!value.startsWith(field)){
return null; //no match
}
//just cut the field and the sep from the value
return value.substring(field.length()+1);
}
return null;
}
}