blob: 96d6d62fbd5337e35572dd691975bebeb66f5d47 [file] [log] [blame]
package org.apache.solr.update.processor;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.update.AddUpdateCommand;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Pattern;
/**
* Identifies the language of a set of input fields.
* Also supports mapping of field names based
* on detected language.
* <p>
* See <a href="http://wiki.apache.org/solr/LanguageDetection">http://wiki.apache.org/solr/LanguageDetection</a>
* @since 3.5
* @lucene.experimental
*/
public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestProcessor implements LangIdParams {
protected final static Logger log = LoggerFactory
.getLogger(LanguageIdentifierUpdateProcessor.class);
protected boolean enabled;
protected String[] inputFields = {};
protected String[] mapFields = {};
protected Pattern mapPattern;
protected String mapReplaceStr;
protected String langField;
protected String langsField; // MultiValued, contains all languages detected
protected String docIdField;
protected String fallbackValue;
protected String[] fallbackFields = {};
protected boolean enableMapping;
protected boolean mapKeepOrig;
protected boolean overwrite;
protected boolean mapOverwrite;
protected boolean mapIndividual;
protected boolean enforceSchema;
protected double threshold;
protected HashSet<String> langWhitelist;
protected HashSet<String> mapIndividualFieldsSet;
protected HashSet<String> allMapFieldsSet;
protected HashMap<String,String> lcMap;
protected IndexSchema schema;
// Regex patterns
protected final Pattern tikaSimilarityPattern = Pattern.compile(".*\\((.*?)\\)");
protected final Pattern langPattern = Pattern.compile("\\{lang\\}");
public LanguageIdentifierUpdateProcessor(SolrQueryRequest req,
SolrQueryResponse rsp, UpdateRequestProcessor next) {
super(next);
schema = req.getSchema();
initParams(req.getParams());
}
private void initParams(SolrParams params) {
if (params != null) {
// Document-centric langId params
setEnabled(params.getBool(LANGUAGE_ID, true));
if(params.get(FIELDS_PARAM, "").length() > 0) {
inputFields = params.get(FIELDS_PARAM, "").split(",");
}
langField = params.get(LANG_FIELD, DOCID_LANGFIELD_DEFAULT);
langsField = params.get(LANGS_FIELD, DOCID_LANGSFIELD_DEFAULT);
SchemaField uniqueKeyField = schema.getUniqueKeyField();
docIdField = params.get(DOCID_PARAM, uniqueKeyField == null ? DOCID_FIELD_DEFAULT : uniqueKeyField.getName());
fallbackValue = params.get(FALLBACK);
if(params.get(FALLBACK_FIELDS, "").length() > 0) {
fallbackFields = params.get(FALLBACK_FIELDS).split(",");
}
overwrite = params.getBool(OVERWRITE, false);
langWhitelist = new HashSet<String>();
threshold = params.getDouble(THRESHOLD, DOCID_THRESHOLD_DEFAULT);
if(params.get(LANG_WHITELIST, "").length() > 0) {
for(String lang : params.get(LANG_WHITELIST, "").split(",")) {
langWhitelist.add(lang);
}
}
// Mapping params (field centric)
enableMapping = params.getBool(MAP_ENABLE, false);
if(params.get(MAP_FL, "").length() > 0) {
mapFields = params.get(MAP_FL, "").split(",");
} else {
mapFields = inputFields;
}
mapKeepOrig = params.getBool(MAP_KEEP_ORIG, false);
mapOverwrite = params.getBool(MAP_OVERWRITE, false);
mapIndividual = params.getBool(MAP_INDIVIDUAL, false);
// Process individual fields
String[] mapIndividualFields = {};
if(params.get(MAP_INDIVIDUAL_FL, "").length() > 0) {
mapIndividualFields = params.get(MAP_INDIVIDUAL_FL, "").split(",");
} else {
mapIndividualFields = mapFields;
}
mapIndividualFieldsSet = new HashSet<String>(Arrays.asList(mapIndividualFields));
// Compile a union of the lists of fields to map
allMapFieldsSet = new HashSet<String>(Arrays.asList(mapFields));
if(Arrays.equals(mapFields, mapIndividualFields)) {
allMapFieldsSet.addAll(mapIndividualFieldsSet);
}
// Language Code mapping
lcMap = new HashMap<String,String>();
if(params.get(MAP_LCMAP) != null) {
for(String mapping : params.get(MAP_LCMAP).split("[, ]")) {
String[] keyVal = mapping.split(":");
if(keyVal.length == 2) {
lcMap.put(keyVal[0], keyVal[1]);
} else {
log.error("Unsupported format for langid.map.lcmap: "+mapping+". Skipping this mapping.");
}
}
}
enforceSchema = params.getBool(ENFORCE_SCHEMA, true);
mapPattern = Pattern.compile(params.get(MAP_PATTERN, MAP_PATTERN_DEFAULT));
mapReplaceStr = params.get(MAP_REPLACE, MAP_REPLACE_DEFAULT);
}
log.debug("LangId configured");
if (inputFields.length == 0) {
throw new SolrException(ErrorCode.BAD_REQUEST,
"Missing or faulty configuration of LanguageIdentifierUpdateProcessor. Input fields must be specified as a comma separated list");
}
}
@Override
public void processAdd(AddUpdateCommand cmd) throws IOException {
if (isEnabled()) {
process(cmd.getSolrInputDocument());
} else {
log.debug("Processor not enabled, not running");
}
super.processAdd(cmd);
}
/**
* This is the main, testable process method called from processAdd()
* @param doc the SolrInputDocument to work on
* @return the modified SolrInputDocument
*/
protected SolrInputDocument process(SolrInputDocument doc) {
String docLang = null;
HashSet<String> docLangs = new HashSet<String>();
String fallbackLang = getFallbackLang(doc, fallbackFields, fallbackValue);
if(langField == null || !doc.containsKey(langField) || (doc.containsKey(langField) && overwrite)) {
String allText = concatFields(doc, inputFields);
List<DetectedLanguage> languagelist = detectLanguage(allText);
docLang = resolveLanguage(languagelist, fallbackLang);
docLangs.add(docLang);
log.debug("Detected main document language from fields "+inputFields+": "+docLang);
if(doc.containsKey(langField) && overwrite) {
log.debug("Overwritten old value "+doc.getFieldValue(langField));
}
if(langField != null && langField.length() != 0) {
doc.setField(langField, docLang);
}
} else {
// langField is set, we sanity check it against whitelist and fallback
docLang = resolveLanguage((String) doc.getFieldValue(langField), fallbackLang);
docLangs.add(docLang);
log.debug("Field "+langField+" already contained value "+docLang+", not overwriting.");
}
if(enableMapping) {
for (String fieldName : allMapFieldsSet) {
if(doc.containsKey(fieldName)) {
String fieldLang;
if(mapIndividual && mapIndividualFieldsSet.contains(fieldName)) {
String text = (String) doc.getFieldValue(fieldName);
List<DetectedLanguage> languagelist = detectLanguage(text);
fieldLang = resolveLanguage(languagelist, docLang);
docLangs.add(fieldLang);
log.debug("Mapping field "+fieldName+" using individually detected language "+fieldLang);
} else {
fieldLang = docLang;
log.debug("Mapping field "+fieldName+" using document global language "+fieldLang);
}
String mappedOutputField = getMappedField(fieldName, fieldLang);
if (mappedOutputField != null) {
log.debug("Mapping field {} to {}", doc.getFieldValue(docIdField), fieldLang);
SolrInputField inField = doc.getField(fieldName);
doc.setField(mappedOutputField, inField.getValue(), inField.getBoost());
if(!mapKeepOrig) {
log.debug("Removing old field {}", fieldName);
doc.removeField(fieldName);
}
} else {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Invalid output field mapping for "
+ fieldName + " field and language: " + fieldLang);
}
}
}
}
// Set the languages field to an array of all detected languages
if(langsField != null && langsField.length() != 0) {
doc.setField(langsField, docLangs.toArray());
}
return doc;
}
/**
* Decides the fallback language, either from content of fallback field or fallback value
* @param doc the Solr document
* @param fallbackFields an array of strings with field names containing fallback language codes
* @param fallbackValue a language code to use in case no fallbackFields are found
*/
private String getFallbackLang(SolrInputDocument doc, String[] fallbackFields, String fallbackValue) {
String lang = null;
for(String field : fallbackFields) {
if(doc.containsKey(field)) {
lang = (String) doc.getFieldValue(field);
log.debug("Language fallback to field "+field);
break;
}
}
if(lang == null) {
log.debug("Language fallback to value "+fallbackValue);
lang = fallbackValue;
}
return lang;
}
/*
* Concatenates content from multiple fields
*/
protected String concatFields(SolrInputDocument doc, String[] fields) {
StringBuffer sb = new StringBuffer();
for (String fieldName : inputFields) {
log.debug("Appending field "+fieldName);
if (doc.containsKey(fieldName)) {
Object content = doc.getFieldValue(fieldName);
if(content instanceof String) {
sb.append((String) doc.getFieldValue(fieldName));
sb.append(" ");
} else {
log.warn("Field "+fieldName+" not a String value, not including in detection");
}
}
}
return sb.toString();
}
/**
* Detects language(s) from a string.
* Classes wishing to implement their own language detection module should override this method.
* @param content The content to identify
* @return List of detected language(s) according to RFC-3066
*/
protected abstract List<DetectedLanguage> detectLanguage(String content);
/**
* Chooses a language based on the list of candidates detected
* @param language language code as a string
* @param fallbackLang the language code to use as a fallback
* @return a string of the chosen language
*/
protected String resolveLanguage(String language, String fallbackLang) {
List<DetectedLanguage> l = new ArrayList<DetectedLanguage>();
l.add(new DetectedLanguage(language, 1.0));
return resolveLanguage(l, fallbackLang);
}
/**
* Chooses a language based on the list of candidates detected
* @param languages a List of DetectedLanguages with certainty score
* @param fallbackLang the language code to use as a fallback
* @return a string of the chosen language
*/
protected String resolveLanguage(List<DetectedLanguage> languages, String fallbackLang) {
String langStr;
if(languages.size() == 0) {
log.debug("No language detected, using fallback {}", fallbackLang);
langStr = fallbackLang;
} else {
DetectedLanguage lang = languages.get(0);
if(langWhitelist.isEmpty() || langWhitelist.contains(lang.getLangCode())) {
log.debug("Language detected {} with certainty {}", lang.getLangCode(), lang.getCertainty());
if(lang.getCertainty() >= threshold) {
langStr = lang.getLangCode();
} else {
log.debug("Detected language below threshold {}, using fallback {}", threshold, fallbackLang);
langStr = fallbackLang;
}
} else {
log.debug("Detected a language not in whitelist ({}), using fallback {}", lang.getLangCode(), fallbackLang);
langStr = fallbackLang;
}
}
if(langStr == null || langStr.length() == 0) {
log.warn("Language resolved to null or empty string. Fallback not configured?");
langStr = "";
}
return langStr;
}
/**
* Returns the name of the field to map the current contents into, so that they are properly analyzed. For instance
* if the currentField is "text" and the code is "en", the new field would by default be "text_en".
* This method also performs custom regex pattern replace if configured. If enforceSchema=true
* and the resulting field name doesn't exist, then null is returned.
*
* @param currentField The current field name
* @param language the language code
* @return The new schema field name, based on pattern and replace, or null if illegal
*/
protected String getMappedField(String currentField, String language) {
String lc = lcMap.containsKey(language) ? lcMap.get(language) : language;
String newFieldName = langPattern.matcher(mapPattern.matcher(currentField).replaceFirst(mapReplaceStr)).replaceFirst(lc);
if(enforceSchema && schema.getFieldOrNull(newFieldName) == null) {
log.warn("Unsuccessful field name mapping from {} to {}, field does not exist and enforceSchema=true; skipping mapping.", currentField, newFieldName);
return null;
} else {
log.debug("Doing mapping from "+currentField+" with language "+language+" to field "+newFieldName);
}
return newFieldName;
}
/**
* Tells if this processor is enabled or not
* @return true if enabled, else false
*/
public boolean isEnabled() {
return enabled;
}
public void setEnabled(boolean enabled) {
this.enabled = enabled;
}
}