| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.solr.update.processor; |
| |
| import java.lang.invoke.MethodHandles; |
| import java.util.HashMap; |
| import java.util.Map; |
| |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.FieldType; |
| import org.apache.lucene.index.IndexableField; |
| import org.apache.solr.common.SolrInputField; |
| import org.apache.solr.common.util.NamedList; |
| import org.apache.solr.core.SolrCore; |
| import org.apache.solr.request.SolrQueryRequest; |
| import org.apache.solr.response.SolrQueryResponse; |
| import org.apache.solr.schema.IndexSchema; |
| import org.apache.solr.schema.JsonPreAnalyzedParser; |
| import org.apache.solr.schema.PreAnalyzedField; |
| import org.apache.solr.schema.PreAnalyzedField.PreAnalyzedParser; |
| import org.apache.solr.schema.SchemaField; |
| import org.apache.solr.schema.SimplePreAnalyzedParser; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * <p>An update processor that parses configured fields of any document being added |
| * using {@link PreAnalyzedField} with the configured format parser.</p> |
| * |
| * <p>Fields are specified using the same patterns as in {@link FieldMutatingUpdateProcessorFactory}. |
| * They are then checked whether they follow a pre-analyzed format defined by <code>parser</code>. |
| * Valid fields are then parsed. The original {@link SchemaField} is used for the initial |
| * creation of {@link IndexableField}, which is then modified to add the results from |
| * parsing (token stream value and/or string value) and then it will be directly added to |
| * the final Lucene {@link Document} to be indexed.</p> |
| * <p>Fields that are declared in the patterns list but are not present |
| * in the current schema will be removed from the input document.</p> |
| * <h3>Implementation details</h3> |
| * <p>This update processor uses {@link PreAnalyzedParser} |
| * to parse the original field content (interpreted as a string value), and thus |
| * obtain the stored part and the token stream part. Then it creates the "template" |
| * {@link Field}-s using the original {@link SchemaField#createFields(Object)} |
| * as declared in the current schema. Finally it sets the pre-analyzed parts if |
| * available (string value and the token |
| * stream value) on the first field of these "template" fields. If the declared |
| * field type does not support stored or indexed parts then such parts are silently |
| * discarded. Finally the updated "template" {@link Field}-s are added to the resulting |
| * {@link SolrInputField}, and the original value of that field is removed.</p> |
| * <h3>Example configuration</h3> |
| * <p>In the example configuration below there are two update chains, one that |
| * uses the "simple" parser ({@link SimplePreAnalyzedParser}) and one that uses |
| * the "json" parser ({@link JsonPreAnalyzedParser}). Field "nonexistent" will be |
| * removed from input documents if not present in the schema. Other fields will be |
| * analyzed and if valid they will be converted to {@link IndexableField}-s or if |
| * they are not in a valid format that can be parsed with the selected parser they |
| * will be passed as-is. Assuming that <code>ssto</code> field is stored but not |
| * indexed, and <code>sind</code> field is indexed but not stored: if |
| * <code>ssto</code> input value contains the indexed part then this part will |
| * be discarded and only the stored value part will be retained. Similarly, |
| * if <code>sind</code> input value contains the stored part then it |
| * will be discarded and only the token stream part will be retained.</p> |
| * |
| * <pre class="prettyprint"> |
| * <updateRequestProcessorChain name="pre-analyzed-simple"> |
| * <processor class="solr.PreAnalyzedUpdateProcessorFactory"> |
| * <str name="fieldName">title</str> |
| * <str name="fieldName">nonexistent</str> |
| * <str name="fieldName">ssto</str> |
| * <str name="fieldName">sind</str> |
| * <str name="parser">simple</str> |
| * </processor> |
| * <processor class="solr.RunUpdateProcessorFactory" /> |
| * </updateRequestProcessorChain> |
| * |
| * <updateRequestProcessorChain name="pre-analyzed-json"> |
| * <processor class="solr.PreAnalyzedUpdateProcessorFactory"> |
| * <str name="fieldName">title</str> |
| * <str name="fieldName">nonexistent</str> |
| * <str name="fieldName">ssto</str> |
| * <str name="fieldName">sind</str> |
| * <str name="parser">json</str> |
| * </processor> |
| * <processor class="solr.RunUpdateProcessorFactory" /> |
| * </updateRequestProcessorChain> |
| * </pre> |
| * |
| * @since 4.3.0 |
| */ |
| public class PreAnalyzedUpdateProcessorFactory extends FieldMutatingUpdateProcessorFactory { |
| |
| private PreAnalyzedField parser; |
| private String parserImpl; |
| |
| @Override |
| public void init(@SuppressWarnings({"rawtypes"})final NamedList args) { |
| parserImpl = (String)args.get("parser"); |
| args.remove("parser"); |
| // initialize inclusion / exclusion patterns |
| super.init(args); |
| } |
| |
| @Override |
| public UpdateRequestProcessor getInstance(SolrQueryRequest req, |
| SolrQueryResponse rsp, UpdateRequestProcessor next) { |
| return new PreAnalyzedUpdateProcessor(getSelector(), next, req.getSchema(), parser); |
| } |
| |
| @Override |
| public void inform(SolrCore core) { |
| super.inform(core); |
| parser = new PreAnalyzedField(); |
| Map<String,String> args = new HashMap<>(); |
| if (parserImpl != null) { |
| args.put(PreAnalyzedField.PARSER_IMPL, parserImpl); |
| } |
| parser.init(core.getLatestSchema(), args); |
| } |
| } |
| |
| class PreAnalyzedUpdateProcessor extends FieldMutatingUpdateProcessor { |
| |
| private PreAnalyzedField parser; |
| private IndexSchema schema; |
| |
| private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); |
| |
| public PreAnalyzedUpdateProcessor(FieldNameSelector sel, UpdateRequestProcessor next, IndexSchema schema, PreAnalyzedField parser) { |
| super(sel, next); |
| this.schema = schema; |
| this.parser = parser; |
| } |
| |
| @Override |
| protected SolrInputField mutate(SolrInputField src) { |
| SchemaField sf = schema.getFieldOrNull(src.getName()); |
| if (sf == null) { // remove this field |
| return null; |
| } |
| FieldType type = PreAnalyzedField.createFieldType(sf); |
| if (type == null) { // neither indexed nor stored - skip |
| return null; |
| } |
| SolrInputField res = new SolrInputField(src.getName()); |
| for (Object o : src) { |
| if (o == null) { |
| continue; |
| } |
| Field pre = (Field)parser.createField(sf, o); |
| if (pre != null) { |
| res.addValue(pre); |
| } else { // restore the original value |
| log.warn("Could not parse field {} - using original value as is: {}", src.getName(), o); |
| res.addValue(o); |
| } |
| } |
| return res; |
| } |
| } |