| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.solr.update; |
| |
| import java.util.Iterator; |
| import java.util.LinkedList; |
| import java.util.List; |
| import java.util.Set; |
| |
| import com.google.common.collect.Sets; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.NumericDocValuesField; |
| import org.apache.lucene.index.IndexableField; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.solr.common.SolrDocumentBase; |
| import org.apache.solr.common.SolrException; |
| import org.apache.solr.common.SolrInputDocument; |
| import org.apache.solr.common.SolrInputField; |
| import org.apache.solr.schema.CopyField; |
| import org.apache.solr.schema.IndexSchema; |
| import org.apache.solr.schema.SchemaField; |
| |
| /** |
| * Builds a Lucene {@link Document} from a {@link SolrInputDocument}. |
| */ |
| public class DocumentBuilder { |
| |
| // accessible only for tests |
| static int MIN_LENGTH_TO_MOVE_LAST = Integer.getInteger("solr.docBuilder.minLengthToMoveLast", 4*1024); // internal setting |
| |
| /** |
| * Add a field value to a given document. |
| * @param doc Document that the field needs to be added to |
| * @param field The schema field object for the field |
| * @param val The value for the field to be added |
| * @param forInPlaceUpdate Whether the field is to be added for in-place update. If true, |
| * only numeric docValues based fields are added to the document. This can be true |
| * when constructing a Lucene document for writing an in-place update, and we don't need |
| * presence of non-updatable fields (non NDV) in such a document. |
| */ |
| private static void addField(Document doc, SchemaField field, Object val, |
| boolean forInPlaceUpdate) { |
| if (val instanceof IndexableField) { |
| if (forInPlaceUpdate) { |
| assert val instanceof NumericDocValuesField: "Expected in-place update to be done on" |
| + " NDV fields only."; |
| } |
| doc.add((IndexableField)val); |
| return; |
| } |
| for (IndexableField f : field.getType().createFields(field, val)) { |
| if (f != null) { // null fields are not added |
| // HACK: workaround for SOLR-9809 |
| // even though at this point in the code we know the field is single valued and DV only |
| // TrieField.createFields() may still return (usless) IndexableField instances that are not |
| // NumericDocValuesField instances. |
| // |
| // once SOLR-9809 is resolved, we should be able to replace this conditional with... |
| // assert f instanceof NumericDocValuesField |
| if (forInPlaceUpdate) { |
| if (f instanceof NumericDocValuesField) { |
| doc.add(f); |
| } |
| } else { |
| doc.add(f); |
| } |
| } |
| } |
| } |
| |
| private static String getID( SolrInputDocument doc, IndexSchema schema ) |
| { |
| String id = ""; |
| SchemaField sf = schema.getUniqueKeyField(); |
| if( sf != null ) { |
| id = "[doc="+doc.getFieldValue( sf.getName() )+"] "; |
| } |
| return id; |
| } |
| |
| /** |
| * @see DocumentBuilder#toDocument(SolrInputDocument, IndexSchema, boolean, boolean) |
| */ |
| public static Document toDocument( SolrInputDocument doc, IndexSchema schema ) |
| { |
| return toDocument(doc, schema, false, true); |
| } |
| |
| /** |
| * Convert a SolrInputDocument to a lucene Document. |
| * |
| * This function should go elsewhere. This builds the Document without an |
| * extra Map<> checking for multiple values. For more discussion, see: |
| * http://www.nabble.com/Re%3A-svn-commit%3A-r547493---in--lucene-solr-trunk%3A-.--src-java-org-apache-solr-common--src-java-org-apache-solr-schema--src-java-org-apache-solr-update--src-test-org-apache-solr-common--tf3931539.html |
| * |
| * TODO: /!\ NOTE /!\ This semantics of this function are still in flux. |
| * Something somewhere needs to be able to fill up a SolrDocument from |
| * a lucene document - this is one place that may happen. It may also be |
| * moved to an independent function |
| * |
| * @since solr 1.3 |
| * |
| * @param doc SolrInputDocument from which the document has to be built |
| * @param schema Schema instance |
| * @param forInPlaceUpdate Whether the output document would be used for an in-place update or not. When this is true, |
| * default fields values and copy fields targets are not populated. |
| * @param ignoreNestedDocs if nested child documents should be ignored. If false then an exception will be thrown. |
| * @return Built Lucene document |
| */ |
| public static Document toDocument(SolrInputDocument doc, IndexSchema schema, boolean forInPlaceUpdate, boolean ignoreNestedDocs) { |
| if (!ignoreNestedDocs && doc.hasChildDocuments()) { |
| throw unexpectedNestedDocException(schema, forInPlaceUpdate); |
| } |
| |
| final SchemaField uniqueKeyField = schema.getUniqueKeyField(); |
| final String uniqueKeyFieldName = null == uniqueKeyField ? null : uniqueKeyField.getName(); |
| |
| Document out = new Document(); |
| Set<String> usedFields = Sets.newHashSet(); |
| |
| // Load fields from SolrDocument to Document |
| for( SolrInputField field : doc ) { |
| |
| // when in-place update, don't process the id & _root_; they won't change |
| if (forInPlaceUpdate) { |
| if (field.getName().equals(uniqueKeyFieldName) || field.getName().equals(IndexSchema.ROOT_FIELD_NAME)) { |
| continue; |
| } |
| } |
| |
| if (field.getFirstValue() instanceof SolrDocumentBase) { |
| if (ignoreNestedDocs) { |
| continue; |
| } |
| throw unexpectedNestedDocException(schema, forInPlaceUpdate); |
| } |
| |
| String name = field.getName(); |
| SchemaField sfield = schema.getFieldOrNull(name); |
| boolean used = false; |
| |
| // Make sure it has the correct number |
| if( sfield!=null && !sfield.multiValued() && field.getValueCount() > 1 ) { |
| throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, |
| "ERROR: "+getID(doc, schema)+"multiple values encountered for non multiValued field " + |
| sfield.getName() + ": " +field.getValue() ); |
| } |
| |
| List<CopyField> copyFields = schema.getCopyFieldsList(name); |
| if( copyFields.size() == 0 ) copyFields = null; |
| |
| // load each field value |
| boolean hasField = false; |
| try { |
| @SuppressWarnings({"rawtypes"}) |
| Iterator it = field.iterator(); |
| while (it.hasNext()) { |
| Object v = it.next(); |
| if( v == null ) { |
| continue; |
| } |
| hasField = true; |
| if (sfield != null) { |
| used = true; |
| addField(out, sfield, v, forInPlaceUpdate); |
| // record the field as having a value |
| usedFields.add(sfield.getName()); |
| } |
| |
| // Check if we should copy this field value to any other fields. |
| // This could happen whether it is explicit or not. |
| if (copyFields != null) { |
| for (CopyField cf : copyFields) { |
| SchemaField destinationField = cf.getDestination(); |
| |
| final boolean destHasValues = usedFields.contains(destinationField.getName()); |
| |
| // check if the copy field is a multivalued or not |
| if (!destinationField.multiValued() && destHasValues) { |
| throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, |
| "Multiple values encountered for non multiValued copy field " + |
| destinationField.getName() + ": " + v); |
| } |
| |
| used = true; |
| |
| // Perhaps trim the length of a copy field |
| Object val = v; |
| if( val instanceof CharSequence && cf.getMaxChars() > 0 ) { |
| val = cf.getLimitedValue(val.toString()); |
| } |
| |
| // TODO ban copyField populating uniqueKeyField; too problematic to support |
| addField(out, destinationField, val, |
| destinationField.getName().equals(uniqueKeyFieldName) ? false : forInPlaceUpdate); |
| // record the field as having a value |
| usedFields.add(destinationField.getName()); |
| } |
| } |
| } |
| } |
| catch( SolrException ex ) { |
| throw new SolrException(SolrException.ErrorCode.getErrorCode(ex.code()), |
| "ERROR: "+getID(doc, schema)+"Error adding field '" + |
| field.getName() + "'='" +field.getValue()+"' msg=" + ex.getMessage(), ex ); |
| } |
| catch( Exception ex ) { |
| throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, |
| "ERROR: "+getID(doc, schema)+"Error adding field '" + |
| field.getName() + "'='" +field.getValue()+"' msg=" + ex.getMessage(), ex ); |
| } |
| |
| // make sure the field was used somehow... |
| if( !used && hasField ) { |
| throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, |
| "ERROR: "+getID(doc, schema)+"unknown field '" +name + "'"); |
| } |
| } |
| |
| |
| // Now validate required fields or add default values |
| // fields with default values are defacto 'required' |
| |
| // Note: We don't need to add required fields if this document is to be used for |
| // in-place updates, since this validation and population of required fields would've happened |
| // during the full indexing initially. |
| if (!forInPlaceUpdate) { |
| for (SchemaField field : schema.getRequiredFields()) { |
| if (out.getField(field.getName() ) == null) { |
| if (field.getDefaultValue() != null) { |
| addField(out, field, field.getDefaultValue(), false); |
| } |
| else { |
| String msg = getID(doc, schema) + "missing required field: " + field.getName(); |
| throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, msg ); |
| } |
| } |
| } |
| } |
| |
| if (!forInPlaceUpdate) { |
| moveLargestFieldLast(out); |
| } |
| |
| return out; |
| } |
| |
| private static SolrException unexpectedNestedDocException(IndexSchema schema, boolean forInPlaceUpdate) { |
| if (! schema.isUsableForChildDocs()) { |
| return new SolrException(SolrException.ErrorCode.BAD_REQUEST, |
| "Unable to index docs with children: the schema must " + |
| "include definitions for both a uniqueKey field and the '" + IndexSchema.ROOT_FIELD_NAME + |
| "' field, using the exact same fieldType"); |
| } else if (forInPlaceUpdate) { |
| return new SolrException(SolrException.ErrorCode.BAD_REQUEST, |
| "Unable to index docs with children: for an in-place update, just provide the doc by itself"); |
| } else { |
| return new SolrException(SolrException.ErrorCode.SERVER_ERROR, |
| "A document unexpectedly contained nested child documents"); |
| } |
| } |
| |
| /** Move the largest stored field last, because Lucene can avoid loading that one if it's not needed. */ |
| private static void moveLargestFieldLast(Document doc) { |
| String largestField = null; |
| int largestFieldLen = -1; |
| boolean largestIsLast = true; |
| for (IndexableField field : doc) { |
| if (!field.fieldType().stored()) { |
| continue; |
| } |
| if (largestIsLast && !field.name().equals(largestField)) { |
| largestIsLast = false; |
| } |
| if (field.numericValue() != null) { // just ignore these as non-competitive (avoid toString'ing their number) |
| continue; |
| } |
| String strVal = field.stringValue(); |
| if (strVal != null) { |
| if (strVal.length() > largestFieldLen) { |
| largestField = field.name(); |
| largestFieldLen = strVal.length(); |
| largestIsLast = true; |
| } |
| } else { |
| BytesRef bytesRef = field.binaryValue(); |
| if (bytesRef != null && bytesRef.length > largestFieldLen) { |
| largestField = field.name(); |
| largestFieldLen = bytesRef.length; |
| largestIsLast = true; |
| } |
| } |
| } |
| if (!largestIsLast && largestField != null && largestFieldLen > MIN_LENGTH_TO_MOVE_LAST) { // only bother if the value isn't tiny |
| LinkedList<IndexableField> addToEnd = new LinkedList<>(); |
| Iterator<IndexableField> iterator = doc.iterator(); |
| while (iterator.hasNext()) { |
| IndexableField field = iterator.next(); |
| if (field.name().equals(largestField)) { |
| addToEnd.add(field); |
| iterator.remove(); // Document may not have "remove" but it's iterator allows mutation |
| } |
| } |
| for (IndexableField field : addToEnd) { |
| doc.add(field); |
| } |
| } |
| } |
| } |