lucene/src/java/org/apache/lucene/index/DocFieldProcessor.java - lucene-solr - Git at Google

 package org.apache.lucene.index;

 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.IOException;
 import java.util.Collection;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;

 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Fieldable;
 import org.apache.lucene.index.DocumentsWriterPerThread.DocState;
 import org.apache.lucene.index.codecs.Codec;
 import org.apache.lucene.index.codecs.PerDocConsumer;
 import org.apache.lucene.index.codecs.DocValuesConsumer;
 import org.apache.lucene.util.ArrayUtil;


 /**
  * This is a DocConsumer that gathers all fields under the
  * same name, and calls per-field consumers to process field
  * by field.  This class doesn't doesn't do any "real" work
  * of its own: it just forwards the fields to a
  * DocFieldConsumer.
  */

 final class DocFieldProcessor extends DocConsumer {

   final DocFieldConsumer consumer;
   final StoredFieldsWriter fieldsWriter;

   // Holds all fields seen in current doc
   DocFieldProcessorPerField[] fields = new DocFieldProcessorPerField[1];
   int fieldCount;

   // Hash table for all fields ever seen
   DocFieldProcessorPerField[] fieldHash = new DocFieldProcessorPerField[2];
   int hashMask = 1;
   int totalFieldCount;

   float docBoost;
   int fieldGen;
   final DocumentsWriterPerThread.DocState docState;

   public DocFieldProcessor(DocumentsWriterPerThread docWriter, DocFieldConsumer consumer) {
     this.docState = docWriter.docState;
     this.consumer = consumer;
     fieldsWriter = new StoredFieldsWriter(docWriter);
   }

   @Override
   public void flush(SegmentWriteState state) throws IOException {

     Map<FieldInfo, DocFieldConsumerPerField> childFields = new HashMap<FieldInfo, DocFieldConsumerPerField>();
     Collection<DocFieldConsumerPerField> fields = fields();
     for (DocFieldConsumerPerField f : fields) {
       childFields.put(f.getFieldInfo(), f);
     }

     fieldsWriter.flush(state);
     consumer.flush(childFields, state);

     // Important to save after asking consumer to flush so
     // consumer can alter the FieldInfo* if necessary.  EG,
     // FreqProxTermsWriter does this with
     // FieldInfo.storePayload.
     final String fileName = IndexFileNames.segmentFileName(state.segmentName, "", IndexFileNames.FIELD_INFOS_EXTENSION);
     state.fieldInfos.write(state.directory, fileName);
     for (DocValuesConsumer consumers : docValues.values()) {
       consumers.finish(state.numDocs);
     };
   }

   @Override
   public void abort() {
     Throwable th = null;

     for (DocFieldProcessorPerField field : fieldHash) {
       while (field != null) {
         final DocFieldProcessorPerField next = field.next;
         try {
           field.abort();
         } catch (Throwable t) {
           if (th == null) {
             th = t;
           }
         }
         field = next;
       }
     }

     for(PerDocConsumer consumer : perDocConsumers.values()) {
       try {
         consumer.close();  // TODO add abort to PerDocConsumer!
       } catch (IOException e) {
         // ignore on abort!
       }
     }

     try {
       fieldsWriter.abort();
     } catch (Throwable t) {
       if (th == null) {
         th = t;
       }
     }

     try {
       consumer.abort();
     } catch (Throwable t) {
       if (th == null) {
         th = t;
       }
     }

     // If any errors occured, throw it.
     if (th != null) {
       if (th instanceof RuntimeException) throw (RuntimeException) th;
       if (th instanceof Error) throw (Error) th;
       // defensive code - we should not hit unchecked exceptions
       throw new RuntimeException(th);
     }
   }

   @Override
   public boolean freeRAM() {
     return consumer.freeRAM();
   }

   public Collection<DocFieldConsumerPerField> fields() {
     Collection<DocFieldConsumerPerField> fields = new HashSet<DocFieldConsumerPerField>();
     for(int i=0;i<fieldHash.length;i++) {
       DocFieldProcessorPerField field = fieldHash[i];
       while(field != null) {
         fields.add(field.consumer);
         field = field.next;
       }
     }
     assert fields.size() == totalFieldCount;
     return fields;
   }

   /** In flush we reset the fieldHash to not maintain per-field state
    *  across segments */
   @Override
   void doAfterFlush() {
     fieldHash = new DocFieldProcessorPerField[2];
     hashMask = 1;
     totalFieldCount = 0;
     for(PerDocConsumer consumer : perDocConsumers.values()) {
       try {
         consumer.close();
       } catch (IOException e) {
         // ignore and continue closing remaining consumers
       }
     }
     perDocConsumers.clear();
     docValues.clear();
   }

   private void rehash() {
     final int newHashSize = (fieldHash.length*2);
     assert newHashSize > fieldHash.length;

     final DocFieldProcessorPerField newHashArray[] = new DocFieldProcessorPerField[newHashSize];

     // Rehash
     int newHashMask = newHashSize-1;
     for(int j=0;j<fieldHash.length;j++) {
       DocFieldProcessorPerField fp0 = fieldHash[j];
       while(fp0 != null) {
         final int hashPos2 = fp0.fieldInfo.name.hashCode() & newHashMask;
         DocFieldProcessorPerField nextFP0 = fp0.next;
         fp0.next = newHashArray[hashPos2];
         newHashArray[hashPos2] = fp0;
         fp0 = nextFP0;
       }
     }

     fieldHash = newHashArray;
     hashMask = newHashMask;
   }

   @Override
   public void processDocument(FieldInfos fieldInfos) throws IOException {

     consumer.startDocument();
     fieldsWriter.startDocument();

     final Document doc = docState.doc;

     fieldCount = 0;

     final int thisFieldGen = fieldGen++;

     final List<Fieldable> docFields = doc.getFields();
     final int numDocFields = docFields.size();

     // Absorb any new fields first seen in this document.
     // Also absorb any changes to fields we had already
     // seen before (eg suddenly turning on norms or
     // vectors, etc.):

     for(int i=0;i<numDocFields;i++) {
       Fieldable field = docFields.get(i);
       final String fieldName = field.name();

       // Make sure we have a PerField allocated
       final int hashPos = fieldName.hashCode() & hashMask;
       DocFieldProcessorPerField fp = fieldHash[hashPos];
       while(fp != null && !fp.fieldInfo.name.equals(fieldName)) {
         fp = fp.next;
       }

       if (fp == null) {

         // TODO FI: we need to genericize the "flags" that a
         // field holds, and, how these flags are merged; it
         // needs to be more "pluggable" such that if I want
         // to have a new "thing" my Fields can do, I can
         // easily add it
         FieldInfo fi = fieldInfos.addOrUpdate(fieldName, field.isIndexed(), field.isTermVectorStored(),
                                       field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(),
                                       field.getOmitNorms(), false, field.getOmitTermFreqAndPositions(), field.docValuesType());

         fp = new DocFieldProcessorPerField(this, fi);
         fp.next = fieldHash[hashPos];
         fieldHash[hashPos] = fp;
         totalFieldCount++;

         if (totalFieldCount >= fieldHash.length/2)
           rehash();
       } else {
         fieldInfos.addOrUpdate(fp.fieldInfo.name, field.isIndexed(), field.isTermVectorStored(),
                             field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(),
                             field.getOmitNorms(), false, field.getOmitTermFreqAndPositions(), field.docValuesType());
       }

       if (thisFieldGen != fp.lastGen) {

         // First time we're seeing this field for this doc
         fp.fieldCount = 0;

         if (fieldCount == fields.length) {
           final int newSize = fields.length*2;
           DocFieldProcessorPerField newArray[] = new DocFieldProcessorPerField[newSize];
           System.arraycopy(fields, 0, newArray, 0, fieldCount);
           fields = newArray;
         }

         fields[fieldCount++] = fp;
         fp.lastGen = thisFieldGen;
       }

       fp.addField(field);

       if (field.isStored()) {
         fieldsWriter.addField(field, fp.fieldInfo);
       }
       if (field.hasDocValues()) {
         final DocValuesConsumer docValuesConsumer = docValuesConsumer(docState, fp.fieldInfo);
         docValuesConsumer.add(docState.docID, field.getDocValues());
       }
     }

     // If we are writing vectors then we must visit
     // fields in sorted order so they are written in
     // sorted order.  TODO: we actually only need to
     // sort the subset of fields that have vectors
     // enabled; we could save [small amount of] CPU
     // here.
     ArrayUtil.quickSort(fields, 0, fieldCount, fieldsComp);
     for(int i=0;i<fieldCount;i++) {
       final DocFieldProcessorPerField perField = fields[i];
       perField.consumer.processFields(perField.fields, perField.fieldCount);
     }

     if (docState.maxTermPrefix != null && docState.infoStream != null) {
       docState.infoStream.println("WARNING: document contains at least one immense term (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped.  Please correct the analyzer to not produce such terms.  The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'");
       docState.maxTermPrefix = null;
     }
   }

   private static final Comparator<DocFieldProcessorPerField> fieldsComp = new Comparator<DocFieldProcessorPerField>() {
     public int compare(DocFieldProcessorPerField o1, DocFieldProcessorPerField o2) {
       return o1.fieldInfo.name.compareTo(o2.fieldInfo.name);
     }
   };

   @Override
   void finishDocument() throws IOException {
     try {
       fieldsWriter.finishDocument();
     } finally {
       consumer.finishDocument();
     }
   }

   final private Map<String, DocValuesConsumer> docValues = new HashMap<String, DocValuesConsumer>();
   final private Map<Integer, PerDocConsumer> perDocConsumers = new HashMap<Integer, PerDocConsumer>();

   DocValuesConsumer docValuesConsumer(DocState docState, FieldInfo fieldInfo)
       throws IOException {
     DocValuesConsumer docValuesConsumer = docValues.get(fieldInfo.name);
     if (docValuesConsumer != null) {
       return docValuesConsumer;
     }
     PerDocConsumer perDocConsumer = perDocConsumers.get(fieldInfo.getCodecId());
     if (perDocConsumer == null) {
       PerDocWriteState perDocWriteState = docState.docWriter.newPerDocWriteState(fieldInfo.getCodecId());
       SegmentCodecs codecs = perDocWriteState.segmentCodecs;
       assert codecs.codecs.length > fieldInfo.getCodecId();
       Codec codec = codecs.codecs[fieldInfo.getCodecId()];
       perDocConsumer = codec.docsConsumer(perDocWriteState);
       perDocConsumers.put(Integer.valueOf(fieldInfo.getCodecId()), perDocConsumer);
     }
     boolean success = false;
     try {
       docValuesConsumer = perDocConsumer.addValuesField(fieldInfo);
       fieldInfo.commitDocValues();
       success = true;
     } finally {
       if (!success) {
         fieldInfo.revertUncommitted();
       }
     }
     docValues.put(fieldInfo.name, docValuesConsumer);
     return docValuesConsumer;
   }

 }
	package org.apache.lucene.index;

	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.io.IOException;
	import java.util.Collection;
	import java.util.Comparator;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Map;

	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Fieldable;
	import org.apache.lucene.index.DocumentsWriterPerThread.DocState;
	import org.apache.lucene.index.codecs.Codec;
	import org.apache.lucene.index.codecs.PerDocConsumer;
	import org.apache.lucene.index.codecs.DocValuesConsumer;
	import org.apache.lucene.util.ArrayUtil;


	/**
	* This is a DocConsumer that gathers all fields under the
	* same name, and calls per-field consumers to process field
	* by field. This class doesn't doesn't do any "real" work
	* of its own: it just forwards the fields to a
	* DocFieldConsumer.
	*/

	final class DocFieldProcessor extends DocConsumer {

	final DocFieldConsumer consumer;
	final StoredFieldsWriter fieldsWriter;

	// Holds all fields seen in current doc
	DocFieldProcessorPerField[] fields = new DocFieldProcessorPerField[1];
	int fieldCount;

	// Hash table for all fields ever seen
	DocFieldProcessorPerField[] fieldHash = new DocFieldProcessorPerField[2];
	int hashMask = 1;
	int totalFieldCount;

	float docBoost;
	int fieldGen;
	final DocumentsWriterPerThread.DocState docState;

	public DocFieldProcessor(DocumentsWriterPerThread docWriter, DocFieldConsumer consumer) {
	this.docState = docWriter.docState;
	this.consumer = consumer;
	fieldsWriter = new StoredFieldsWriter(docWriter);
	}

	@Override
	public void flush(SegmentWriteState state) throws IOException {

	Map<FieldInfo, DocFieldConsumerPerField> childFields = new HashMap<FieldInfo, DocFieldConsumerPerField>();
	Collection<DocFieldConsumerPerField> fields = fields();
	for (DocFieldConsumerPerField f : fields) {
	childFields.put(f.getFieldInfo(), f);
	}

	fieldsWriter.flush(state);
	consumer.flush(childFields, state);

	// Important to save after asking consumer to flush so
	// consumer can alter the FieldInfo* if necessary. EG,
	// FreqProxTermsWriter does this with
	// FieldInfo.storePayload.
	final String fileName = IndexFileNames.segmentFileName(state.segmentName, "", IndexFileNames.FIELD_INFOS_EXTENSION);
	state.fieldInfos.write(state.directory, fileName);
	for (DocValuesConsumer consumers : docValues.values()) {
	consumers.finish(state.numDocs);
	};
	}

	@Override
	public void abort() {
	Throwable th = null;

	for (DocFieldProcessorPerField field : fieldHash) {
	while (field != null) {
	final DocFieldProcessorPerField next = field.next;
	try {
	field.abort();
	} catch (Throwable t) {
	if (th == null) {
	th = t;
	}
	}
	field = next;
	}
	}

	for(PerDocConsumer consumer : perDocConsumers.values()) {
	try {
	consumer.close(); // TODO add abort to PerDocConsumer!
	} catch (IOException e) {
	// ignore on abort!
	}
	}

	try {
	fieldsWriter.abort();
	} catch (Throwable t) {
	if (th == null) {
	th = t;
	}
	}

	try {
	consumer.abort();
	} catch (Throwable t) {
	if (th == null) {
	th = t;
	}
	}

	// If any errors occured, throw it.
	if (th != null) {
	if (th instanceof RuntimeException) throw (RuntimeException) th;
	if (th instanceof Error) throw (Error) th;
	// defensive code - we should not hit unchecked exceptions
	throw new RuntimeException(th);
	}
	}

	@Override
	public boolean freeRAM() {
	return consumer.freeRAM();
	}

	public Collection<DocFieldConsumerPerField> fields() {
	Collection<DocFieldConsumerPerField> fields = new HashSet<DocFieldConsumerPerField>();
	for(int i=0;i<fieldHash.length;i++) {
	DocFieldProcessorPerField field = fieldHash[i];
	while(field != null) {
	fields.add(field.consumer);
	field = field.next;
	}
	}
	assert fields.size() == totalFieldCount;
	return fields;
	}

	/** In flush we reset the fieldHash to not maintain per-field state
	* across segments */
	@Override
	void doAfterFlush() {
	fieldHash = new DocFieldProcessorPerField[2];
	hashMask = 1;
	totalFieldCount = 0;
	for(PerDocConsumer consumer : perDocConsumers.values()) {
	try {
	consumer.close();
	} catch (IOException e) {
	// ignore and continue closing remaining consumers
	}
	}
	perDocConsumers.clear();
	docValues.clear();
	}

	private void rehash() {
	final int newHashSize = (fieldHash.length*2);
	assert newHashSize > fieldHash.length;

	final DocFieldProcessorPerField newHashArray[] = new DocFieldProcessorPerField[newHashSize];

	// Rehash
	int newHashMask = newHashSize-1;
	for(int j=0;j<fieldHash.length;j++) {
	DocFieldProcessorPerField fp0 = fieldHash[j];
	while(fp0 != null) {
	final int hashPos2 = fp0.fieldInfo.name.hashCode() & newHashMask;
	DocFieldProcessorPerField nextFP0 = fp0.next;
	fp0.next = newHashArray[hashPos2];
	newHashArray[hashPos2] = fp0;
	fp0 = nextFP0;
	}
	}

	fieldHash = newHashArray;
	hashMask = newHashMask;
	}

	@Override
	public void processDocument(FieldInfos fieldInfos) throws IOException {

	consumer.startDocument();
	fieldsWriter.startDocument();

	final Document doc = docState.doc;

	fieldCount = 0;

	final int thisFieldGen = fieldGen++;

	final List<Fieldable> docFields = doc.getFields();
	final int numDocFields = docFields.size();

	// Absorb any new fields first seen in this document.
	// Also absorb any changes to fields we had already
	// seen before (eg suddenly turning on norms or
	// vectors, etc.):

	for(int i=0;i<numDocFields;i++) {
	Fieldable field = docFields.get(i);
	final String fieldName = field.name();

	// Make sure we have a PerField allocated
	final int hashPos = fieldName.hashCode() & hashMask;
	DocFieldProcessorPerField fp = fieldHash[hashPos];
	while(fp != null && !fp.fieldInfo.name.equals(fieldName)) {
	fp = fp.next;
	}

	if (fp == null) {

	// TODO FI: we need to genericize the "flags" that a
	// field holds, and, how these flags are merged; it
	// needs to be more "pluggable" such that if I want
	// to have a new "thing" my Fields can do, I can
	// easily add it
	FieldInfo fi = fieldInfos.addOrUpdate(fieldName, field.isIndexed(), field.isTermVectorStored(),
	field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(),
	field.getOmitNorms(), false, field.getOmitTermFreqAndPositions(), field.docValuesType());

	fp = new DocFieldProcessorPerField(this, fi);
	fp.next = fieldHash[hashPos];
	fieldHash[hashPos] = fp;
	totalFieldCount++;

	if (totalFieldCount >= fieldHash.length/2)
	rehash();
	} else {
	fieldInfos.addOrUpdate(fp.fieldInfo.name, field.isIndexed(), field.isTermVectorStored(),
	field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(),
	field.getOmitNorms(), false, field.getOmitTermFreqAndPositions(), field.docValuesType());
	}

	if (thisFieldGen != fp.lastGen) {

	// First time we're seeing this field for this doc
	fp.fieldCount = 0;

	if (fieldCount == fields.length) {
	final int newSize = fields.length*2;
	DocFieldProcessorPerField newArray[] = new DocFieldProcessorPerField[newSize];
	System.arraycopy(fields, 0, newArray, 0, fieldCount);
	fields = newArray;
	}

	fields[fieldCount++] = fp;
	fp.lastGen = thisFieldGen;
	}

	fp.addField(field);

	if (field.isStored()) {
	fieldsWriter.addField(field, fp.fieldInfo);
	}
	if (field.hasDocValues()) {
	final DocValuesConsumer docValuesConsumer = docValuesConsumer(docState, fp.fieldInfo);
	docValuesConsumer.add(docState.docID, field.getDocValues());
	}
	}

	// If we are writing vectors then we must visit
	// fields in sorted order so they are written in
	// sorted order. TODO: we actually only need to
	// sort the subset of fields that have vectors
	// enabled; we could save [small amount of] CPU
	// here.
	ArrayUtil.quickSort(fields, 0, fieldCount, fieldsComp);
	for(int i=0;i<fieldCount;i++) {
	final DocFieldProcessorPerField perField = fields[i];
	perField.consumer.processFields(perField.fields, perField.fieldCount);
	}

	if (docState.maxTermPrefix != null && docState.infoStream != null) {
	docState.infoStream.println("WARNING: document contains at least one immense term (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'");
	docState.maxTermPrefix = null;
	}
	}

	private static final Comparator<DocFieldProcessorPerField> fieldsComp = new Comparator<DocFieldProcessorPerField>() {
	public int compare(DocFieldProcessorPerField o1, DocFieldProcessorPerField o2) {
	return o1.fieldInfo.name.compareTo(o2.fieldInfo.name);
	}
	};

	@Override
	void finishDocument() throws IOException {
	try {
	fieldsWriter.finishDocument();
	} finally {
	consumer.finishDocument();
	}
	}

	final private Map<String, DocValuesConsumer> docValues = new HashMap<String, DocValuesConsumer>();
	final private Map<Integer, PerDocConsumer> perDocConsumers = new HashMap<Integer, PerDocConsumer>();

	DocValuesConsumer docValuesConsumer(DocState docState, FieldInfo fieldInfo)
	throws IOException {
	DocValuesConsumer docValuesConsumer = docValues.get(fieldInfo.name);
	if (docValuesConsumer != null) {
	return docValuesConsumer;
	}
	PerDocConsumer perDocConsumer = perDocConsumers.get(fieldInfo.getCodecId());
	if (perDocConsumer == null) {
	PerDocWriteState perDocWriteState = docState.docWriter.newPerDocWriteState(fieldInfo.getCodecId());
	SegmentCodecs codecs = perDocWriteState.segmentCodecs;
	assert codecs.codecs.length > fieldInfo.getCodecId();
	Codec codec = codecs.codecs[fieldInfo.getCodecId()];
	perDocConsumer = codec.docsConsumer(perDocWriteState);
	perDocConsumers.put(Integer.valueOf(fieldInfo.getCodecId()), perDocConsumer);
	}
	boolean success = false;
	try {
	docValuesConsumer = perDocConsumer.addValuesField(fieldInfo);
	fieldInfo.commitDocValues();
	success = true;
	} finally {
	if (!success) {
	fieldInfo.revertUncommitted();
	}
	}
	docValues.put(fieldInfo.name, docValuesConsumer);
	return docValuesConsumer;
	}

	}