blob: eb13e033a1fed945195cadd9dff381db90f8a160 [file] [log] [blame]
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.DocumentsWriterPerThread.DocState;
import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.index.codecs.PerDocConsumer;
import org.apache.lucene.index.codecs.DocValuesConsumer;
import org.apache.lucene.util.ArrayUtil;
/**
* This is a DocConsumer that gathers all fields under the
* same name, and calls per-field consumers to process field
* by field. This class doesn't doesn't do any "real" work
* of its own: it just forwards the fields to a
* DocFieldConsumer.
*/
final class DocFieldProcessor extends DocConsumer {
final DocFieldConsumer consumer;
final StoredFieldsWriter fieldsWriter;
// Holds all fields seen in current doc
DocFieldProcessorPerField[] fields = new DocFieldProcessorPerField[1];
int fieldCount;
// Hash table for all fields ever seen
DocFieldProcessorPerField[] fieldHash = new DocFieldProcessorPerField[2];
int hashMask = 1;
int totalFieldCount;
float docBoost;
int fieldGen;
final DocumentsWriterPerThread.DocState docState;
public DocFieldProcessor(DocumentsWriterPerThread docWriter, DocFieldConsumer consumer) {
this.docState = docWriter.docState;
this.consumer = consumer;
fieldsWriter = new StoredFieldsWriter(docWriter);
}
@Override
public void flush(SegmentWriteState state) throws IOException {
Map<FieldInfo, DocFieldConsumerPerField> childFields = new HashMap<FieldInfo, DocFieldConsumerPerField>();
Collection<DocFieldConsumerPerField> fields = fields();
for (DocFieldConsumerPerField f : fields) {
childFields.put(f.getFieldInfo(), f);
}
fieldsWriter.flush(state);
consumer.flush(childFields, state);
// Important to save after asking consumer to flush so
// consumer can alter the FieldInfo* if necessary. EG,
// FreqProxTermsWriter does this with
// FieldInfo.storePayload.
final String fileName = IndexFileNames.segmentFileName(state.segmentName, "", IndexFileNames.FIELD_INFOS_EXTENSION);
state.fieldInfos.write(state.directory, fileName);
for (DocValuesConsumer consumers : docValues.values()) {
consumers.finish(state.numDocs);
};
}
@Override
public void abort() {
Throwable th = null;
for (DocFieldProcessorPerField field : fieldHash) {
while (field != null) {
final DocFieldProcessorPerField next = field.next;
try {
field.abort();
} catch (Throwable t) {
if (th == null) {
th = t;
}
}
field = next;
}
}
for(PerDocConsumer consumer : perDocConsumers.values()) {
try {
consumer.close(); // TODO add abort to PerDocConsumer!
} catch (IOException e) {
// ignore on abort!
}
}
try {
fieldsWriter.abort();
} catch (Throwable t) {
if (th == null) {
th = t;
}
}
try {
consumer.abort();
} catch (Throwable t) {
if (th == null) {
th = t;
}
}
// If any errors occured, throw it.
if (th != null) {
if (th instanceof RuntimeException) throw (RuntimeException) th;
if (th instanceof Error) throw (Error) th;
// defensive code - we should not hit unchecked exceptions
throw new RuntimeException(th);
}
}
@Override
public boolean freeRAM() {
return consumer.freeRAM();
}
public Collection<DocFieldConsumerPerField> fields() {
Collection<DocFieldConsumerPerField> fields = new HashSet<DocFieldConsumerPerField>();
for(int i=0;i<fieldHash.length;i++) {
DocFieldProcessorPerField field = fieldHash[i];
while(field != null) {
fields.add(field.consumer);
field = field.next;
}
}
assert fields.size() == totalFieldCount;
return fields;
}
/** In flush we reset the fieldHash to not maintain per-field state
* across segments */
@Override
void doAfterFlush() {
fieldHash = new DocFieldProcessorPerField[2];
hashMask = 1;
totalFieldCount = 0;
for(PerDocConsumer consumer : perDocConsumers.values()) {
try {
consumer.close();
} catch (IOException e) {
// ignore and continue closing remaining consumers
}
}
perDocConsumers.clear();
docValues.clear();
}
private void rehash() {
final int newHashSize = (fieldHash.length*2);
assert newHashSize > fieldHash.length;
final DocFieldProcessorPerField newHashArray[] = new DocFieldProcessorPerField[newHashSize];
// Rehash
int newHashMask = newHashSize-1;
for(int j=0;j<fieldHash.length;j++) {
DocFieldProcessorPerField fp0 = fieldHash[j];
while(fp0 != null) {
final int hashPos2 = fp0.fieldInfo.name.hashCode() & newHashMask;
DocFieldProcessorPerField nextFP0 = fp0.next;
fp0.next = newHashArray[hashPos2];
newHashArray[hashPos2] = fp0;
fp0 = nextFP0;
}
}
fieldHash = newHashArray;
hashMask = newHashMask;
}
@Override
public void processDocument(FieldInfos fieldInfos) throws IOException {
consumer.startDocument();
fieldsWriter.startDocument();
final Document doc = docState.doc;
fieldCount = 0;
final int thisFieldGen = fieldGen++;
final List<Fieldable> docFields = doc.getFields();
final int numDocFields = docFields.size();
// Absorb any new fields first seen in this document.
// Also absorb any changes to fields we had already
// seen before (eg suddenly turning on norms or
// vectors, etc.):
for(int i=0;i<numDocFields;i++) {
Fieldable field = docFields.get(i);
final String fieldName = field.name();
// Make sure we have a PerField allocated
final int hashPos = fieldName.hashCode() & hashMask;
DocFieldProcessorPerField fp = fieldHash[hashPos];
while(fp != null && !fp.fieldInfo.name.equals(fieldName)) {
fp = fp.next;
}
if (fp == null) {
// TODO FI: we need to genericize the "flags" that a
// field holds, and, how these flags are merged; it
// needs to be more "pluggable" such that if I want
// to have a new "thing" my Fields can do, I can
// easily add it
FieldInfo fi = fieldInfos.addOrUpdate(fieldName, field.isIndexed(), field.isTermVectorStored(),
field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(),
field.getOmitNorms(), false, field.getOmitTermFreqAndPositions(), field.docValuesType());
fp = new DocFieldProcessorPerField(this, fi);
fp.next = fieldHash[hashPos];
fieldHash[hashPos] = fp;
totalFieldCount++;
if (totalFieldCount >= fieldHash.length/2)
rehash();
} else {
fieldInfos.addOrUpdate(fp.fieldInfo.name, field.isIndexed(), field.isTermVectorStored(),
field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(),
field.getOmitNorms(), false, field.getOmitTermFreqAndPositions(), field.docValuesType());
}
if (thisFieldGen != fp.lastGen) {
// First time we're seeing this field for this doc
fp.fieldCount = 0;
if (fieldCount == fields.length) {
final int newSize = fields.length*2;
DocFieldProcessorPerField newArray[] = new DocFieldProcessorPerField[newSize];
System.arraycopy(fields, 0, newArray, 0, fieldCount);
fields = newArray;
}
fields[fieldCount++] = fp;
fp.lastGen = thisFieldGen;
}
fp.addField(field);
if (field.isStored()) {
fieldsWriter.addField(field, fp.fieldInfo);
}
if (field.hasDocValues()) {
final DocValuesConsumer docValuesConsumer = docValuesConsumer(docState, fp.fieldInfo);
docValuesConsumer.add(docState.docID, field.getDocValues());
}
}
// If we are writing vectors then we must visit
// fields in sorted order so they are written in
// sorted order. TODO: we actually only need to
// sort the subset of fields that have vectors
// enabled; we could save [small amount of] CPU
// here.
ArrayUtil.quickSort(fields, 0, fieldCount, fieldsComp);
for(int i=0;i<fieldCount;i++) {
final DocFieldProcessorPerField perField = fields[i];
perField.consumer.processFields(perField.fields, perField.fieldCount);
}
if (docState.maxTermPrefix != null && docState.infoStream != null) {
docState.infoStream.println("WARNING: document contains at least one immense term (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'");
docState.maxTermPrefix = null;
}
}
private static final Comparator<DocFieldProcessorPerField> fieldsComp = new Comparator<DocFieldProcessorPerField>() {
public int compare(DocFieldProcessorPerField o1, DocFieldProcessorPerField o2) {
return o1.fieldInfo.name.compareTo(o2.fieldInfo.name);
}
};
@Override
void finishDocument() throws IOException {
try {
fieldsWriter.finishDocument();
} finally {
consumer.finishDocument();
}
}
final private Map<String, DocValuesConsumer> docValues = new HashMap<String, DocValuesConsumer>();
final private Map<Integer, PerDocConsumer> perDocConsumers = new HashMap<Integer, PerDocConsumer>();
DocValuesConsumer docValuesConsumer(DocState docState, FieldInfo fieldInfo)
throws IOException {
DocValuesConsumer docValuesConsumer = docValues.get(fieldInfo.name);
if (docValuesConsumer != null) {
return docValuesConsumer;
}
PerDocConsumer perDocConsumer = perDocConsumers.get(fieldInfo.getCodecId());
if (perDocConsumer == null) {
PerDocWriteState perDocWriteState = docState.docWriter.newPerDocWriteState(fieldInfo.getCodecId());
SegmentCodecs codecs = perDocWriteState.segmentCodecs;
assert codecs.codecs.length > fieldInfo.getCodecId();
Codec codec = codecs.codecs[fieldInfo.getCodecId()];
perDocConsumer = codec.docsConsumer(perDocWriteState);
perDocConsumers.put(Integer.valueOf(fieldInfo.getCodecId()), perDocConsumer);
}
boolean success = false;
try {
docValuesConsumer = perDocConsumer.addValuesField(fieldInfo);
fieldInfo.commitDocValues();
success = true;
} finally {
if (!success) {
fieldInfo.revertUncommitted();
}
}
docValues.put(fieldInfo.name, docValuesConsumer);
return docValuesConsumer;
}
}