package org.apache.lucene.index;
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.codecs.DimensionalFormat;
import org.apache.lucene.codecs.DimensionalWriter;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.NormsConsumer;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.StoredFieldsWriter;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash.MaxBytesLengthExceededException;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
/** Default general purpose indexing chain, which handles
* indexing all types of fields. */
final class DefaultIndexingChain extends DocConsumer {
final Counter bytesUsed;
final DocumentsWriterPerThread.DocState docState;
final DocumentsWriterPerThread docWriter;
final FieldInfos.Builder fieldInfos;
// Writes postings and term vectors:
final TermsHash termsHash;
// lazy init:
private StoredFieldsWriter storedFieldsWriter;
private int lastStoredDocID;
// NOTE: I tried using Hash Map<String,PerField>
// but it was ~2% slower on Wiki and Geonames with Java
// 1.7.0_25:
private PerField[] fieldHash = new PerField[2];
private int hashMask = 1;
private int totalFieldCount;
private long nextFieldGen;
// Holds fields seen in each document
private PerField[] fields = new PerField[1];
public DefaultIndexingChain(DocumentsWriterPerThread docWriter) throws IOException {
this.docWriter = docWriter;
this.fieldInfos = docWriter.getFieldInfosBuilder();
this.docState = docWriter.docState;
this.bytesUsed = docWriter.bytesUsed;
TermsHash termVectorsWriter = new TermVectorsConsumer(docWriter);
termsHash = new FreqProxTermsWriter(docWriter, termVectorsWriter);
// TODO: can we remove this lazy-init / make cleaner / do it another way...?
private void initStoredFieldsWriter() throws IOException {
if (storedFieldsWriter == null) {
storedFieldsWriter = docWriter.codec.storedFieldsFormat().fieldsWriter(, docWriter.getSegmentInfo(), IOContext.DEFAULT);
public void flush(SegmentWriteState state) throws IOException, AbortingException {
// NOTE: caller (DocumentsWriterPerThread) handles
// aborting on any exception from this method
int maxDoc = state.segmentInfo.maxDoc();
// it's possible all docs hit non-aborting exceptions...
storedFieldsWriter.finish(state.fieldInfos, maxDoc);
Map<String,TermsHashPerField> fieldsToFlush = new HashMap<>();
for (int i=0;i<fieldHash.length;i++) {
PerField perField = fieldHash[i];
while (perField != null) {
if (perField.invertState != null) {
fieldsToFlush.put(, perField.termsHashPerField);
perField =;
termsHash.flush(fieldsToFlush, state);
// Important to save after asking consumer to flush so
// consumer can alter the FieldInfo* if necessary. EG,
// FreqProxTermsWriter does this with
// FieldInfo.storePayload.
docWriter.codec.fieldInfosFormat().write(, state.segmentInfo, "", state.fieldInfos, IOContext.DEFAULT);
/** Writes all buffered dimensional values. */
private void writeDimensionalValues(SegmentWriteState state) throws IOException {
DimensionalWriter dimensionalWriter = null;
boolean success = false;
try {
for (int i=0;i<fieldHash.length;i++) {
PerField perField = fieldHash[i];
while (perField != null) {
if (perField.dimensionalValuesWriter != null) {
if (perField.fieldInfo.getDimensionCount() == 0) {
// BUG
throw new AssertionError("segment=" + state.segmentInfo + ": field=\"" + + "\" has no dimensional values but wrote them");
if (dimensionalWriter == null) {
// lazy init
DimensionalFormat fmt = state.segmentInfo.getCodec().dimensionalFormat();
dimensionalWriter = fmt.fieldsWriter(state);
perField.dimensionalValuesWriter.flush(state, dimensionalWriter);
perField.dimensionalValuesWriter = null;
} else if (perField.fieldInfo.getDimensionCount() != 0) {
// BUG
throw new AssertionError("segment=" + state.segmentInfo + ": field=\"" + + "\" has dimensional values but did not write them");
perField =;
success = true;
} finally {
if (success) {
} else {
/** Writes all buffered doc values (called from {@link #flush}). */
private void writeDocValues(SegmentWriteState state) throws IOException {
int maxDoc = state.segmentInfo.maxDoc();
DocValuesConsumer dvConsumer = null;
boolean success = false;
try {
for (int i=0;i<fieldHash.length;i++) {
PerField perField = fieldHash[i];
while (perField != null) {
if (perField.docValuesWriter != null) {
if (perField.fieldInfo.getDocValuesType() == DocValuesType.NONE) {
// BUG
throw new AssertionError("segment=" + state.segmentInfo + ": field=\"" + + "\" has no docValues but wrote them");
if (dvConsumer == null) {
// lazy init
DocValuesFormat fmt = state.segmentInfo.getCodec().docValuesFormat();
dvConsumer = fmt.fieldsConsumer(state);
perField.docValuesWriter.flush(state, dvConsumer);
perField.docValuesWriter = null;
} else if (perField.fieldInfo.getDocValuesType() != DocValuesType.NONE) {
// BUG
throw new AssertionError("segment=" + state.segmentInfo + ": field=\"" + + "\" has docValues but did not write them");
perField =;
// TODO: catch missing DV fields here? else we have
// null/"" depending on how docs landed in segments?
// but we can't detect all cases, and we should leave
// this behavior undefined. dv is not "schemaless": it's column-stride.
success = true;
} finally {
if (success) {
} else {
if (state.fieldInfos.hasDocValues() == false) {
if (dvConsumer != null) {
// BUG
throw new AssertionError("segment=" + state.segmentInfo + ": fieldInfos has no docValues but wrote them");
} else if (dvConsumer == null) {
// BUG
throw new AssertionError("segment=" + state.segmentInfo + ": fieldInfos has docValues but did not wrote them");
/** Catch up for all docs before us that had no stored
* fields, or hit non-aborting exceptions before writing
* stored fields. */
private void fillStoredFields(int docID) throws IOException, AbortingException {
while (lastStoredDocID < docID) {
private void writeNorms(SegmentWriteState state) throws IOException {
boolean success = false;
NormsConsumer normsConsumer = null;
try {
if (state.fieldInfos.hasNorms()) {
NormsFormat normsFormat = state.segmentInfo.getCodec().normsFormat();
assert normsFormat != null;
normsConsumer = normsFormat.normsConsumer(state);
for (FieldInfo fi : state.fieldInfos) {
PerField perField = getPerField(;
assert perField != null;
// we must check the final value of omitNorms for the fieldinfo: it could have
// changed for this field since the first time we added it.
if (fi.omitsNorms() == false && fi.getIndexOptions() != IndexOptions.NONE) {
assert perField.norms != null: "field=" +;
perField.norms.flush(state, normsConsumer);
success = true;
} finally {
if (success) {
} else {
public void abort() {
try {
// E.g. close any open files in the term vectors writer:
} catch (Throwable t) {
Arrays.fill(fieldHash, null);
private void rehash() {
int newHashSize = (fieldHash.length*2);
assert newHashSize > fieldHash.length;
PerField newHashArray[] = new PerField[newHashSize];
// Rehash
int newHashMask = newHashSize-1;
for(int j=0;j<fieldHash.length;j++) {
PerField fp0 = fieldHash[j];
while(fp0 != null) {
final int hashPos2 = & newHashMask;
PerField nextFP0 =; = newHashArray[hashPos2];
newHashArray[hashPos2] = fp0;
fp0 = nextFP0;
fieldHash = newHashArray;
hashMask = newHashMask;
/** Calls StoredFieldsWriter.startDocument, aborting the
* segment if it hits any exception. */
private void startStoredFields() throws IOException, AbortingException {
try {
} catch (Throwable th) {
throw AbortingException.wrap(th);
/** Calls StoredFieldsWriter.finishDocument, aborting the
* segment if it hits any exception. */
private void finishStoredFields() throws IOException, AbortingException {
try {
} catch (Throwable th) {
throw AbortingException.wrap(th);
public void processDocument() throws IOException, AbortingException {
// How many indexed field names we've seen (collapses
// multiple field instances by the same name):
int fieldCount = 0;
long fieldGen = nextFieldGen++;
// NOTE: we need two passes here, in case there are
// multi-valued fields, because we must process all
// instances of a given field at once, since the
// analyzer is free to reuse TokenStream across fields
// (i.e., we cannot have more than one TokenStream
// running "at once"):
// Invert indexed fields:
try {
for (IndexableField field : docState.doc.indexableFields()) {
IndexableFieldType fieldType = field.fieldType();
// if the field omits norms, the boost cannot be indexed.
if (fieldType.omitNorms() && field.boost() != 1.0f) {
throw new UnsupportedOperationException("You cannot set an index-time boost: norms are omitted for field '" + + "'");
PerField fp = getOrAddField(, fieldType, true);
boolean first = fp.fieldGen != fieldGen;
fp.invert(field, first);
if (first) {
fields[fieldCount++] = fp;
fp.fieldGen = fieldGen;
} finally {
// Finish each field name seen in the document:
for (int i=0;i<fieldCount;i++) {
try {
} catch (Throwable th) {
// Must abort, on the possibility that on-disk term
// vectors are now corrupt:
throw AbortingException.wrap(th);
// Add stored fields:
// TODO: clean up this loop, it's bogus that docvalues are treated as stored fields...
boolean abort = false;
try {
for (StorableField field : docState.doc.storableFields()) {
String fieldName =;
IndexableFieldType fieldType = field.fieldType();
verifyFieldType(fieldName, fieldType);
PerField fp = getOrAddField(fieldName, fieldType, false);
if (fieldType.stored()) {
try {
storedFieldsWriter.writeField(fp.fieldInfo, field);
} catch (Throwable th) {
abort = true;
throw AbortingException.wrap(th);
DocValuesType dvType = fieldType.docValuesType();
if (dvType == null) {
throw new NullPointerException("docValuesType cannot be null (field: \"" + fieldName + "\")");
if (dvType != DocValuesType.NONE) {
indexDocValue(fp, dvType, field);
if (fieldType.dimensionCount() != 0) {
indexDimensionalValue(fp, field);
} finally {
if (abort == false) {
private static void verifyFieldType(String name, IndexableFieldType ft) {
if (ft.indexOptions() == null) {
throw new NullPointerException("IndexOptions must not be null (field: \"" + name + "\")");
if (ft.indexOptions() == IndexOptions.NONE) {
if (ft.storeTermVectors()) {
throw new IllegalArgumentException("cannot store term vectors "
+ "for a field that is not indexed (field=\"" + name + "\")");
if (ft.storeTermVectorPositions()) {
throw new IllegalArgumentException("cannot store term vector positions "
+ "for a field that is not indexed (field=\"" + name + "\")");
if (ft.storeTermVectorOffsets()) {
throw new IllegalArgumentException("cannot store term vector offsets "
+ "for a field that is not indexed (field=\"" + name + "\")");
if (ft.storeTermVectorPayloads()) {
throw new IllegalArgumentException("cannot store term vector payloads "
+ "for a field that is not indexed (field=\"" + name + "\")");
/** Called from processDocument to index one field's dimensional value */
private void indexDimensionalValue(PerField fp, StorableField field) throws IOException {
int dimensionCount = field.fieldType().dimensionCount();
int dimensionNumBytes = field.fieldType().dimensionNumBytes();
// Record dimensions for this field; this setter will throw IllegalArgExc if
// the dimensions were already set to something different:
if (fp.fieldInfo.getDimensionCount() == 0) {
fieldInfos.globalFieldNumbers.setDimensions(fp.fieldInfo.number,, dimensionCount, dimensionNumBytes);
fp.fieldInfo.setDimensions(dimensionCount, dimensionNumBytes);
if (fp.dimensionalValuesWriter == null) {
fp.dimensionalValuesWriter = new DimensionalValuesWriter(docWriter, fp.fieldInfo);
fp.dimensionalValuesWriter.addPackedValue(docState.docID, field.binaryValue());
/** Called from processDocument to index one field's doc value */
private void indexDocValue(PerField fp, DocValuesType dvType, StorableField field) throws IOException {
if (fp.fieldInfo.getDocValuesType() == DocValuesType.NONE) {
// This is the first time we are seeing this field indexed with doc values, so we
// now record the DV type so that any future attempt to (illegally) change
// the DV type of this field, will throw an IllegalArgExc:
fieldInfos.globalFieldNumbers.setDocValuesType(fp.fieldInfo.number,, dvType);
int docID = docState.docID;
switch(dvType) {
if (fp.docValuesWriter == null) {
fp.docValuesWriter = new NumericDocValuesWriter(fp.fieldInfo, bytesUsed);
((NumericDocValuesWriter) fp.docValuesWriter).addValue(docID, field.numericValue().longValue());
case BINARY:
if (fp.docValuesWriter == null) {
fp.docValuesWriter = new BinaryDocValuesWriter(fp.fieldInfo, bytesUsed);
((BinaryDocValuesWriter) fp.docValuesWriter).addValue(docID, field.binaryValue());
case SORTED:
if (fp.docValuesWriter == null) {
fp.docValuesWriter = new SortedDocValuesWriter(fp.fieldInfo, bytesUsed);
((SortedDocValuesWriter) fp.docValuesWriter).addValue(docID, field.binaryValue());
if (fp.docValuesWriter == null) {
fp.docValuesWriter = new SortedNumericDocValuesWriter(fp.fieldInfo, bytesUsed);
((SortedNumericDocValuesWriter) fp.docValuesWriter).addValue(docID, field.numericValue().longValue());
if (fp.docValuesWriter == null) {
fp.docValuesWriter = new SortedSetDocValuesWriter(fp.fieldInfo, bytesUsed);
((SortedSetDocValuesWriter) fp.docValuesWriter).addValue(docID, field.binaryValue());
throw new AssertionError("unrecognized DocValues.Type: " + dvType);
/** Returns a previously created {@link PerField}, or null
* if this field name wasn't seen yet. */
private PerField getPerField(String name) {
final int hashPos = name.hashCode() & hashMask;
PerField fp = fieldHash[hashPos];
while (fp != null && ! {
fp =;
return fp;
/** Returns a previously created {@link PerField},
* absorbing the type information from {@link FieldType},
* and creates a new {@link PerField} if this field name
* wasn't seen yet. */
private PerField getOrAddField(String name, IndexableFieldType fieldType, boolean invert) {
// Make sure we have a PerField allocated
final int hashPos = name.hashCode() & hashMask;
PerField fp = fieldHash[hashPos];
while (fp != null && ! {
fp =;
if (fp == null) {
// First time we are seeing this field in this segment
FieldInfo fi = fieldInfos.getOrAdd(name);
// Messy: must set this here because e.g. FreqProxTermsWriterPerField looks at the initial
// IndexOptions to decide what arrays it must create). Then, we also must set it in
// PerField.invert to allow for later downgrading of the index options:
fp = new PerField(fi, invert); = fieldHash[hashPos];
fieldHash[hashPos] = fp;
// At most 50% load factor:
if (totalFieldCount >= fieldHash.length/2) {
if (totalFieldCount > fields.length) {
PerField[] newFields = new PerField[ArrayUtil.oversize(totalFieldCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(fields, 0, newFields, 0, fields.length);
fields = newFields;
} else if (invert && fp.invertState == null) {
// Messy: must set this here because e.g. FreqProxTermsWriterPerField looks at the initial
// IndexOptions to decide what arrays it must create). Then, we also must set it in
// PerField.invert to allow for later downgrading of the index options:
return fp;
/** NOTE: not static: accesses at least docState, termsHash. */
private final class PerField implements Comparable<PerField> {
final FieldInfo fieldInfo;
final Similarity similarity;
FieldInvertState invertState;
TermsHashPerField termsHashPerField;
// Non-null if this field ever had doc values in this
// segment:
DocValuesWriter docValuesWriter;
// Non-null if this field ever had dimensional values in this segment:
DimensionalValuesWriter dimensionalValuesWriter;
/** We use this to know when a PerField is seen for the
* first time in the current document. */
long fieldGen = -1;
// Used by the hash table
PerField next;
// Lazy init'd:
NormValuesWriter norms;
// reused
TokenStream tokenStream;
IndexOptions indexOptions;
public PerField(FieldInfo fieldInfo, boolean invert) {
this.fieldInfo = fieldInfo;
similarity = docState.similarity;
if (invert) {
void setInvertState() {
invertState = new FieldInvertState(;
termsHashPerField = termsHash.addField(invertState, fieldInfo);
if (fieldInfo.omitsNorms() == false) {
assert norms == null;
// Even if no documents actually succeed in setting a norm, we still write norms for this segment:
norms = new NormValuesWriter(fieldInfo, docState.docWriter.bytesUsed);
public int compareTo(PerField other) {
public void finish() throws IOException {
if (fieldInfo.omitsNorms() == false && invertState.length != 0) {
norms.addValue(docState.docID, similarity.computeNorm(invertState));
/** Inverts one field for one document; first is true
* if this is the first time we are seeing this field
* name in this document. */
public void invert(IndexableField field, boolean first) throws IOException, AbortingException {
if (first) {
// First time we're seeing this field (indexed) in
// this document:
IndexableFieldType fieldType = field.fieldType();
IndexOptions indexOptions = fieldType.indexOptions();
if (fieldType.omitNorms()) {
final boolean analyzed = fieldType.tokenized() && docState.analyzer != null;
// only bother checking offsets if something will consume them.
// TODO: after we fix analyzers, also check if termVectorOffsets will be indexed.
final boolean checkOffsets = indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
* To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream
* when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses,
* but rather a finally that takes note of the problem.
boolean succeededInProcessingField = false;
try (TokenStream stream = tokenStream = field.tokenStream(docState.analyzer, tokenStream)) {
// reset the TokenStream to the first token
termsHashPerField.start(field, first);
while (stream.incrementToken()) {
// If we hit an exception in below
// (which is fairly common, e.g. if analyzer
// chokes on a given document), then it's
// non-aborting and (above) this one document
// will be marked as deleted, but still
// consume a docID
int posIncr = invertState.posIncrAttribute.getPositionIncrement();
invertState.position += posIncr;
if (invertState.position < invertState.lastPosition) {
if (posIncr == 0) {
throw new IllegalArgumentException("first position increment must be > 0 (got 0) for field '" + + "'");
} else {
throw new IllegalArgumentException("position increments (and gaps) must be >= 0 (got " + posIncr + ") for field '" + + "'");
} else if (invertState.position > IndexWriter.MAX_POSITION) {
throw new IllegalArgumentException("position " + invertState.position + " is too large for field '" + + "': max allowed position is " + IndexWriter.MAX_POSITION);
invertState.lastPosition = invertState.position;
if (posIncr == 0) {
if (checkOffsets) {
int startOffset = invertState.offset + invertState.offsetAttribute.startOffset();
int endOffset = invertState.offset + invertState.offsetAttribute.endOffset();
if (startOffset < invertState.lastStartOffset || endOffset < startOffset) {
throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards "
+ "startOffset=" + startOffset + ",endOffset=" + endOffset + ",lastStartOffset=" + invertState.lastStartOffset + " for field '" + + "'");
invertState.lastStartOffset = startOffset;
if (invertState.length < 0) {
throw new IllegalArgumentException("too many tokens in field '" + + "'");
//System.out.println(" term=" + invertState.termAttribute);
// If we hit an exception in here, we abort
// all buffered documents since the last
// flush, on the likelihood that the
// internal state of the terms hash is now
// corrupt and should not be flushed to a
// new segment:
try {
} catch (MaxBytesLengthExceededException e) {
byte[] prefix = new byte[30];
BytesRef bigTerm = invertState.termAttribute.getBytesRef();
System.arraycopy(bigTerm.bytes, bigTerm.offset, prefix, 0, 30);
String msg = "Document contains at least one immense term in field=\"" + + "\" (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + Arrays.toString(prefix) + "...', original message: " + e.getMessage();
if (docState.infoStream.isEnabled("IW")) {
docState.infoStream.message("IW", "ERROR: " + msg);
// Document will be deleted above:
throw new IllegalArgumentException(msg, e);
} catch (Throwable th) {
throw AbortingException.wrap(th);
// trigger streams to perform end-of-stream operations
// TODO: maybe add some safety? then again, it's already checked
// when we come back around to the field...
invertState.position += invertState.posIncrAttribute.getPositionIncrement();
invertState.offset += invertState.offsetAttribute.endOffset();
/* if there is an exception coming through, we won't set this to true here:*/
succeededInProcessingField = true;
} finally {
if (!succeededInProcessingField && docState.infoStream.isEnabled("DW")) {
docState.infoStream.message("DW", "An exception was thrown while processing field " +;
if (analyzed) {
invertState.position += docState.analyzer.getPositionIncrementGap(;
invertState.offset += docState.analyzer.getOffsetGap(;
invertState.boost *= field.boost();