blob: c374d9a546c42a095b68cfa62026f0616499f8da [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.codecs.TermVectorsWriter;
import org.apache.lucene.codecs.compressing.CompressingTermVectorsFormat;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FlushInfo;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntBlockPool;
final class SortingTermVectorsConsumer extends TermVectorsConsumer {
private static final TermVectorsFormat TEMP_TERM_VECTORS_FORMAT =
new CompressingTermVectorsFormat(
"TempTermVectors", "", SortingStoredFieldsConsumer.NO_COMPRESSION, 8 * 1024, 128, 10);
TrackingTmpOutputDirectoryWrapper tmpDirectory;
SortingTermVectorsConsumer(final IntBlockPool.Allocator intBlockAllocator, final ByteBlockPool.Allocator byteBlockAllocator, Directory directory, SegmentInfo info, Codec codec) {
super(intBlockAllocator, byteBlockAllocator, directory, info, codec);
}
@Override
void flush(Map<String, TermsHashPerField> fieldsToFlush, final SegmentWriteState state, Sorter.DocMap sortMap, NormsProducer norms) throws IOException {
super.flush(fieldsToFlush, state, sortMap, norms);
if (tmpDirectory != null) {
TermVectorsReader reader = TEMP_TERM_VECTORS_FORMAT
.vectorsReader(tmpDirectory, state.segmentInfo, state.fieldInfos, IOContext.DEFAULT);
// Don't pull a merge instance, since merge instances optimize for
// sequential access while term vectors will likely be accessed in random
// order here.
TermVectorsWriter writer = codec.termVectorsFormat()
.vectorsWriter(state.directory, state.segmentInfo, IOContext.DEFAULT);
try {
reader.checkIntegrity();
for (int docID = 0; docID < state.segmentInfo.maxDoc(); docID++) {
Fields vectors = reader.get(sortMap == null ? docID : sortMap.newToOld(docID));
writeTermVectors(writer, vectors, state.fieldInfos);
}
writer.finish(state.fieldInfos, state.segmentInfo.maxDoc());
} finally {
IOUtils.close(reader, writer);
IOUtils.deleteFiles(tmpDirectory,
tmpDirectory.getTemporaryFiles().values());
}
}
}
@Override
void initTermVectorsWriter() throws IOException {
if (writer == null) {
IOContext context = new IOContext(new FlushInfo(lastDocID, bytesUsed.get()));
tmpDirectory = new TrackingTmpOutputDirectoryWrapper(directory);
writer = TEMP_TERM_VECTORS_FORMAT.vectorsWriter(tmpDirectory, info, context);
lastDocID = 0;
}
}
@Override
public void abort() {
try {
super.abort();
} finally {
if (tmpDirectory != null) {
IOUtils.deleteFilesIgnoringExceptions(tmpDirectory,
tmpDirectory.getTemporaryFiles().values());
}
}
}
/** Safe (but, slowish) default method to copy every vector field in the provided {@link TermVectorsWriter}. */
private static void writeTermVectors(TermVectorsWriter writer, Fields vectors, FieldInfos fieldInfos) throws IOException {
if (vectors == null) {
writer.startDocument(0);
writer.finishDocument();
return;
}
int numFields = vectors.size();
if (numFields == -1) {
// count manually! TODO: Maybe enforce that Fields.size() returns something valid?
numFields = 0;
for (final Iterator<String> it = vectors.iterator(); it.hasNext(); ) {
it.next();
numFields++;
}
}
writer.startDocument(numFields);
String lastFieldName = null;
TermsEnum termsEnum = null;
PostingsEnum docsAndPositionsEnum = null;
int fieldCount = 0;
for(String fieldName : vectors) {
fieldCount++;
final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldName);
assert lastFieldName == null || fieldName.compareTo(lastFieldName) > 0: "lastFieldName=" + lastFieldName + " fieldName=" + fieldName;
lastFieldName = fieldName;
final Terms terms = vectors.terms(fieldName);
if (terms == null) {
// FieldsEnum shouldn't lie...
continue;
}
final boolean hasPositions = terms.hasPositions();
final boolean hasOffsets = terms.hasOffsets();
final boolean hasPayloads = terms.hasPayloads();
assert !hasPayloads || hasPositions;
int numTerms = (int) terms.size();
if (numTerms == -1) {
// count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function
numTerms = 0;
termsEnum = terms.iterator();
while(termsEnum.next() != null) {
numTerms++;
}
}
writer.startField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads);
termsEnum = terms.iterator();
int termCount = 0;
while(termsEnum.next() != null) {
termCount++;
final int freq = (int) termsEnum.totalTermFreq();
writer.startTerm(termsEnum.term(), freq);
if (hasPositions || hasOffsets) {
docsAndPositionsEnum = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);
assert docsAndPositionsEnum != null;
final int docID = docsAndPositionsEnum.nextDoc();
assert docID != DocIdSetIterator.NO_MORE_DOCS;
assert docsAndPositionsEnum.freq() == freq;
for(int posUpto=0; posUpto<freq; posUpto++) {
final int pos = docsAndPositionsEnum.nextPosition();
final int startOffset = docsAndPositionsEnum.startOffset();
final int endOffset = docsAndPositionsEnum.endOffset();
final BytesRef payload = docsAndPositionsEnum.getPayload();
assert !hasPositions || pos >= 0 ;
writer.addPosition(pos, startOffset, endOffset, payload);
}
}
writer.finishTerm();
}
assert termCount == numTerms;
writer.finishField();
}
assert fieldCount == numFields;
writer.finishDocument();
}
}