lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.index;


 import java.io.IOException;
 import java.util.Iterator;
 import java.util.Map;

 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.codecs.NormsProducer;
 import org.apache.lucene.codecs.TermVectorsFormat;
 import org.apache.lucene.codecs.TermVectorsReader;
 import org.apache.lucene.codecs.TermVectorsWriter;
 import org.apache.lucene.codecs.compressing.CompressingTermVectorsFormat;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FlushInfo;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.util.ByteBlockPool;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntBlockPool;

 final class SortingTermVectorsConsumer extends TermVectorsConsumer {

   private static final TermVectorsFormat TEMP_TERM_VECTORS_FORMAT =
       new CompressingTermVectorsFormat(
           "TempTermVectors", "", SortingStoredFieldsConsumer.NO_COMPRESSION, 8 * 1024, 128, 10);
   TrackingTmpOutputDirectoryWrapper tmpDirectory;

   SortingTermVectorsConsumer(final IntBlockPool.Allocator intBlockAllocator, final ByteBlockPool.Allocator byteBlockAllocator, Directory directory, SegmentInfo info, Codec codec) {
     super(intBlockAllocator, byteBlockAllocator, directory, info, codec);
   }

   @Override
   void flush(Map<String, TermsHashPerField> fieldsToFlush, final SegmentWriteState state, Sorter.DocMap sortMap, NormsProducer norms) throws IOException {
     super.flush(fieldsToFlush, state, sortMap, norms);
     if (tmpDirectory != null) {
       TermVectorsReader reader = TEMP_TERM_VECTORS_FORMAT
           .vectorsReader(tmpDirectory, state.segmentInfo, state.fieldInfos, IOContext.DEFAULT);
       // Don't pull a merge instance, since merge instances optimize for
       // sequential access while term vectors will likely be accessed in random
       // order here.
       TermVectorsWriter writer = codec.termVectorsFormat()
           .vectorsWriter(state.directory, state.segmentInfo, IOContext.DEFAULT);
       try {
         reader.checkIntegrity();
         for (int docID = 0; docID < state.segmentInfo.maxDoc(); docID++) {
           Fields vectors = reader.get(sortMap == null ? docID : sortMap.newToOld(docID));
           writeTermVectors(writer, vectors, state.fieldInfos);
         }
         writer.finish(state.fieldInfos, state.segmentInfo.maxDoc());
       } finally {
         IOUtils.close(reader, writer);
         IOUtils.deleteFiles(tmpDirectory,
             tmpDirectory.getTemporaryFiles().values());
       }
     }
   }

   @Override
   void initTermVectorsWriter() throws IOException {
     if (writer == null) {
       IOContext context = new IOContext(new FlushInfo(lastDocID, bytesUsed.get()));
       tmpDirectory = new TrackingTmpOutputDirectoryWrapper(directory);
       writer = TEMP_TERM_VECTORS_FORMAT.vectorsWriter(tmpDirectory, info, context);
       lastDocID = 0;
     }
   }

   @Override
   public void abort() {
     try {
       super.abort();
     } finally {
       if (tmpDirectory != null) {
         IOUtils.deleteFilesIgnoringExceptions(tmpDirectory,
             tmpDirectory.getTemporaryFiles().values());
       }
     }
   }

   /** Safe (but, slowish) default method to copy every vector field in the provided {@link TermVectorsWriter}. */
   private static void writeTermVectors(TermVectorsWriter writer, Fields vectors, FieldInfos fieldInfos) throws IOException {
     if (vectors == null) {
       writer.startDocument(0);
       writer.finishDocument();
       return;
     }

     int numFields = vectors.size();
     if (numFields == -1) {
       // count manually! TODO: Maybe enforce that Fields.size() returns something valid?
       numFields = 0;
       for (final Iterator<String> it = vectors.iterator(); it.hasNext(); ) {
         it.next();
         numFields++;
       }
     }
     writer.startDocument(numFields);

     String lastFieldName = null;

     TermsEnum termsEnum = null;
     PostingsEnum docsAndPositionsEnum = null;

     int fieldCount = 0;
     for(String fieldName : vectors) {
       fieldCount++;
       final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldName);

       assert lastFieldName == null || fieldName.compareTo(lastFieldName) > 0: "lastFieldName=" + lastFieldName + " fieldName=" + fieldName;
       lastFieldName = fieldName;

       final Terms terms = vectors.terms(fieldName);
       if (terms == null) {
         // FieldsEnum shouldn't lie...
         continue;
       }

       final boolean hasPositions = terms.hasPositions();
       final boolean hasOffsets = terms.hasOffsets();
       final boolean hasPayloads = terms.hasPayloads();
       assert !hasPayloads || hasPositions;

       int numTerms = (int) terms.size();
       if (numTerms == -1) {
         // count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function
         numTerms = 0;
         termsEnum = terms.iterator();
         while(termsEnum.next() != null) {
           numTerms++;
         }
       }

       writer.startField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads);
       termsEnum = terms.iterator();

       int termCount = 0;
       while(termsEnum.next() != null) {
         termCount++;

         final int freq = (int) termsEnum.totalTermFreq();

         writer.startTerm(termsEnum.term(), freq);

         if (hasPositions || hasOffsets) {
           docsAndPositionsEnum = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);
           assert docsAndPositionsEnum != null;

           final int docID = docsAndPositionsEnum.nextDoc();
           assert docID != DocIdSetIterator.NO_MORE_DOCS;
           assert docsAndPositionsEnum.freq() == freq;

           for(int posUpto=0; posUpto<freq; posUpto++) {
             final int pos = docsAndPositionsEnum.nextPosition();
             final int startOffset = docsAndPositionsEnum.startOffset();
             final int endOffset = docsAndPositionsEnum.endOffset();

             final BytesRef payload = docsAndPositionsEnum.getPayload();

             assert !hasPositions || pos >= 0 ;
             writer.addPosition(pos, startOffset, endOffset, payload);
           }
         }
         writer.finishTerm();
       }
       assert termCount == numTerms;
       writer.finishField();
     }
     assert fieldCount == numFields;
     writer.finishDocument();
   }


 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.index;


	import java.io.IOException;
	import java.util.Iterator;
	import java.util.Map;

	import org.apache.lucene.codecs.Codec;
	import org.apache.lucene.codecs.NormsProducer;
	import org.apache.lucene.codecs.TermVectorsFormat;
	import org.apache.lucene.codecs.TermVectorsReader;
	import org.apache.lucene.codecs.TermVectorsWriter;
	import org.apache.lucene.codecs.compressing.CompressingTermVectorsFormat;
	import org.apache.lucene.search.DocIdSetIterator;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.store.FlushInfo;
	import org.apache.lucene.store.IOContext;
	import org.apache.lucene.util.ByteBlockPool;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.IOUtils;
	import org.apache.lucene.util.IntBlockPool;

	final class SortingTermVectorsConsumer extends TermVectorsConsumer {

	private static final TermVectorsFormat TEMP_TERM_VECTORS_FORMAT =
	new CompressingTermVectorsFormat(
	"TempTermVectors", "", SortingStoredFieldsConsumer.NO_COMPRESSION, 8 * 1024, 128, 10);
	TrackingTmpOutputDirectoryWrapper tmpDirectory;

	SortingTermVectorsConsumer(final IntBlockPool.Allocator intBlockAllocator, final ByteBlockPool.Allocator byteBlockAllocator, Directory directory, SegmentInfo info, Codec codec) {
	super(intBlockAllocator, byteBlockAllocator, directory, info, codec);
	}

	@Override
	void flush(Map<String, TermsHashPerField> fieldsToFlush, final SegmentWriteState state, Sorter.DocMap sortMap, NormsProducer norms) throws IOException {
	super.flush(fieldsToFlush, state, sortMap, norms);
	if (tmpDirectory != null) {
	TermVectorsReader reader = TEMP_TERM_VECTORS_FORMAT
	.vectorsReader(tmpDirectory, state.segmentInfo, state.fieldInfos, IOContext.DEFAULT);
	// Don't pull a merge instance, since merge instances optimize for
	// sequential access while term vectors will likely be accessed in random
	// order here.
	TermVectorsWriter writer = codec.termVectorsFormat()
	.vectorsWriter(state.directory, state.segmentInfo, IOContext.DEFAULT);
	try {
	reader.checkIntegrity();
	for (int docID = 0; docID < state.segmentInfo.maxDoc(); docID++) {
	Fields vectors = reader.get(sortMap == null ? docID : sortMap.newToOld(docID));
	writeTermVectors(writer, vectors, state.fieldInfos);
	}
	writer.finish(state.fieldInfos, state.segmentInfo.maxDoc());
	} finally {
	IOUtils.close(reader, writer);
	IOUtils.deleteFiles(tmpDirectory,
	tmpDirectory.getTemporaryFiles().values());
	}
	}
	}

	@Override
	void initTermVectorsWriter() throws IOException {
	if (writer == null) {
	IOContext context = new IOContext(new FlushInfo(lastDocID, bytesUsed.get()));
	tmpDirectory = new TrackingTmpOutputDirectoryWrapper(directory);
	writer = TEMP_TERM_VECTORS_FORMAT.vectorsWriter(tmpDirectory, info, context);
	lastDocID = 0;
	}
	}

	@Override
	public void abort() {
	try {
	super.abort();
	} finally {
	if (tmpDirectory != null) {
	IOUtils.deleteFilesIgnoringExceptions(tmpDirectory,
	tmpDirectory.getTemporaryFiles().values());
	}
	}
	}

	/** Safe (but, slowish) default method to copy every vector field in the provided {@link TermVectorsWriter}. */
	private static void writeTermVectors(TermVectorsWriter writer, Fields vectors, FieldInfos fieldInfos) throws IOException {
	if (vectors == null) {
	writer.startDocument(0);
	writer.finishDocument();
	return;
	}

	int numFields = vectors.size();
	if (numFields == -1) {
	// count manually! TODO: Maybe enforce that Fields.size() returns something valid?
	numFields = 0;
	for (final Iterator<String> it = vectors.iterator(); it.hasNext(); ) {
	it.next();
	numFields++;
	}
	}
	writer.startDocument(numFields);

	String lastFieldName = null;

	TermsEnum termsEnum = null;
	PostingsEnum docsAndPositionsEnum = null;

	int fieldCount = 0;
	for(String fieldName : vectors) {
	fieldCount++;
	final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldName);

	assert lastFieldName == null \|\| fieldName.compareTo(lastFieldName) > 0: "lastFieldName=" + lastFieldName + " fieldName=" + fieldName;
	lastFieldName = fieldName;

	final Terms terms = vectors.terms(fieldName);
	if (terms == null) {
	// FieldsEnum shouldn't lie...
	continue;
	}

	final boolean hasPositions = terms.hasPositions();
	final boolean hasOffsets = terms.hasOffsets();
	final boolean hasPayloads = terms.hasPayloads();
	assert !hasPayloads \|\| hasPositions;

	int numTerms = (int) terms.size();
	if (numTerms == -1) {
	// count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function
	numTerms = 0;
	termsEnum = terms.iterator();
	while(termsEnum.next() != null) {
	numTerms++;
	}
	}

	writer.startField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads);
	termsEnum = terms.iterator();

	int termCount = 0;
	while(termsEnum.next() != null) {
	termCount++;

	final int freq = (int) termsEnum.totalTermFreq();

	writer.startTerm(termsEnum.term(), freq);

	if (hasPositions \|\| hasOffsets) {
	docsAndPositionsEnum = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.OFFSETS \| PostingsEnum.PAYLOADS);
	assert docsAndPositionsEnum != null;

	final int docID = docsAndPositionsEnum.nextDoc();
	assert docID != DocIdSetIterator.NO_MORE_DOCS;
	assert docsAndPositionsEnum.freq() == freq;

	for(int posUpto=0; posUpto<freq; posUpto++) {
	final int pos = docsAndPositionsEnum.nextPosition();
	final int startOffset = docsAndPositionsEnum.startOffset();
	final int endOffset = docsAndPositionsEnum.endOffset();

	final BytesRef payload = docsAndPositionsEnum.getPayload();

	assert !hasPositions \|\| pos >= 0 ;
	writer.addPosition(pos, startOffset, endOffset, payload);
	}
	}
	writer.finishTerm();
	}
	assert termCount == numTerms;
	writer.finishField();
	}
	assert fieldCount == numFields;
	writer.finishDocument();
	}


	}