lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java - lucene-solr - Git at Google

 package org.apache.lucene.codecs.lucene40;

 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.IOException;
 import java.util.Arrays;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Set;

 import org.apache.lucene.codecs.TermVectorsReader;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.DocsAndPositionsEnum;
 import org.apache.lucene.index.DocsEnum;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.FieldsEnum;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.IndexFormatTooNewException;
 import org.apache.lucene.index.IndexFormatTooOldException;
 import org.apache.lucene.index.SegmentInfo;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;

 public class Lucene40TermVectorsReader extends TermVectorsReader {

   // NOTE: if you make a new format, it must be larger than
   // the current format

   // Changed strings to UTF8 with length-in-bytes not length-in-chars
   static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4;

   // NOTE: always change this if you switch to a new format!
   // whenever you add a new format, make it 1 larger (positive version logic)!
   static final int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;

   // when removing support for old versions, leave the last supported version here
   static final int FORMAT_MINIMUM = FORMAT_UTF8_LENGTH_IN_BYTES;

   //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
   static final int FORMAT_SIZE = 4;

   static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1;

   static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;

   /** Extension of vectors fields file */
   static final String VECTORS_FIELDS_EXTENSION = "tvf";

   /** Extension of vectors documents file */
   static final String VECTORS_DOCUMENTS_EXTENSION = "tvd";

   /** Extension of vectors index file */
   static final String VECTORS_INDEX_EXTENSION = "tvx";

   private FieldInfos fieldInfos;

   private IndexInput tvx;
   private IndexInput tvd;
   private IndexInput tvf;
   private int size;
   private int numTotalDocs;

   private final int format;

   // used by clone
   Lucene40TermVectorsReader(FieldInfos fieldInfos, IndexInput tvx, IndexInput tvd, IndexInput tvf, int size, int numTotalDocs, int format) {
     this.fieldInfos = fieldInfos;
     this.tvx = tvx;
     this.tvd = tvd;
     this.tvf = tvf;
     this.size = size;
     this.numTotalDocs = numTotalDocs;
     this.format = format;
   }

   public Lucene40TermVectorsReader(Directory d, SegmentInfo si, FieldInfos fieldInfos, IOContext context)
     throws CorruptIndexException, IOException {
     final String segment = si.name;
     final int size = si.docCount;

     boolean success = false;

     try {
       String idxName = IndexFileNames.segmentFileName(segment, "", VECTORS_INDEX_EXTENSION);
       tvx = d.openInput(idxName, context);
       format = checkValidFormat(tvx);
       String fn = IndexFileNames.segmentFileName(segment, "", VECTORS_DOCUMENTS_EXTENSION);
       tvd = d.openInput(fn, context);
       final int tvdFormat = checkValidFormat(tvd);
       fn = IndexFileNames.segmentFileName(segment, "", VECTORS_FIELDS_EXTENSION);
       tvf = d.openInput(fn, context);
       final int tvfFormat = checkValidFormat(tvf);

       assert format == tvdFormat;
       assert format == tvfFormat;

       numTotalDocs = (int) (tvx.length() >> 4);

       this.size = numTotalDocs;
       assert size == 0 || numTotalDocs == size;

       this.fieldInfos = fieldInfos;
       success = true;
     } finally {
       // With lock-less commits, it's entirely possible (and
       // fine) to hit a FileNotFound exception above. In
       // this case, we want to explicitly close any subset
       // of things that were opened so that we don't have to
       // wait for a GC to do so.
       if (!success) {
         close();
       }
     }
   }

   // Used for bulk copy when merging
   IndexInput getTvdStream() {
     return tvd;
   }

   // Used for bulk copy when merging
   IndexInput getTvfStream() {
     return tvf;
   }

   // Not private to avoid synthetic access$NNN methods
   void seekTvx(final int docNum) throws IOException {
     tvx.seek(docNum * 16L + FORMAT_SIZE);
   }

   boolean canReadRawDocs() {
     // we can always read raw docs, unless the term vectors
     // didn't exist
     return format != 0;
   }

   /** Retrieve the length (in bytes) of the tvd and tvf
    *  entries for the next numDocs starting with
    *  startDocID.  This is used for bulk copying when
    *  merging segments, if the field numbers are
    *  congruent.  Once this returns, the tvf & tvd streams
    *  are seeked to the startDocID. */
   final void rawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs) throws IOException {

     if (tvx == null) {
       Arrays.fill(tvdLengths, 0);
       Arrays.fill(tvfLengths, 0);
       return;
     }

     seekTvx(startDocID);

     long tvdPosition = tvx.readLong();
     tvd.seek(tvdPosition);

     long tvfPosition = tvx.readLong();
     tvf.seek(tvfPosition);

     long lastTvdPosition = tvdPosition;
     long lastTvfPosition = tvfPosition;

     int count = 0;
     while (count < numDocs) {
       final int docID = startDocID + count + 1;
       assert docID <= numTotalDocs;
       if (docID < numTotalDocs)  {
         tvdPosition = tvx.readLong();
         tvfPosition = tvx.readLong();
       } else {
         tvdPosition = tvd.length();
         tvfPosition = tvf.length();
         assert count == numDocs-1;
       }
       tvdLengths[count] = (int) (tvdPosition-lastTvdPosition);
       tvfLengths[count] = (int) (tvfPosition-lastTvfPosition);
       count++;
       lastTvdPosition = tvdPosition;
       lastTvfPosition = tvfPosition;
     }
   }

   private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException
   {
     int format = in.readInt();
     if (format < FORMAT_MINIMUM)
       throw new IndexFormatTooOldException(in, format, FORMAT_MINIMUM, FORMAT_CURRENT);
     if (format > FORMAT_CURRENT)
       throw new IndexFormatTooNewException(in, format, FORMAT_MINIMUM, FORMAT_CURRENT);
     return format;
   }

   public void close() throws IOException {
     IOUtils.close(tvx, tvd, tvf);
   }

   /**
    *
    * @return The number of documents in the reader
    */
   int size() {
     return size;
   }

   private class TVFields extends Fields {
     private final int[] fieldNumbers;
     private final long[] fieldFPs;
     private final Map<Integer,Integer> fieldNumberToIndex = new HashMap<Integer,Integer>();

     public TVFields(int docID) throws IOException {
       seekTvx(docID);
       tvd.seek(tvx.readLong());

       final int fieldCount = tvd.readVInt();
       assert fieldCount >= 0;
       if (fieldCount != 0) {
         fieldNumbers = new int[fieldCount];
         fieldFPs = new long[fieldCount];
         for(int fieldUpto=0;fieldUpto<fieldCount;fieldUpto++) {
           final int fieldNumber = tvd.readVInt();
           fieldNumbers[fieldUpto] = fieldNumber;
           fieldNumberToIndex.put(fieldNumber, fieldUpto);
         }

         long position = tvx.readLong();
         fieldFPs[0] = position;
         for(int fieldUpto=1;fieldUpto<fieldCount;fieldUpto++) {
           position += tvd.readVLong();
           fieldFPs[fieldUpto] = position;
         }
       } else {
         // TODO: we can improve writer here, eg write 0 into
         // tvx file, so we know on first read from tvx that
         // this doc has no TVs
         fieldNumbers = null;
         fieldFPs = null;
       }
     }

     @Override
     public FieldsEnum iterator() throws IOException {

       return new FieldsEnum() {
         private int fieldUpto;

         @Override
         public String next() throws IOException {
           if (fieldNumbers != null && fieldUpto < fieldNumbers.length) {
             return fieldInfos.fieldName(fieldNumbers[fieldUpto++]);
           } else {
             return null;
           }
         }

         @Override
         public Terms terms() throws IOException {
           return TVFields.this.terms(fieldInfos.fieldName(fieldNumbers[fieldUpto-1]));
         }
       };
     }

     @Override
     public Terms terms(String field) throws IOException {
       final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
       if (fieldInfo == null) {
         // No such field
         return null;
       }

       final Integer fieldIndex = fieldNumberToIndex.get(fieldInfo.number);
       if (fieldIndex == null) {
         // Term vectors were not indexed for this field
         return null;
       }

       return new TVTerms(fieldFPs[fieldIndex]);
     }

     @Override
     public int getUniqueFieldCount() {
       if (fieldNumbers == null) {
         return 0;
       } else {
         return fieldNumbers.length;
       }
     }
   }

   private class TVTerms extends Terms {
     private final int numTerms;
     private final long tvfFPStart;

     public TVTerms(long tvfFP) throws IOException {
       tvf.seek(tvfFP);
       numTerms = tvf.readVInt();
       tvfFPStart = tvf.getFilePointer();
     }

     @Override
     public TermsEnum iterator(TermsEnum reuse) throws IOException {
       TVTermsEnum termsEnum;
       if (reuse instanceof TVTermsEnum) {
         termsEnum = (TVTermsEnum) reuse;
         if (!termsEnum.canReuse(tvf)) {
           termsEnum = new TVTermsEnum();
         }
       } else {
         termsEnum = new TVTermsEnum();
       }
       termsEnum.reset(numTerms, tvfFPStart);
       return termsEnum;
     }

     @Override
     public long getUniqueTermCount() {
       return numTerms;
     }

     @Override
     public long getSumTotalTermFreq() {
       return -1;
     }

     @Override
     public long getSumDocFreq() {
       // Every term occurs in just one doc:
       return numTerms;
     }

     @Override
     public int getDocCount() {
       return 1;
     }

     @Override
     public Comparator<BytesRef> getComparator() {
       // TODO: really indexer hardwires
       // this...?  I guess codec could buffer and re-sort...
       return BytesRef.getUTF8SortedAsUnicodeComparator();
     }
   }

   private class TVTermsEnum extends TermsEnum {
     private final IndexInput origTVF;
     private final IndexInput tvf;
     private int numTerms;
     private int nextTerm;
     private int freq;
     private BytesRef lastTerm = new BytesRef();
     private BytesRef term = new BytesRef();
     private boolean storePositions;
     private boolean storeOffsets;
     private long tvfFP;

     private int[] positions;
     private int[] startOffsets;
     private int[] endOffsets;

     // NOTE: tvf is pre-positioned by caller
     public TVTermsEnum() throws IOException {
       this.origTVF = Lucene40TermVectorsReader.this.tvf;
       tvf = (IndexInput) origTVF.clone();
     }

     public boolean canReuse(IndexInput tvf) {
       return tvf == origTVF;
     }

     public void reset(int numTerms, long tvfFPStart) throws IOException {
       this.numTerms = numTerms;
       nextTerm = 0;
       tvf.seek(tvfFPStart);
       final byte bits = tvf.readByte();
       storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
       storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
       tvfFP = 1+tvfFPStart;
       positions = null;
       startOffsets = null;
       endOffsets = null;
     }

     // NOTE: slow!  (linear scan)
     @Override
     public SeekStatus seekCeil(BytesRef text, boolean useCache)
       throws IOException {
       if (nextTerm != 0) {
         final int cmp = text.compareTo(term);
         if (cmp < 0) {
           nextTerm = 0;
           tvf.seek(tvfFP);
         } else if (cmp == 0) {
           return SeekStatus.FOUND;
         }
       }

       while (next() != null) {
         final int cmp = text.compareTo(term);
         if (cmp < 0) {
           return SeekStatus.NOT_FOUND;
         } else if (cmp == 0) {
           return SeekStatus.FOUND;
         }
       }

       return SeekStatus.END;
     }

     @Override
     public void seekExact(long ord) {
       throw new UnsupportedOperationException();
     }

     @Override
     public BytesRef next() throws IOException {
       if (nextTerm >= numTerms) {
         return null;
       }
       term.copyBytes(lastTerm);
       final int start = tvf.readVInt();
       final int deltaLen = tvf.readVInt();
       term.length = start + deltaLen;
       term.grow(term.length);
       tvf.readBytes(term.bytes, start, deltaLen);
       freq = tvf.readVInt();

       if (storePositions) {
         // TODO: we could maybe reuse last array, if we can
         // somehow be careful about consumer never using two
         // D&PEnums at once...
         positions = new int[freq];
         int pos = 0;
         for(int posUpto=0;posUpto<freq;posUpto++) {
           pos += tvf.readVInt();
           positions[posUpto] = pos;
         }
       }

       if (storeOffsets) {
         startOffsets = new int[freq];
         endOffsets = new int[freq];
         int offset = 0;
         for(int posUpto=0;posUpto<freq;posUpto++) {
           startOffsets[posUpto] = offset + tvf.readVInt();
           offset = endOffsets[posUpto] = startOffsets[posUpto] + tvf.readVInt();
         }
       }

       lastTerm.copyBytes(term);
       nextTerm++;
       return term;
     }

     @Override
     public BytesRef term() {
       return term;
     }

     @Override
     public long ord() {
       throw new UnsupportedOperationException();
     }

     @Override
     public int docFreq() {
       return 1;
     }

     @Override
     public long totalTermFreq() {
       return freq;
     }

     @Override
     public DocsEnum docs(Bits liveDocs, DocsEnum reuse, boolean needsFreqs /* ignored */) throws IOException {
       TVDocsEnum docsEnum;
       if (reuse != null && reuse instanceof TVDocsEnum) {
         docsEnum = (TVDocsEnum) reuse;
       } else {
         docsEnum = new TVDocsEnum();
       }
       docsEnum.reset(liveDocs, freq);
       return docsEnum;
     }

     @Override
     public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
       if (needsOffsets && !storeOffsets) {
         return null;
       }

       if (!storePositions && !storeOffsets) {
         return null;
       }

       TVDocsAndPositionsEnum docsAndPositionsEnum;
       if (reuse != null && reuse instanceof TVDocsAndPositionsEnum) {
         docsAndPositionsEnum = (TVDocsAndPositionsEnum) reuse;
       } else {
         docsAndPositionsEnum = new TVDocsAndPositionsEnum();
       }
       docsAndPositionsEnum.reset(liveDocs, positions, startOffsets, endOffsets);
       return docsAndPositionsEnum;
     }

     @Override
     public Comparator<BytesRef> getComparator() {
       // TODO: really indexer hardwires
       // this...?  I guess codec could buffer and re-sort...
       return BytesRef.getUTF8SortedAsUnicodeComparator();
     }
   }

   // NOTE: sort of a silly class, since you can get the
   // freq() already by TermsEnum.totalTermFreq
   private static class TVDocsEnum extends DocsEnum {
     private boolean didNext;
     private int doc = -1;
     private int freq;
     private Bits liveDocs;

     @Override
     public int freq() {
       return freq;
     }

     @Override
     public int docID() {
       return doc;
     }

     @Override
     public int nextDoc() {
       if (!didNext && (liveDocs == null || liveDocs.get(0))) {
         didNext = true;
         return (doc = 0);
       } else {
         return (doc = NO_MORE_DOCS);
       }
     }

     @Override
     public int advance(int target) {
       if (!didNext && target == 0) {
         return nextDoc();
       } else {
         return (doc = NO_MORE_DOCS);
       }
     }

     public void reset(Bits liveDocs, int freq) {
       this.liveDocs = liveDocs;
       this.freq = freq;
       this.doc = -1;
       didNext = false;
     }
   }

   private static class TVDocsAndPositionsEnum extends DocsAndPositionsEnum {
     private boolean didNext;
     private int doc = -1;
     private int nextPos;
     private Bits liveDocs;
     private int[] positions;
     private int[] startOffsets;
     private int[] endOffsets;

     @Override
     public int freq() {
       if (positions != null) {
         return positions.length;
       } else {
         assert startOffsets != null;
         return startOffsets.length;
       }
     }

     @Override
     public int docID() {
       return doc;
     }

     @Override
     public int nextDoc() {
       if (!didNext && (liveDocs == null || liveDocs.get(0))) {
         didNext = true;
         return (doc = 0);
       } else {
         return (doc = NO_MORE_DOCS);
       }
     }

     @Override
     public int advance(int target) {
       if (!didNext && target == 0) {
         return nextDoc();
       } else {
         return (doc = NO_MORE_DOCS);
       }
     }

     public void reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets) {
       this.liveDocs = liveDocs;
       this.positions = positions;
       this.startOffsets = startOffsets;
       this.endOffsets = endOffsets;
       this.doc = -1;
       didNext = false;
       nextPos = 0;
     }

     @Override
     public BytesRef getPayload() {
       return null;
     }

     @Override
     public boolean hasPayload() {
       return false;
     }

     @Override
     public int nextPosition() {
       assert (positions != null && nextPos < positions.length) ||
         startOffsets != null && nextPos < startOffsets.length;

       if (positions != null) {
         return positions[nextPos++];
       } else {
         nextPos++;
         return -1;
       }
     }

     @Override
     public int startOffset() {
       assert startOffsets != null;
       return startOffsets[nextPos-1];
     }

     @Override
     public int endOffset() {
       assert endOffsets != null;
       return endOffsets[nextPos-1];
     }
   }

   @Override
   public Fields get(int docID) throws IOException {
     if (docID < 0 || docID >= numTotalDocs) {
       throw new IllegalArgumentException("doID=" + docID + " is out of bounds [0.." + (numTotalDocs-1) + "]");
     }
     if (tvx != null) {
       Fields fields = new TVFields(docID);
       if (fields.getUniqueFieldCount() == 0) {
         // TODO: we can improve writer here, eg write 0 into
         // tvx file, so we know on first read from tvx that
         // this doc has no TVs
         return null;
       } else {
         return fields;
       }
     } else {
       return null;
     }
   }

   @Override
   public TermVectorsReader clone() {
     IndexInput cloneTvx = null;
     IndexInput cloneTvd = null;
     IndexInput cloneTvf = null;

     // These are null when a TermVectorsReader was created
     // on a segment that did not have term vectors saved
     if (tvx != null && tvd != null && tvf != null) {
       cloneTvx = (IndexInput) tvx.clone();
       cloneTvd = (IndexInput) tvd.clone();
       cloneTvf = (IndexInput) tvf.clone();
     }

     return new Lucene40TermVectorsReader(fieldInfos, cloneTvx, cloneTvd, cloneTvf, size, numTotalDocs, format);
   }

   public static void files(SegmentInfo info, Set<String> files) throws IOException {
     if (info.getHasVectors()) {
       files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_INDEX_EXTENSION));
       files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_FIELDS_EXTENSION));
       files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_DOCUMENTS_EXTENSION));
     }
   }
 }
	package org.apache.lucene.codecs.lucene40;

	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.io.IOException;
	import java.util.Arrays;
	import java.util.Comparator;
	import java.util.HashMap;
	import java.util.Map;
	import java.util.Set;

	import org.apache.lucene.codecs.TermVectorsReader;
	import org.apache.lucene.index.CorruptIndexException;
	import org.apache.lucene.index.DocsAndPositionsEnum;
	import org.apache.lucene.index.DocsEnum;
	import org.apache.lucene.index.FieldInfo;
	import org.apache.lucene.index.FieldInfos;
	import org.apache.lucene.index.Fields;
	import org.apache.lucene.index.FieldsEnum;
	import org.apache.lucene.index.IndexFileNames;
	import org.apache.lucene.index.IndexFormatTooNewException;
	import org.apache.lucene.index.IndexFormatTooOldException;
	import org.apache.lucene.index.SegmentInfo;
	import org.apache.lucene.index.Terms;
	import org.apache.lucene.index.TermsEnum;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.store.IOContext;
	import org.apache.lucene.store.IndexInput;
	import org.apache.lucene.util.Bits;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.IOUtils;

	public class Lucene40TermVectorsReader extends TermVectorsReader {

	// NOTE: if you make a new format, it must be larger than
	// the current format

	// Changed strings to UTF8 with length-in-bytes not length-in-chars
	static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4;

	// NOTE: always change this if you switch to a new format!
	// whenever you add a new format, make it 1 larger (positive version logic)!
	static final int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;

	// when removing support for old versions, leave the last supported version here
	static final int FORMAT_MINIMUM = FORMAT_UTF8_LENGTH_IN_BYTES;

	//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
	static final int FORMAT_SIZE = 4;

	static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1;

	static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;

	/** Extension of vectors fields file */
	static final String VECTORS_FIELDS_EXTENSION = "tvf";

	/** Extension of vectors documents file */
	static final String VECTORS_DOCUMENTS_EXTENSION = "tvd";

	/** Extension of vectors index file */
	static final String VECTORS_INDEX_EXTENSION = "tvx";

	private FieldInfos fieldInfos;

	private IndexInput tvx;
	private IndexInput tvd;
	private IndexInput tvf;
	private int size;
	private int numTotalDocs;

	private final int format;

	// used by clone
	Lucene40TermVectorsReader(FieldInfos fieldInfos, IndexInput tvx, IndexInput tvd, IndexInput tvf, int size, int numTotalDocs, int format) {
	this.fieldInfos = fieldInfos;
	this.tvx = tvx;
	this.tvd = tvd;
	this.tvf = tvf;
	this.size = size;
	this.numTotalDocs = numTotalDocs;
	this.format = format;
	}

	public Lucene40TermVectorsReader(Directory d, SegmentInfo si, FieldInfos fieldInfos, IOContext context)
	throws CorruptIndexException, IOException {
	final String segment = si.name;
	final int size = si.docCount;

	boolean success = false;

	try {
	String idxName = IndexFileNames.segmentFileName(segment, "", VECTORS_INDEX_EXTENSION);
	tvx = d.openInput(idxName, context);
	format = checkValidFormat(tvx);
	String fn = IndexFileNames.segmentFileName(segment, "", VECTORS_DOCUMENTS_EXTENSION);
	tvd = d.openInput(fn, context);
	final int tvdFormat = checkValidFormat(tvd);
	fn = IndexFileNames.segmentFileName(segment, "", VECTORS_FIELDS_EXTENSION);
	tvf = d.openInput(fn, context);
	final int tvfFormat = checkValidFormat(tvf);

	assert format == tvdFormat;
	assert format == tvfFormat;

	numTotalDocs = (int) (tvx.length() >> 4);

	this.size = numTotalDocs;
	assert size == 0 \|\| numTotalDocs == size;

	this.fieldInfos = fieldInfos;
	success = true;
	} finally {
	// With lock-less commits, it's entirely possible (and
	// fine) to hit a FileNotFound exception above. In
	// this case, we want to explicitly close any subset
	// of things that were opened so that we don't have to
	// wait for a GC to do so.
	if (!success) {
	close();
	}
	}
	}

	// Used for bulk copy when merging
	IndexInput getTvdStream() {
	return tvd;
	}

	// Used for bulk copy when merging
	IndexInput getTvfStream() {
	return tvf;
	}

	// Not private to avoid synthetic access$NNN methods
	void seekTvx(final int docNum) throws IOException {
	tvx.seek(docNum * 16L + FORMAT_SIZE);
	}

	boolean canReadRawDocs() {
	// we can always read raw docs, unless the term vectors
	// didn't exist
	return format != 0;
	}

	/** Retrieve the length (in bytes) of the tvd and tvf
	* entries for the next numDocs starting with
	* startDocID. This is used for bulk copying when
	* merging segments, if the field numbers are
	* congruent. Once this returns, the tvf & tvd streams
	* are seeked to the startDocID. */
	final void rawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs) throws IOException {

	if (tvx == null) {
	Arrays.fill(tvdLengths, 0);
	Arrays.fill(tvfLengths, 0);
	return;
	}

	seekTvx(startDocID);

	long tvdPosition = tvx.readLong();
	tvd.seek(tvdPosition);

	long tvfPosition = tvx.readLong();
	tvf.seek(tvfPosition);

	long lastTvdPosition = tvdPosition;
	long lastTvfPosition = tvfPosition;

	int count = 0;
	while (count < numDocs) {
	final int docID = startDocID + count + 1;
	assert docID <= numTotalDocs;
	if (docID < numTotalDocs) {
	tvdPosition = tvx.readLong();
	tvfPosition = tvx.readLong();
	} else {
	tvdPosition = tvd.length();
	tvfPosition = tvf.length();
	assert count == numDocs-1;
	}
	tvdLengths[count] = (int) (tvdPosition-lastTvdPosition);
	tvfLengths[count] = (int) (tvfPosition-lastTvfPosition);
	count++;
	lastTvdPosition = tvdPosition;
	lastTvfPosition = tvfPosition;
	}
	}

	private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException
	{
	int format = in.readInt();
	if (format < FORMAT_MINIMUM)
	throw new IndexFormatTooOldException(in, format, FORMAT_MINIMUM, FORMAT_CURRENT);
	if (format > FORMAT_CURRENT)
	throw new IndexFormatTooNewException(in, format, FORMAT_MINIMUM, FORMAT_CURRENT);
	return format;
	}

	public void close() throws IOException {
	IOUtils.close(tvx, tvd, tvf);
	}

	/**
	*
	* @return The number of documents in the reader
	*/
	int size() {
	return size;
	}

	private class TVFields extends Fields {
	private final int[] fieldNumbers;
	private final long[] fieldFPs;
	private final Map<Integer,Integer> fieldNumberToIndex = new HashMap<Integer,Integer>();

	public TVFields(int docID) throws IOException {
	seekTvx(docID);
	tvd.seek(tvx.readLong());

	final int fieldCount = tvd.readVInt();
	assert fieldCount >= 0;
	if (fieldCount != 0) {
	fieldNumbers = new int[fieldCount];
	fieldFPs = new long[fieldCount];
	for(int fieldUpto=0;fieldUpto<fieldCount;fieldUpto++) {
	final int fieldNumber = tvd.readVInt();
	fieldNumbers[fieldUpto] = fieldNumber;
	fieldNumberToIndex.put(fieldNumber, fieldUpto);
	}

	long position = tvx.readLong();
	fieldFPs[0] = position;
	for(int fieldUpto=1;fieldUpto<fieldCount;fieldUpto++) {
	position += tvd.readVLong();
	fieldFPs[fieldUpto] = position;
	}
	} else {
	// TODO: we can improve writer here, eg write 0 into
	// tvx file, so we know on first read from tvx that
	// this doc has no TVs
	fieldNumbers = null;
	fieldFPs = null;
	}
	}

	@Override
	public FieldsEnum iterator() throws IOException {

	return new FieldsEnum() {
	private int fieldUpto;

	@Override
	public String next() throws IOException {
	if (fieldNumbers != null && fieldUpto < fieldNumbers.length) {
	return fieldInfos.fieldName(fieldNumbers[fieldUpto++]);
	} else {
	return null;
	}
	}

	@Override
	public Terms terms() throws IOException {
	return TVFields.this.terms(fieldInfos.fieldName(fieldNumbers[fieldUpto-1]));
	}
	};
	}

	@Override
	public Terms terms(String field) throws IOException {
	final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
	if (fieldInfo == null) {
	// No such field
	return null;
	}

	final Integer fieldIndex = fieldNumberToIndex.get(fieldInfo.number);
	if (fieldIndex == null) {
	// Term vectors were not indexed for this field
	return null;
	}

	return new TVTerms(fieldFPs[fieldIndex]);
	}

	@Override
	public int getUniqueFieldCount() {
	if (fieldNumbers == null) {
	return 0;
	} else {
	return fieldNumbers.length;
	}
	}
	}

	private class TVTerms extends Terms {
	private final int numTerms;
	private final long tvfFPStart;

	public TVTerms(long tvfFP) throws IOException {
	tvf.seek(tvfFP);
	numTerms = tvf.readVInt();
	tvfFPStart = tvf.getFilePointer();
	}

	@Override
	public TermsEnum iterator(TermsEnum reuse) throws IOException {
	TVTermsEnum termsEnum;
	if (reuse instanceof TVTermsEnum) {
	termsEnum = (TVTermsEnum) reuse;
	if (!termsEnum.canReuse(tvf)) {
	termsEnum = new TVTermsEnum();
	}
	} else {
	termsEnum = new TVTermsEnum();
	}
	termsEnum.reset(numTerms, tvfFPStart);
	return termsEnum;
	}

	@Override
	public long getUniqueTermCount() {
	return numTerms;
	}

	@Override
	public long getSumTotalTermFreq() {
	return -1;
	}

	@Override
	public long getSumDocFreq() {
	// Every term occurs in just one doc:
	return numTerms;
	}

	@Override
	public int getDocCount() {
	return 1;
	}

	@Override
	public Comparator<BytesRef> getComparator() {
	// TODO: really indexer hardwires
	// this...? I guess codec could buffer and re-sort...
	return BytesRef.getUTF8SortedAsUnicodeComparator();
	}
	}

	private class TVTermsEnum extends TermsEnum {
	private final IndexInput origTVF;
	private final IndexInput tvf;
	private int numTerms;
	private int nextTerm;
	private int freq;
	private BytesRef lastTerm = new BytesRef();
	private BytesRef term = new BytesRef();
	private boolean storePositions;
	private boolean storeOffsets;
	private long tvfFP;

	private int[] positions;
	private int[] startOffsets;
	private int[] endOffsets;

	// NOTE: tvf is pre-positioned by caller
	public TVTermsEnum() throws IOException {
	this.origTVF = Lucene40TermVectorsReader.this.tvf;
	tvf = (IndexInput) origTVF.clone();
	}

	public boolean canReuse(IndexInput tvf) {
	return tvf == origTVF;
	}

	public void reset(int numTerms, long tvfFPStart) throws IOException {
	this.numTerms = numTerms;
	nextTerm = 0;
	tvf.seek(tvfFPStart);
	final byte bits = tvf.readByte();
	storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
	storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
	tvfFP = 1+tvfFPStart;
	positions = null;
	startOffsets = null;
	endOffsets = null;
	}

	// NOTE: slow! (linear scan)
	@Override
	public SeekStatus seekCeil(BytesRef text, boolean useCache)
	throws IOException {
	if (nextTerm != 0) {
	final int cmp = text.compareTo(term);
	if (cmp < 0) {
	nextTerm = 0;
	tvf.seek(tvfFP);
	} else if (cmp == 0) {
	return SeekStatus.FOUND;
	}
	}

	while (next() != null) {
	final int cmp = text.compareTo(term);
	if (cmp < 0) {
	return SeekStatus.NOT_FOUND;
	} else if (cmp == 0) {
	return SeekStatus.FOUND;
	}
	}

	return SeekStatus.END;
	}

	@Override
	public void seekExact(long ord) {
	throw new UnsupportedOperationException();
	}

	@Override
	public BytesRef next() throws IOException {
	if (nextTerm >= numTerms) {
	return null;
	}
	term.copyBytes(lastTerm);
	final int start = tvf.readVInt();
	final int deltaLen = tvf.readVInt();
	term.length = start + deltaLen;
	term.grow(term.length);
	tvf.readBytes(term.bytes, start, deltaLen);
	freq = tvf.readVInt();

	if (storePositions) {
	// TODO: we could maybe reuse last array, if we can
	// somehow be careful about consumer never using two
	// D&PEnums at once...
	positions = new int[freq];
	int pos = 0;
	for(int posUpto=0;posUpto<freq;posUpto++) {
	pos += tvf.readVInt();
	positions[posUpto] = pos;
	}
	}

	if (storeOffsets) {
	startOffsets = new int[freq];
	endOffsets = new int[freq];
	int offset = 0;
	for(int posUpto=0;posUpto<freq;posUpto++) {
	startOffsets[posUpto] = offset + tvf.readVInt();
	offset = endOffsets[posUpto] = startOffsets[posUpto] + tvf.readVInt();
	}
	}

	lastTerm.copyBytes(term);
	nextTerm++;
	return term;
	}

	@Override
	public BytesRef term() {
	return term;
	}

	@Override
	public long ord() {
	throw new UnsupportedOperationException();
	}

	@Override
	public int docFreq() {
	return 1;
	}

	@Override
	public long totalTermFreq() {
	return freq;
	}

	@Override
	public DocsEnum docs(Bits liveDocs, DocsEnum reuse, boolean needsFreqs /* ignored */) throws IOException {
	TVDocsEnum docsEnum;
	if (reuse != null && reuse instanceof TVDocsEnum) {
	docsEnum = (TVDocsEnum) reuse;
	} else {
	docsEnum = new TVDocsEnum();
	}
	docsEnum.reset(liveDocs, freq);
	return docsEnum;
	}

	@Override
	public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
	if (needsOffsets && !storeOffsets) {
	return null;
	}

	if (!storePositions && !storeOffsets) {
	return null;
	}

	TVDocsAndPositionsEnum docsAndPositionsEnum;
	if (reuse != null && reuse instanceof TVDocsAndPositionsEnum) {
	docsAndPositionsEnum = (TVDocsAndPositionsEnum) reuse;
	} else {
	docsAndPositionsEnum = new TVDocsAndPositionsEnum();
	}
	docsAndPositionsEnum.reset(liveDocs, positions, startOffsets, endOffsets);
	return docsAndPositionsEnum;
	}

	@Override
	public Comparator<BytesRef> getComparator() {
	// TODO: really indexer hardwires
	// this...? I guess codec could buffer and re-sort...
	return BytesRef.getUTF8SortedAsUnicodeComparator();
	}
	}

	// NOTE: sort of a silly class, since you can get the
	// freq() already by TermsEnum.totalTermFreq
	private static class TVDocsEnum extends DocsEnum {
	private boolean didNext;
	private int doc = -1;
	private int freq;
	private Bits liveDocs;

	@Override
	public int freq() {
	return freq;
	}

	@Override
	public int docID() {
	return doc;
	}

	@Override
	public int nextDoc() {
	if (!didNext && (liveDocs == null \|\| liveDocs.get(0))) {
	didNext = true;
	return (doc = 0);
	} else {
	return (doc = NO_MORE_DOCS);
	}
	}

	@Override
	public int advance(int target) {
	if (!didNext && target == 0) {
	return nextDoc();
	} else {
	return (doc = NO_MORE_DOCS);
	}
	}

	public void reset(Bits liveDocs, int freq) {
	this.liveDocs = liveDocs;
	this.freq = freq;
	this.doc = -1;
	didNext = false;
	}
	}

	private static class TVDocsAndPositionsEnum extends DocsAndPositionsEnum {
	private boolean didNext;
	private int doc = -1;
	private int nextPos;
	private Bits liveDocs;
	private int[] positions;
	private int[] startOffsets;
	private int[] endOffsets;

	@Override
	public int freq() {
	if (positions != null) {
	return positions.length;
	} else {
	assert startOffsets != null;
	return startOffsets.length;
	}
	}

	@Override
	public int docID() {
	return doc;
	}

	@Override
	public int nextDoc() {
	if (!didNext && (liveDocs == null \|\| liveDocs.get(0))) {
	didNext = true;
	return (doc = 0);
	} else {
	return (doc = NO_MORE_DOCS);
	}
	}

	@Override
	public int advance(int target) {
	if (!didNext && target == 0) {
	return nextDoc();
	} else {
	return (doc = NO_MORE_DOCS);
	}
	}

	public void reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets) {
	this.liveDocs = liveDocs;
	this.positions = positions;
	this.startOffsets = startOffsets;
	this.endOffsets = endOffsets;
	this.doc = -1;
	didNext = false;
	nextPos = 0;
	}

	@Override
	public BytesRef getPayload() {
	return null;
	}

	@Override
	public boolean hasPayload() {
	return false;
	}

	@Override
	public int nextPosition() {
	assert (positions != null && nextPos < positions.length) \|\|
	startOffsets != null && nextPos < startOffsets.length;

	if (positions != null) {
	return positions[nextPos++];
	} else {
	nextPos++;
	return -1;
	}
	}

	@Override
	public int startOffset() {
	assert startOffsets != null;
	return startOffsets[nextPos-1];
	}

	@Override
	public int endOffset() {
	assert endOffsets != null;
	return endOffsets[nextPos-1];
	}
	}

	@Override
	public Fields get(int docID) throws IOException {
	if (docID < 0 \|\| docID >= numTotalDocs) {
	throw new IllegalArgumentException("doID=" + docID + " is out of bounds [0.." + (numTotalDocs-1) + "]");
	}
	if (tvx != null) {
	Fields fields = new TVFields(docID);
	if (fields.getUniqueFieldCount() == 0) {
	// TODO: we can improve writer here, eg write 0 into
	// tvx file, so we know on first read from tvx that
	// this doc has no TVs
	return null;
	} else {
	return fields;
	}
	} else {
	return null;
	}
	}

	@Override
	public TermVectorsReader clone() {
	IndexInput cloneTvx = null;
	IndexInput cloneTvd = null;
	IndexInput cloneTvf = null;

	// These are null when a TermVectorsReader was created
	// on a segment that did not have term vectors saved
	if (tvx != null && tvd != null && tvf != null) {
	cloneTvx = (IndexInput) tvx.clone();
	cloneTvd = (IndexInput) tvd.clone();
	cloneTvf = (IndexInput) tvf.clone();
	}

	return new Lucene40TermVectorsReader(fieldInfos, cloneTvx, cloneTvd, cloneTvf, size, numTotalDocs, format);
	}

	public static void files(SegmentInfo info, Set<String> files) throws IOException {
	if (info.getHasVectors()) {
	files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_INDEX_EXTENSION));
	files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_FIELDS_EXTENSION));
	files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_DOCUMENTS_EXTENSION));
	}
	}
	}