src/Lucene.Net.Core/Index/TermVectorsReader.cs - lucenenet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 using System;

 using BufferedIndexInput = Lucene.Net.Store.BufferedIndexInput;
 using Directory = Lucene.Net.Store.Directory;
 using IndexInput = Lucene.Net.Store.IndexInput;

 namespace Lucene.Net.Index
 {
     class TermVectorsReader : System.ICloneable, IDisposable
     {

         // NOTE: if you make a new format, it must be larger than
         // the current format
         internal const int FORMAT_VERSION = 2;

         // Changes to speed up bulk merging of term vectors:
         internal const int FORMAT_VERSION2 = 3;

         // Changed strings to UTF8 with length-in-bytes not length-in-chars
         internal const int FORMAT_UTF8_LENGTH_IN_BYTES = 4;

         // NOTE: always change this if you switch to a new format!
         internal static readonly int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;

         //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
         internal const int FORMAT_SIZE = 4;

         internal const byte STORE_POSITIONS_WITH_TERMVECTOR = (byte) (0x1);
         internal const byte STORE_OFFSET_WITH_TERMVECTOR = (byte) (0x2);

         private FieldInfos fieldInfos;

         private IndexInput tvx;
         private IndexInput tvd;
         private IndexInput tvf;
         private int size;
         private int numTotalDocs;

         // The docID offset where our docs begin in the index
         // file.  This will be 0 if we have our own private file.
         private int docStoreOffset;

         private int format;
         private bool isDisposed;

         internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos):this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE)
         {
         }

         internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos, int readBufferSize):this(d, segment, fieldInfos, readBufferSize, - 1, 0)
         {
         }

         internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size)
         {
             bool success = false;

             try
             {
                 if (d.FileExists(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION))
                 {
                     tvx = d.OpenInput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION, readBufferSize);
                     format = CheckValidFormat(tvx);
                     tvd = d.OpenInput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION, readBufferSize);
                     int tvdFormat = CheckValidFormat(tvd);
                     tvf = d.OpenInput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION, readBufferSize);
                     int tvfFormat = CheckValidFormat(tvf);

                     System.Diagnostics.Debug.Assert(format == tvdFormat);
                     System.Diagnostics.Debug.Assert(format == tvfFormat);

                     if (format >= FORMAT_VERSION2)
                     {
                         System.Diagnostics.Debug.Assert((tvx.Length() - FORMAT_SIZE) % 16 == 0);
                         numTotalDocs = (int)(tvx.Length() >> 4);
                     }
                     else
                     {
                         System.Diagnostics.Debug.Assert((tvx.Length() - FORMAT_SIZE) % 8 == 0);
                         numTotalDocs = (int)(tvx.Length() >> 3);
                     }

                     if (-1 == docStoreOffset)
                     {
                         this.docStoreOffset = 0;
                         this.size = numTotalDocs;
                         System.Diagnostics.Debug.Assert(size == 0 || numTotalDocs == size);
                     }
                     else
                     {
                         this.docStoreOffset = docStoreOffset;
                         this.size = size;
                         // Verify the file is long enough to hold all of our
                         // docs
                         System.Diagnostics.Debug.Assert(numTotalDocs >= size + docStoreOffset, "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset);
                     }
                 }
                 else
                 {
                     // If all documents flushed in a segment had hit
                     // non-aborting exceptions, it's possible that
                     // FieldInfos.hasVectors returns true yet the term
                     // vector files don't exist.
                     format = 0;
                 }


                 this.fieldInfos = fieldInfos;
                 success = true;
             }
             finally
             {
                 // With lock-less commits, it's entirely possible (and
                 // fine) to hit a FileNotFound exception above. In
                 // this case, we want to explicitly close any subset
                 // of things that were opened so that we don't have to
                 // wait for a GC to do so.
                 if (!success)
                 {
                     Dispose();
                 }
             }
         }

         // Used for bulk copy when merging
         internal virtual IndexInput GetTvdStream()
         {
             return tvd;
         }

         // Used for bulk copy when merging
         internal virtual IndexInput GetTvfStream()
         {
             return tvf;
         }

         private void  SeekTvx(int docNum)
         {
             if (format < FORMAT_VERSION2)
                 tvx.Seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE);
             else
                 tvx.Seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
         }

         internal virtual bool CanReadRawDocs()
         {
             return format >= FORMAT_UTF8_LENGTH_IN_BYTES;
         }

         /// <summary>Retrieve the length (in bytes) of the tvd and tvf
         /// entries for the next numDocs starting with
         /// startDocID.  This is used for bulk copying when
         /// merging segments, if the field numbers are
         /// congruent.  Once this returns, the tvf &amp; tvd streams
         /// are seeked to the startDocID.
         /// </summary>
         internal void  RawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs)
         {

             if (tvx == null)
             {
                 for (int i = 0; i < tvdLengths.Length; i++)
                 {
                     tvdLengths[i] = 0;
                 }
                 for (int i = 0; i < tvfLengths.Length; i++)
                 {
                     tvfLengths[i] = 0;
                 }
                 return ;
             }

             // SegmentMerger calls canReadRawDocs() first and should
             // not call us if that returns false.
             if (format < FORMAT_VERSION2)
                 throw new System.SystemException("cannot read raw docs with older term vector formats");

             SeekTvx(startDocID);

             long tvdPosition = tvx.ReadLong();
             tvd.Seek(tvdPosition);

             long tvfPosition = tvx.ReadLong();
             tvf.Seek(tvfPosition);

             long lastTvdPosition = tvdPosition;
             long lastTvfPosition = tvfPosition;

             int count = 0;
             while (count < numDocs)
             {
                 int docID = docStoreOffset + startDocID + count + 1;
                 System.Diagnostics.Debug.Assert(docID <= numTotalDocs);
                 if (docID < numTotalDocs)
                 {
                     tvdPosition = tvx.ReadLong();
                     tvfPosition = tvx.ReadLong();
                 }
                 else
                 {
                     tvdPosition = tvd.Length();
                     tvfPosition = tvf.Length();
                     System.Diagnostics.Debug.Assert(count == numDocs - 1);
                 }
                 tvdLengths[count] = (int) (tvdPosition - lastTvdPosition);
                 tvfLengths[count] = (int) (tvfPosition - lastTvfPosition);
                 count++;
                 lastTvdPosition = tvdPosition;
                 lastTvfPosition = tvfPosition;
             }
         }

         private int CheckValidFormat(IndexInput in_Renamed)
         {
             int format = in_Renamed.ReadInt();
             if (format > FORMAT_CURRENT)
             {
                 throw new CorruptIndexException("Incompatible format version: " + format + " expected " + FORMAT_CURRENT + " or less");
             }
             return format;
         }

         public void Dispose()
         {
             Dispose(true);
         }

         protected virtual void Dispose(bool disposing)
         {
             if (isDisposed) return;

             if (disposing)
             {
                 // make all effort to close up. Keep the first exception
                 // and throw it as a new one.
                 System.IO.IOException keep = null;
                 if (tvx != null)
                     try
                     {
                         tvx.Close();
                     }
                     catch (System.IO.IOException e)
                     {
                         if (keep == null)
                             keep = e;
                     }
                 if (tvd != null)
                     try
                     {
                         tvd.Close();
                     }
                     catch (System.IO.IOException e)
                     {
                         if (keep == null)
                             keep = e;
                     }
                 if (tvf != null)
                     try
                     {
                         tvf.Close();
                     }
                     catch (System.IO.IOException e)
                     {
                         if (keep == null)
                             keep = e;
                     }
                 if (keep != null)
                 {
                     throw new System.IO.IOException(keep.StackTrace);
                 }
             }

             isDisposed = true;
         }

         /// <summary> </summary>
         /// <returns> The number of documents in the reader
         /// </returns>
         internal virtual int Size()
         {
             return size;
         }

         public virtual void  Get(int docNum, System.String field, TermVectorMapper mapper)
         {
             if (tvx != null)
             {
                 int fieldNumber = fieldInfos.FieldNumber(field);
                 //We need to account for the FORMAT_SIZE at when seeking in the tvx
                 //We don't need to do this in other seeks because we already have the
                 // file pointer
                 //that was written in another file
                 SeekTvx(docNum);
                 //System.out.println("TVX Pointer: " + tvx.getFilePointer());
                 long tvdPosition = tvx.ReadLong();

                 tvd.Seek(tvdPosition);
                 int fieldCount = tvd.ReadVInt();
                 //System.out.println("Num Fields: " + fieldCount);
                 // There are only a few fields per document. We opt for a full scan
                 // rather then requiring that they be ordered. We need to read through
                 // all of the fields anyway to get to the tvf pointers.
                 int number = 0;
                 int found = - 1;
                 for (int i = 0; i < fieldCount; i++)
                 {
                     if (format >= FORMAT_VERSION)
                         number = tvd.ReadVInt();
                     else
                         number += tvd.ReadVInt();

                     if (number == fieldNumber)
                         found = i;
                 }

                 // This field, although valid in the segment, was not found in this
                 // document
                 if (found != - 1)
                 {
                     // Compute position in the tvf file
                     long position;
                     if (format >= FORMAT_VERSION2)
                         position = tvx.ReadLong();
                     else
                         position = tvd.ReadVLong();
                     for (int i = 1; i <= found; i++)
                         position += tvd.ReadVLong();

                     mapper.SetDocumentNumber(docNum);
                     ReadTermVector(field, position, mapper);
                 }
                 else
                 {
                     //System.out.println("Fieldable not found");
                 }
             }
             else
             {
                 //System.out.println("No tvx file");
             }
         }


         /// <summary> Retrieve the term vector for the given document and field</summary>
         /// <param name="docNum">The document number to retrieve the vector for
         /// </param>
         /// <param name="field">The field within the document to retrieve
         /// </param>
         /// <returns> The TermFreqVector for the document and field or null if there is no termVector for this field.
         /// </returns>
         /// <throws>  IOException if there is an error reading the term vector files </throws>
         public /*internal*/ virtual ITermFreqVector Get(int docNum, System.String field)
         {
             // Check if no term vectors are available for this segment at all
             ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
             Get(docNum, field, mapper);

             return mapper.MaterializeVector();
         }

         // Reads the String[] fields; you have to pre-seek tvd to
         // the right point
         private System.String[] ReadFields(int fieldCount)
         {
             int number = 0;
             System.String[] fields = new System.String[fieldCount];

             for (int i = 0; i < fieldCount; i++)
             {
                 if (format >= FORMAT_VERSION)
                     number = tvd.ReadVInt();
                 else
                     number += tvd.ReadVInt();

                 fields[i] = fieldInfos.FieldName(number);
             }

             return fields;
         }

         // Reads the long[] offsets into TVF; you have to pre-seek
         // tvx/tvd to the right point
         private long[] ReadTvfPointers(int fieldCount)
         {
             // Compute position in the tvf file
             long position;
             if (format >= FORMAT_VERSION2)
                 position = tvx.ReadLong();
             else
                 position = tvd.ReadVLong();

             long[] tvfPointers = new long[fieldCount];
             tvfPointers[0] = position;

             for (int i = 1; i < fieldCount; i++)
             {
                 position += tvd.ReadVLong();
                 tvfPointers[i] = position;
             }

             return tvfPointers;
         }

         /// <summary> Return all term vectors stored for this document or null if the could not be read in.
         ///
         /// </summary>
         /// <param name="docNum">The document number to retrieve the vector for
         /// </param>
         /// <returns> All term frequency vectors
         /// </returns>
         /// <throws>  IOException if there is an error reading the term vector files  </throws>
         public /*internal*/ virtual ITermFreqVector[] Get(int docNum)
         {
             ITermFreqVector[] result = null;
             if (tvx != null)
             {
                 //We need to offset by
                 SeekTvx(docNum);
                 long tvdPosition = tvx.ReadLong();

                 tvd.Seek(tvdPosition);
                 int fieldCount = tvd.ReadVInt();

                 // No fields are vectorized for this document
                 if (fieldCount != 0)
                 {
                     System.String[] fields = ReadFields(fieldCount);
                     long[] tvfPointers = ReadTvfPointers(fieldCount);
                     result = ReadTermVectors(docNum, fields, tvfPointers);
                 }
             }
             else
             {
                 //System.out.println("No tvx file");
             }
             return result;
         }

         public virtual void  Get(int docNumber, TermVectorMapper mapper)
         {
             // Check if no term vectors are available for this segment at all
             if (tvx != null)
             {
                 //We need to offset by

                 SeekTvx(docNumber);
                 long tvdPosition = tvx.ReadLong();

                 tvd.Seek(tvdPosition);
                 int fieldCount = tvd.ReadVInt();

                 // No fields are vectorized for this document
                 if (fieldCount != 0)
                 {
                     System.String[] fields = ReadFields(fieldCount);
                     long[] tvfPointers = ReadTvfPointers(fieldCount);
                     mapper.SetDocumentNumber(docNumber);
                     ReadTermVectors(fields, tvfPointers, mapper);
                 }
             }
             else
             {
                 //System.out.println("No tvx file");
             }
         }


         private SegmentTermVector[] ReadTermVectors(int docNum, System.String[] fields, long[] tvfPointers)
         {
             SegmentTermVector[] res = new SegmentTermVector[fields.Length];
             for (int i = 0; i < fields.Length; i++)
             {
                 var mapper = new ParallelArrayTermVectorMapper();
                 mapper.SetDocumentNumber(docNum);
                 ReadTermVector(fields[i], tvfPointers[i], mapper);
                 res[i] = (SegmentTermVector) mapper.MaterializeVector();
             }
             return res;
         }

         private void  ReadTermVectors(System.String[] fields, long[] tvfPointers, TermVectorMapper mapper)
         {
             for (int i = 0; i < fields.Length; i++)
             {
                 ReadTermVector(fields[i], tvfPointers[i], mapper);
             }
         }


         /// <summary> </summary>
         /// <param name="field">The field to read in
         /// </param>
         /// <param name="tvfPointer">The pointer within the tvf file where we should start reading
         /// </param>
         /// <param name="mapper">The mapper used to map the TermVector
         /// </param>
         /// <throws>  IOException </throws>
         private void  ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper)
         {

             // Now read the data from specified position
             //We don't need to offset by the FORMAT here since the pointer already includes the offset
             tvf.Seek(tvfPointer);

             int numTerms = tvf.ReadVInt();
             //System.out.println("Num Terms: " + numTerms);
             // If no terms - return a constant empty termvector. However, this should never occur!
             if (numTerms == 0)
                 return ;

             bool storePositions;
             bool storeOffsets;

             if (format >= FORMAT_VERSION)
             {
                 byte bits = tvf.ReadByte();
                 storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
                 storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
             }
             else
             {
                 tvf.ReadVInt();
                 storePositions = false;
                 storeOffsets = false;
             }
             mapper.SetExpectations(field, numTerms, storeOffsets, storePositions);
             int start = 0;
             int deltaLength = 0;
             int totalLength = 0;
             byte[] byteBuffer;
             char[] charBuffer;
             bool preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES;

             // init the buffers
             if (preUTF8)
             {
                 charBuffer = new char[10];
                 byteBuffer = null;
             }
             else
             {
                 charBuffer = null;
                 byteBuffer = new byte[20];
             }

             for (int i = 0; i < numTerms; i++)
             {
                 start = tvf.ReadVInt();
                 deltaLength = tvf.ReadVInt();
                 totalLength = start + deltaLength;

                 System.String term;

                 if (preUTF8)
                 {
                     // Term stored as java chars
                     if (charBuffer.Length < totalLength)
                     {
                         char[] newCharBuffer = new char[(int) (1.5 * totalLength)];
                         Array.Copy(charBuffer, 0, newCharBuffer, 0, start);
                         charBuffer = newCharBuffer;
                     }
                     tvf.ReadChars(charBuffer, start, deltaLength);
                     term = new System.String(charBuffer, 0, totalLength);
                 }
                 else
                 {
                     // Term stored as utf8 bytes
                     if (byteBuffer.Length < totalLength)
                     {
                         byte[] newByteBuffer = new byte[(int) (1.5 * totalLength)];
                         Array.Copy(byteBuffer, 0, newByteBuffer, 0, start);
                         byteBuffer = newByteBuffer;
                     }
                     tvf.ReadBytes(byteBuffer, start, deltaLength);
                     term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength);
                 }
                 int freq = tvf.ReadVInt();
                 int[] positions = null;
                 if (storePositions)
                 {
                     //read in the positions
                     //does the mapper even care about positions?
                     if (mapper.IsIgnoringPositions == false)
                     {
                         positions = new int[freq];
                         int prevPosition = 0;
                         for (int j = 0; j < freq; j++)
                         {
                             positions[j] = prevPosition + tvf.ReadVInt();
                             prevPosition = positions[j];
                         }
                     }
                     else
                     {
                         //we need to skip over the positions.  Since these are VInts, I don't believe there is anyway to know for sure how far to skip
                         //
                         for (int j = 0; j < freq; j++)
                         {
                             tvf.ReadVInt();
                         }
                     }
                 }
                 TermVectorOffsetInfo[] offsets = null;
                 if (storeOffsets)
                 {
                     //does the mapper even care about offsets?
                     if (mapper.IsIgnoringOffsets == false)
                     {
                         offsets = new TermVectorOffsetInfo[freq];
                         int prevOffset = 0;
                         for (int j = 0; j < freq; j++)
                         {
                             int startOffset = prevOffset + tvf.ReadVInt();
                             int endOffset = startOffset + tvf.ReadVInt();
                             offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
                             prevOffset = endOffset;
                         }
                     }
                     else
                     {
                         for (int j = 0; j < freq; j++)
                         {
                             tvf.ReadVInt();
                             tvf.ReadVInt();
                         }
                     }
                 }
                 mapper.Map(term, freq, offsets, positions);
             }
         }

         public virtual System.Object Clone()
         {

             TermVectorsReader clone = (TermVectorsReader) base.MemberwiseClone();

             // These are null when a TermVectorsReader was created
             // on a segment that did not have term vectors saved
             if (tvx != null && tvd != null && tvf != null)
             {
                 clone.tvx = (IndexInput) tvx.Clone();
                 clone.tvd = (IndexInput) tvd.Clone();
                 clone.tvf = (IndexInput) tvf.Clone();
             }

             return clone;
         }
     }


     /// <summary> Models the existing parallel array structure</summary>
     class ParallelArrayTermVectorMapper:TermVectorMapper
     {

         private System.String[] terms;
         private int[] termFreqs;
         private int[][] positions;
         private TermVectorOffsetInfo[][] offsets;
         private int currentPosition;
         private bool storingOffsets;
         private bool storingPositions;
         private System.String field;

         public override void  SetExpectations(System.String field, int numTerms, bool storeOffsets, bool storePositions)
         {
             this.field = field;
             terms = new System.String[numTerms];
             termFreqs = new int[numTerms];
             this.storingOffsets = storeOffsets;
             this.storingPositions = storePositions;
             if (storePositions)
                 this.positions = new int[numTerms][];
             if (storeOffsets)
                 this.offsets = new TermVectorOffsetInfo[numTerms][];
         }

         public override void  Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
         {
             terms[currentPosition] = term;
             termFreqs[currentPosition] = frequency;
             if (storingOffsets)
             {
                 this.offsets[currentPosition] = offsets;
             }
             if (storingPositions)
             {
                 this.positions[currentPosition] = positions;
             }
             currentPosition++;
         }

         /// <summary> Construct the vector</summary>
         /// <returns> The <see cref="ITermFreqVector" /> based on the mappings.
         /// </returns>
         public virtual ITermFreqVector MaterializeVector()
         {
             SegmentTermVector tv = null;
             if (field != null && terms != null)
             {
                 if (storingPositions || storingOffsets)
                 {
                     tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
                 }
                 else
                 {
                     tv = new SegmentTermVector(field, terms, termFreqs);
                 }
             }
             return tv;
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	using System;

	using BufferedIndexInput = Lucene.Net.Store.BufferedIndexInput;
	using Directory = Lucene.Net.Store.Directory;
	using IndexInput = Lucene.Net.Store.IndexInput;

	namespace Lucene.Net.Index
	{
	class TermVectorsReader : System.ICloneable, IDisposable
	{

	// NOTE: if you make a new format, it must be larger than
	// the current format
	internal const int FORMAT_VERSION = 2;

	// Changes to speed up bulk merging of term vectors:
	internal const int FORMAT_VERSION2 = 3;

	// Changed strings to UTF8 with length-in-bytes not length-in-chars
	internal const int FORMAT_UTF8_LENGTH_IN_BYTES = 4;

	// NOTE: always change this if you switch to a new format!
	internal static readonly int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;

	//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
	internal const int FORMAT_SIZE = 4;

	internal const byte STORE_POSITIONS_WITH_TERMVECTOR = (byte) (0x1);
	internal const byte STORE_OFFSET_WITH_TERMVECTOR = (byte) (0x2);

	private FieldInfos fieldInfos;

	private IndexInput tvx;
	private IndexInput tvd;
	private IndexInput tvf;
	private int size;
	private int numTotalDocs;

	// The docID offset where our docs begin in the index
	// file. This will be 0 if we have our own private file.
	private int docStoreOffset;

	private int format;
	private bool isDisposed;

	internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos):this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE)
	{
	}

	internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos, int readBufferSize):this(d, segment, fieldInfos, readBufferSize, - 1, 0)
	{
	}

	internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size)
	{
	bool success = false;

	try
	{
	if (d.FileExists(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION))
	{
	tvx = d.OpenInput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION, readBufferSize);
	format = CheckValidFormat(tvx);
	tvd = d.OpenInput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION, readBufferSize);
	int tvdFormat = CheckValidFormat(tvd);
	tvf = d.OpenInput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION, readBufferSize);
	int tvfFormat = CheckValidFormat(tvf);

	System.Diagnostics.Debug.Assert(format == tvdFormat);
	System.Diagnostics.Debug.Assert(format == tvfFormat);

	if (format >= FORMAT_VERSION2)
	{
	System.Diagnostics.Debug.Assert((tvx.Length() - FORMAT_SIZE) % 16 == 0);
	numTotalDocs = (int)(tvx.Length() >> 4);
	}
	else
	{
	System.Diagnostics.Debug.Assert((tvx.Length() - FORMAT_SIZE) % 8 == 0);
	numTotalDocs = (int)(tvx.Length() >> 3);
	}

	if (-1 == docStoreOffset)
	{
	this.docStoreOffset = 0;
	this.size = numTotalDocs;
	System.Diagnostics.Debug.Assert(size == 0 \|\| numTotalDocs == size);
	}
	else
	{
	this.docStoreOffset = docStoreOffset;
	this.size = size;
	// Verify the file is long enough to hold all of our
	// docs
	System.Diagnostics.Debug.Assert(numTotalDocs >= size + docStoreOffset, "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset);
	}
	}
	else
	{
	// If all documents flushed in a segment had hit
	// non-aborting exceptions, it's possible that
	// FieldInfos.hasVectors returns true yet the term
	// vector files don't exist.
	format = 0;
	}


	this.fieldInfos = fieldInfos;
	success = true;
	}
	finally
	{
	// With lock-less commits, it's entirely possible (and
	// fine) to hit a FileNotFound exception above. In
	// this case, we want to explicitly close any subset
	// of things that were opened so that we don't have to
	// wait for a GC to do so.
	if (!success)
	{
	Dispose();
	}
	}
	}

	// Used for bulk copy when merging
	internal virtual IndexInput GetTvdStream()
	{
	return tvd;
	}

	// Used for bulk copy when merging
	internal virtual IndexInput GetTvfStream()
	{
	return tvf;
	}

	private void SeekTvx(int docNum)
	{
	if (format < FORMAT_VERSION2)
	tvx.Seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE);
	else
	tvx.Seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
	}

	internal virtual bool CanReadRawDocs()
	{
	return format >= FORMAT_UTF8_LENGTH_IN_BYTES;
	}

	/// <summary>Retrieve the length (in bytes) of the tvd and tvf
	/// entries for the next numDocs starting with
	/// startDocID. This is used for bulk copying when
	/// merging segments, if the field numbers are
	/// congruent. Once this returns, the tvf & tvd streams
	/// are seeked to the startDocID.
	/// </summary>
	internal void RawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs)
	{

	if (tvx == null)
	{
	for (int i = 0; i < tvdLengths.Length; i++)
	{
	tvdLengths[i] = 0;
	}
	for (int i = 0; i < tvfLengths.Length; i++)
	{
	tvfLengths[i] = 0;
	}
	return ;
	}

	// SegmentMerger calls canReadRawDocs() first and should
	// not call us if that returns false.
	if (format < FORMAT_VERSION2)
	throw new System.SystemException("cannot read raw docs with older term vector formats");

	SeekTvx(startDocID);

	long tvdPosition = tvx.ReadLong();
	tvd.Seek(tvdPosition);

	long tvfPosition = tvx.ReadLong();
	tvf.Seek(tvfPosition);

	long lastTvdPosition = tvdPosition;
	long lastTvfPosition = tvfPosition;

	int count = 0;
	while (count < numDocs)
	{
	int docID = docStoreOffset + startDocID + count + 1;
	System.Diagnostics.Debug.Assert(docID <= numTotalDocs);
	if (docID < numTotalDocs)
	{
	tvdPosition = tvx.ReadLong();
	tvfPosition = tvx.ReadLong();
	}
	else
	{
	tvdPosition = tvd.Length();
	tvfPosition = tvf.Length();
	System.Diagnostics.Debug.Assert(count == numDocs - 1);
	}
	tvdLengths[count] = (int) (tvdPosition - lastTvdPosition);
	tvfLengths[count] = (int) (tvfPosition - lastTvfPosition);
	count++;
	lastTvdPosition = tvdPosition;
	lastTvfPosition = tvfPosition;
	}
	}

	private int CheckValidFormat(IndexInput in_Renamed)
	{
	int format = in_Renamed.ReadInt();
	if (format > FORMAT_CURRENT)
	{
	throw new CorruptIndexException("Incompatible format version: " + format + " expected " + FORMAT_CURRENT + " or less");
	}
	return format;
	}

	public void Dispose()
	{
	Dispose(true);
	}

	protected virtual void Dispose(bool disposing)
	{
	if (isDisposed) return;

	if (disposing)
	{
	// make all effort to close up. Keep the first exception
	// and throw it as a new one.
	System.IO.IOException keep = null;
	if (tvx != null)
	try
	{
	tvx.Close();
	}
	catch (System.IO.IOException e)
	{
	if (keep == null)
	keep = e;
	}
	if (tvd != null)
	try
	{
	tvd.Close();
	}
	catch (System.IO.IOException e)
	{
	if (keep == null)
	keep = e;
	}
	if (tvf != null)
	try
	{
	tvf.Close();
	}
	catch (System.IO.IOException e)
	{
	if (keep == null)
	keep = e;
	}
	if (keep != null)
	{
	throw new System.IO.IOException(keep.StackTrace);
	}
	}

	isDisposed = true;
	}

	/// <summary> </summary>
	/// <returns> The number of documents in the reader
	/// </returns>
	internal virtual int Size()
	{
	return size;
	}

	public virtual void Get(int docNum, System.String field, TermVectorMapper mapper)
	{
	if (tvx != null)
	{
	int fieldNumber = fieldInfos.FieldNumber(field);
	//We need to account for the FORMAT_SIZE at when seeking in the tvx
	//We don't need to do this in other seeks because we already have the
	// file pointer
	//that was written in another file
	SeekTvx(docNum);
	//System.out.println("TVX Pointer: " + tvx.getFilePointer());
	long tvdPosition = tvx.ReadLong();

	tvd.Seek(tvdPosition);
	int fieldCount = tvd.ReadVInt();
	//System.out.println("Num Fields: " + fieldCount);
	// There are only a few fields per document. We opt for a full scan
	// rather then requiring that they be ordered. We need to read through
	// all of the fields anyway to get to the tvf pointers.
	int number = 0;
	int found = - 1;
	for (int i = 0; i < fieldCount; i++)
	{
	if (format >= FORMAT_VERSION)
	number = tvd.ReadVInt();
	else
	number += tvd.ReadVInt();

	if (number == fieldNumber)
	found = i;
	}

	// This field, although valid in the segment, was not found in this
	// document
	if (found != - 1)
	{
	// Compute position in the tvf file
	long position;
	if (format >= FORMAT_VERSION2)
	position = tvx.ReadLong();
	else
	position = tvd.ReadVLong();
	for (int i = 1; i <= found; i++)
	position += tvd.ReadVLong();

	mapper.SetDocumentNumber(docNum);
	ReadTermVector(field, position, mapper);
	}
	else
	{
	//System.out.println("Fieldable not found");
	}
	}
	else
	{
	//System.out.println("No tvx file");
	}
	}



	/// <summary> Retrieve the term vector for the given document and field</summary>
	/// <param name="docNum">The document number to retrieve the vector for
	/// </param>
	/// <param name="field">The field within the document to retrieve
	/// </param>
	/// <returns> The TermFreqVector for the document and field or null if there is no termVector for this field.
	/// </returns>
	/// <throws> IOException if there is an error reading the term vector files </throws>
	public /internal/ virtual ITermFreqVector Get(int docNum, System.String field)
	{
	// Check if no term vectors are available for this segment at all
	ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
	Get(docNum, field, mapper);

	return mapper.MaterializeVector();
	}

	// Reads the String[] fields; you have to pre-seek tvd to
	// the right point
	private System.String[] ReadFields(int fieldCount)
	{
	int number = 0;
	System.String[] fields = new System.String[fieldCount];

	for (int i = 0; i < fieldCount; i++)
	{
	if (format >= FORMAT_VERSION)
	number = tvd.ReadVInt();
	else
	number += tvd.ReadVInt();

	fields[i] = fieldInfos.FieldName(number);
	}

	return fields;
	}

	// Reads the long[] offsets into TVF; you have to pre-seek
	// tvx/tvd to the right point
	private long[] ReadTvfPointers(int fieldCount)
	{
	// Compute position in the tvf file
	long position;
	if (format >= FORMAT_VERSION2)
	position = tvx.ReadLong();
	else
	position = tvd.ReadVLong();

	long[] tvfPointers = new long[fieldCount];
	tvfPointers[0] = position;

	for (int i = 1; i < fieldCount; i++)
	{
	position += tvd.ReadVLong();
	tvfPointers[i] = position;
	}

	return tvfPointers;
	}

	/// <summary> Return all term vectors stored for this document or null if the could not be read in.
	///
	/// </summary>
	/// <param name="docNum">The document number to retrieve the vector for
	/// </param>
	/// <returns> All term frequency vectors
	/// </returns>
	/// <throws> IOException if there is an error reading the term vector files </throws>
	public /internal/ virtual ITermFreqVector[] Get(int docNum)
	{
	ITermFreqVector[] result = null;
	if (tvx != null)
	{
	//We need to offset by
	SeekTvx(docNum);
	long tvdPosition = tvx.ReadLong();

	tvd.Seek(tvdPosition);
	int fieldCount = tvd.ReadVInt();

	// No fields are vectorized for this document
	if (fieldCount != 0)
	{
	System.String[] fields = ReadFields(fieldCount);
	long[] tvfPointers = ReadTvfPointers(fieldCount);
	result = ReadTermVectors(docNum, fields, tvfPointers);
	}
	}
	else
	{
	//System.out.println("No tvx file");
	}
	return result;
	}

	public virtual void Get(int docNumber, TermVectorMapper mapper)
	{
	// Check if no term vectors are available for this segment at all
	if (tvx != null)
	{
	//We need to offset by

	SeekTvx(docNumber);
	long tvdPosition = tvx.ReadLong();

	tvd.Seek(tvdPosition);
	int fieldCount = tvd.ReadVInt();

	// No fields are vectorized for this document
	if (fieldCount != 0)
	{
	System.String[] fields = ReadFields(fieldCount);
	long[] tvfPointers = ReadTvfPointers(fieldCount);
	mapper.SetDocumentNumber(docNumber);
	ReadTermVectors(fields, tvfPointers, mapper);
	}
	}
	else
	{
	//System.out.println("No tvx file");
	}
	}


	private SegmentTermVector[] ReadTermVectors(int docNum, System.String[] fields, long[] tvfPointers)
	{
	SegmentTermVector[] res = new SegmentTermVector[fields.Length];
	for (int i = 0; i < fields.Length; i++)
	{
	var mapper = new ParallelArrayTermVectorMapper();
	mapper.SetDocumentNumber(docNum);
	ReadTermVector(fields[i], tvfPointers[i], mapper);
	res[i] = (SegmentTermVector) mapper.MaterializeVector();
	}
	return res;
	}

	private void ReadTermVectors(System.String[] fields, long[] tvfPointers, TermVectorMapper mapper)
	{
	for (int i = 0; i < fields.Length; i++)
	{
	ReadTermVector(fields[i], tvfPointers[i], mapper);
	}
	}


	/// <summary> </summary>
	/// <param name="field">The field to read in
	/// </param>
	/// <param name="tvfPointer">The pointer within the tvf file where we should start reading
	/// </param>
	/// <param name="mapper">The mapper used to map the TermVector
	/// </param>
	/// <throws> IOException </throws>
	private void ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper)
	{

	// Now read the data from specified position
	//We don't need to offset by the FORMAT here since the pointer already includes the offset
	tvf.Seek(tvfPointer);

	int numTerms = tvf.ReadVInt();
	//System.out.println("Num Terms: " + numTerms);
	// If no terms - return a constant empty termvector. However, this should never occur!
	if (numTerms == 0)
	return ;

	bool storePositions;
	bool storeOffsets;

	if (format >= FORMAT_VERSION)
	{
	byte bits = tvf.ReadByte();
	storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
	storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
	}
	else
	{
	tvf.ReadVInt();
	storePositions = false;
	storeOffsets = false;
	}
	mapper.SetExpectations(field, numTerms, storeOffsets, storePositions);
	int start = 0;
	int deltaLength = 0;
	int totalLength = 0;
	byte[] byteBuffer;
	char[] charBuffer;
	bool preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES;

	// init the buffers
	if (preUTF8)
	{
	charBuffer = new char[10];
	byteBuffer = null;
	}
	else
	{
	charBuffer = null;
	byteBuffer = new byte[20];
	}

	for (int i = 0; i < numTerms; i++)
	{
	start = tvf.ReadVInt();
	deltaLength = tvf.ReadVInt();
	totalLength = start + deltaLength;

	System.String term;

	if (preUTF8)
	{
	// Term stored as java chars
	if (charBuffer.Length < totalLength)
	{
	char[] newCharBuffer = new char[(int) (1.5 * totalLength)];
	Array.Copy(charBuffer, 0, newCharBuffer, 0, start);
	charBuffer = newCharBuffer;
	}
	tvf.ReadChars(charBuffer, start, deltaLength);
	term = new System.String(charBuffer, 0, totalLength);
	}
	else
	{
	// Term stored as utf8 bytes
	if (byteBuffer.Length < totalLength)
	{
	byte[] newByteBuffer = new byte[(int) (1.5 * totalLength)];
	Array.Copy(byteBuffer, 0, newByteBuffer, 0, start);
	byteBuffer = newByteBuffer;
	}
	tvf.ReadBytes(byteBuffer, start, deltaLength);
	term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength);
	}
	int freq = tvf.ReadVInt();
	int[] positions = null;
	if (storePositions)
	{
	//read in the positions
	//does the mapper even care about positions?
	if (mapper.IsIgnoringPositions == false)
	{
	positions = new int[freq];
	int prevPosition = 0;
	for (int j = 0; j < freq; j++)
	{
	positions[j] = prevPosition + tvf.ReadVInt();
	prevPosition = positions[j];
	}
	}
	else
	{
	//we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip
	//
	for (int j = 0; j < freq; j++)
	{
	tvf.ReadVInt();
	}
	}
	}
	TermVectorOffsetInfo[] offsets = null;
	if (storeOffsets)
	{
	//does the mapper even care about offsets?
	if (mapper.IsIgnoringOffsets == false)
	{
	offsets = new TermVectorOffsetInfo[freq];
	int prevOffset = 0;
	for (int j = 0; j < freq; j++)
	{
	int startOffset = prevOffset + tvf.ReadVInt();
	int endOffset = startOffset + tvf.ReadVInt();
	offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
	prevOffset = endOffset;
	}
	}
	else
	{
	for (int j = 0; j < freq; j++)
	{
	tvf.ReadVInt();
	tvf.ReadVInt();
	}
	}
	}
	mapper.Map(term, freq, offsets, positions);
	}
	}

	public virtual System.Object Clone()
	{

	TermVectorsReader clone = (TermVectorsReader) base.MemberwiseClone();

	// These are null when a TermVectorsReader was created
	// on a segment that did not have term vectors saved
	if (tvx != null && tvd != null && tvf != null)
	{
	clone.tvx = (IndexInput) tvx.Clone();
	clone.tvd = (IndexInput) tvd.Clone();
	clone.tvf = (IndexInput) tvf.Clone();
	}

	return clone;
	}
	}


	/// <summary> Models the existing parallel array structure</summary>
	class ParallelArrayTermVectorMapper:TermVectorMapper
	{

	private System.String[] terms;
	private int[] termFreqs;
	private int[][] positions;
	private TermVectorOffsetInfo[][] offsets;
	private int currentPosition;
	private bool storingOffsets;
	private bool storingPositions;
	private System.String field;

	public override void SetExpectations(System.String field, int numTerms, bool storeOffsets, bool storePositions)
	{
	this.field = field;
	terms = new System.String[numTerms];
	termFreqs = new int[numTerms];
	this.storingOffsets = storeOffsets;
	this.storingPositions = storePositions;
	if (storePositions)
	this.positions = new int[numTerms][];
	if (storeOffsets)
	this.offsets = new TermVectorOffsetInfo[numTerms][];
	}

	public override void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
	{
	terms[currentPosition] = term;
	termFreqs[currentPosition] = frequency;
	if (storingOffsets)
	{
	this.offsets[currentPosition] = offsets;
	}
	if (storingPositions)
	{
	this.positions[currentPosition] = positions;
	}
	currentPosition++;
	}

	/// <summary> Construct the vector</summary>
	/// <returns> The <see cref="ITermFreqVector" /> based on the mappings.
	/// </returns>
	public virtual ITermFreqVector MaterializeVector()
	{
	SegmentTermVector tv = null;
	if (field != null && terms != null)
	{
	if (storingPositions \|\| storingOffsets)
	{
	tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
	}
	else
	{
	tv = new SegmentTermVector(field, terms, termFreqs);
	}
	}
	return tv;
	}
	}
	}