blob: 23677a9a4aa49eb0667b4772c0d3ba798ba92399 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using BufferedIndexInput = Lucene.Net.Store.BufferedIndexInput;
using Directory = Lucene.Net.Store.Directory;
using IndexInput = Lucene.Net.Store.IndexInput;
namespace Lucene.Net.Index
{
class TermVectorsReader : System.ICloneable, IDisposable
{
// NOTE: if you make a new format, it must be larger than
// the current format
internal const int FORMAT_VERSION = 2;
// Changes to speed up bulk merging of term vectors:
internal const int FORMAT_VERSION2 = 3;
// Changed strings to UTF8 with length-in-bytes not length-in-chars
internal const int FORMAT_UTF8_LENGTH_IN_BYTES = 4;
// NOTE: always change this if you switch to a new format!
internal static readonly int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;
//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
internal const int FORMAT_SIZE = 4;
internal const byte STORE_POSITIONS_WITH_TERMVECTOR = (byte) (0x1);
internal const byte STORE_OFFSET_WITH_TERMVECTOR = (byte) (0x2);
private FieldInfos fieldInfos;
private IndexInput tvx;
private IndexInput tvd;
private IndexInput tvf;
private int size;
private int numTotalDocs;
// The docID offset where our docs begin in the index
// file. This will be 0 if we have our own private file.
private int docStoreOffset;
private int format;
private bool isDisposed;
internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos):this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE)
{
}
internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos, int readBufferSize):this(d, segment, fieldInfos, readBufferSize, - 1, 0)
{
}
internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size)
{
bool success = false;
try
{
if (d.FileExists(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION))
{
tvx = d.OpenInput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION, readBufferSize);
format = CheckValidFormat(tvx);
tvd = d.OpenInput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION, readBufferSize);
int tvdFormat = CheckValidFormat(tvd);
tvf = d.OpenInput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION, readBufferSize);
int tvfFormat = CheckValidFormat(tvf);
System.Diagnostics.Debug.Assert(format == tvdFormat);
System.Diagnostics.Debug.Assert(format == tvfFormat);
if (format >= FORMAT_VERSION2)
{
System.Diagnostics.Debug.Assert((tvx.Length() - FORMAT_SIZE) % 16 == 0);
numTotalDocs = (int)(tvx.Length() >> 4);
}
else
{
System.Diagnostics.Debug.Assert((tvx.Length() - FORMAT_SIZE) % 8 == 0);
numTotalDocs = (int)(tvx.Length() >> 3);
}
if (-1 == docStoreOffset)
{
this.docStoreOffset = 0;
this.size = numTotalDocs;
System.Diagnostics.Debug.Assert(size == 0 || numTotalDocs == size);
}
else
{
this.docStoreOffset = docStoreOffset;
this.size = size;
// Verify the file is long enough to hold all of our
// docs
System.Diagnostics.Debug.Assert(numTotalDocs >= size + docStoreOffset, "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset);
}
}
else
{
// If all documents flushed in a segment had hit
// non-aborting exceptions, it's possible that
// FieldInfos.hasVectors returns true yet the term
// vector files don't exist.
format = 0;
}
this.fieldInfos = fieldInfos;
success = true;
}
finally
{
// With lock-less commits, it's entirely possible (and
// fine) to hit a FileNotFound exception above. In
// this case, we want to explicitly close any subset
// of things that were opened so that we don't have to
// wait for a GC to do so.
if (!success)
{
Dispose();
}
}
}
// Used for bulk copy when merging
internal virtual IndexInput GetTvdStream()
{
return tvd;
}
// Used for bulk copy when merging
internal virtual IndexInput GetTvfStream()
{
return tvf;
}
private void SeekTvx(int docNum)
{
if (format < FORMAT_VERSION2)
tvx.Seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE);
else
tvx.Seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
}
internal virtual bool CanReadRawDocs()
{
return format >= FORMAT_UTF8_LENGTH_IN_BYTES;
}
/// <summary>Retrieve the length (in bytes) of the tvd and tvf
/// entries for the next numDocs starting with
/// startDocID. This is used for bulk copying when
/// merging segments, if the field numbers are
/// congruent. Once this returns, the tvf &amp; tvd streams
/// are seeked to the startDocID.
/// </summary>
internal void RawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs)
{
if (tvx == null)
{
for (int i = 0; i < tvdLengths.Length; i++)
{
tvdLengths[i] = 0;
}
for (int i = 0; i < tvfLengths.Length; i++)
{
tvfLengths[i] = 0;
}
return ;
}
// SegmentMerger calls canReadRawDocs() first and should
// not call us if that returns false.
if (format < FORMAT_VERSION2)
throw new System.SystemException("cannot read raw docs with older term vector formats");
SeekTvx(startDocID);
long tvdPosition = tvx.ReadLong();
tvd.Seek(tvdPosition);
long tvfPosition = tvx.ReadLong();
tvf.Seek(tvfPosition);
long lastTvdPosition = tvdPosition;
long lastTvfPosition = tvfPosition;
int count = 0;
while (count < numDocs)
{
int docID = docStoreOffset + startDocID + count + 1;
System.Diagnostics.Debug.Assert(docID <= numTotalDocs);
if (docID < numTotalDocs)
{
tvdPosition = tvx.ReadLong();
tvfPosition = tvx.ReadLong();
}
else
{
tvdPosition = tvd.Length();
tvfPosition = tvf.Length();
System.Diagnostics.Debug.Assert(count == numDocs - 1);
}
tvdLengths[count] = (int) (tvdPosition - lastTvdPosition);
tvfLengths[count] = (int) (tvfPosition - lastTvfPosition);
count++;
lastTvdPosition = tvdPosition;
lastTvfPosition = tvfPosition;
}
}
private int CheckValidFormat(IndexInput in_Renamed)
{
int format = in_Renamed.ReadInt();
if (format > FORMAT_CURRENT)
{
throw new CorruptIndexException("Incompatible format version: " + format + " expected " + FORMAT_CURRENT + " or less");
}
return format;
}
public void Dispose()
{
Dispose(true);
}
protected virtual void Dispose(bool disposing)
{
if (isDisposed) return;
if (disposing)
{
// make all effort to close up. Keep the first exception
// and throw it as a new one.
System.IO.IOException keep = null;
if (tvx != null)
try
{
tvx.Close();
}
catch (System.IO.IOException e)
{
if (keep == null)
keep = e;
}
if (tvd != null)
try
{
tvd.Close();
}
catch (System.IO.IOException e)
{
if (keep == null)
keep = e;
}
if (tvf != null)
try
{
tvf.Close();
}
catch (System.IO.IOException e)
{
if (keep == null)
keep = e;
}
if (keep != null)
{
throw new System.IO.IOException(keep.StackTrace);
}
}
isDisposed = true;
}
/// <summary> </summary>
/// <returns> The number of documents in the reader
/// </returns>
internal virtual int Size()
{
return size;
}
public virtual void Get(int docNum, System.String field, TermVectorMapper mapper)
{
if (tvx != null)
{
int fieldNumber = fieldInfos.FieldNumber(field);
//We need to account for the FORMAT_SIZE at when seeking in the tvx
//We don't need to do this in other seeks because we already have the
// file pointer
//that was written in another file
SeekTvx(docNum);
//System.out.println("TVX Pointer: " + tvx.getFilePointer());
long tvdPosition = tvx.ReadLong();
tvd.Seek(tvdPosition);
int fieldCount = tvd.ReadVInt();
//System.out.println("Num Fields: " + fieldCount);
// There are only a few fields per document. We opt for a full scan
// rather then requiring that they be ordered. We need to read through
// all of the fields anyway to get to the tvf pointers.
int number = 0;
int found = - 1;
for (int i = 0; i < fieldCount; i++)
{
if (format >= FORMAT_VERSION)
number = tvd.ReadVInt();
else
number += tvd.ReadVInt();
if (number == fieldNumber)
found = i;
}
// This field, although valid in the segment, was not found in this
// document
if (found != - 1)
{
// Compute position in the tvf file
long position;
if (format >= FORMAT_VERSION2)
position = tvx.ReadLong();
else
position = tvd.ReadVLong();
for (int i = 1; i <= found; i++)
position += tvd.ReadVLong();
mapper.SetDocumentNumber(docNum);
ReadTermVector(field, position, mapper);
}
else
{
//System.out.println("Fieldable not found");
}
}
else
{
//System.out.println("No tvx file");
}
}
/// <summary> Retrieve the term vector for the given document and field</summary>
/// <param name="docNum">The document number to retrieve the vector for
/// </param>
/// <param name="field">The field within the document to retrieve
/// </param>
/// <returns> The TermFreqVector for the document and field or null if there is no termVector for this field.
/// </returns>
/// <throws> IOException if there is an error reading the term vector files </throws>
public /*internal*/ virtual ITermFreqVector Get(int docNum, System.String field)
{
// Check if no term vectors are available for this segment at all
ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
Get(docNum, field, mapper);
return mapper.MaterializeVector();
}
// Reads the String[] fields; you have to pre-seek tvd to
// the right point
private System.String[] ReadFields(int fieldCount)
{
int number = 0;
System.String[] fields = new System.String[fieldCount];
for (int i = 0; i < fieldCount; i++)
{
if (format >= FORMAT_VERSION)
number = tvd.ReadVInt();
else
number += tvd.ReadVInt();
fields[i] = fieldInfos.FieldName(number);
}
return fields;
}
// Reads the long[] offsets into TVF; you have to pre-seek
// tvx/tvd to the right point
private long[] ReadTvfPointers(int fieldCount)
{
// Compute position in the tvf file
long position;
if (format >= FORMAT_VERSION2)
position = tvx.ReadLong();
else
position = tvd.ReadVLong();
long[] tvfPointers = new long[fieldCount];
tvfPointers[0] = position;
for (int i = 1; i < fieldCount; i++)
{
position += tvd.ReadVLong();
tvfPointers[i] = position;
}
return tvfPointers;
}
/// <summary> Return all term vectors stored for this document or null if the could not be read in.
///
/// </summary>
/// <param name="docNum">The document number to retrieve the vector for
/// </param>
/// <returns> All term frequency vectors
/// </returns>
/// <throws> IOException if there is an error reading the term vector files </throws>
public /*internal*/ virtual ITermFreqVector[] Get(int docNum)
{
ITermFreqVector[] result = null;
if (tvx != null)
{
//We need to offset by
SeekTvx(docNum);
long tvdPosition = tvx.ReadLong();
tvd.Seek(tvdPosition);
int fieldCount = tvd.ReadVInt();
// No fields are vectorized for this document
if (fieldCount != 0)
{
System.String[] fields = ReadFields(fieldCount);
long[] tvfPointers = ReadTvfPointers(fieldCount);
result = ReadTermVectors(docNum, fields, tvfPointers);
}
}
else
{
//System.out.println("No tvx file");
}
return result;
}
public virtual void Get(int docNumber, TermVectorMapper mapper)
{
// Check if no term vectors are available for this segment at all
if (tvx != null)
{
//We need to offset by
SeekTvx(docNumber);
long tvdPosition = tvx.ReadLong();
tvd.Seek(tvdPosition);
int fieldCount = tvd.ReadVInt();
// No fields are vectorized for this document
if (fieldCount != 0)
{
System.String[] fields = ReadFields(fieldCount);
long[] tvfPointers = ReadTvfPointers(fieldCount);
mapper.SetDocumentNumber(docNumber);
ReadTermVectors(fields, tvfPointers, mapper);
}
}
else
{
//System.out.println("No tvx file");
}
}
private SegmentTermVector[] ReadTermVectors(int docNum, System.String[] fields, long[] tvfPointers)
{
SegmentTermVector[] res = new SegmentTermVector[fields.Length];
for (int i = 0; i < fields.Length; i++)
{
var mapper = new ParallelArrayTermVectorMapper();
mapper.SetDocumentNumber(docNum);
ReadTermVector(fields[i], tvfPointers[i], mapper);
res[i] = (SegmentTermVector) mapper.MaterializeVector();
}
return res;
}
private void ReadTermVectors(System.String[] fields, long[] tvfPointers, TermVectorMapper mapper)
{
for (int i = 0; i < fields.Length; i++)
{
ReadTermVector(fields[i], tvfPointers[i], mapper);
}
}
/// <summary> </summary>
/// <param name="field">The field to read in
/// </param>
/// <param name="tvfPointer">The pointer within the tvf file where we should start reading
/// </param>
/// <param name="mapper">The mapper used to map the TermVector
/// </param>
/// <throws> IOException </throws>
private void ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper)
{
// Now read the data from specified position
//We don't need to offset by the FORMAT here since the pointer already includes the offset
tvf.Seek(tvfPointer);
int numTerms = tvf.ReadVInt();
//System.out.println("Num Terms: " + numTerms);
// If no terms - return a constant empty termvector. However, this should never occur!
if (numTerms == 0)
return ;
bool storePositions;
bool storeOffsets;
if (format >= FORMAT_VERSION)
{
byte bits = tvf.ReadByte();
storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
}
else
{
tvf.ReadVInt();
storePositions = false;
storeOffsets = false;
}
mapper.SetExpectations(field, numTerms, storeOffsets, storePositions);
int start = 0;
int deltaLength = 0;
int totalLength = 0;
byte[] byteBuffer;
char[] charBuffer;
bool preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES;
// init the buffers
if (preUTF8)
{
charBuffer = new char[10];
byteBuffer = null;
}
else
{
charBuffer = null;
byteBuffer = new byte[20];
}
for (int i = 0; i < numTerms; i++)
{
start = tvf.ReadVInt();
deltaLength = tvf.ReadVInt();
totalLength = start + deltaLength;
System.String term;
if (preUTF8)
{
// Term stored as java chars
if (charBuffer.Length < totalLength)
{
char[] newCharBuffer = new char[(int) (1.5 * totalLength)];
Array.Copy(charBuffer, 0, newCharBuffer, 0, start);
charBuffer = newCharBuffer;
}
tvf.ReadChars(charBuffer, start, deltaLength);
term = new System.String(charBuffer, 0, totalLength);
}
else
{
// Term stored as utf8 bytes
if (byteBuffer.Length < totalLength)
{
byte[] newByteBuffer = new byte[(int) (1.5 * totalLength)];
Array.Copy(byteBuffer, 0, newByteBuffer, 0, start);
byteBuffer = newByteBuffer;
}
tvf.ReadBytes(byteBuffer, start, deltaLength);
term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength);
}
int freq = tvf.ReadVInt();
int[] positions = null;
if (storePositions)
{
//read in the positions
//does the mapper even care about positions?
if (mapper.IsIgnoringPositions == false)
{
positions = new int[freq];
int prevPosition = 0;
for (int j = 0; j < freq; j++)
{
positions[j] = prevPosition + tvf.ReadVInt();
prevPosition = positions[j];
}
}
else
{
//we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip
//
for (int j = 0; j < freq; j++)
{
tvf.ReadVInt();
}
}
}
TermVectorOffsetInfo[] offsets = null;
if (storeOffsets)
{
//does the mapper even care about offsets?
if (mapper.IsIgnoringOffsets == false)
{
offsets = new TermVectorOffsetInfo[freq];
int prevOffset = 0;
for (int j = 0; j < freq; j++)
{
int startOffset = prevOffset + tvf.ReadVInt();
int endOffset = startOffset + tvf.ReadVInt();
offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
prevOffset = endOffset;
}
}
else
{
for (int j = 0; j < freq; j++)
{
tvf.ReadVInt();
tvf.ReadVInt();
}
}
}
mapper.Map(term, freq, offsets, positions);
}
}
public virtual System.Object Clone()
{
TermVectorsReader clone = (TermVectorsReader) base.MemberwiseClone();
// These are null when a TermVectorsReader was created
// on a segment that did not have term vectors saved
if (tvx != null && tvd != null && tvf != null)
{
clone.tvx = (IndexInput) tvx.Clone();
clone.tvd = (IndexInput) tvd.Clone();
clone.tvf = (IndexInput) tvf.Clone();
}
return clone;
}
}
/// <summary> Models the existing parallel array structure</summary>
class ParallelArrayTermVectorMapper:TermVectorMapper
{
private System.String[] terms;
private int[] termFreqs;
private int[][] positions;
private TermVectorOffsetInfo[][] offsets;
private int currentPosition;
private bool storingOffsets;
private bool storingPositions;
private System.String field;
public override void SetExpectations(System.String field, int numTerms, bool storeOffsets, bool storePositions)
{
this.field = field;
terms = new System.String[numTerms];
termFreqs = new int[numTerms];
this.storingOffsets = storeOffsets;
this.storingPositions = storePositions;
if (storePositions)
this.positions = new int[numTerms][];
if (storeOffsets)
this.offsets = new TermVectorOffsetInfo[numTerms][];
}
public override void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
{
terms[currentPosition] = term;
termFreqs[currentPosition] = frequency;
if (storingOffsets)
{
this.offsets[currentPosition] = offsets;
}
if (storingPositions)
{
this.positions[currentPosition] = positions;
}
currentPosition++;
}
/// <summary> Construct the vector</summary>
/// <returns> The <see cref="ITermFreqVector" /> based on the mappings.
/// </returns>
public virtual ITermFreqVector MaterializeVector()
{
SegmentTermVector tv = null;
if (field != null && terms != null)
{
if (storingPositions || storingOffsets)
{
tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
}
else
{
tv = new SegmentTermVector(field, terms, termFreqs);
}
}
return tv;
}
}
}