src/Lucene.Net/Codecs/Lucene3x/TermInfosReaderIndex.cs - lucenenet - Git at Google

 using J2N.Numerics;
 using J2N.Text;
 using System;
 using System.Collections.Generic;

 namespace Lucene.Net.Codecs.Lucene3x
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     using BytesRef = Lucene.Net.Util.BytesRef;
     using GrowableWriter = Lucene.Net.Util.Packed.GrowableWriter;
     using MathUtil = Lucene.Net.Util.MathUtil;
     using PackedInt32s = Lucene.Net.Util.Packed.PackedInt32s;
     using PagedBytes = Lucene.Net.Util.PagedBytes;
     using PagedBytesDataInput = Lucene.Net.Util.PagedBytes.PagedBytesDataInput;
     using PagedBytesDataOutput = Lucene.Net.Util.PagedBytes.PagedBytesDataOutput;
     using RamUsageEstimator = Lucene.Net.Util.RamUsageEstimator;
     using Term = Lucene.Net.Index.Term;

     /// <summary>
     /// This stores a monotonically increasing set of <c>Term, TermInfo</c> pairs in an
     /// index segment. Pairs are accessed either by <see cref="Term"/> or by ordinal position the
     /// set. The <see cref="Index.Terms"/> and <see cref="TermInfo"/> are actually serialized and stored into a byte
     /// array and pointers to the position of each are stored in a <see cref="int"/> array. </summary>
     [Obsolete("Only for reading existing 3.x indexes")]
     internal class TermInfosReaderIndex
     {
         private const int MAX_PAGE_BITS = 18; // 256 KB block
         private Term[] fields;
         private int totalIndexInterval;
         private IComparer<BytesRef> comparer = BytesRef.UTF8SortedAsUTF16Comparer;
         private readonly PagedBytesDataInput dataInput;
         private readonly PackedInt32s.Reader indexToDataOffset;
         private readonly int indexSize;
         private readonly int skipInterval;
         private readonly long ramBytesUsed;

         /// <summary>
         /// Loads the segment information at segment load time.
         /// </summary>
         /// <param name="indexEnum">
         ///          The term enum. </param>
         /// <param name="indexDivisor">
         ///          The index divisor. </param>
         /// <param name="tiiFileLength">
         ///          The size of the tii file, used to approximate the size of the
         ///          buffer. </param>
         /// <param name="totalIndexInterval">
         ///          The total index interval. </param>
         public TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval)
         {
             this.totalIndexInterval = totalIndexInterval;
             indexSize = 1 + ((int)indexEnum.size - 1) / indexDivisor;
             skipInterval = indexEnum.skipInterval;
             // this is only an inital size, it will be GCed once the build is complete
             long initialSize = (long)(tiiFileLength * 1.5) / indexDivisor;
             PagedBytes dataPagedBytes = new PagedBytes(EstimatePageBits(initialSize));
             PagedBytesDataOutput dataOutput = dataPagedBytes.GetDataOutput();

             int bitEstimate = 1 + MathUtil.Log(tiiFileLength, 2);
             GrowableWriter indexToTerms = new GrowableWriter(bitEstimate, indexSize, PackedInt32s.DEFAULT);

             string currentField = null;
             IList<string> fieldStrs = new List<string>();
             int fieldCounter = -1;
             for (int i = 0; indexEnum.Next(); i++)
             {
                 Term term = indexEnum.Term();
                 if (currentField == null || !currentField.Equals(term.Field, StringComparison.Ordinal))
                 {
                     currentField = term.Field;
                     fieldStrs.Add(currentField);
                     fieldCounter++;
                 }
                 TermInfo termInfo = indexEnum.TermInfo();
                 indexToTerms.Set(i, dataOutput.GetPosition());
                 dataOutput.WriteVInt32(fieldCounter);
                 dataOutput.WriteString(term.Text());
                 dataOutput.WriteVInt32(termInfo.DocFreq);
                 if (termInfo.DocFreq >= skipInterval)
                 {
                     dataOutput.WriteVInt32(termInfo.SkipOffset);
                 }
                 dataOutput.WriteVInt64(termInfo.FreqPointer);
                 dataOutput.WriteVInt64(termInfo.ProxPointer);
                 dataOutput.WriteVInt64(indexEnum.indexPointer);
                 for (int j = 1; j < indexDivisor; j++)
                 {
                     if (!indexEnum.Next())
                     {
                         break;
                     }
                 }
             }

             fields = new Term[fieldStrs.Count];
             for (int i = 0; i < fields.Length; i++)
             {
                 fields[i] = new Term(fieldStrs[i]);
             }

             dataPagedBytes.Freeze(true);
             dataInput = dataPagedBytes.GetDataInput();
             indexToDataOffset = indexToTerms.Mutable;

             ramBytesUsed = fields.Length * (RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.ShallowSizeOfInstance(typeof(Term))) + dataPagedBytes.RamBytesUsed() + indexToDataOffset.RamBytesUsed();
         }

         private static int EstimatePageBits(long estSize)
         {
             return Math.Max(Math.Min(64 - estSize.LeadingZeroCount(), MAX_PAGE_BITS), 4);
         }

         internal virtual void SeekEnum(SegmentTermEnum enumerator, int indexOffset)
         {
             PagedBytesDataInput input = (PagedBytesDataInput)dataInput.Clone();

             input.SetPosition(indexToDataOffset.Get(indexOffset));

             // read the term
             int fieldId = input.ReadVInt32();
             Term field = fields[fieldId];
             Term term = new Term(field.Field, input.ReadString());

             // read the terminfo
             var termInfo = new TermInfo();
             termInfo.DocFreq = input.ReadVInt32();
             if (termInfo.DocFreq >= skipInterval)
             {
                 termInfo.SkipOffset = input.ReadVInt32();
             }
             else
             {
                 termInfo.SkipOffset = 0;
             }
             termInfo.FreqPointer = input.ReadVInt64();
             termInfo.ProxPointer = input.ReadVInt64();

             long pointer = input.ReadVInt64();

             // perform the seek
             enumerator.Seek(pointer, ((long)indexOffset * totalIndexInterval) - 1, term, termInfo);
         }

         /// <summary>
         /// Binary search for the given term.
         /// </summary>
         /// <param name="term">
         ///          The term to locate. </param>
         /// <exception cref="System.IO.IOException"> If there is a low-level I/O error. </exception>
         internal virtual int GetIndexOffset(Term term)
         {
             int lo = 0;
             int hi = indexSize - 1;
             PagedBytesDataInput input = (PagedBytesDataInput)dataInput.Clone();
             BytesRef scratch = new BytesRef();
             while (hi >= lo)
             {
                 int mid = (int)((uint)(lo + hi) >> 1);
                 int delta = CompareTo(term, mid, input, scratch);
                 if (delta < 0)
                 {
                     hi = mid - 1;
                 }
                 else if (delta > 0)
                 {
                     lo = mid + 1;
                 }
                 else
                 {
                     return mid;
                 }
             }
             return hi;
         }

         /// <summary>
         /// Gets the term at the given position.  For testing.
         /// </summary>
         /// <param name="termIndex">
         ///          The position to read the term from the index. </param>
         /// <returns> The term. </returns>
         /// <exception cref="System.IO.IOException"> If there is a low-level I/O error. </exception>
         internal virtual Term GetTerm(int termIndex)
         {
             PagedBytesDataInput input = (PagedBytesDataInput)dataInput.Clone();
             input.SetPosition(indexToDataOffset.Get(termIndex));

             // read the term
             int fieldId = input.ReadVInt32();
             Term field = fields[fieldId];
             return new Term(field.Field, input.ReadString());
         }

         /// <summary>
         /// Returns the number of terms.
         /// </summary>
         /// <returns> int. </returns>
         internal virtual int Length
         {
             get { return indexSize; }
         }

         /// <summary>
         /// The compares the given term against the term in the index specified by the
         /// term index. ie It returns negative N when term is less than index term;
         /// </summary>
         /// <param name="term">
         ///          The given term. </param>
         /// <param name="termIndex">
         ///          The index of the of term to compare. </param>
         /// <returns> int. </returns>
         /// <exception cref="System.IO.IOException"> If there is a low-level I/O error. </exception>
         internal virtual int CompareTo(Term term, int termIndex)
         {
             return CompareTo(term, termIndex, (PagedBytesDataInput)dataInput.Clone(), new BytesRef());
         }

         /// <summary>
         /// Compare the fields of the terms first, and if not equals return from
         /// compare. If equal compare terms.
         /// </summary>
         /// <param name="term">
         ///          The term to compare. </param>
         /// <param name="termIndex">
         ///          The position of the term in the input to compare </param>
         /// <param name="input">
         ///          The input buffer. </param>
         /// <returns> int. </returns>
         /// <exception cref="System.IO.IOException"> If there is a low-level I/O error. </exception>
         private int CompareTo(Term term, int termIndex, PagedBytesDataInput input, BytesRef reuse)
         {
             // if term field does not equal mid's field index, then compare fields
             // else if they are equal, compare term's string values...
             int c = CompareField(term, termIndex, input);
             if (c == 0)
             {
                 reuse.Length = input.ReadVInt32();
                 reuse.Grow(reuse.Length);
                 input.ReadBytes(reuse.Bytes, 0, reuse.Length);
                 return comparer.Compare(term.Bytes, reuse);
             }
             return c;
         }

         /// <summary>
         /// Compares the fields before checking the text of the terms.
         /// </summary>
         /// <param name="term">
         ///          The given term. </param>
         /// <param name="termIndex">
         ///          The term that exists in the data block. </param>
         /// <param name="input">
         ///          The data block. </param>
         /// <returns> int. </returns>
         /// <exception cref="System.IO.IOException"> If there is a low-level I/O error. </exception>
         private int CompareField(Term term, int termIndex, PagedBytesDataInput input)
         {
             input.SetPosition(indexToDataOffset.Get(termIndex));
             return term.Field.CompareToOrdinal(fields[input.ReadVInt32()].Field);
         }

         internal virtual long RamBytesUsed()
         {
             return ramBytesUsed;
         }
     }
 }
	using J2N.Numerics;
	using J2N.Text;
	using System;
	using System.Collections.Generic;

	namespace Lucene.Net.Codecs.Lucene3x
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	using BytesRef = Lucene.Net.Util.BytesRef;
	using GrowableWriter = Lucene.Net.Util.Packed.GrowableWriter;
	using MathUtil = Lucene.Net.Util.MathUtil;
	using PackedInt32s = Lucene.Net.Util.Packed.PackedInt32s;
	using PagedBytes = Lucene.Net.Util.PagedBytes;
	using PagedBytesDataInput = Lucene.Net.Util.PagedBytes.PagedBytesDataInput;
	using PagedBytesDataOutput = Lucene.Net.Util.PagedBytes.PagedBytesDataOutput;
	using RamUsageEstimator = Lucene.Net.Util.RamUsageEstimator;
	using Term = Lucene.Net.Index.Term;

	/// <summary>
	/// This stores a monotonically increasing set of <c>Term, TermInfo</c> pairs in an
	/// index segment. Pairs are accessed either by <see cref="Term"/> or by ordinal position the
	/// set. The <see cref="Index.Terms"/> and <see cref="TermInfo"/> are actually serialized and stored into a byte
	/// array and pointers to the position of each are stored in a <see cref="int"/> array. </summary>
	[Obsolete("Only for reading existing 3.x indexes")]
	internal class TermInfosReaderIndex
	{
	private const int MAX_PAGE_BITS = 18; // 256 KB block
	private Term[] fields;
	private int totalIndexInterval;
	private IComparer<BytesRef> comparer = BytesRef.UTF8SortedAsUTF16Comparer;
	private readonly PagedBytesDataInput dataInput;
	private readonly PackedInt32s.Reader indexToDataOffset;
	private readonly int indexSize;
	private readonly int skipInterval;
	private readonly long ramBytesUsed;

	/// <summary>
	/// Loads the segment information at segment load time.
	/// </summary>
	/// <param name="indexEnum">
	/// The term enum. </param>
	/// <param name="indexDivisor">
	/// The index divisor. </param>
	/// <param name="tiiFileLength">
	/// The size of the tii file, used to approximate the size of the
	/// buffer. </param>
	/// <param name="totalIndexInterval">
	/// The total index interval. </param>
	public TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval)
	{
	this.totalIndexInterval = totalIndexInterval;
	indexSize = 1 + ((int)indexEnum.size - 1) / indexDivisor;
	skipInterval = indexEnum.skipInterval;
	// this is only an inital size, it will be GCed once the build is complete
	long initialSize = (long)(tiiFileLength * 1.5) / indexDivisor;
	PagedBytes dataPagedBytes = new PagedBytes(EstimatePageBits(initialSize));
	PagedBytesDataOutput dataOutput = dataPagedBytes.GetDataOutput();

	int bitEstimate = 1 + MathUtil.Log(tiiFileLength, 2);
	GrowableWriter indexToTerms = new GrowableWriter(bitEstimate, indexSize, PackedInt32s.DEFAULT);

	string currentField = null;
	IList<string> fieldStrs = new List<string>();
	int fieldCounter = -1;
	for (int i = 0; indexEnum.Next(); i++)
	{
	Term term = indexEnum.Term();
	if (currentField == null \|\| !currentField.Equals(term.Field, StringComparison.Ordinal))
	{
	currentField = term.Field;
	fieldStrs.Add(currentField);
	fieldCounter++;
	}
	TermInfo termInfo = indexEnum.TermInfo();
	indexToTerms.Set(i, dataOutput.GetPosition());
	dataOutput.WriteVInt32(fieldCounter);
	dataOutput.WriteString(term.Text());
	dataOutput.WriteVInt32(termInfo.DocFreq);
	if (termInfo.DocFreq >= skipInterval)
	{
	dataOutput.WriteVInt32(termInfo.SkipOffset);
	}
	dataOutput.WriteVInt64(termInfo.FreqPointer);
	dataOutput.WriteVInt64(termInfo.ProxPointer);
	dataOutput.WriteVInt64(indexEnum.indexPointer);
	for (int j = 1; j < indexDivisor; j++)
	{
	if (!indexEnum.Next())
	{
	break;
	}
	}
	}

	fields = new Term[fieldStrs.Count];
	for (int i = 0; i < fields.Length; i++)
	{
	fields[i] = new Term(fieldStrs[i]);
	}

	dataPagedBytes.Freeze(true);
	dataInput = dataPagedBytes.GetDataInput();
	indexToDataOffset = indexToTerms.Mutable;

	ramBytesUsed = fields.Length * (RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.ShallowSizeOfInstance(typeof(Term))) + dataPagedBytes.RamBytesUsed() + indexToDataOffset.RamBytesUsed();
	}

	private static int EstimatePageBits(long estSize)
	{
	return Math.Max(Math.Min(64 - estSize.LeadingZeroCount(), MAX_PAGE_BITS), 4);
	}

	internal virtual void SeekEnum(SegmentTermEnum enumerator, int indexOffset)
	{
	PagedBytesDataInput input = (PagedBytesDataInput)dataInput.Clone();

	input.SetPosition(indexToDataOffset.Get(indexOffset));

	// read the term
	int fieldId = input.ReadVInt32();
	Term field = fields[fieldId];
	Term term = new Term(field.Field, input.ReadString());

	// read the terminfo
	var termInfo = new TermInfo();
	termInfo.DocFreq = input.ReadVInt32();
	if (termInfo.DocFreq >= skipInterval)
	{
	termInfo.SkipOffset = input.ReadVInt32();
	}
	else
	{
	termInfo.SkipOffset = 0;
	}
	termInfo.FreqPointer = input.ReadVInt64();
	termInfo.ProxPointer = input.ReadVInt64();

	long pointer = input.ReadVInt64();

	// perform the seek
	enumerator.Seek(pointer, ((long)indexOffset * totalIndexInterval) - 1, term, termInfo);
	}

	/// <summary>
	/// Binary search for the given term.
	/// </summary>
	/// <param name="term">
	/// The term to locate. </param>
	/// <exception cref="System.IO.IOException"> If there is a low-level I/O error. </exception>
	internal virtual int GetIndexOffset(Term term)
	{
	int lo = 0;
	int hi = indexSize - 1;
	PagedBytesDataInput input = (PagedBytesDataInput)dataInput.Clone();
	BytesRef scratch = new BytesRef();
	while (hi >= lo)
	{
	int mid = (int)((uint)(lo + hi) >> 1);
	int delta = CompareTo(term, mid, input, scratch);
	if (delta < 0)
	{
	hi = mid - 1;
	}
	else if (delta > 0)
	{
	lo = mid + 1;
	}
	else
	{
	return mid;
	}
	}
	return hi;
	}

	/// <summary>
	/// Gets the term at the given position. For testing.
	/// </summary>
	/// <param name="termIndex">
	/// The position to read the term from the index. </param>
	/// <returns> The term. </returns>
	/// <exception cref="System.IO.IOException"> If there is a low-level I/O error. </exception>
	internal virtual Term GetTerm(int termIndex)
	{
	PagedBytesDataInput input = (PagedBytesDataInput)dataInput.Clone();
	input.SetPosition(indexToDataOffset.Get(termIndex));

	// read the term
	int fieldId = input.ReadVInt32();
	Term field = fields[fieldId];
	return new Term(field.Field, input.ReadString());
	}

	/// <summary>
	/// Returns the number of terms.
	/// </summary>
	/// <returns> int. </returns>
	internal virtual int Length
	{
	get { return indexSize; }
	}

	/// <summary>
	/// The compares the given term against the term in the index specified by the
	/// term index. ie It returns negative N when term is less than index term;
	/// </summary>
	/// <param name="term">
	/// The given term. </param>
	/// <param name="termIndex">
	/// The index of the of term to compare. </param>
	/// <returns> int. </returns>
	/// <exception cref="System.IO.IOException"> If there is a low-level I/O error. </exception>
	internal virtual int CompareTo(Term term, int termIndex)
	{
	return CompareTo(term, termIndex, (PagedBytesDataInput)dataInput.Clone(), new BytesRef());
	}

	/// <summary>
	/// Compare the fields of the terms first, and if not equals return from
	/// compare. If equal compare terms.
	/// </summary>
	/// <param name="term">
	/// The term to compare. </param>
	/// <param name="termIndex">
	/// The position of the term in the input to compare </param>
	/// <param name="input">
	/// The input buffer. </param>
	/// <returns> int. </returns>
	/// <exception cref="System.IO.IOException"> If there is a low-level I/O error. </exception>
	private int CompareTo(Term term, int termIndex, PagedBytesDataInput input, BytesRef reuse)
	{
	// if term field does not equal mid's field index, then compare fields
	// else if they are equal, compare term's string values...
	int c = CompareField(term, termIndex, input);
	if (c == 0)
	{
	reuse.Length = input.ReadVInt32();
	reuse.Grow(reuse.Length);
	input.ReadBytes(reuse.Bytes, 0, reuse.Length);
	return comparer.Compare(term.Bytes, reuse);
	}
	return c;
	}

	/// <summary>
	/// Compares the fields before checking the text of the terms.
	/// </summary>
	/// <param name="term">
	/// The given term. </param>
	/// <param name="termIndex">
	/// The term that exists in the data block. </param>
	/// <param name="input">
	/// The data block. </param>
	/// <returns> int. </returns>
	/// <exception cref="System.IO.IOException"> If there is a low-level I/O error. </exception>
	private int CompareField(Term term, int termIndex, PagedBytesDataInput input)
	{
	input.SetPosition(indexToDataOffset.Get(termIndex));
	return term.Field.CompareToOrdinal(fields[input.ReadVInt32()].Field);
	}

	internal virtual long RamBytesUsed()
	{
	return ramBytesUsed;
	}
	}
	}