src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexWriter.cs - lucenenet - Git at Google

 using Lucene.Net.Diagnostics;
 using Lucene.Net.Index;
 using Lucene.Net.Store;
 using Lucene.Net.Util;
 using Lucene.Net.Util.Fst;
 using System;
 using System.Collections.Generic;

 namespace Lucene.Net.Codecs.BlockTerms
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Selects index terms according to provided pluggable
     /// <see cref="IndexTermSelector"/>, and stores them in a prefix trie that's
     /// loaded entirely in RAM stored as an <see cref="FST{T}"/>.  This terms
     /// index only supports unsigned byte term sort order
     /// (unicode codepoint order when the bytes are UTF8).
     /// <para/>
     /// @lucene.experimental
     /// </summary>
     public class VariableGapTermsIndexWriter : TermsIndexWriterBase
     {
         protected IndexOutput m_output;

         /// <summary>Extension of terms index file.</summary>
         internal readonly static string TERMS_INDEX_EXTENSION = "tiv";

         internal readonly static string CODEC_NAME = "VARIABLE_GAP_TERMS_INDEX";
         internal readonly static int VERSION_START = 0;
         internal readonly static int VERSION_APPEND_ONLY = 1;
         internal readonly static int VERSION_CHECKSUM = 2;
         internal readonly static int VERSION_CURRENT = VERSION_CHECKSUM;

         private readonly IList<FSTFieldWriter> fields = new List<FSTFieldWriter>();

         //private readonly FieldInfos fieldInfos; // unread  // LUCENENET: Not used
         private readonly IndexTermSelector policy;

         /// <summary>
         /// Hook for selecting which terms should be placed in the terms index.
         /// <para/>
         /// <see cref="NewField(FieldInfo)"/> is called at the start of each new field, and
         /// <see cref="IsIndexTerm(BytesRef, TermStats)"/> for each term in that field.
         /// <para/>
         /// @lucene.experimental
         /// </summary>
         public abstract class IndexTermSelector
         {
             /// <summary>
             /// Called sequentially on every term being written,
             /// returning <c>true</c> if this term should be indexed.
             /// </summary>
             public abstract bool IsIndexTerm(BytesRef term, TermStats stats);

             /// <summary>Called when a new field is started.</summary>
             public abstract void NewField(FieldInfo fieldInfo);
         }

         /// <remarks>
         /// Same policy as <see cref="FixedGapTermsIndexWriter"/>
         /// </remarks>
         public sealed class EveryNTermSelector : IndexTermSelector
         {
             private int count;
             private readonly int interval;

             public EveryNTermSelector(int interval)
             {
                 this.interval = interval;
                 // First term is first indexed term
                 count = interval;
             }

             public override bool IsIndexTerm(BytesRef term, TermStats stats)
             {
                 if (count >= interval)
                 {
                     count = 1;
                     return true;
                 }
                 else
                 {
                     count++;
                     return false;
                 }
             }

             public override void NewField(FieldInfo fieldInfo)
             {
                 count = interval;
             }
         }

         /// <summary>
         /// Sets an index term when docFreq >= docFreqThresh, or
         /// every interval terms.  This should reduce seek time
         /// to high docFreq terms.
         /// </summary>
         public sealed class EveryNOrDocFreqTermSelector : IndexTermSelector
         {
             private int count;
             private readonly int docFreqThresh;
             private readonly int interval;

             public EveryNOrDocFreqTermSelector(int docFreqThresh, int interval)
             {
                 this.interval = interval;
                 this.docFreqThresh = docFreqThresh;

                 // First term is first indexed term:
                 count = interval;
             }

             public override bool IsIndexTerm(BytesRef term, TermStats stats)
             {
                 if (stats.DocFreq >= docFreqThresh || count >= interval)
                 {
                     count = 1;
                     return true;
                 }
                 else
                 {
                     count++;
                     return false;
                 }
             }

             public override void NewField(FieldInfo fieldInfo)
             {
                 count = interval;
             }
         }

         // TODO: it'd be nice to let the FST builder prune based
         // on term count of each node (the prune1/prune2 that it
         // accepts), and build the index based on that.  This
         // should result in a more compact terms index, more like
         // a prefix trie than the other selectors, because it
         // only stores enough leading bytes to get down to N
         // terms that may complete that prefix.  It becomes
         // "deeper" when terms are dense, and "shallow" when they
         // are less dense.
         //
         // However, it's not easy to make that work this this
         // API, because that pruning doesn't immediately know on
         // seeing each term whether that term will be a seek point
         // or not.  It requires some non-causality in the API, ie
         // only on seeing some number of future terms will the
         // builder decide which past terms are seek points.
         // Somehow the API'd need to be able to return a "I don't
         // know" value, eg like a Future, which only later on is
         // flipped (frozen) to true or false.
         //
         // We could solve this with a 2-pass approach, where the
         // first pass would build an FSA (no outputs) solely to
         // determine which prefixes are the 'leaves' in the
         // pruning. The 2nd pass would then look at this prefix
         // trie to mark the seek points and build the FST mapping
         // to the true output.
         //
         // But, one downside to this approach is that it'd result
         // in uneven index term selection.  EG with prune1=10, the
         // resulting index terms could be as frequent as every 10
         // terms or as rare as every <maxArcCount> * 10 (eg 2560),
         // in the extremes.

         public VariableGapTermsIndexWriter(SegmentWriteState state, IndexTermSelector policy)
         {
             string indexFileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, TERMS_INDEX_EXTENSION);
             m_output = state.Directory.CreateOutput(indexFileName, state.Context);
             bool success = false;
             try
             {
                 //fieldInfos = state.FieldInfos; // LUCENENET: Not used
                 this.policy = policy;
                 WriteHeader(m_output);
                 success = true;
             }
             finally
             {
                 if (!success)
                 {
                     IOUtils.DisposeWhileHandlingException(m_output);
                 }
             }
         }

         private void WriteHeader(IndexOutput output)
         {
             CodecUtil.WriteHeader(output, CODEC_NAME, VERSION_CURRENT);
         }

         public override FieldWriter AddField(FieldInfo field, long termsFilePointer)
         {
             ////System.out.println("VGW: field=" + field.name);
             policy.NewField(field);
             FSTFieldWriter writer = new FSTFieldWriter(this, field, termsFilePointer);
             fields.Add(writer);
             return writer;
         }

         /// <remarks>
         /// NOTE: If your codec does not sort in unicode code point order,
         /// you must override this method to simply return <c>indexedTerm.Length</c>.
         /// </remarks>
         protected virtual int IndexedTermPrefixLength(BytesRef priorTerm, BytesRef indexedTerm)
         {
             // As long as codec sorts terms in unicode codepoint
             // order, we can safely strip off the non-distinguishing
             // suffix to save RAM in the loaded terms index.
             int idxTermOffset = indexedTerm.Offset;
             int priorTermOffset = priorTerm.Offset;
             int limit = Math.Min(priorTerm.Length, indexedTerm.Length);
             for (int byteIdx = 0; byteIdx < limit; byteIdx++)
             {
                 if (priorTerm.Bytes[priorTermOffset + byteIdx] != indexedTerm.Bytes[idxTermOffset + byteIdx])
                 {
                     return byteIdx + 1;
                 }
             }
             return Math.Min(1 + priorTerm.Length, indexedTerm.Length);
         }

         private class FSTFieldWriter : FieldWriter
         {
             private readonly VariableGapTermsIndexWriter outerInstance;

             private readonly Builder<long?> fstBuilder;
             private readonly PositiveInt32Outputs fstOutputs;
             private readonly long startTermsFilePointer;

             internal readonly FieldInfo fieldInfo;
             internal FST<long?> fst;
             internal readonly long indexStart;

             private readonly BytesRef lastTerm = new BytesRef();
             private bool first = true;

             public FSTFieldWriter(VariableGapTermsIndexWriter outerInstance, FieldInfo fieldInfo, long termsFilePointer)
             {
                 this.outerInstance = outerInstance;

                 this.fieldInfo = fieldInfo;
                 fstOutputs = PositiveInt32Outputs.Singleton;
                 fstBuilder = new Builder<long?>(FST.INPUT_TYPE.BYTE1, fstOutputs);
                 indexStart = outerInstance.m_output.GetFilePointer();
                 ////System.out.println("VGW: field=" + fieldInfo.name);

                 // Always put empty string in
                 fstBuilder.Add(new Int32sRef(), termsFilePointer);
                 startTermsFilePointer = termsFilePointer;
             }

             public override bool CheckIndexTerm(BytesRef text, TermStats stats)
             {
                 //System.out.println("VGW: index term=" + text.utf8ToString());
                 // NOTE: we must force the first term per field to be
                 // indexed, in case policy doesn't:
                 if (outerInstance.policy.IsIndexTerm(text, stats) || first)
                 {
                     first = false;
                     //System.out.println("  YES");
                     return true;
                 }
                 else
                 {
                     lastTerm.CopyBytes(text);
                     return false;
                 }
             }

             private readonly Int32sRef scratchIntsRef = new Int32sRef();

             public override void Add(BytesRef text, TermStats stats, long termsFilePointer)
             {
                 if (text.Length == 0)
                 {
                     // We already added empty string in ctor
                     if (Debugging.AssertsEnabled) Debugging.Assert(termsFilePointer == startTermsFilePointer);
                     return;
                 }
                 int lengthSave = text.Length;
                 text.Length = outerInstance.IndexedTermPrefixLength(lastTerm, text);
                 try
                 {
                     fstBuilder.Add(Util.Fst.Util.ToInt32sRef(text, scratchIntsRef), termsFilePointer);
                 }
                 finally
                 {
                     text.Length = lengthSave;
                 }
                 lastTerm.CopyBytes(text);
             }

             public override void Finish(long termsFilePointer)
             {
                 fst = fstBuilder.Finish();
                 if (fst != null)
                 {
                     fst.Save(outerInstance.m_output);
                 }
             }
         }

         protected override void Dispose(bool disposing)
         {
             if (disposing)
             {
                 if (m_output != null)
                 {
                     try
                     {
                         long dirStart = m_output.GetFilePointer();
                         int fieldCount = fields.Count;

                         int nonNullFieldCount = 0;
                         for (int i = 0; i < fieldCount; i++)
                         {
                             FSTFieldWriter field = fields[i];
                             if (field.fst != null)
                             {
                                 nonNullFieldCount++;
                             }
                         }

                         m_output.WriteVInt32(nonNullFieldCount);
                         for (int i = 0; i < fieldCount; i++)
                         {
                             FSTFieldWriter field = fields[i];
                             if (field.fst != null)
                             {
                                 m_output.WriteVInt32(field.fieldInfo.Number);
                                 m_output.WriteVInt64(field.indexStart);
                             }
                         }
                         WriteTrailer(dirStart);
                         CodecUtil.WriteFooter(m_output);
                     }
                     finally
                     {
                         m_output.Dispose();
                         m_output = null;
                     }
                 }
             }
         }

         private void WriteTrailer(long dirStart)
         {
             m_output.WriteInt64(dirStart);
         }
     }
 }
	using Lucene.Net.Diagnostics;
	using Lucene.Net.Index;
	using Lucene.Net.Store;
	using Lucene.Net.Util;
	using Lucene.Net.Util.Fst;
	using System;
	using System.Collections.Generic;

	namespace Lucene.Net.Codecs.BlockTerms
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Selects index terms according to provided pluggable
	/// <see cref="IndexTermSelector"/>, and stores them in a prefix trie that's
	/// loaded entirely in RAM stored as an <see cref="FST{T}"/>. This terms
	/// index only supports unsigned byte term sort order
	/// (unicode codepoint order when the bytes are UTF8).
	/// <para/>
	/// @lucene.experimental
	/// </summary>
	public class VariableGapTermsIndexWriter : TermsIndexWriterBase
	{
	protected IndexOutput m_output;

	/// <summary>Extension of terms index file.</summary>
	internal readonly static string TERMS_INDEX_EXTENSION = "tiv";

	internal readonly static string CODEC_NAME = "VARIABLE_GAP_TERMS_INDEX";
	internal readonly static int VERSION_START = 0;
	internal readonly static int VERSION_APPEND_ONLY = 1;
	internal readonly static int VERSION_CHECKSUM = 2;
	internal readonly static int VERSION_CURRENT = VERSION_CHECKSUM;

	private readonly IList<FSTFieldWriter> fields = new List<FSTFieldWriter>();

	//private readonly FieldInfos fieldInfos; // unread // LUCENENET: Not used
	private readonly IndexTermSelector policy;

	/// <summary>
	/// Hook for selecting which terms should be placed in the terms index.
	/// <para/>
	/// <see cref="NewField(FieldInfo)"/> is called at the start of each new field, and
	/// <see cref="IsIndexTerm(BytesRef, TermStats)"/> for each term in that field.
	/// <para/>
	/// @lucene.experimental
	/// </summary>
	public abstract class IndexTermSelector
	{
	/// <summary>
	/// Called sequentially on every term being written,
	/// returning <c>true</c> if this term should be indexed.
	/// </summary>
	public abstract bool IsIndexTerm(BytesRef term, TermStats stats);

	/// <summary>Called when a new field is started.</summary>
	public abstract void NewField(FieldInfo fieldInfo);
	}

	/// <remarks>
	/// Same policy as <see cref="FixedGapTermsIndexWriter"/>
	/// </remarks>
	public sealed class EveryNTermSelector : IndexTermSelector
	{
	private int count;
	private readonly int interval;

	public EveryNTermSelector(int interval)
	{
	this.interval = interval;
	// First term is first indexed term
	count = interval;
	}

	public override bool IsIndexTerm(BytesRef term, TermStats stats)
	{
	if (count >= interval)
	{
	count = 1;
	return true;
	}
	else
	{
	count++;
	return false;
	}
	}

	public override void NewField(FieldInfo fieldInfo)
	{
	count = interval;
	}
	}

	/// <summary>
	/// Sets an index term when docFreq >= docFreqThresh, or
	/// every interval terms. This should reduce seek time
	/// to high docFreq terms.
	/// </summary>
	public sealed class EveryNOrDocFreqTermSelector : IndexTermSelector
	{
	private int count;
	private readonly int docFreqThresh;
	private readonly int interval;

	public EveryNOrDocFreqTermSelector(int docFreqThresh, int interval)
	{
	this.interval = interval;
	this.docFreqThresh = docFreqThresh;

	// First term is first indexed term:
	count = interval;
	}

	public override bool IsIndexTerm(BytesRef term, TermStats stats)
	{
	if (stats.DocFreq >= docFreqThresh \|\| count >= interval)
	{
	count = 1;
	return true;
	}
	else
	{
	count++;
	return false;
	}
	}

	public override void NewField(FieldInfo fieldInfo)
	{
	count = interval;
	}
	}

	// TODO: it'd be nice to let the FST builder prune based
	// on term count of each node (the prune1/prune2 that it
	// accepts), and build the index based on that. This
	// should result in a more compact terms index, more like
	// a prefix trie than the other selectors, because it
	// only stores enough leading bytes to get down to N
	// terms that may complete that prefix. It becomes
	// "deeper" when terms are dense, and "shallow" when they
	// are less dense.
	//
	// However, it's not easy to make that work this this
	// API, because that pruning doesn't immediately know on
	// seeing each term whether that term will be a seek point
	// or not. It requires some non-causality in the API, ie
	// only on seeing some number of future terms will the
	// builder decide which past terms are seek points.
	// Somehow the API'd need to be able to return a "I don't
	// know" value, eg like a Future, which only later on is
	// flipped (frozen) to true or false.
	//
	// We could solve this with a 2-pass approach, where the
	// first pass would build an FSA (no outputs) solely to
	// determine which prefixes are the 'leaves' in the
	// pruning. The 2nd pass would then look at this prefix
	// trie to mark the seek points and build the FST mapping
	// to the true output.
	//
	// But, one downside to this approach is that it'd result
	// in uneven index term selection. EG with prune1=10, the
	// resulting index terms could be as frequent as every 10
	// terms or as rare as every <maxArcCount> * 10 (eg 2560),
	// in the extremes.

	public VariableGapTermsIndexWriter(SegmentWriteState state, IndexTermSelector policy)
	{
	string indexFileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, TERMS_INDEX_EXTENSION);
	m_output = state.Directory.CreateOutput(indexFileName, state.Context);
	bool success = false;
	try
	{
	//fieldInfos = state.FieldInfos; // LUCENENET: Not used
	this.policy = policy;
	WriteHeader(m_output);
	success = true;
	}
	finally
	{
	if (!success)
	{
	IOUtils.DisposeWhileHandlingException(m_output);
	}
	}
	}

	private void WriteHeader(IndexOutput output)
	{
	CodecUtil.WriteHeader(output, CODEC_NAME, VERSION_CURRENT);
	}

	public override FieldWriter AddField(FieldInfo field, long termsFilePointer)
	{
	////System.out.println("VGW: field=" + field.name);
	policy.NewField(field);
	FSTFieldWriter writer = new FSTFieldWriter(this, field, termsFilePointer);
	fields.Add(writer);
	return writer;
	}

	/// <remarks>
	/// NOTE: If your codec does not sort in unicode code point order,
	/// you must override this method to simply return <c>indexedTerm.Length</c>.
	/// </remarks>
	protected virtual int IndexedTermPrefixLength(BytesRef priorTerm, BytesRef indexedTerm)
	{
	// As long as codec sorts terms in unicode codepoint
	// order, we can safely strip off the non-distinguishing
	// suffix to save RAM in the loaded terms index.
	int idxTermOffset = indexedTerm.Offset;
	int priorTermOffset = priorTerm.Offset;
	int limit = Math.Min(priorTerm.Length, indexedTerm.Length);
	for (int byteIdx = 0; byteIdx < limit; byteIdx++)
	{
	if (priorTerm.Bytes[priorTermOffset + byteIdx] != indexedTerm.Bytes[idxTermOffset + byteIdx])
	{
	return byteIdx + 1;
	}
	}
	return Math.Min(1 + priorTerm.Length, indexedTerm.Length);
	}

	private class FSTFieldWriter : FieldWriter
	{
	private readonly VariableGapTermsIndexWriter outerInstance;

	private readonly Builder<long?> fstBuilder;
	private readonly PositiveInt32Outputs fstOutputs;
	private readonly long startTermsFilePointer;

	internal readonly FieldInfo fieldInfo;
	internal FST<long?> fst;
	internal readonly long indexStart;

	private readonly BytesRef lastTerm = new BytesRef();
	private bool first = true;

	public FSTFieldWriter(VariableGapTermsIndexWriter outerInstance, FieldInfo fieldInfo, long termsFilePointer)
	{
	this.outerInstance = outerInstance;

	this.fieldInfo = fieldInfo;
	fstOutputs = PositiveInt32Outputs.Singleton;
	fstBuilder = new Builder<long?>(FST.INPUT_TYPE.BYTE1, fstOutputs);
	indexStart = outerInstance.m_output.GetFilePointer();
	////System.out.println("VGW: field=" + fieldInfo.name);

	// Always put empty string in
	fstBuilder.Add(new Int32sRef(), termsFilePointer);
	startTermsFilePointer = termsFilePointer;
	}

	public override bool CheckIndexTerm(BytesRef text, TermStats stats)
	{
	//System.out.println("VGW: index term=" + text.utf8ToString());
	// NOTE: we must force the first term per field to be
	// indexed, in case policy doesn't:
	if (outerInstance.policy.IsIndexTerm(text, stats) \|\| first)
	{
	first = false;
	//System.out.println(" YES");
	return true;
	}
	else
	{
	lastTerm.CopyBytes(text);
	return false;
	}
	}

	private readonly Int32sRef scratchIntsRef = new Int32sRef();

	public override void Add(BytesRef text, TermStats stats, long termsFilePointer)
	{
	if (text.Length == 0)
	{
	// We already added empty string in ctor
	if (Debugging.AssertsEnabled) Debugging.Assert(termsFilePointer == startTermsFilePointer);
	return;
	}
	int lengthSave = text.Length;
	text.Length = outerInstance.IndexedTermPrefixLength(lastTerm, text);
	try
	{
	fstBuilder.Add(Util.Fst.Util.ToInt32sRef(text, scratchIntsRef), termsFilePointer);
	}
	finally
	{
	text.Length = lengthSave;
	}
	lastTerm.CopyBytes(text);
	}

	public override void Finish(long termsFilePointer)
	{
	fst = fstBuilder.Finish();
	if (fst != null)
	{
	fst.Save(outerInstance.m_output);
	}
	}
	}

	protected override void Dispose(bool disposing)
	{
	if (disposing)
	{
	if (m_output != null)
	{
	try
	{
	long dirStart = m_output.GetFilePointer();
	int fieldCount = fields.Count;

	int nonNullFieldCount = 0;
	for (int i = 0; i < fieldCount; i++)
	{
	FSTFieldWriter field = fields[i];
	if (field.fst != null)
	{
	nonNullFieldCount++;
	}
	}

	m_output.WriteVInt32(nonNullFieldCount);
	for (int i = 0; i < fieldCount; i++)
	{
	FSTFieldWriter field = fields[i];
	if (field.fst != null)
	{
	m_output.WriteVInt32(field.fieldInfo.Number);
	m_output.WriteVInt64(field.indexStart);
	}
	}
	WriteTrailer(dirStart);
	CodecUtil.WriteFooter(m_output);
	}
	finally
	{
	m_output.Dispose();
	m_output = null;
	}
	}
	}
	}

	private void WriteTrailer(long dirStart)
	{
	m_output.WriteInt64(dirStart);
	}
	}
	}