src/Lucene.Net.Suggest/Suggest/Analyzing/BlendedInfixSuggester.cs - lucenenet - Git at Google

 using Lucene.Net.Analysis;
 using Lucene.Net.Documents;
 using Lucene.Net.Index;
 using Lucene.Net.Util;
 using System;
 using System.Collections.Generic;
 using System.Diagnostics;
 using System.IO;
 using JCG = J2N.Collections.Generic;
 using Directory = Lucene.Net.Store.Directory;
 using Lucene.Net.Diagnostics;

 namespace Lucene.Net.Search.Suggest.Analyzing
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     // TODO:
     // - allow to use the search score

     /// <summary>
     /// Extension of the <see cref="AnalyzingInfixSuggester"/> which transforms the weight
     /// after search to take into account the position of the searched term into
     /// the indexed text.
     /// Please note that it increases the number of elements searched and applies the
     /// ponderation after. It might be costly for long suggestions.
     ///
     /// @lucene.experimental
     /// </summary>
     public class BlendedInfixSuggester : AnalyzingInfixSuggester
     {

         /// <summary>
         /// Coefficient used for linear blending
         /// </summary>
         protected internal static double LINEAR_COEF = 0.10;

         /// <summary>
         /// Default factor
         /// </summary>
         public static int DEFAULT_NUM_FACTOR = 10;

         /// <summary>
         /// Factor to multiply the number of searched elements
         /// </summary>
         private readonly int numFactor;

         /// <summary>
         /// Type of blender used by the suggester
         /// </summary>
         private readonly BlenderType blenderType;

         /// <summary>
         /// The different types of blender.
         /// </summary>
         public enum BlenderType
         {
             /// <summary>
             /// Application dependent; override <see cref="CalculateCoefficient(int)"/>
             /// to compute it.
             /// </summary>
             CUSTOM,
             /// <summary>
             /// weight*(1 - 0.10*position)
             /// </summary>
             POSITION_LINEAR,
             /// <summary>
             /// weight/(1+position)
             /// </summary>
             POSITION_RECIPROCAL,
             // TODO:
             //SCORE
         }

         /// <summary>
         /// Create a new instance, loading from a previously built
         /// directory, if it exists.
         /// </summary>
         public BlendedInfixSuggester(LuceneVersion matchVersion, Directory dir, Analyzer analyzer)
             : base(matchVersion, dir, analyzer)
         {
             this.blenderType = BlenderType.POSITION_LINEAR;
             this.numFactor = DEFAULT_NUM_FACTOR;
         }

         /// <summary>
         /// Create a new instance, loading from a previously built
         /// directory, if it exists.
         /// </summary>
         /// <param name="blenderType"> Type of blending strategy, see BlenderType for more precisions </param>
         /// <param name="numFactor">   Factor to multiply the number of searched elements before ponderate </param>
         /// <exception cref="IOException"> If there are problems opening the underlying Lucene index. </exception>
         public BlendedInfixSuggester(LuceneVersion matchVersion, Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars, BlenderType blenderType, int numFactor)
             : base(matchVersion, dir, indexAnalyzer, queryAnalyzer, minPrefixChars)
         {
             this.blenderType = blenderType;
             this.numFactor = numFactor;
         }

         public override IList<Lookup.LookupResult> DoLookup(string key, IEnumerable<BytesRef> contexts, bool onlyMorePopular, int num)
         {
             // here we multiply the number of searched element by the defined factor
             return base.DoLookup(key, contexts, onlyMorePopular, num * numFactor);
         }

         public override IList<Lookup.LookupResult> DoLookup(string key, IEnumerable<BytesRef> contexts, int num, bool allTermsRequired, bool doHighlight)
         {
             // here we multiply the number of searched element by the defined factor
             return base.DoLookup(key, contexts, num * numFactor, allTermsRequired, doHighlight);
         }

         protected override FieldType GetTextFieldType()
         {
             FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
             ft.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
             ft.StoreTermVectors = true;
             ft.StoreTermVectorPositions = true;
             ft.OmitNorms = true;

             return ft;
         }

         protected internal override IList<Lookup.LookupResult> CreateResults(IndexSearcher searcher, TopFieldDocs hits,
             int num, string key, bool doHighlight, ICollection<string> matchedTokens, string prefixToken)
         {

             BinaryDocValues textDV = MultiDocValues.GetBinaryValues(searcher.IndexReader, TEXT_FIELD_NAME);
             if (Debugging.AssertsEnabled) Debugging.Assert(textDV != null);

             // This will just be null if app didn't pass payloads to build():
             // TODO: maybe just stored fields?  they compress...
             BinaryDocValues payloadsDV = MultiDocValues.GetBinaryValues(searcher.IndexReader, "payloads");

             JCG.SortedSet<Lookup.LookupResult> results = new JCG.SortedSet<Lookup.LookupResult>(LOOKUP_COMP);

             // we reduce the num to the one initially requested
             int actualNum = num / numFactor;

             BytesRef scratch = new BytesRef();
             for (int i = 0; i < hits.ScoreDocs.Length; i++)
             {
                 FieldDoc fd = (FieldDoc)hits.ScoreDocs[i];

                 textDV.Get(fd.Doc, scratch);
                 string text = scratch.Utf8ToString();
                 long weight = (long)fd.Fields[0];

                 BytesRef payload;
                 if (payloadsDV != null)
                 {
                     payload = new BytesRef();
                     payloadsDV.Get(fd.Doc, payload);
                 }
                 else
                 {
                     payload = null;
                 }

                 double coefficient;
                 if (text.StartsWith(key.ToString(), StringComparison.Ordinal))
                 {
                     // if hit starts with the key, we don't change the score
                     coefficient = 1;
                 }
                 else
                 {
                     coefficient = CreateCoefficient(searcher, fd.Doc, matchedTokens, prefixToken);
                 }

                 long score = (long)(weight * coefficient);

                 LookupResult result;
                 if (doHighlight)
                 {
                     object highlightKey = Highlight(text, matchedTokens, prefixToken);
                     result = new LookupResult(highlightKey.ToString(), highlightKey, score, payload);
                 }
                 else
                 {
                     result = new LookupResult(text, score, payload);
                 }

                 BoundedTreeAdd(results, result, actualNum);
             }

             return new List<LookupResult>(results.Reverse());
         }

         /// <summary>
         /// Add an element to the tree respecting a size limit
         /// </summary>
         /// <param name="results"> the tree to add in </param>
         /// <param name="result"> the result we try to add </param>
         /// <param name="num"> size limit </param>
         private static void BoundedTreeAdd(JCG.SortedSet<Lookup.LookupResult> results, Lookup.LookupResult result, int num)
         {
             if (results.Count >= num)
             {
                 var first = results.Min; // "get" our first object so we don't cross threads
                 if (first.Value < result.Value)
                     // Code similar to the java TreeMap class
                     results.Remove(first);
                 else
                     return;
             }

             results.Add(result);
         }

         /// <summary>
         /// Create the coefficient to transform the weight.
         /// </summary>
         /// <param name="doc"> id of the document </param>
         /// <param name="matchedTokens"> tokens found in the query </param>
         /// <param name="prefixToken"> unfinished token in the query </param>
         /// <returns> the coefficient </returns>
         /// <exception cref="IOException"> If there are problems reading term vectors from the underlying Lucene index. </exception>
         private double CreateCoefficient(IndexSearcher searcher, int doc, ICollection<string> matchedTokens, string prefixToken)
         {
             Terms tv = searcher.IndexReader.GetTermVector(doc, TEXT_FIELD_NAME);
             TermsEnum it = tv.GetEnumerator(TermsEnum.EMPTY);

             int position = int.MaxValue;
             // find the closest token position
             while (it.MoveNext())
             {
                 string docTerm = it.Term.Utf8ToString();

                 if (matchedTokens.Contains(docTerm) || docTerm.StartsWith(prefixToken, StringComparison.Ordinal))
                 {
                     DocsAndPositionsEnum docPosEnum = it.DocsAndPositions(null, null, DocsAndPositionsFlags.OFFSETS);
                     docPosEnum.NextDoc();

                     // use the first occurrence of the term
                     int p = docPosEnum.NextPosition();
                     if (p < position)
                     {
                         position = p;
                     }
                 }
             }

             // create corresponding coefficient based on position
             return CalculateCoefficient(position);
         }

         /// <summary>
         /// Calculate the weight coefficient based on the position of the first matching word.
         /// Subclass should override it to adapt it to particular needs </summary>
         /// <param name="position"> of the first matching word in text </param>
         /// <returns> the coefficient </returns>
         protected internal virtual double CalculateCoefficient(int position)
         {
             double coefficient;
             switch (blenderType)
             {
                 case BlendedInfixSuggester.BlenderType.POSITION_LINEAR:
                     coefficient = 1 - LINEAR_COEF * position;
                     break;

                 case BlendedInfixSuggester.BlenderType.POSITION_RECIPROCAL:
                     coefficient = 1.0 / (position + 1);
                     break;

                 default:
                     coefficient = 1;
                     break;
             }

             return coefficient;
         }

         private static readonly IComparer<Lookup.LookupResult> LOOKUP_COMP = new LookUpComparer(); // LUCENENET: marked readonly

         private class LookUpComparer : IComparer<Lookup.LookupResult>
         {
             public virtual int Compare(Lookup.LookupResult o1, Lookup.LookupResult o2)
             {
                 // order on weight
                 if (o1.Value > o2.Value)
                 {
                     return 1;
                 }
                 else if (o1.Value < o2.Value)
                 {
                     return -1;
                 }

                 // otherwise on alphabetic order
                 return CHARSEQUENCE_COMPARER.Compare(o1.Key, o2.Key);
             }
         }
     }
 }
	using Lucene.Net.Analysis;
	using Lucene.Net.Documents;
	using Lucene.Net.Index;
	using Lucene.Net.Util;
	using System;
	using System.Collections.Generic;
	using System.Diagnostics;
	using System.IO;
	using JCG = J2N.Collections.Generic;
	using Directory = Lucene.Net.Store.Directory;
	using Lucene.Net.Diagnostics;

	namespace Lucene.Net.Search.Suggest.Analyzing
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// TODO:
	// - allow to use the search score

	/// <summary>
	/// Extension of the <see cref="AnalyzingInfixSuggester"/> which transforms the weight
	/// after search to take into account the position of the searched term into
	/// the indexed text.
	/// Please note that it increases the number of elements searched and applies the
	/// ponderation after. It might be costly for long suggestions.
	///
	/// @lucene.experimental
	/// </summary>
	public class BlendedInfixSuggester : AnalyzingInfixSuggester
	{

	/// <summary>
	/// Coefficient used for linear blending
	/// </summary>
	protected internal static double LINEAR_COEF = 0.10;

	/// <summary>
	/// Default factor
	/// </summary>
	public static int DEFAULT_NUM_FACTOR = 10;

	/// <summary>
	/// Factor to multiply the number of searched elements
	/// </summary>
	private readonly int numFactor;

	/// <summary>
	/// Type of blender used by the suggester
	/// </summary>
	private readonly BlenderType blenderType;

	/// <summary>
	/// The different types of blender.
	/// </summary>
	public enum BlenderType
	{
	/// <summary>
	/// Application dependent; override <see cref="CalculateCoefficient(int)"/>
	/// to compute it.
	/// </summary>
	CUSTOM,
	/// <summary>
	/// weight(1 - 0.10position)
	/// </summary>
	POSITION_LINEAR,
	/// <summary>
	/// weight/(1+position)
	/// </summary>
	POSITION_RECIPROCAL,
	// TODO:
	//SCORE
	}

	/// <summary>
	/// Create a new instance, loading from a previously built
	/// directory, if it exists.
	/// </summary>
	public BlendedInfixSuggester(LuceneVersion matchVersion, Directory dir, Analyzer analyzer)
	: base(matchVersion, dir, analyzer)
	{
	this.blenderType = BlenderType.POSITION_LINEAR;
	this.numFactor = DEFAULT_NUM_FACTOR;
	}

	/// <summary>
	/// Create a new instance, loading from a previously built
	/// directory, if it exists.
	/// </summary>
	/// <param name="blenderType"> Type of blending strategy, see BlenderType for more precisions </param>
	/// <param name="numFactor"> Factor to multiply the number of searched elements before ponderate </param>
	/// <exception cref="IOException"> If there are problems opening the underlying Lucene index. </exception>
	public BlendedInfixSuggester(LuceneVersion matchVersion, Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars, BlenderType blenderType, int numFactor)
	: base(matchVersion, dir, indexAnalyzer, queryAnalyzer, minPrefixChars)
	{
	this.blenderType = blenderType;
	this.numFactor = numFactor;
	}

	public override IList<Lookup.LookupResult> DoLookup(string key, IEnumerable<BytesRef> contexts, bool onlyMorePopular, int num)
	{
	// here we multiply the number of searched element by the defined factor
	return base.DoLookup(key, contexts, onlyMorePopular, num * numFactor);
	}

	public override IList<Lookup.LookupResult> DoLookup(string key, IEnumerable<BytesRef> contexts, int num, bool allTermsRequired, bool doHighlight)
	{
	// here we multiply the number of searched element by the defined factor
	return base.DoLookup(key, contexts, num * numFactor, allTermsRequired, doHighlight);
	}

	protected override FieldType GetTextFieldType()
	{
	FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
	ft.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
	ft.StoreTermVectors = true;
	ft.StoreTermVectorPositions = true;
	ft.OmitNorms = true;

	return ft;
	}

	protected internal override IList<Lookup.LookupResult> CreateResults(IndexSearcher searcher, TopFieldDocs hits,
	int num, string key, bool doHighlight, ICollection<string> matchedTokens, string prefixToken)
	{

	BinaryDocValues textDV = MultiDocValues.GetBinaryValues(searcher.IndexReader, TEXT_FIELD_NAME);
	if (Debugging.AssertsEnabled) Debugging.Assert(textDV != null);

	// This will just be null if app didn't pass payloads to build():
	// TODO: maybe just stored fields? they compress...
	BinaryDocValues payloadsDV = MultiDocValues.GetBinaryValues(searcher.IndexReader, "payloads");

	JCG.SortedSet<Lookup.LookupResult> results = new JCG.SortedSet<Lookup.LookupResult>(LOOKUP_COMP);

	// we reduce the num to the one initially requested
	int actualNum = num / numFactor;

	BytesRef scratch = new BytesRef();
	for (int i = 0; i < hits.ScoreDocs.Length; i++)
	{
	FieldDoc fd = (FieldDoc)hits.ScoreDocs[i];

	textDV.Get(fd.Doc, scratch);
	string text = scratch.Utf8ToString();
	long weight = (long)fd.Fields[0];

	BytesRef payload;
	if (payloadsDV != null)
	{
	payload = new BytesRef();
	payloadsDV.Get(fd.Doc, payload);
	}
	else
	{
	payload = null;
	}

	double coefficient;
	if (text.StartsWith(key.ToString(), StringComparison.Ordinal))
	{
	// if hit starts with the key, we don't change the score
	coefficient = 1;
	}
	else
	{
	coefficient = CreateCoefficient(searcher, fd.Doc, matchedTokens, prefixToken);
	}

	long score = (long)(weight * coefficient);

	LookupResult result;
	if (doHighlight)
	{
	object highlightKey = Highlight(text, matchedTokens, prefixToken);
	result = new LookupResult(highlightKey.ToString(), highlightKey, score, payload);
	}
	else
	{
	result = new LookupResult(text, score, payload);
	}

	BoundedTreeAdd(results, result, actualNum);
	}

	return new List<LookupResult>(results.Reverse());
	}

	/// <summary>
	/// Add an element to the tree respecting a size limit
	/// </summary>
	/// <param name="results"> the tree to add in </param>
	/// <param name="result"> the result we try to add </param>
	/// <param name="num"> size limit </param>
	private static void BoundedTreeAdd(JCG.SortedSet<Lookup.LookupResult> results, Lookup.LookupResult result, int num)
	{
	if (results.Count >= num)
	{
	var first = results.Min; // "get" our first object so we don't cross threads
	if (first.Value < result.Value)
	// Code similar to the java TreeMap class
	results.Remove(first);
	else
	return;
	}

	results.Add(result);
	}

	/// <summary>
	/// Create the coefficient to transform the weight.
	/// </summary>
	/// <param name="doc"> id of the document </param>
	/// <param name="matchedTokens"> tokens found in the query </param>
	/// <param name="prefixToken"> unfinished token in the query </param>
	/// <returns> the coefficient </returns>
	/// <exception cref="IOException"> If there are problems reading term vectors from the underlying Lucene index. </exception>
	private double CreateCoefficient(IndexSearcher searcher, int doc, ICollection<string> matchedTokens, string prefixToken)
	{
	Terms tv = searcher.IndexReader.GetTermVector(doc, TEXT_FIELD_NAME);
	TermsEnum it = tv.GetEnumerator(TermsEnum.EMPTY);

	int position = int.MaxValue;
	// find the closest token position
	while (it.MoveNext())
	{
	string docTerm = it.Term.Utf8ToString();

	if (matchedTokens.Contains(docTerm) \|\| docTerm.StartsWith(prefixToken, StringComparison.Ordinal))
	{
	DocsAndPositionsEnum docPosEnum = it.DocsAndPositions(null, null, DocsAndPositionsFlags.OFFSETS);
	docPosEnum.NextDoc();

	// use the first occurrence of the term
	int p = docPosEnum.NextPosition();
	if (p < position)
	{
	position = p;
	}
	}
	}

	// create corresponding coefficient based on position
	return CalculateCoefficient(position);
	}

	/// <summary>
	/// Calculate the weight coefficient based on the position of the first matching word.
	/// Subclass should override it to adapt it to particular needs </summary>
	/// <param name="position"> of the first matching word in text </param>
	/// <returns> the coefficient </returns>
	protected internal virtual double CalculateCoefficient(int position)
	{
	double coefficient;
	switch (blenderType)
	{
	case BlendedInfixSuggester.BlenderType.POSITION_LINEAR:
	coefficient = 1 - LINEAR_COEF * position;
	break;

	case BlendedInfixSuggester.BlenderType.POSITION_RECIPROCAL:
	coefficient = 1.0 / (position + 1);
	break;

	default:
	coefficient = 1;
	break;
	}

	return coefficient;
	}

	private static readonly IComparer<Lookup.LookupResult> LOOKUP_COMP = new LookUpComparer(); // LUCENENET: marked readonly

	private class LookUpComparer : IComparer<Lookup.LookupResult>
	{
	public virtual int Compare(Lookup.LookupResult o1, Lookup.LookupResult o2)
	{
	// order on weight
	if (o1.Value > o2.Value)
	{
	return 1;
	}
	else if (o1.Value < o2.Value)
	{
	return -1;
	}

	// otherwise on alphabetic order
	return CHARSEQUENCE_COMPARER.Compare(o1.Key, o2.Key);
	}
	}
	}
	}