src/Lucene.Net.Highlighter/Highlight/QueryScorer.cs - lucenenet - Git at Google

 using J2N.Text;
 using Lucene.Net.Analysis;
 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Index;
 using System;
 using System.Collections.Generic;
 using JCG = J2N.Collections.Generic;

 namespace Lucene.Net.Search.Highlight
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     ///<summary>
     /// <see cref="IScorer"/> implementation which scores text fragments by the number of
     /// unique query terms found. This class converts appropriate <see cref="Query"/>s to
     /// <see cref="Search.Spans.SpanQuery"/>s and attempts to score only those terms that participated in
     /// generating the 'hit' on the document.
     /// </summary>
     public class QueryScorer : IScorer
     {
         private float totalScore;
         private ISet<string> foundTerms;
         private IDictionary<string, WeightedSpanTerm> fieldWeightedSpanTerms;
         private readonly float maxTermWeight;
         private int position = -1;
         private readonly string defaultField;
         private ICharTermAttribute termAtt;
         private IPositionIncrementAttribute posIncAtt;
         private bool expandMultiTermQuery = true;
         private Query query;
         private string field;
         private IndexReader reader;
         private readonly bool skipInitExtractor;
         private bool wrapToCaching = true;
         private int maxCharsToAnalyze;

         /// <summary>
         /// Constructs a new <see cref="QueryScorer"/> instance
         /// </summary>
         /// <param name="query"><see cref="Query"/> to use for highlighting</param>
         public QueryScorer(Query query)
         {
             Init(query, null, null, true);
         }

         /// <summary>
         /// Constructs a new <see cref="QueryScorer"/> instance
         /// </summary>
         /// <param name="query"><see cref="Query"/> to use for highlighting</param>
         /// <param name="field">Field to highlight - pass null to ignore fields</param>
         public QueryScorer(Query query, string field)
         {
             Init(query, field, null, true);
         }

         /// <summary>
         /// Constructs a new <see cref="QueryScorer"/> instance
         /// </summary>
         /// <param name="query"><see cref="Query"/> to use for highlighting</param>
         /// <param name="reader"><see cref="IndexReader"/> to use for quasi tf/idf scoring</param>
         /// <param name="field">Field to highlight - pass null to ignore fields</param>
         public QueryScorer(Query query, IndexReader reader, string field)
         {
             Init(query, field, reader, true);
         }

         /// <summary>
         /// Constructs a new <see cref="QueryScorer"/> instance
         /// </summary>
         /// <param name="query"><see cref="Query"/> to use for highlighting</param>
         /// <param name="reader"><see cref="IndexReader"/> to use for quasi tf/idf scoring</param>
         /// <param name="field">Field to highlight - pass null to ignore fields</param>
         /// <param name="defaultField">The default field for queries with the field name unspecified</param>
         public QueryScorer(Query query, IndexReader reader, string field, string defaultField)
         {
             this.defaultField = defaultField.Intern();
             Init(query, field, reader, true);
         }

         /// <summary>
         /// Constructs a new <see cref="QueryScorer"/> instance
         /// </summary>
         /// <param name="query"><see cref="Query"/> to use for highlighting</param>
         /// <param name="field">Field to highlight - pass null to ignore fields</param>
         /// <param name="defaultField">The default field for queries with the field name unspecified</param>
         public QueryScorer(Query query, string field, string defaultField)
         {
             this.defaultField = defaultField.Intern();
             Init(query, field, null, true);
         }

         /// <summary>
         /// Constructs a new <see cref="QueryScorer"/> instance
         /// </summary>
         /// <param name="weightedTerms">an array of pre-created <see cref="WeightedSpanTerm"/>s</param>
         public QueryScorer(WeightedSpanTerm[] weightedTerms)
         {
             this.fieldWeightedSpanTerms = new JCG.Dictionary<string, WeightedSpanTerm>(weightedTerms.Length);

             foreach (WeightedSpanTerm t in weightedTerms)
             {
                 if (!fieldWeightedSpanTerms.TryGetValue(t.Term, out WeightedSpanTerm existingTerm) ||
                     (existingTerm == null) ||
                     (existingTerm.Weight < t.Weight))
                 {
                     // if a term is defined more than once, always use the highest
                     // scoring Weight
                     fieldWeightedSpanTerms[t.Term] = t;
                     maxTermWeight = Math.Max(maxTermWeight, t.Weight);
                 }
             }
             skipInitExtractor = true;
         }

         /// <seealso cref="IScorer.FragmentScore"/>
         public virtual float FragmentScore => totalScore;

         /// <summary>
         /// The highest weighted term (useful for passing to <see cref="GradientFormatter"/> to set top end of coloring scale).
         /// </summary>
         public virtual float MaxTermWeight => maxTermWeight;

         /// <seealso cref="IScorer.GetTokenScore()"/>
         public virtual float GetTokenScore()
         {
             position += posIncAtt.PositionIncrement;
             string termText = termAtt.ToString();

             WeightedSpanTerm weightedSpanTerm;
             if (!fieldWeightedSpanTerms.TryGetValue(termText, out weightedSpanTerm) || weightedSpanTerm == null)
             {
                 return 0;
             }

             if (weightedSpanTerm.IsPositionSensitive &&
                 !weightedSpanTerm.CheckPosition(position))
             {
                 return 0;
             }

             float score = weightedSpanTerm.Weight;

             // found a query term - is it unique in this doc?
             if (!foundTerms.Contains(termText))
             {
                 totalScore += score;
                 foundTerms.Add(termText);
             }

             return score;
         }

         /// <seealso cref="IScorer.Init"/>
         public virtual TokenStream Init(TokenStream tokenStream)
         {
             position = -1;
             termAtt = tokenStream.AddAttribute<ICharTermAttribute>();
             posIncAtt = tokenStream.AddAttribute<IPositionIncrementAttribute>();
             if (!skipInitExtractor)
             {
                 fieldWeightedSpanTerms?.Clear();
                 return InitExtractor(tokenStream);
             }
             return null;
         }

         /// <summary>
         /// Retrieve the <see cref="WeightedSpanTerm"/> for the specified token. Useful for passing
         /// Span information to a <see cref="IFragmenter"/>.
         /// </summary>
         /// <param name="token">token to get <see cref="WeightedSpanTerm"/> for</param>
         /// <returns><see cref="WeightedSpanTerm"/> for token</returns>
         public virtual WeightedSpanTerm GetWeightedSpanTerm(string token)
         {
             WeightedSpanTerm result;
             fieldWeightedSpanTerms.TryGetValue(token, out result);
             return result;
         }

         private void Init(Query query, string field, IndexReader reader, bool expandMultiTermQuery)
         {
             this.reader = reader;
             this.expandMultiTermQuery = expandMultiTermQuery;
             this.query = query;
             this.field = field;
         }

         private TokenStream InitExtractor(TokenStream tokenStream)
         {
             WeightedSpanTermExtractor qse = NewTermExtractor(defaultField);

             qse.SetMaxDocCharsToAnalyze(maxCharsToAnalyze);
             qse.ExpandMultiTermQuery = expandMultiTermQuery;
             qse.SetWrapIfNotCachingTokenFilter(wrapToCaching);
             if (reader == null)
             {
                 this.fieldWeightedSpanTerms = qse.GetWeightedSpanTerms(query,
                                                                        tokenStream, field);
             }
             else
             {
                 this.fieldWeightedSpanTerms = qse.GetWeightedSpanTermsWithScores(query,
                                                              tokenStream, field, reader);
             }
             if (qse.IsCachedTokenStream)
             {
                 return qse.TokenStream;
             }

             return null;
         }

         protected virtual WeightedSpanTermExtractor NewTermExtractor(string defaultField)
         {
             return defaultField == null ? new WeightedSpanTermExtractor()
                 : new WeightedSpanTermExtractor(defaultField);
         }

         /// <seealso cref="IScorer.StartFragment"/>
         public virtual void StartFragment(TextFragment newFragment)
         {
             foundTerms = new JCG.HashSet<string>();
             totalScore = 0;
         }

         /// <summary>
         /// Controls whether or not multi-term queries are expanded
         /// against a <see cref="Index.Memory.MemoryIndex"/> <see cref="IndexReader"/>.
         /// <c>true</c> if multi-term queries should be expanded
         /// </summary>
         public virtual bool ExpandMultiTermQuery
         {
             get => expandMultiTermQuery;
             set => this.expandMultiTermQuery = value;
         }

         /// <summary>
         /// By default, <see cref="TokenStream"/>s that are not of the type
         /// <see cref="CachingTokenFilter"/> are wrapped in a <see cref="CachingTokenFilter"/> to
         /// ensure an efficient reset - if you are already using a different caching
         /// <see cref="TokenStream"/> impl and you don't want it to be wrapped, set this to
         /// false.
         /// </summary>
         public virtual void SetWrapIfNotCachingTokenFilter(bool wrap)
         {
             this.wrapToCaching = wrap;
         }

         public virtual void SetMaxDocCharsToAnalyze(int maxDocCharsToAnalyze)
         {
             this.maxCharsToAnalyze = maxDocCharsToAnalyze;
         }
     }
 }
	using J2N.Text;
	using Lucene.Net.Analysis;
	using Lucene.Net.Analysis.TokenAttributes;
	using Lucene.Net.Index;
	using System;
	using System.Collections.Generic;
	using JCG = J2N.Collections.Generic;

	namespace Lucene.Net.Search.Highlight
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	///<summary>
	/// <see cref="IScorer"/> implementation which scores text fragments by the number of
	/// unique query terms found. This class converts appropriate <see cref="Query"/>s to
	/// <see cref="Search.Spans.SpanQuery"/>s and attempts to score only those terms that participated in
	/// generating the 'hit' on the document.
	/// </summary>
	public class QueryScorer : IScorer
	{
	private float totalScore;
	private ISet<string> foundTerms;
	private IDictionary<string, WeightedSpanTerm> fieldWeightedSpanTerms;
	private readonly float maxTermWeight;
	private int position = -1;
	private readonly string defaultField;
	private ICharTermAttribute termAtt;
	private IPositionIncrementAttribute posIncAtt;
	private bool expandMultiTermQuery = true;
	private Query query;
	private string field;
	private IndexReader reader;
	private readonly bool skipInitExtractor;
	private bool wrapToCaching = true;
	private int maxCharsToAnalyze;

	/// <summary>
	/// Constructs a new <see cref="QueryScorer"/> instance
	/// </summary>
	/// <param name="query"><see cref="Query"/> to use for highlighting</param>
	public QueryScorer(Query query)
	{
	Init(query, null, null, true);
	}

	/// <summary>
	/// Constructs a new <see cref="QueryScorer"/> instance
	/// </summary>
	/// <param name="query"><see cref="Query"/> to use for highlighting</param>
	/// <param name="field">Field to highlight - pass null to ignore fields</param>
	public QueryScorer(Query query, string field)
	{
	Init(query, field, null, true);
	}

	/// <summary>
	/// Constructs a new <see cref="QueryScorer"/> instance
	/// </summary>
	/// <param name="query"><see cref="Query"/> to use for highlighting</param>
	/// <param name="reader"><see cref="IndexReader"/> to use for quasi tf/idf scoring</param>
	/// <param name="field">Field to highlight - pass null to ignore fields</param>
	public QueryScorer(Query query, IndexReader reader, string field)
	{
	Init(query, field, reader, true);
	}

	/// <summary>
	/// Constructs a new <see cref="QueryScorer"/> instance
	/// </summary>
	/// <param name="query"><see cref="Query"/> to use for highlighting</param>
	/// <param name="reader"><see cref="IndexReader"/> to use for quasi tf/idf scoring</param>
	/// <param name="field">Field to highlight - pass null to ignore fields</param>
	/// <param name="defaultField">The default field for queries with the field name unspecified</param>
	public QueryScorer(Query query, IndexReader reader, string field, string defaultField)
	{
	this.defaultField = defaultField.Intern();
	Init(query, field, reader, true);
	}

	/// <summary>
	/// Constructs a new <see cref="QueryScorer"/> instance
	/// </summary>
	/// <param name="query"><see cref="Query"/> to use for highlighting</param>
	/// <param name="field">Field to highlight - pass null to ignore fields</param>
	/// <param name="defaultField">The default field for queries with the field name unspecified</param>
	public QueryScorer(Query query, string field, string defaultField)
	{
	this.defaultField = defaultField.Intern();
	Init(query, field, null, true);
	}

	/// <summary>
	/// Constructs a new <see cref="QueryScorer"/> instance
	/// </summary>
	/// <param name="weightedTerms">an array of pre-created <see cref="WeightedSpanTerm"/>s</param>
	public QueryScorer(WeightedSpanTerm[] weightedTerms)
	{
	this.fieldWeightedSpanTerms = new JCG.Dictionary<string, WeightedSpanTerm>(weightedTerms.Length);

	foreach (WeightedSpanTerm t in weightedTerms)
	{
	if (!fieldWeightedSpanTerms.TryGetValue(t.Term, out WeightedSpanTerm existingTerm) \|\|
	(existingTerm == null) \|\|
	(existingTerm.Weight < t.Weight))
	{
	// if a term is defined more than once, always use the highest
	// scoring Weight
	fieldWeightedSpanTerms[t.Term] = t;
	maxTermWeight = Math.Max(maxTermWeight, t.Weight);
	}
	}
	skipInitExtractor = true;
	}

	/// <seealso cref="IScorer.FragmentScore"/>
	public virtual float FragmentScore => totalScore;

	/// <summary>
	/// The highest weighted term (useful for passing to <see cref="GradientFormatter"/> to set top end of coloring scale).
	/// </summary>
	public virtual float MaxTermWeight => maxTermWeight;

	/// <seealso cref="IScorer.GetTokenScore()"/>
	public virtual float GetTokenScore()
	{
	position += posIncAtt.PositionIncrement;
	string termText = termAtt.ToString();

	WeightedSpanTerm weightedSpanTerm;
	if (!fieldWeightedSpanTerms.TryGetValue(termText, out weightedSpanTerm) \|\| weightedSpanTerm == null)
	{
	return 0;
	}

	if (weightedSpanTerm.IsPositionSensitive &&
	!weightedSpanTerm.CheckPosition(position))
	{
	return 0;
	}

	float score = weightedSpanTerm.Weight;

	// found a query term - is it unique in this doc?
	if (!foundTerms.Contains(termText))
	{
	totalScore += score;
	foundTerms.Add(termText);
	}

	return score;
	}

	/// <seealso cref="IScorer.Init"/>
	public virtual TokenStream Init(TokenStream tokenStream)
	{
	position = -1;
	termAtt = tokenStream.AddAttribute<ICharTermAttribute>();
	posIncAtt = tokenStream.AddAttribute<IPositionIncrementAttribute>();
	if (!skipInitExtractor)
	{
	fieldWeightedSpanTerms?.Clear();
	return InitExtractor(tokenStream);
	}
	return null;
	}

	/// <summary>
	/// Retrieve the <see cref="WeightedSpanTerm"/> for the specified token. Useful for passing
	/// Span information to a <see cref="IFragmenter"/>.
	/// </summary>
	/// <param name="token">token to get <see cref="WeightedSpanTerm"/> for</param>
	/// <returns><see cref="WeightedSpanTerm"/> for token</returns>
	public virtual WeightedSpanTerm GetWeightedSpanTerm(string token)
	{
	WeightedSpanTerm result;
	fieldWeightedSpanTerms.TryGetValue(token, out result);
	return result;
	}

	private void Init(Query query, string field, IndexReader reader, bool expandMultiTermQuery)
	{
	this.reader = reader;
	this.expandMultiTermQuery = expandMultiTermQuery;
	this.query = query;
	this.field = field;
	}

	private TokenStream InitExtractor(TokenStream tokenStream)
	{
	WeightedSpanTermExtractor qse = NewTermExtractor(defaultField);

	qse.SetMaxDocCharsToAnalyze(maxCharsToAnalyze);
	qse.ExpandMultiTermQuery = expandMultiTermQuery;
	qse.SetWrapIfNotCachingTokenFilter(wrapToCaching);
	if (reader == null)
	{
	this.fieldWeightedSpanTerms = qse.GetWeightedSpanTerms(query,
	tokenStream, field);
	}
	else
	{
	this.fieldWeightedSpanTerms = qse.GetWeightedSpanTermsWithScores(query,
	tokenStream, field, reader);
	}
	if (qse.IsCachedTokenStream)
	{
	return qse.TokenStream;
	}

	return null;
	}

	protected virtual WeightedSpanTermExtractor NewTermExtractor(string defaultField)
	{
	return defaultField == null ? new WeightedSpanTermExtractor()
	: new WeightedSpanTermExtractor(defaultField);
	}

	/// <seealso cref="IScorer.StartFragment"/>
	public virtual void StartFragment(TextFragment newFragment)
	{
	foundTerms = new JCG.HashSet<string>();
	totalScore = 0;
	}

	/// <summary>
	/// Controls whether or not multi-term queries are expanded
	/// against a <see cref="Index.Memory.MemoryIndex"/> <see cref="IndexReader"/>.
	/// <c>true</c> if multi-term queries should be expanded
	/// </summary>
	public virtual bool ExpandMultiTermQuery
	{
	get => expandMultiTermQuery;
	set => this.expandMultiTermQuery = value;
	}

	/// <summary>
	/// By default, <see cref="TokenStream"/>s that are not of the type
	/// <see cref="CachingTokenFilter"/> are wrapped in a <see cref="CachingTokenFilter"/> to
	/// ensure an efficient reset - if you are already using a different caching
	/// <see cref="TokenStream"/> impl and you don't want it to be wrapped, set this to
	/// false.
	/// </summary>
	public virtual void SetWrapIfNotCachingTokenFilter(bool wrap)
	{
	this.wrapToCaching = wrap;
	}

	public virtual void SetMaxDocCharsToAnalyze(int maxDocCharsToAnalyze)
	{
	this.maxCharsToAnalyze = maxDocCharsToAnalyze;
	}
	}
	}