src/Lucene.Net/Search/ConstantScoreAutoRewrite.cs - lucenenet - Git at Google

 using Lucene.Net.Diagnostics;
 using System;

 namespace Lucene.Net.Search
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     using ArrayUtil = Lucene.Net.Util.ArrayUtil;
     using ByteBlockPool = Lucene.Net.Util.ByteBlockPool;
     using BytesRef = Lucene.Net.Util.BytesRef;
     using BytesRefHash = Lucene.Net.Util.BytesRefHash;
     using IndexReader = Lucene.Net.Index.IndexReader;
     using RamUsageEstimator = Lucene.Net.Util.RamUsageEstimator;
     using Term = Lucene.Net.Index.Term;
     using TermContext = Lucene.Net.Index.TermContext;
     using TermsEnum = Lucene.Net.Index.TermsEnum;
     using TermState = Lucene.Net.Index.TermState;

     /// <summary>
     /// A rewrite method that tries to pick the best
     /// constant-score rewrite method based on term and
     /// document counts from the query.  If both the number of
     /// terms and documents is small enough, then
     /// <see cref="MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE"/> is used.
     /// Otherwise, <see cref="MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE"/> is
     /// used.
     /// </summary>
     // LUCENENET specific: made this class public. In Lucene there was a derived class
     // with the same name that was nested within MultiTermQuery, but in .NET it is
     // more intuitive if our classes are not nested.
     public class ConstantScoreAutoRewrite : TermCollectingRewrite<BooleanQuery>
     {
         /// <summary>
         /// Defaults derived from rough tests with a 20.0 million
         /// doc Wikipedia index.  With more than 350 terms in the
         /// query, the filter method is fastest:
         /// </summary>
         public static int DEFAULT_TERM_COUNT_CUTOFF = 350;

         /// <summary>
         /// If the query will hit more than 1 in 1000 of the docs
         /// in the index (0.1%), the filter method is fastest:
         /// </summary>
         public static double DEFAULT_DOC_COUNT_PERCENT = 0.1;

         private int termCountCutoff = DEFAULT_TERM_COUNT_CUTOFF;
         private double docCountPercent = DEFAULT_DOC_COUNT_PERCENT;

         /// <summary>
         /// If the number of terms in this query is equal to or
         /// larger than this setting then
         /// <see cref="MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE"/> is used.
         /// </summary>
         public virtual int TermCountCutoff
         {
             get => termCountCutoff;
             set => termCountCutoff = value;
         }

         /// <summary>
         /// If the number of documents to be visited in the
         /// postings exceeds this specified percentage of the
         /// <see cref="Index.IndexReader.MaxDoc"/> for the index, then
         /// <see cref="MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE"/> is used.
         /// Value may be 0.0 to 100.0.
         /// </summary>
         public virtual double DocCountPercent
         {
             get => docCountPercent;
             set => docCountPercent = value;
         }

         protected override BooleanQuery GetTopLevelQuery()
         {
             return new BooleanQuery(true);
         }

         protected override void AddClause(BooleanQuery topLevel, Term term, int docFreq, float boost, TermContext states) //ignored
         {
             topLevel.Add(new TermQuery(term, states), Occur.SHOULD);
         }

         public override Query Rewrite(IndexReader reader, MultiTermQuery query)
         {
             // Get the enum and start visiting terms.  If we
             // exhaust the enum before hitting either of the
             // cutoffs, we use ConstantBooleanQueryRewrite; else,
             // ConstantFilterRewrite:
             int docCountCutoff = (int)((docCountPercent / 100.0) * reader.MaxDoc);
             int termCountLimit = Math.Min(BooleanQuery.MaxClauseCount, termCountCutoff);

             CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit);
             CollectTerms(reader, query, col);
             int size = col.pendingTerms.Count;
             if (col.hasCutOff)
             {
                 return MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE.Rewrite(reader, query);
             }
             else
             {
                 BooleanQuery bq = GetTopLevelQuery();
                 if (size > 0)
                 {
                     BytesRefHash pendingTerms = col.pendingTerms;
                     int[] sort = pendingTerms.Sort(col.termsEnum.Comparer);
                     for (int i = 0; i < size; i++)
                     {
                         int pos = sort[i];
                         // docFreq is not used for constant score here, we pass 1
                         // to explicitely set a fake value, so it's not calculated
                         AddClause(bq, new Term(query.m_field, pendingTerms.Get(pos, new BytesRef())), 1, 1.0f, col.array.termState[pos]);
                     }
                 }
                 // Strip scores
                 Query result = new ConstantScoreQuery(bq);
                 result.Boost = query.Boost;
                 return result;
             }
         }

         internal sealed class CutOffTermCollector : TermCollector
         {
             internal CutOffTermCollector(int docCountCutoff, int termCountLimit)
             {
                 pendingTerms = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectAllocator()), 16, array);
                 this.docCountCutoff = docCountCutoff;
                 this.termCountLimit = termCountLimit;
             }

             public override void SetNextEnum(TermsEnum termsEnum)
             {
                 this.termsEnum = termsEnum;
             }

             public override bool Collect(BytesRef bytes)
             {
                 int pos = pendingTerms.Add(bytes);
                 docVisitCount += termsEnum.DocFreq;
                 if (pendingTerms.Count >= termCountLimit || docVisitCount >= docCountCutoff)
                 {
                     hasCutOff = true;
                     return false;
                 }

                 TermState termState = termsEnum.GetTermState();
                 if (Debugging.AssertsEnabled) Debugging.Assert(termState != null);
                 if (pos < 0)
                 {
                     pos = (-pos) - 1;
                     array.termState[pos].Register(termState, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq);
                 }
                 else
                 {
                     array.termState[pos] = new TermContext(m_topReaderContext, termState, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq);
                 }
                 return true;
             }

             internal int docVisitCount = 0;
             internal bool hasCutOff = false;
             internal TermsEnum termsEnum;

             internal readonly int docCountCutoff, termCountLimit;
             internal readonly TermStateByteStart array = new TermStateByteStart(16);
             internal BytesRefHash pendingTerms;
         }

         public override int GetHashCode()
         {
             const int prime = 1279;
             return (int)(prime * termCountCutoff + J2N.BitConversion.DoubleToInt64Bits(docCountPercent));
         }

         public override bool Equals(object obj)
         {
             if (this == obj)
             {
                 return true;
             }
             if (obj == null)
             {
                 return false;
             }
             if (this.GetType() != obj.GetType())
             {
                 return false;
             }

             ConstantScoreAutoRewrite other = (ConstantScoreAutoRewrite)obj;
             if (other.termCountCutoff != termCountCutoff)
             {
                 return false;
             }

             if (J2N.BitConversion.DoubleToInt64Bits(other.docCountPercent) != J2N.BitConversion.DoubleToInt64Bits(docCountPercent))
             {
                 return false;
             }

             return true;
         }

         /// <summary>
         /// Special implementation of <see cref="BytesRefHash.BytesStartArray"/> that keeps parallel arrays for <see cref="TermContext"/> </summary>
         internal sealed class TermStateByteStart : BytesRefHash.DirectBytesStartArray
         {
             internal TermContext[] termState;

             public TermStateByteStart(int initSize)
                 : base(initSize)
             {
             }

             public override int[] Init()
             {
                 int[] ord = base.Init();
                 termState = new TermContext[ArrayUtil.Oversize(ord.Length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
                 if (Debugging.AssertsEnabled) Debugging.Assert(termState.Length >= ord.Length);
                 return ord;
             }

             public override int[] Grow()
             {
                 int[] ord = base.Grow();
                 if (termState.Length < ord.Length)
                 {
                     TermContext[] tmpTermState = new TermContext[ArrayUtil.Oversize(ord.Length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
                     Array.Copy(termState, 0, tmpTermState, 0, termState.Length);
                     termState = tmpTermState;
                 }
                 if (Debugging.AssertsEnabled) Debugging.Assert(termState.Length >= ord.Length);
                 return ord;
             }

             public override int[] Clear()
             {
                 termState = null;
                 return base.Clear();
             }
         }
     }
 }
	using Lucene.Net.Diagnostics;
	using System;

	namespace Lucene.Net.Search
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	using ArrayUtil = Lucene.Net.Util.ArrayUtil;
	using ByteBlockPool = Lucene.Net.Util.ByteBlockPool;
	using BytesRef = Lucene.Net.Util.BytesRef;
	using BytesRefHash = Lucene.Net.Util.BytesRefHash;
	using IndexReader = Lucene.Net.Index.IndexReader;
	using RamUsageEstimator = Lucene.Net.Util.RamUsageEstimator;
	using Term = Lucene.Net.Index.Term;
	using TermContext = Lucene.Net.Index.TermContext;
	using TermsEnum = Lucene.Net.Index.TermsEnum;
	using TermState = Lucene.Net.Index.TermState;

	/// <summary>
	/// A rewrite method that tries to pick the best
	/// constant-score rewrite method based on term and
	/// document counts from the query. If both the number of
	/// terms and documents is small enough, then
	/// <see cref="MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE"/> is used.
	/// Otherwise, <see cref="MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE"/> is
	/// used.
	/// </summary>
	// LUCENENET specific: made this class public. In Lucene there was a derived class
	// with the same name that was nested within MultiTermQuery, but in .NET it is
	// more intuitive if our classes are not nested.
	public class ConstantScoreAutoRewrite : TermCollectingRewrite<BooleanQuery>
	{
	/// <summary>
	/// Defaults derived from rough tests with a 20.0 million
	/// doc Wikipedia index. With more than 350 terms in the
	/// query, the filter method is fastest:
	/// </summary>
	public static int DEFAULT_TERM_COUNT_CUTOFF = 350;

	/// <summary>
	/// If the query will hit more than 1 in 1000 of the docs
	/// in the index (0.1%), the filter method is fastest:
	/// </summary>
	public static double DEFAULT_DOC_COUNT_PERCENT = 0.1;

	private int termCountCutoff = DEFAULT_TERM_COUNT_CUTOFF;
	private double docCountPercent = DEFAULT_DOC_COUNT_PERCENT;

	/// <summary>
	/// If the number of terms in this query is equal to or
	/// larger than this setting then
	/// <see cref="MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE"/> is used.
	/// </summary>
	public virtual int TermCountCutoff
	{
	get => termCountCutoff;
	set => termCountCutoff = value;
	}

	/// <summary>
	/// If the number of documents to be visited in the
	/// postings exceeds this specified percentage of the
	/// <see cref="Index.IndexReader.MaxDoc"/> for the index, then
	/// <see cref="MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE"/> is used.
	/// Value may be 0.0 to 100.0.
	/// </summary>
	public virtual double DocCountPercent
	{
	get => docCountPercent;
	set => docCountPercent = value;
	}

	protected override BooleanQuery GetTopLevelQuery()
	{
	return new BooleanQuery(true);
	}

	protected override void AddClause(BooleanQuery topLevel, Term term, int docFreq, float boost, TermContext states) //ignored
	{
	topLevel.Add(new TermQuery(term, states), Occur.SHOULD);
	}

	public override Query Rewrite(IndexReader reader, MultiTermQuery query)
	{
	// Get the enum and start visiting terms. If we
	// exhaust the enum before hitting either of the
	// cutoffs, we use ConstantBooleanQueryRewrite; else,
	// ConstantFilterRewrite:
	int docCountCutoff = (int)((docCountPercent / 100.0) * reader.MaxDoc);
	int termCountLimit = Math.Min(BooleanQuery.MaxClauseCount, termCountCutoff);

	CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit);
	CollectTerms(reader, query, col);
	int size = col.pendingTerms.Count;
	if (col.hasCutOff)
	{
	return MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE.Rewrite(reader, query);
	}
	else
	{
	BooleanQuery bq = GetTopLevelQuery();
	if (size > 0)
	{
	BytesRefHash pendingTerms = col.pendingTerms;
	int[] sort = pendingTerms.Sort(col.termsEnum.Comparer);
	for (int i = 0; i < size; i++)
	{
	int pos = sort[i];
	// docFreq is not used for constant score here, we pass 1
	// to explicitely set a fake value, so it's not calculated
	AddClause(bq, new Term(query.m_field, pendingTerms.Get(pos, new BytesRef())), 1, 1.0f, col.array.termState[pos]);
	}
	}
	// Strip scores
	Query result = new ConstantScoreQuery(bq);
	result.Boost = query.Boost;
	return result;
	}
	}

	internal sealed class CutOffTermCollector : TermCollector
	{
	internal CutOffTermCollector(int docCountCutoff, int termCountLimit)
	{
	pendingTerms = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectAllocator()), 16, array);
	this.docCountCutoff = docCountCutoff;
	this.termCountLimit = termCountLimit;
	}

	public override void SetNextEnum(TermsEnum termsEnum)
	{
	this.termsEnum = termsEnum;
	}

	public override bool Collect(BytesRef bytes)
	{
	int pos = pendingTerms.Add(bytes);
	docVisitCount += termsEnum.DocFreq;
	if (pendingTerms.Count >= termCountLimit \|\| docVisitCount >= docCountCutoff)
	{
	hasCutOff = true;
	return false;
	}

	TermState termState = termsEnum.GetTermState();
	if (Debugging.AssertsEnabled) Debugging.Assert(termState != null);
	if (pos < 0)
	{
	pos = (-pos) - 1;
	array.termState[pos].Register(termState, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq);
	}
	else
	{
	array.termState[pos] = new TermContext(m_topReaderContext, termState, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq);
	}
	return true;
	}

	internal int docVisitCount = 0;
	internal bool hasCutOff = false;
	internal TermsEnum termsEnum;

	internal readonly int docCountCutoff, termCountLimit;
	internal readonly TermStateByteStart array = new TermStateByteStart(16);
	internal BytesRefHash pendingTerms;
	}

	public override int GetHashCode()
	{
	const int prime = 1279;
	return (int)(prime * termCountCutoff + J2N.BitConversion.DoubleToInt64Bits(docCountPercent));
	}

	public override bool Equals(object obj)
	{
	if (this == obj)
	{
	return true;
	}
	if (obj == null)
	{
	return false;
	}
	if (this.GetType() != obj.GetType())
	{
	return false;
	}

	ConstantScoreAutoRewrite other = (ConstantScoreAutoRewrite)obj;
	if (other.termCountCutoff != termCountCutoff)
	{
	return false;
	}

	if (J2N.BitConversion.DoubleToInt64Bits(other.docCountPercent) != J2N.BitConversion.DoubleToInt64Bits(docCountPercent))
	{
	return false;
	}

	return true;
	}

	/// <summary>
	/// Special implementation of <see cref="BytesRefHash.BytesStartArray"/> that keeps parallel arrays for <see cref="TermContext"/> </summary>
	internal sealed class TermStateByteStart : BytesRefHash.DirectBytesStartArray
	{
	internal TermContext[] termState;

	public TermStateByteStart(int initSize)
	: base(initSize)
	{
	}

	public override int[] Init()
	{
	int[] ord = base.Init();
	termState = new TermContext[ArrayUtil.Oversize(ord.Length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
	if (Debugging.AssertsEnabled) Debugging.Assert(termState.Length >= ord.Length);
	return ord;
	}

	public override int[] Grow()
	{
	int[] ord = base.Grow();
	if (termState.Length < ord.Length)
	{
	TermContext[] tmpTermState = new TermContext[ArrayUtil.Oversize(ord.Length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
	Array.Copy(termState, 0, tmpTermState, 0, termState.Length);
	termState = tmpTermState;
	}
	if (Debugging.AssertsEnabled) Debugging.Assert(termState.Length >= ord.Length);
	return ord;
	}

	public override int[] Clear()
	{
	termState = null;
	return base.Clear();
	}
	}
	}
	}