src/Lucene.Net.Analysis.Common/Analysis/Query/QueryAutoStopWordAnalyzer.cs - lucenenet - Git at Google

 using Lucene.Net.Analysis.Core;
 using Lucene.Net.Analysis.Util;
 using Lucene.Net.Index;
 using Lucene.Net.Util;
 using System.Collections.Generic;
 using System.Linq;

 namespace Lucene.Net.Analysis.Query
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// An <see cref="Analyzer"/> used primarily at query time to wrap another analyzer and provide a layer of protection
     /// which prevents very common words from being passed into queries.
     /// <para>
     /// For very large indexes the cost
     /// of reading TermDocs for a very common word can be  high. This analyzer was created after experience with
     /// a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for
     /// this term to take 2 seconds.
     /// </para>
     /// </summary>
     public sealed class QueryAutoStopWordAnalyzer : AnalyzerWrapper
     {
         private readonly Analyzer @delegate;
         private readonly IDictionary<string, HashSet<string>> stopWordsPerField = new Dictionary<string, HashSet<string>>();
         //The default maximum percentage (40%) of index documents which
         //can contain a term, after which the term is considered to be a stop word.
         public const float defaultMaxDocFreqPercent = 0.4f;
         private readonly LuceneVersion matchVersion;

         /// <summary>
         /// Creates a new <see cref="QueryAutoStopWordAnalyzer"/> with stopwords calculated for all
         /// indexed fields from terms with a document frequency percentage greater than
         /// <see cref="defaultMaxDocFreqPercent"/>
         /// </summary>
         /// <param name="matchVersion"> Version to be used in <see cref="StopFilter"/> </param>
         /// <param name="delegate"> <see cref="Analyzer"/> whose <see cref="TokenStream"/> will be filtered </param>
         /// <param name="indexReader"> <see cref="IndexReader"/> to identify the stopwords from </param>
         /// <exception cref="System.IO.IOException"> Can be thrown while reading from the <see cref="IndexReader"/> </exception>
         public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader)
             : this(matchVersion, @delegate, indexReader, defaultMaxDocFreqPercent)
         {
         }

         /// <summary>
         /// Creates a new <see cref="QueryAutoStopWordAnalyzer"/> with stopwords calculated for all
         /// indexed fields from terms with a document frequency greater than the given
         /// <paramref name="maxDocFreq"/>
         /// </summary>
         /// <param name="matchVersion"> Version to be used in <see cref="StopFilter"/> </param>
         /// <param name="delegate"> <see cref="Analyzer"/> whose <see cref="TokenStream"/> will be filtered </param>
         /// <param name="indexReader"> <see cref="IndexReader"/> to identify the stopwords from </param>
         /// <param name="maxDocFreq"> Document frequency terms should be above in order to be stopwords </param>
         /// <exception cref="System.IO.IOException"> Can be thrown while reading from the <see cref="IndexReader"/> </exception>
         public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader, int maxDocFreq)
             : this(matchVersion, @delegate, indexReader, MultiFields.GetIndexedFields(indexReader), maxDocFreq)
         {
         }

         /// <summary>
         /// Creates a new <see cref="QueryAutoStopWordAnalyzer"/> with stopwords calculated for all
         /// indexed fields from terms with a document frequency percentage greater than
         /// the given <paramref name="maxPercentDocs"/>
         /// </summary>
         /// <param name="matchVersion"> Version to be used in <see cref="StopFilter"/> </param>
         /// <param name="delegate"> <see cref="Analyzer"/> whose <see cref="TokenStream"/> will be filtered </param>
         /// <param name="indexReader"> <see cref="IndexReader"/> to identify the stopwords from </param>
         /// <param name="maxPercentDocs"> The maximum percentage (between 0.0 and 1.0) of index documents which
         ///                      contain a term, after which the word is considered to be a stop word </param>
         /// <exception cref="System.IO.IOException"> Can be thrown while reading from the <see cref="IndexReader"/> </exception>
         public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader, float maxPercentDocs)
             : this(matchVersion, @delegate, indexReader, MultiFields.GetIndexedFields(indexReader), maxPercentDocs)
         {
         }

         /// <summary>
         /// Creates a new <see cref="QueryAutoStopWordAnalyzer"/> with stopwords calculated for the
         /// given selection of fields from terms with a document frequency percentage
         /// greater than the given <paramref name="maxPercentDocs"/>
         /// </summary>
         /// <param name="matchVersion"> Version to be used in <see cref="StopFilter"/> </param>
         /// <param name="delegate"> <see cref="Analyzer"/> whose <see cref="TokenStream"/> will be filtered </param>
         /// <param name="indexReader"> <see cref="IndexReader"/> to identify the stopwords from </param>
         /// <param name="fields"> Selection of fields to calculate stopwords for </param>
         /// <param name="maxPercentDocs"> The maximum percentage (between 0.0 and 1.0) of index documents which
         ///                      contain a term, after which the word is considered to be a stop word </param>
         /// <exception cref="System.IO.IOException"> Can be thrown while reading from the <see cref="IndexReader"/> </exception>
         public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader, ICollection<string> fields, float maxPercentDocs)
             : this(matchVersion, @delegate, indexReader, fields, (int)(indexReader.NumDocs * maxPercentDocs))
         {
         }

         /// <summary>
         /// Creates a new <see cref="QueryAutoStopWordAnalyzer"/> with stopwords calculated for the
         /// given selection of fields from terms with a document frequency greater than
         /// the given <paramref name="maxDocFreq"/>
         /// </summary>
         /// <param name="matchVersion"> Version to be used in <see cref="StopFilter"/> </param>
         /// <param name="delegate"> Analyzer whose TokenStream will be filtered </param>
         /// <param name="indexReader"> <see cref="IndexReader"/> to identify the stopwords from </param>
         /// <param name="fields"> Selection of fields to calculate stopwords for </param>
         /// <param name="maxDocFreq"> Document frequency terms should be above in order to be stopwords </param>
         /// <exception cref="System.IO.IOException"> Can be thrown while reading from the <see cref="IndexReader"/> </exception>
         public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader, ICollection<string> fields, int maxDocFreq)
             : base(@delegate.Strategy)
         {
             this.matchVersion = matchVersion;
             this.@delegate = @delegate;

             foreach (string field in fields)
             {
                 var stopWords = new HashSet<string>();
                 Terms terms = MultiFields.GetTerms(indexReader, field);
                 CharsRef spare = new CharsRef();
                 if (terms != null)
                 {
                     TermsEnum te = terms.GetIterator(null);
                     BytesRef text;
                     while ((text = te.Next()) != null)
                     {
                         if (te.DocFreq > maxDocFreq)
                         {
                             UnicodeUtil.UTF8toUTF16(text, spare);
                             stopWords.Add(spare.ToString());
                         }
                     }
                 }
                 stopWordsPerField[field] = stopWords;
             }
         }

         protected override Analyzer GetWrappedAnalyzer(string fieldName)
         {
             return @delegate;
         }

         protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
         {
             if (!stopWordsPerField.TryGetValue(fieldName, out HashSet<string> stopWords) || stopWords == null)
             {
                 return components;
             }
             var stopFilter = new StopFilter(matchVersion, components.TokenStream, new CharArraySet(matchVersion, stopWords, false));
             return new TokenStreamComponents(components.Tokenizer, stopFilter);
         }

         /// <summary>
         /// Provides information on which stop words have been identified for a field
         /// </summary>
         /// <param name="fieldName"> The field for which stop words identified in "addStopWords"
         ///                  method calls will be returned </param>
         /// <returns> the stop words identified for a field </returns>
         public string[] GetStopWords(string fieldName)
         {
             var stopWords = stopWordsPerField[fieldName];
             return stopWords != null ? stopWords.ToArray() : new string[0];
         }

         /// <summary>
         /// Provides information on which stop words have been identified for all fields
         /// </summary>
         /// <returns> the stop words (as terms) </returns>
         public Term[] GetStopWords()
         {
             IList<Term> allStopWords = new List<Term>();
             foreach (string fieldName in stopWordsPerField.Keys)
             {
                 HashSet<string> stopWords = stopWordsPerField[fieldName];
                 foreach (string text in stopWords)
                 {
                     allStopWords.Add(new Term(fieldName, text));
                 }
             }
             return allStopWords.ToArray();
         }
     }
 }
	using Lucene.Net.Analysis.Core;
	using Lucene.Net.Analysis.Util;
	using Lucene.Net.Index;
	using Lucene.Net.Util;
	using System.Collections.Generic;
	using System.Linq;

	namespace Lucene.Net.Analysis.Query
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// An <see cref="Analyzer"/> used primarily at query time to wrap another analyzer and provide a layer of protection
	/// which prevents very common words from being passed into queries.
	/// <para>
	/// For very large indexes the cost
	/// of reading TermDocs for a very common word can be high. This analyzer was created after experience with
	/// a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for
	/// this term to take 2 seconds.
	/// </para>
	/// </summary>
	public sealed class QueryAutoStopWordAnalyzer : AnalyzerWrapper
	{
	private readonly Analyzer @delegate;
	private readonly IDictionary<string, HashSet<string>> stopWordsPerField = new Dictionary<string, HashSet<string>>();
	//The default maximum percentage (40%) of index documents which
	//can contain a term, after which the term is considered to be a stop word.
	public const float defaultMaxDocFreqPercent = 0.4f;
	private readonly LuceneVersion matchVersion;

	/// <summary>
	/// Creates a new <see cref="QueryAutoStopWordAnalyzer"/> with stopwords calculated for all
	/// indexed fields from terms with a document frequency percentage greater than
	/// <see cref="defaultMaxDocFreqPercent"/>
	/// </summary>
	/// <param name="matchVersion"> Version to be used in <see cref="StopFilter"/> </param>
	/// <param name="delegate"> <see cref="Analyzer"/> whose <see cref="TokenStream"/> will be filtered </param>
	/// <param name="indexReader"> <see cref="IndexReader"/> to identify the stopwords from </param>
	/// <exception cref="System.IO.IOException"> Can be thrown while reading from the <see cref="IndexReader"/> </exception>
	public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader)
	: this(matchVersion, @delegate, indexReader, defaultMaxDocFreqPercent)
	{
	}

	/// <summary>
	/// Creates a new <see cref="QueryAutoStopWordAnalyzer"/> with stopwords calculated for all
	/// indexed fields from terms with a document frequency greater than the given
	/// <paramref name="maxDocFreq"/>
	/// </summary>
	/// <param name="matchVersion"> Version to be used in <see cref="StopFilter"/> </param>
	/// <param name="delegate"> <see cref="Analyzer"/> whose <see cref="TokenStream"/> will be filtered </param>
	/// <param name="indexReader"> <see cref="IndexReader"/> to identify the stopwords from </param>
	/// <param name="maxDocFreq"> Document frequency terms should be above in order to be stopwords </param>
	/// <exception cref="System.IO.IOException"> Can be thrown while reading from the <see cref="IndexReader"/> </exception>
	public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader, int maxDocFreq)
	: this(matchVersion, @delegate, indexReader, MultiFields.GetIndexedFields(indexReader), maxDocFreq)
	{
	}

	/// <summary>
	/// Creates a new <see cref="QueryAutoStopWordAnalyzer"/> with stopwords calculated for all
	/// indexed fields from terms with a document frequency percentage greater than
	/// the given <paramref name="maxPercentDocs"/>
	/// </summary>
	/// <param name="matchVersion"> Version to be used in <see cref="StopFilter"/> </param>
	/// <param name="delegate"> <see cref="Analyzer"/> whose <see cref="TokenStream"/> will be filtered </param>
	/// <param name="indexReader"> <see cref="IndexReader"/> to identify the stopwords from </param>
	/// <param name="maxPercentDocs"> The maximum percentage (between 0.0 and 1.0) of index documents which
	/// contain a term, after which the word is considered to be a stop word </param>
	/// <exception cref="System.IO.IOException"> Can be thrown while reading from the <see cref="IndexReader"/> </exception>
	public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader, float maxPercentDocs)
	: this(matchVersion, @delegate, indexReader, MultiFields.GetIndexedFields(indexReader), maxPercentDocs)
	{
	}

	/// <summary>
	/// Creates a new <see cref="QueryAutoStopWordAnalyzer"/> with stopwords calculated for the
	/// given selection of fields from terms with a document frequency percentage
	/// greater than the given <paramref name="maxPercentDocs"/>
	/// </summary>
	/// <param name="matchVersion"> Version to be used in <see cref="StopFilter"/> </param>
	/// <param name="delegate"> <see cref="Analyzer"/> whose <see cref="TokenStream"/> will be filtered </param>
	/// <param name="indexReader"> <see cref="IndexReader"/> to identify the stopwords from </param>
	/// <param name="fields"> Selection of fields to calculate stopwords for </param>
	/// <param name="maxPercentDocs"> The maximum percentage (between 0.0 and 1.0) of index documents which
	/// contain a term, after which the word is considered to be a stop word </param>
	/// <exception cref="System.IO.IOException"> Can be thrown while reading from the <see cref="IndexReader"/> </exception>
	public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader, ICollection<string> fields, float maxPercentDocs)
	: this(matchVersion, @delegate, indexReader, fields, (int)(indexReader.NumDocs * maxPercentDocs))
	{
	}

	/// <summary>
	/// Creates a new <see cref="QueryAutoStopWordAnalyzer"/> with stopwords calculated for the
	/// given selection of fields from terms with a document frequency greater than
	/// the given <paramref name="maxDocFreq"/>
	/// </summary>
	/// <param name="matchVersion"> Version to be used in <see cref="StopFilter"/> </param>
	/// <param name="delegate"> Analyzer whose TokenStream will be filtered </param>
	/// <param name="indexReader"> <see cref="IndexReader"/> to identify the stopwords from </param>
	/// <param name="fields"> Selection of fields to calculate stopwords for </param>
	/// <param name="maxDocFreq"> Document frequency terms should be above in order to be stopwords </param>
	/// <exception cref="System.IO.IOException"> Can be thrown while reading from the <see cref="IndexReader"/> </exception>
	public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader, ICollection<string> fields, int maxDocFreq)
	: base(@delegate.Strategy)
	{
	this.matchVersion = matchVersion;
	this.@delegate = @delegate;

	foreach (string field in fields)
	{
	var stopWords = new HashSet<string>();
	Terms terms = MultiFields.GetTerms(indexReader, field);
	CharsRef spare = new CharsRef();
	if (terms != null)
	{
	TermsEnum te = terms.GetIterator(null);
	BytesRef text;
	while ((text = te.Next()) != null)
	{
	if (te.DocFreq > maxDocFreq)
	{
	UnicodeUtil.UTF8toUTF16(text, spare);
	stopWords.Add(spare.ToString());
	}
	}
	}
	stopWordsPerField[field] = stopWords;
	}
	}

	protected override Analyzer GetWrappedAnalyzer(string fieldName)
	{
	return @delegate;
	}

	protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
	{
	if (!stopWordsPerField.TryGetValue(fieldName, out HashSet<string> stopWords) \|\| stopWords == null)
	{
	return components;
	}
	var stopFilter = new StopFilter(matchVersion, components.TokenStream, new CharArraySet(matchVersion, stopWords, false));
	return new TokenStreamComponents(components.Tokenizer, stopFilter);
	}

	/// <summary>
	/// Provides information on which stop words have been identified for a field
	/// </summary>
	/// <param name="fieldName"> The field for which stop words identified in "addStopWords"
	/// method calls will be returned </param>
	/// <returns> the stop words identified for a field </returns>
	public string[] GetStopWords(string fieldName)
	{
	var stopWords = stopWordsPerField[fieldName];
	return stopWords != null ? stopWords.ToArray() : new string[0];
	}

	/// <summary>
	/// Provides information on which stop words have been identified for all fields
	/// </summary>
	/// <returns> the stop words (as terms) </returns>
	public Term[] GetStopWords()
	{
	IList<Term> allStopWords = new List<Term>();
	foreach (string fieldName in stopWordsPerField.Keys)
	{
	HashSet<string> stopWords = stopWordsPerField[fieldName];
	foreach (string text in stopWords)
	{
	allStopWords.Add(new Term(fieldName, text));
	}
	}
	return allStopWords.ToArray();
	}
	}
	}