src/contrib/Analyzers/Query/QueryAutoStopWordAnalyzer.cs - lucenenet - Git at Google

 /*
  *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  *
 */

 using System;
 using System.Collections.Generic;
 using System.IO;
 using System.Linq;
 using System.Text;
 using Lucene.Net.Index;
 using Lucene.Net.Support;
 using Lucene.Net.Util;
 using Version = Lucene.Net.Util.Version;

 namespace Lucene.Net.Analysis.Query
 {
 /*
  * An {@link Analyzer} used primarily at query time to wrap another analyzer and provide a layer of protection
  * which prevents very common words from being passed into queries.
  * <p>
  * For very large indexes the cost
  * of reading TermDocs for a very common word can be  high. This analyzer was created after experience with
  * a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for
  * this term to take 2 seconds.
  * </p>
  * <p>
  * Use the various "addStopWords" methods in this class to automate the identification and addition of
  * stop words found in an already existing index.
  * </p>
  */
 public class QueryAutoStopWordAnalyzer : Analyzer {
   Analyzer _delegate;
   HashMap<String,ISet<String>> stopWordsPerField = new HashMap<String,ISet<String>>();
   //The default maximum percentage (40%) of index documents which
   //can contain a term, after which the term is considered to be a stop word.
   public const float defaultMaxDocFreqPercent = 0.4f;
   private readonly Version matchVersion;

   /*
    * Initializes this analyzer with the Analyzer object that actually produces the tokens
    *
    * @param _delegate The choice of {@link Analyzer} that is used to produce the token stream which needs filtering
    */
   public QueryAutoStopWordAnalyzer(Version matchVersion, Analyzer _delegate)
   {
     this._delegate = _delegate;
     SetOverridesTokenStreamMethod<QueryAutoStopWordAnalyzer>();
     this.matchVersion = matchVersion;
   }

   /*
    * Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent
    *
    * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
    *               exceed the required document frequency
    * @return The number of stop words identified.
    * @throws IOException
    */
   public int AddStopWords(IndexReader reader)
   {
     return AddStopWords(reader, defaultMaxDocFreqPercent);
   }

   /*
    * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
    *
    * @param reader     The {@link IndexReader} which will be consulted to identify potential stop words that
    *                   exceed the required document frequency
    * @param maxDocFreq The maximum number of index documents which can contain a term, after which
    *                   the term is considered to be a stop word
    * @return The number of stop words identified.
    * @throws IOException
    */
   public int AddStopWords(IndexReader reader, int maxDocFreq)
   {
     int numStopWords = 0;
     ICollection<String> fieldNames = reader.GetFieldNames(IndexReader.FieldOption.INDEXED);
     for (IEnumerator<String> iter = fieldNames.GetEnumerator(); iter.MoveNext();) {
       String fieldName = iter.Current;
       numStopWords += AddStopWords(reader, fieldName, maxDocFreq);
     }
     return numStopWords;
   }

   /*
    * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
    *
    * @param reader        The {@link IndexReader} which will be consulted to identify potential stop words that
    *                      exceed the required document frequency
    * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
    *                      contain a term, after which the word is considered to be a stop word.
    * @return The number of stop words identified.
    * @throws IOException
    */
   public int AddStopWords(IndexReader reader, float maxPercentDocs)
   {
     int numStopWords = 0;
     ICollection<String> fieldNames = reader.GetFieldNames(IndexReader.FieldOption.INDEXED);
     for (IEnumerator<String> iter = fieldNames.GetEnumerator(); iter.MoveNext();) {
       String fieldName = iter.Current;
       numStopWords += AddStopWords(reader, fieldName, maxPercentDocs);
     }
     return numStopWords;
   }

   /*
    * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
    *
    * @param reader         The {@link IndexReader} which will be consulted to identify potential stop words that
    *                       exceed the required document frequency
    * @param fieldName      The field for which stopwords will be added
    * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
    *                       contain a term, after which the word is considered to be a stop word.
    * @return The number of stop words identified.
    * @throws IOException
    */
   public int AddStopWords(IndexReader reader, String fieldName, float maxPercentDocs)
   {
     return AddStopWords(reader, fieldName, (int) (reader.NumDocs() * maxPercentDocs));
   }

   /*
    * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
    *
    * @param reader     The {@link IndexReader} which will be consulted to identify potential stop words that
    *                   exceed the required document frequency
    * @param fieldName  The field for which stopwords will be added
    * @param maxDocFreq The maximum number of index documents which
    *                   can contain a term, after which the term is considered to be a stop word.
    * @return The number of stop words identified.
    * @throws IOException
    */
   public int AddStopWords(IndexReader reader, String fieldName, int maxDocFreq)
   {
       var stopWords = Support.Compatibility.SetFactory.CreateHashSet<string>();
     String internedFieldName = StringHelper.Intern(fieldName);
     TermEnum te = reader.Terms(new Term(fieldName));
     Term term = te.Term;
     while (term != null) {
       if (term.Field != internedFieldName) {
         break;
       }
       if (te.DocFreq() > maxDocFreq) {
         stopWords.Add(term.Text);
       }
       if (!te.Next()) {
         break;
       }
       term = te.Term;
     }
     stopWordsPerField.Add(fieldName, stopWords);

     /* if the stopwords for a field are changed,
      * then saved streams for that field are erased.
      */
     IDictionary<String,SavedStreams> streamMap = (IDictionary<String,SavedStreams>) PreviousTokenStream;
     if (streamMap != null)
       streamMap.Remove(fieldName);

     return stopWords.Count;
   }

   public override TokenStream TokenStream(String fieldName, TextReader reader) {
     TokenStream result;
     try {
       result = _delegate.ReusableTokenStream(fieldName, reader);
     } catch (IOException) {
       result = _delegate.TokenStream(fieldName, reader);
     }
     var stopWords = stopWordsPerField[fieldName];
     if (stopWords != null) {
       result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                               result, stopWords);
     }
     return result;
   }

   private class SavedStreams {
     /* the underlying stream */
     protected internal TokenStream Wrapped;

     /*
      * when there are no stopwords for the field, refers to wrapped.
      * if there stopwords, it is a StopFilter around wrapped.
      */
     protected internal TokenStream WithStopFilter;
   };

   public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
 {
     if (overridesTokenStreamMethod) {
       // LUCENE-1678: force fallback to tokenStream() if we
       // have been subclassed and that subclass overrides
       // tokenStream but not reusableTokenStream
       return TokenStream(fieldName, reader);
     }

     /* map of SavedStreams for each field */
     IDictionary<String, SavedStreams> streamMap = (IDictionary<String, SavedStreams>)PreviousTokenStream;
     if (streamMap == null) {
       streamMap = new HashMap<String, SavedStreams>();
       PreviousTokenStream = streamMap;
     }

     SavedStreams streams = streamMap[fieldName];
     if (streams == null) {
       /* an entry for this field does not exist, create one */
       streams = new SavedStreams();
       streamMap.Add(fieldName, streams);
       streams.Wrapped = _delegate.ReusableTokenStream(fieldName, reader);

       /* if there are any stopwords for the field, save the stopfilter */
       var stopWords = stopWordsPerField[fieldName];
       if (stopWords != null)
         streams.WithStopFilter = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                                 streams.Wrapped, stopWords);
       else
         streams.WithStopFilter = streams.Wrapped;

     } else {
       /*
        * an entry for this field exists, verify the wrapped stream has not
        * changed. if it has not, reuse it, otherwise wrap the new stream.
        */
       TokenStream result = _delegate.ReusableTokenStream(fieldName, reader);
       if (result == streams.Wrapped) {
         /* the wrapped analyzer reused the stream */
         streams.WithStopFilter.Reset();
       } else {
         /*
          * the wrapped analyzer did not. if there are any stopwords for the
          * field, create a new StopFilter around the new stream
          */
         streams.Wrapped = result;
         var stopWords = stopWordsPerField[fieldName];
         if (stopWords != null)
           streams.WithStopFilter = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                                   streams.Wrapped, stopWords);
         else
           streams.WithStopFilter = streams.Wrapped;
       }
     }

     return streams.WithStopFilter;
   }

   /*
    * Provides information on which stop words have been identified for a field
    *
    * @param fieldName The field for which stop words identified in "addStopWords"
    *                  method calls will be returned
    * @return the stop words identified for a field
    */
   public String[] GetStopWords(String fieldName) {
     String[] result;
     var stopWords = stopWordsPerField[fieldName];
     if (stopWords != null) {
       result = stopWords.ToArray();
     } else {
       result = new String[0];
     }
     return result;
   }

   /*
    * Provides information on which stop words have been identified for all fields
    *
    * @return the stop words (as terms)
    */
   public Term[] GetStopWords() {
     List<Term> allStopWords = new List<Term>();
     foreach(var fieldName in stopWordsPerField.Keys)
     {
       var stopWords = stopWordsPerField[fieldName];
       foreach(var text in stopWords) {
         allStopWords.Add(new Term(fieldName, text));
       }
     }
     return allStopWords.ToArray();
     }

 }
 }
	/*
	*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*
	*/

	using System;
	using System.Collections.Generic;
	using System.IO;
	using System.Linq;
	using System.Text;
	using Lucene.Net.Index;
	using Lucene.Net.Support;
	using Lucene.Net.Util;
	using Version = Lucene.Net.Util.Version;

	namespace Lucene.Net.Analysis.Query
	{
	/*
	* An {@link Analyzer} used primarily at query time to wrap another analyzer and provide a layer of protection
	* which prevents very common words from being passed into queries.
	* <p>
	* For very large indexes the cost
	* of reading TermDocs for a very common word can be high. This analyzer was created after experience with
	* a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for
	* this term to take 2 seconds.
	* </p>
	* <p>
	* Use the various "addStopWords" methods in this class to automate the identification and addition of
	* stop words found in an already existing index.
	* </p>
	*/
	public class QueryAutoStopWordAnalyzer : Analyzer {
	Analyzer _delegate;
	HashMap<String,ISet<String>> stopWordsPerField = new HashMap<String,ISet<String>>();
	//The default maximum percentage (40%) of index documents which
	//can contain a term, after which the term is considered to be a stop word.
	public const float defaultMaxDocFreqPercent = 0.4f;
	private readonly Version matchVersion;

	/*
	* Initializes this analyzer with the Analyzer object that actually produces the tokens
	*
	* @param _delegate The choice of {@link Analyzer} that is used to produce the token stream which needs filtering
	*/
	public QueryAutoStopWordAnalyzer(Version matchVersion, Analyzer _delegate)
	{
	this._delegate = _delegate;
	SetOverridesTokenStreamMethod<QueryAutoStopWordAnalyzer>();
	this.matchVersion = matchVersion;
	}

	/*
	* Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent
	*
	* @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
	* exceed the required document frequency
	* @return The number of stop words identified.
	* @throws IOException
	*/
	public int AddStopWords(IndexReader reader)
	{
	return AddStopWords(reader, defaultMaxDocFreqPercent);
	}

	/*
	* Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
	*
	* @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
	* exceed the required document frequency
	* @param maxDocFreq The maximum number of index documents which can contain a term, after which
	* the term is considered to be a stop word
	* @return The number of stop words identified.
	* @throws IOException
	*/
	public int AddStopWords(IndexReader reader, int maxDocFreq)
	{
	int numStopWords = 0;
	ICollection<String> fieldNames = reader.GetFieldNames(IndexReader.FieldOption.INDEXED);
	for (IEnumerator<String> iter = fieldNames.GetEnumerator(); iter.MoveNext();) {
	String fieldName = iter.Current;
	numStopWords += AddStopWords(reader, fieldName, maxDocFreq);
	}
	return numStopWords;
	}

	/*
	* Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
	*
	* @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
	* exceed the required document frequency
	* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
	* contain a term, after which the word is considered to be a stop word.
	* @return The number of stop words identified.
	* @throws IOException
	*/
	public int AddStopWords(IndexReader reader, float maxPercentDocs)
	{
	int numStopWords = 0;
	ICollection<String> fieldNames = reader.GetFieldNames(IndexReader.FieldOption.INDEXED);
	for (IEnumerator<String> iter = fieldNames.GetEnumerator(); iter.MoveNext();) {
	String fieldName = iter.Current;
	numStopWords += AddStopWords(reader, fieldName, maxPercentDocs);
	}
	return numStopWords;
	}

	/*
	* Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
	*
	* @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
	* exceed the required document frequency
	* @param fieldName The field for which stopwords will be added
	* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
	* contain a term, after which the word is considered to be a stop word.
	* @return The number of stop words identified.
	* @throws IOException
	*/
	public int AddStopWords(IndexReader reader, String fieldName, float maxPercentDocs)
	{
	return AddStopWords(reader, fieldName, (int) (reader.NumDocs() * maxPercentDocs));
	}

	/*
	* Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
	*
	* @param reader The {@link IndexReader} which will be consulted to identify potential stop words that
	* exceed the required document frequency
	* @param fieldName The field for which stopwords will be added
	* @param maxDocFreq The maximum number of index documents which
	* can contain a term, after which the term is considered to be a stop word.
	* @return The number of stop words identified.
	* @throws IOException
	*/
	public int AddStopWords(IndexReader reader, String fieldName, int maxDocFreq)
	{
	var stopWords = Support.Compatibility.SetFactory.CreateHashSet<string>();
	String internedFieldName = StringHelper.Intern(fieldName);
	TermEnum te = reader.Terms(new Term(fieldName));
	Term term = te.Term;
	while (term != null) {
	if (term.Field != internedFieldName) {
	break;
	}
	if (te.DocFreq() > maxDocFreq) {
	stopWords.Add(term.Text);
	}
	if (!te.Next()) {
	break;
	}
	term = te.Term;
	}
	stopWordsPerField.Add(fieldName, stopWords);

	/* if the stopwords for a field are changed,
	* then saved streams for that field are erased.
	*/
	IDictionary<String,SavedStreams> streamMap = (IDictionary<String,SavedStreams>) PreviousTokenStream;
	if (streamMap != null)
	streamMap.Remove(fieldName);

	return stopWords.Count;
	}

	public override TokenStream TokenStream(String fieldName, TextReader reader) {
	TokenStream result;
	try {
	result = _delegate.ReusableTokenStream(fieldName, reader);
	} catch (IOException) {
	result = _delegate.TokenStream(fieldName, reader);
	}
	var stopWords = stopWordsPerField[fieldName];
	if (stopWords != null) {
	result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
	result, stopWords);
	}
	return result;
	}

	private class SavedStreams {
	/* the underlying stream */
	protected internal TokenStream Wrapped;

	/*
	* when there are no stopwords for the field, refers to wrapped.
	* if there stopwords, it is a StopFilter around wrapped.
	*/
	protected internal TokenStream WithStopFilter;
	};

	public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
	{
	if (overridesTokenStreamMethod) {
	// LUCENE-1678: force fallback to tokenStream() if we
	// have been subclassed and that subclass overrides
	// tokenStream but not reusableTokenStream
	return TokenStream(fieldName, reader);
	}

	/* map of SavedStreams for each field */
	IDictionary<String, SavedStreams> streamMap = (IDictionary<String, SavedStreams>)PreviousTokenStream;
	if (streamMap == null) {
	streamMap = new HashMap<String, SavedStreams>();
	PreviousTokenStream = streamMap;
	}

	SavedStreams streams = streamMap[fieldName];
	if (streams == null) {
	/* an entry for this field does not exist, create one */
	streams = new SavedStreams();
	streamMap.Add(fieldName, streams);
	streams.Wrapped = _delegate.ReusableTokenStream(fieldName, reader);

	/* if there are any stopwords for the field, save the stopfilter */
	var stopWords = stopWordsPerField[fieldName];
	if (stopWords != null)
	streams.WithStopFilter = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
	streams.Wrapped, stopWords);
	else
	streams.WithStopFilter = streams.Wrapped;

	} else {
	/*
	* an entry for this field exists, verify the wrapped stream has not
	* changed. if it has not, reuse it, otherwise wrap the new stream.
	*/
	TokenStream result = _delegate.ReusableTokenStream(fieldName, reader);
	if (result == streams.Wrapped) {
	/* the wrapped analyzer reused the stream */
	streams.WithStopFilter.Reset();
	} else {
	/*
	* the wrapped analyzer did not. if there are any stopwords for the
	* field, create a new StopFilter around the new stream
	*/
	streams.Wrapped = result;
	var stopWords = stopWordsPerField[fieldName];
	if (stopWords != null)
	streams.WithStopFilter = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
	streams.Wrapped, stopWords);
	else
	streams.WithStopFilter = streams.Wrapped;
	}
	}

	return streams.WithStopFilter;
	}

	/*
	* Provides information on which stop words have been identified for a field
	*
	* @param fieldName The field for which stop words identified in "addStopWords"
	* method calls will be returned
	* @return the stop words identified for a field
	*/
	public String[] GetStopWords(String fieldName) {
	String[] result;
	var stopWords = stopWordsPerField[fieldName];
	if (stopWords != null) {
	result = stopWords.ToArray();
	} else {
	result = new String[0];
	}
	return result;
	}

	/*
	* Provides information on which stop words have been identified for all fields
	*
	* @return the stop words (as terms)
	*/
	public Term[] GetStopWords() {
	List<Term> allStopWords = new List<Term>();
	foreach(var fieldName in stopWordsPerField.Keys)
	{
	var stopWords = stopWordsPerField[fieldName];
	foreach(var text in stopWords) {
	allStopWords.Add(new Term(fieldName, text));
	}
	}
	return allStopWords.ToArray();
	}

	}
	}