| using J2N.Collections.Generic.Extensions; |
| using Lucene.Net.Analysis; |
| using Lucene.Net.Analysis.TokenAttributes; |
| using Lucene.Net.Documents; |
| using Lucene.Net.Index; |
| using Lucene.Net.Search; |
| using Lucene.Net.Search.Similarities; |
| using Lucene.Net.Support; |
| using Lucene.Net.Util; |
| using System; |
| using System.Collections.Generic; |
| using System.Diagnostics.CodeAnalysis; |
| using System.IO; |
| using System.Text; |
| |
| namespace Lucene.Net.Queries.Mlt |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Generate "more like this" similarity queries. |
| /// Based on this mail: |
| /// <code> |
| /// Lucene does let you access the document frequency of terms, with <see cref="IndexReader.DocFreq"/>. |
| /// Term frequencies can be computed by re-tokenizing the text, which, for a single document, |
| /// is usually fast enough. But looking up the <see cref="IndexReader.DocFreq"/> of every term in the document is |
| /// probably too slow. |
| /// <para/> |
| /// You can use some heuristics to prune the set of terms, to avoid calling <see cref="IndexReader.DocFreq"/> too much, |
| /// or at all. Since you're trying to maximize a tf*idf score, you're probably most interested |
| /// in terms with a high tf. Choosing a tf threshold even as low as two or three will radically |
| /// reduce the number of terms under consideration. Another heuristic is that terms with a |
| /// high idf (i.e., a low df) tend to be longer. So you could threshold the terms by the |
| /// number of characters, not selecting anything less than, e.g., six or seven characters. |
| /// With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms |
| /// that do a pretty good job of characterizing a document. |
| /// <para/> |
| /// It all depends on what you're trying to do. If you're trying to eek out that last percent |
| /// of precision and recall regardless of computational difficulty so that you can win a TREC |
| /// competition, then the techniques I mention above are useless. But if you're trying to |
| /// provide a "more like this" button on a search results page that does a decent job and has |
| /// good performance, such techniques might be useful. |
| /// <para/> |
| /// An efficient, effective "more-like-this" query generator would be a great contribution, if |
| /// anyone's interested. I'd imagine that it would take a Reader or a String (the document's |
| /// text), analyzer Analyzer, and return a set of representative terms using heuristics like those |
| /// above. The frequency and length thresholds could be parameters, etc. |
| /// <para/> |
| /// Doug |
| /// </code> |
| /// <para/> |
| /// <para/> |
| /// <para/> |
| /// <b>Initial Usage</b> |
| /// <para/> |
| /// This class has lots of options to try to make it efficient and flexible. |
| /// The simplest possible usage is as follows. The bold |
| /// fragment is specific to this class. |
| /// <para/> |
| /// <code> |
| /// IndexReader ir = ... |
| /// IndexSearcher is = ... |
| /// |
| /// MoreLikeThis mlt = new MoreLikeThis(ir); |
| /// TextReader target = ... // orig source of doc you want to find similarities to |
| /// Query query = mlt.Like(target); |
| /// |
| /// Hits hits = is.Search(query); |
| /// // now the usual iteration thru 'hits' - the only thing to watch for is to make sure |
| /// //you ignore the doc if it matches your 'target' document, as it should be similar to itself |
| /// </code> |
| /// <para/> |
| /// Thus you: |
| /// <list type="bullet"> |
| /// <item><description>do your normal, Lucene setup for searching,</description></item> |
| /// <item><description>create a MoreLikeThis,</description></item> |
| /// <item><description>get the text of the doc you want to find similarities to</description></item> |
| /// <item><description>then call one of the <see cref="Like(TextReader, string)"/> calls to generate a similarity query</description></item> |
| /// <item><description>call the searcher to find the similar docs</description></item> |
| /// </list> |
| /// <para/> |
| /// <b>More Advanced Usage</b> |
| /// <para/> |
| /// You may want to use the setter for <see cref="FieldNames"/> so you can examine |
| /// multiple fields (e.g. body and title) for similarity. |
| /// <para/> |
| /// <para/> |
| /// Depending on the size of your index and the size and makeup of your documents you |
| /// may want to call the other set methods to control how the similarity queries are |
| /// generated: |
| /// <list type="bullet"> |
| /// <item><description><see cref="MinTermFreq"/></description></item> |
| /// <item><description><see cref="MinDocFreq"/></description></item> |
| /// <item><description><see cref="MaxDocFreq"/></description></item> |
| /// <item><description><see cref="SetMaxDocFreqPct(int)"/></description></item> |
| /// <item><description><see cref="MinWordLen"/></description></item> |
| /// <item><description><see cref="MaxWordLen"/></description></item> |
| /// <item><description><see cref="MaxQueryTerms"/></description></item> |
| /// <item><description><see cref="MaxNumTokensParsed"/></description></item> |
| /// <item><description><see cref="StopWords"/></description></item> |
| /// </list> |
| /// </summary> |
| /// <remarks> |
| /// Changes: Mark Harwood 29/02/04 |
| /// Some bugfixing, some refactoring, some optimisation. |
| /// - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code |
| /// - bugfix: No significant terms being created for fields with a termvector - because |
| /// was only counting one occurrence per term/field pair in calculations(ie not including frequency info from TermVector) |
| /// - refactor: moved common code into isNoiseWord() |
| /// - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization |
| /// </remarks> |
| public sealed class MoreLikeThis |
| { |
| /// <summary> |
| /// Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support. |
| /// </summary> |
| /// <seealso cref="MaxNumTokensParsed"/> |
| public static readonly int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000; |
| |
| /// <summary> |
| /// Ignore terms with less than this frequency in the source doc. |
| /// </summary> |
| /// <seealso cref="MinTermFreq"/> |
| public static readonly int DEFAULT_MIN_TERM_FREQ = 2; |
| |
| /// <summary> |
| /// Ignore words which do not occur in at least this many docs. |
| /// </summary> |
| /// <seealso cref="MinDocFreq"/> |
| public static readonly int DEFAULT_MIN_DOC_FREQ = 5; |
| |
| /// <summary> |
| /// Ignore words which occur in more than this many docs. |
| /// </summary> |
| /// <seealso cref="MaxDocFreq"/> |
| /// <seealso cref="SetMaxDocFreqPct(int)"/> |
| public static readonly int DEFAULT_MAX_DOC_FREQ = int.MaxValue; |
| |
| /// <summary> |
| /// Boost terms in query based on score. |
| /// </summary> |
| /// <seealso cref="ApplyBoost"/> |
| public static readonly bool DEFAULT_BOOST = false; |
| |
| /// <summary> |
| /// Default field names. Null is used to specify that the field names should be looked |
| /// up at runtime from the provided reader. |
| /// </summary> |
| public static readonly string[] DEFAULT_FIELD_NAMES = new string[] { "contents" }; |
| |
| /// <summary> |
| /// Ignore words less than this length or if 0 then this has no effect. |
| /// </summary> |
| /// <seealso cref="MinWordLen"/> |
| public static readonly int DEFAULT_MIN_WORD_LENGTH = 0; |
| |
| /// <summary> |
| /// Ignore words greater than this length or if 0 then this has no effect. |
| /// </summary> |
| /// <seealso cref="MaxWordLen"/> |
| public static readonly int DEFAULT_MAX_WORD_LENGTH = 0; |
| |
| /// <summary> |
| /// Default set of stopwords. |
| /// If null means to allow stop words. |
| /// </summary> |
| /// <seealso cref="StopWords"/> |
| public static readonly ISet<string> DEFAULT_STOP_WORDS = null; |
| |
| /// <summary> |
| /// Return a Query with no more than this many terms. |
| /// </summary> |
| /// <seealso cref="BooleanQuery.MaxClauseCount"/> |
| /// <seealso cref="MaxQueryTerms"/> |
| public static readonly int DEFAULT_MAX_QUERY_TERMS = 25; |
| |
| // LUCNENENET NOTE: The following fields were made into auto-implemented properties: |
| // analyzer, minTermFreq, minDocFreq, maxDocFreq, boost, |
| // fieldNames, maxNumTokensParsed, minWordLen, maxWordLen, |
| // maxQueryTerms, similarity |
| |
| /// <summary> |
| /// <see cref="IndexReader"/> to use |
| /// </summary> |
| private readonly IndexReader ir; |
| |
| /// <summary> |
| /// Boost factor to use when boosting the terms |
| /// </summary> |
| private float boostFactor = 1; |
| |
| /// <summary> |
| /// Gets or Sets the boost factor used when boosting terms |
| /// </summary> |
| public float BoostFactor |
| { |
| get => boostFactor; |
| set => this.boostFactor = value; |
| } |
| |
| |
| /// <summary> |
| /// Constructor requiring an <see cref="IndexReader"/>. |
| /// </summary> |
| public MoreLikeThis(IndexReader ir) |
| : this(ir, new DefaultSimilarity()) |
| { |
| } |
| |
| public MoreLikeThis(IndexReader ir, TFIDFSimilarity sim) |
| { |
| this.ir = ir; |
| this.Similarity = sim; |
| |
| // LUCENENET specific: Set Defaults |
| StopWords = DEFAULT_STOP_WORDS; |
| MinTermFreq = DEFAULT_MIN_TERM_FREQ; |
| MinDocFreq = DEFAULT_MIN_DOC_FREQ; |
| MaxDocFreq = DEFAULT_MAX_DOC_FREQ; |
| ApplyBoost = DEFAULT_BOOST; |
| FieldNames = DEFAULT_FIELD_NAMES; |
| MaxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED; |
| MinWordLen = DEFAULT_MIN_WORD_LENGTH; |
| MaxWordLen = DEFAULT_MAX_WORD_LENGTH; |
| MaxQueryTerms = DEFAULT_MAX_QUERY_TERMS; |
| } |
| |
| /// <summary> |
| /// For idf() calculations. |
| /// </summary> |
| public TFIDFSimilarity Similarity { get; set; } |
| |
| |
| /// <summary> |
| /// Gets or Sets an analyzer that will be used to parse source doc with. The default analyzer |
| /// is not set. An analyzer is not required for generating a query with the |
| /// <see cref="Like(int)"/> method, all other 'like' methods require an analyzer. |
| /// </summary> |
| public Analyzer Analyzer { get; set; } |
| |
| |
| /// <summary> |
| /// Gets or Sets the frequency below which terms will be ignored in the source doc. The default |
| /// frequency is the <see cref="DEFAULT_MIN_TERM_FREQ"/>. |
| /// </summary> |
| public int MinTermFreq { get; set; } |
| |
| |
| /// <summary> |
| /// Gets or Sets the frequency at which words will be ignored which do not occur in at least this |
| /// many docs. The default frequency is <see cref="DEFAULT_MIN_DOC_FREQ"/>. |
| /// </summary> |
| public int MinDocFreq { get; set; } |
| |
| |
| /// <summary> |
| /// Gets or Sets the maximum frequency in which words may still appear. |
| /// Words that appear in more than this many docs will be ignored. The default frequency is |
| /// <see cref="DEFAULT_MAX_DOC_FREQ"/>. |
| /// </summary> |
| public int MaxDocFreq { get; set; } |
| |
| |
| /// <summary> |
| /// Set the maximum percentage in which words may still appear. Words that appear |
| /// in more than this many percent of all docs will be ignored. |
| /// </summary> |
| /// <param name="maxPercentage"> the maximum percentage of documents (0-100) that a term may appear |
| /// in to be still considered relevant </param> |
| public void SetMaxDocFreqPct(int maxPercentage) |
| { |
| this.MaxDocFreq = maxPercentage * ir.NumDocs / 100; |
| } |
| |
| |
| /// <summary> |
| /// Gets or Sets whether to boost terms in query based on "score" or not. The default is |
| /// <see cref="DEFAULT_BOOST"/>. |
| /// </summary> |
| public bool ApplyBoost { get; set; } |
| |
| |
| /// <summary> |
| /// Gets or Sets the field names that will be used when generating the 'More Like This' query. |
| /// The default field names that will be used is <see cref="DEFAULT_FIELD_NAMES"/>. |
| /// Set this to null for the field names to be determined at runtime from the <see cref="IndexReader"/> |
| /// provided in the constructor. |
| /// </summary> |
| [WritableArray] |
| [SuppressMessage("Microsoft.Performance", "CA1819", Justification = "Lucene's design requires some writable array properties")] |
| public string[] FieldNames { get; set; } |
| |
| |
| /// <summary> |
| /// Gets or Sets the minimum word length below which words will be ignored. Set this to 0 for no |
| /// minimum word length. The default is <see cref="DEFAULT_MIN_WORD_LENGTH"/>. |
| /// </summary> |
| public int MinWordLen { get; set; } |
| |
| |
| /// <summary> |
| /// Gets or Sets the maximum word length above which words will be ignored. Set this to 0 for no |
| /// maximum word length. The default is <see cref="DEFAULT_MAX_WORD_LENGTH"/>. |
| /// </summary> |
| public int MaxWordLen { get; set; } |
| |
| |
| /// <summary> |
| /// Gets or Sets the set of stopwords. |
| /// Any word in this set is considered "uninteresting" and ignored. |
| /// Even if your <see cref="Analysis.Analyzer"/> allows stopwords, you might want to tell the <see cref="MoreLikeThis"/> code to ignore them, as |
| /// for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting". |
| /// </summary> |
| public ISet<string> StopWords { get; set; } |
| |
| /// <summary> |
| /// Gets or Sets the maximum number of query terms that will be included in any generated query. |
| /// The default is <see cref="DEFAULT_MAX_QUERY_TERMS"/>. |
| /// </summary> |
| public int MaxQueryTerms { get; set; } |
| |
| |
| /// <returns> Gets or Sets the maximum number of tokens to parse in each example doc field that is not stored with TermVector support </returns> |
| /// <seealso cref="DEFAULT_MAX_NUM_TOKENS_PARSED"/> |
| public int MaxNumTokensParsed { get; set; } |
| |
| |
| |
| /// <summary> |
| /// Return a query that will return docs like the passed lucene document ID. |
| /// </summary> |
| /// <param name="docNum"> the documentID of the lucene doc to generate the 'More Like This" query for. </param> |
| /// <returns> a query that will return docs like the passed lucene document ID. </returns> |
| public Query Like(int docNum) |
| { |
| if (FieldNames == null) |
| { |
| // gather list of valid fields from lucene |
| ICollection<string> fields = MultiFields.GetIndexedFields(ir); |
| FieldNames = fields.ToArray(); |
| } |
| |
| return CreateQuery(RetrieveTerms(docNum)); |
| } |
| |
| /// <summary> |
| /// Return a query that will return docs like the passed <see cref="TextReader"/>. |
| /// </summary> |
| /// <returns> a query that will return docs like the passed <see cref="TextReader"/>. </returns> |
| public Query Like(TextReader r, string fieldName) |
| { |
| return CreateQuery(RetrieveTerms(r, fieldName)); |
| } |
| |
| /// <summary> |
| /// Create the More like query from a <see cref="T:Util.PriorityQueue{object[]}"/> |
| /// </summary> |
| private Query CreateQuery(Util.PriorityQueue<object[]> q) |
| { |
| BooleanQuery query = new BooleanQuery(); |
| object cur; |
| int qterms = 0; |
| float bestScore = 0; |
| |
| while ((cur = q.Pop()) != null) |
| { |
| var ar = (object[])cur; |
| var tq = new TermQuery(new Term((string)ar[1], (string)ar[0])); |
| |
| if (ApplyBoost) |
| { |
| if (qterms == 0) |
| { |
| bestScore = ((float)ar[2]); |
| } |
| float myScore = ((float)ar[2]); |
| |
| tq.Boost = boostFactor * myScore / bestScore; |
| } |
| |
| try |
| { |
| query.Add(tq, Occur.SHOULD); |
| } |
| catch (BooleanQuery.TooManyClausesException) |
| { |
| break; |
| } |
| |
| qterms++; |
| if (MaxQueryTerms > 0 && qterms >= MaxQueryTerms) |
| { |
| break; |
| } |
| } |
| |
| return query; |
| } |
| |
| /// <summary> |
| /// Create a <see cref="T:Util.PriorityQueue{object[]}"/> from a word->tf map. |
| /// </summary> |
| /// <param name="words"> a map of words keyed on the word(<see cref="string"/>) with <see cref="Int32"/> objects as the values. </param> |
| /// <exception cref="IOException"/> |
| private Util.PriorityQueue<object[]> CreateQueue(IDictionary<string, Int32> words) |
| { |
| // have collected all words in doc and their freqs |
| int numDocs = ir.NumDocs; |
| FreqQ res = new FreqQ(words.Count); // will order words by score |
| |
| foreach (string word in words.Keys) // for every word |
| { |
| int tf = words[word].x; // term freq in the source doc |
| if (MinTermFreq > 0 && tf < MinTermFreq) |
| { |
| continue; // filter out words that don't occur enough times in the source |
| } |
| |
| // go through all the fields and find the largest document frequency |
| string topField = FieldNames[0]; |
| int docFreq = 0; |
| foreach (string fieldName in FieldNames) |
| { |
| int freq = ir.DocFreq(new Term(fieldName, word)); |
| topField = (freq > docFreq) ? fieldName : topField; |
| docFreq = (freq > docFreq) ? freq : docFreq; |
| } |
| |
| if (MinDocFreq > 0 && docFreq < MinDocFreq) |
| { |
| continue; // filter out words that don't occur in enough docs |
| } |
| |
| if (docFreq > MaxDocFreq) |
| { |
| continue; // filter out words that occur in too many docs |
| } |
| |
| if (docFreq == 0) |
| { |
| continue; // index update problem? |
| } |
| |
| float idf = Similarity.Idf(docFreq, numDocs); |
| float score = tf * idf; |
| |
| // only really need 1st 3 entries, other ones are for troubleshooting |
| res.InsertWithOverflow(new object[] { word, topField, score, idf, docFreq, tf }); // freq in all docs - idf - overall score - the top field - the word |
| } |
| return res; |
| } |
| |
| /// <summary> |
| /// Describe the parameters that control how the "more like this" query is formed. |
| /// </summary> |
| public string DescribeParams() |
| { |
| StringBuilder sb = new StringBuilder(); |
| sb.Append("\t").Append("maxQueryTerms : ").Append(MaxQueryTerms).Append("\n"); |
| sb.Append("\t").Append("minWordLen : ").Append(MinWordLen).Append("\n"); |
| sb.Append("\t").Append("maxWordLen : ").Append(MaxWordLen).Append("\n"); |
| sb.Append("\t").Append("fieldNames : "); |
| string delim = ""; |
| foreach (string fieldName in FieldNames) |
| { |
| sb.Append(delim).Append(fieldName); |
| delim = ", "; |
| } |
| sb.Append("\n"); |
| sb.Append("\t").Append("boost : ").Append(ApplyBoost).Append("\n"); |
| sb.Append("\t").Append("minTermFreq : ").Append(MinTermFreq).Append("\n"); |
| sb.Append("\t").Append("minDocFreq : ").Append(MinDocFreq).Append("\n"); |
| return sb.ToString(); |
| } |
| |
| /// <summary> |
| /// Find words for a more-like-this query former. |
| /// </summary> |
| /// <param name="docNum"> the id of the lucene document from which to find terms </param> |
| /// <exception cref="IOException"/> |
| public Util.PriorityQueue<object[]> RetrieveTerms(int docNum) |
| { |
| IDictionary<string, Int32> termFreqMap = new Dictionary<string, Int32>(); |
| foreach (string fieldName in FieldNames) |
| { |
| Fields vectors = ir.GetTermVectors(docNum); |
| Terms vector; |
| if (vectors != null) |
| { |
| vector = vectors.GetTerms(fieldName); |
| } |
| else |
| { |
| vector = null; |
| } |
| |
| // field does not store term vector info |
| if (vector == null) |
| { |
| Document d = ir.Document(docNum); |
| IIndexableField[] fields = d.GetFields(fieldName); |
| foreach (IIndexableField field in fields) |
| { |
| string stringValue = field.GetStringValue(); |
| if (stringValue != null) |
| { |
| AddTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName); |
| } |
| } |
| } |
| else |
| { |
| AddTermFrequencies(termFreqMap, vector); |
| } |
| } |
| |
| return CreateQueue(termFreqMap); |
| } |
| |
| /// <summary> |
| /// Adds terms and frequencies found in vector into the <see cref="T:IDictionary{string, Int}"/> <paramref name="termFreqMap"/> |
| /// </summary> |
| /// <param name="termFreqMap"> a <see cref="T:IDictionary{string, Int}"/> of terms and their frequencies </param> |
| /// <param name="vector"> List of terms and their frequencies for a doc/field </param> |
| private void AddTermFrequencies(IDictionary<string, Int32> termFreqMap, Terms vector) |
| { |
| var termsEnum = vector.GetEnumerator(); |
| var spare = new CharsRef(); |
| BytesRef text; |
| while (termsEnum.MoveNext()) |
| { |
| text = termsEnum.Term; |
| UnicodeUtil.UTF8toUTF16(text, spare); |
| var term = spare.ToString(); |
| if (IsNoiseWord(term)) |
| { |
| continue; |
| } |
| var freq = (int)termsEnum.TotalTermFreq; |
| |
| // increment frequency |
| if (!termFreqMap.TryGetValue(term, out Int32 cnt)) |
| { |
| cnt = new Int32(); |
| termFreqMap[term] = cnt; |
| cnt.x = freq; |
| } |
| else |
| { |
| cnt.x += freq; |
| } |
| } |
| } |
| |
| /// <summary> |
| /// Adds term frequencies found by tokenizing text from reader into the <see cref="T:IDictionary{string, Int}"/> words |
| /// </summary> |
| /// <param name="r"> a source of text to be tokenized </param> |
| /// <param name="termFreqMap"> a <see cref="T:IDictionary{string, Int}"/> of terms and their frequencies </param> |
| /// <param name="fieldName"> Used by analyzer for any special per-field analysis </param> |
| private void AddTermFrequencies(TextReader r, IDictionary<string, Int32> termFreqMap, string fieldName) |
| { |
| if (Analyzer == null) |
| { |
| throw new NotSupportedException("To use MoreLikeThis without " + "term vectors, you must provide an Analyzer"); |
| } |
| var ts = Analyzer.GetTokenStream(fieldName, r); |
| try |
| { |
| int tokenCount = 0; |
| // for every token |
| var termAtt = ts.AddAttribute<ICharTermAttribute>(); |
| ts.Reset(); |
| while (ts.IncrementToken()) |
| { |
| string word = termAtt.ToString(); |
| tokenCount++; |
| if (tokenCount > MaxNumTokensParsed) |
| { |
| break; |
| } |
| if (IsNoiseWord(word)) |
| { |
| continue; |
| } |
| |
| // increment frequency |
| if (!termFreqMap.TryGetValue(word, out Int32 cnt)) |
| { |
| termFreqMap[word] = new Int32(); |
| } |
| else |
| { |
| cnt.x++; |
| } |
| } |
| ts.End(); |
| } |
| finally |
| { |
| IOUtils.DisposeWhileHandlingException(ts); |
| } |
| } |
| |
| |
| /// <summary> |
| /// determines if the passed term is likely to be of interest in "more like" comparisons |
| /// </summary> |
| /// <param name="term"> The word being considered </param> |
| /// <returns> <c>true</c> if should be ignored, <c>false</c> if should be used in further analysis </returns> |
| private bool IsNoiseWord(string term) |
| { |
| int len = term.Length; |
| if (MinWordLen > 0 && len < MinWordLen) |
| { |
| return true; |
| } |
| if (MaxWordLen > 0 && len > MaxWordLen) |
| { |
| return true; |
| } |
| return StopWords != null && StopWords.Contains(term); |
| } |
| |
| |
| /// <summary> |
| /// Find words for a more-like-this query former. |
| /// The result is a priority queue of arrays with one entry for <b>every word</b> in the document. |
| /// Each array has 6 elements. |
| /// The elements are: |
| /// <list type="bullet"> |
| /// <item><description>The word (<see cref="string"/>)</description></item> |
| /// <item><description>The top field that this word comes from (<see cref="string"/>)</description></item> |
| /// <item><description>The score for this word (<see cref="float"/>)</description></item> |
| /// <item><description>The IDF value (<see cref="float"/>)</description></item> |
| /// <item><description>The frequency of this word in the index (<see cref="int"/>)</description></item> |
| /// <item><description>The frequency of this word in the source document (<see cref="int"/>)</description></item> |
| /// </list> |
| /// This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest. |
| /// This method is exposed so that you can identify the "interesting words" in a document. |
| /// For an easier method to call see <see cref="RetrieveInterestingTerms(TextReader, string)"/>. |
| /// </summary> |
| /// <param name="r"> the reader that has the content of the document </param> |
| /// <param name="fieldName"> field passed to the analyzer to use when analyzing the content </param> |
| /// <returns> the most interesting words in the document ordered by score, with the highest scoring, or best entry, first </returns> |
| /// <exception cref="IOException"/> |
| /// <seealso cref="RetrieveInterestingTerms(TextReader, string)"/> |
| public Util.PriorityQueue<object[]> RetrieveTerms(TextReader r, string fieldName) |
| { |
| IDictionary<string, Int32> words = new Dictionary<string, Int32>(); |
| AddTermFrequencies(r, words, fieldName); |
| return CreateQueue(words); |
| } |
| |
| /// <seealso cref="RetrieveInterestingTerms(TextReader, string)"/> |
| public string[] RetrieveInterestingTerms(int docNum) |
| { |
| var al = new List<string>(MaxQueryTerms); |
| var pq = RetrieveTerms(docNum); |
| object cur; |
| int lim = MaxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... |
| // we just want to return the top words |
| while (((cur = pq.Pop()) != null) && lim-- > 0) |
| { |
| var ar = (object[])cur; |
| al.Add(ar[0].ToString()); // the 1st entry is the interesting word |
| } |
| return al.ToArray(); |
| } |
| |
| /// <summary> |
| /// Convenience routine to make it easy to return the most interesting words in a document. |
| /// More advanced users will call <see cref="RetrieveTerms(TextReader, string)"/> directly. |
| /// </summary> |
| /// <param name="r"> the source document </param> |
| /// <param name="fieldName"> field passed to analyzer to use when analyzing the content </param> |
| /// <returns> the most interesting words in the document </returns> |
| /// <seealso cref="RetrieveTerms(TextReader, string)"/> |
| /// <seealso cref="MaxQueryTerms"/> |
| public string[] RetrieveInterestingTerms(TextReader r, string fieldName) |
| { |
| var al = new List<string>(MaxQueryTerms); |
| Util.PriorityQueue<object[]> pq = RetrieveTerms(r, fieldName); |
| object cur; |
| int lim = MaxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... |
| // we just want to return the top words |
| while (((cur = pq.Pop()) != null) && lim-- > 0) |
| { |
| var ar = (object[])cur; |
| al.Add(ar[0].ToString()); // the 1st entry is the interesting word |
| } |
| return al.ToArray(); |
| } |
| |
| /// <summary> |
| /// <see cref="T:Util.PriorityQueue{object[]}"/> that orders words by score. |
| /// </summary> |
| private class FreqQ : Util.PriorityQueue<object[]> |
| { |
| internal FreqQ(int s) |
| : base(s) |
| { |
| } |
| |
| protected internal override bool LessThan(object[] aa, object[] bb) |
| { |
| float? fa = (float?)aa[2]; |
| float? fb = (float?)bb[2]; |
| return fa > fb; |
| } |
| } |
| |
| /// <summary> |
| /// Use for frequencies and to avoid renewing <see cref="int"/>s. |
| /// <para/> |
| /// NOTE: This was Int in Lucene |
| /// </summary> |
| private class Int32 |
| { |
| internal int x; |
| |
| internal Int32() |
| { |
| x = 1; |
| } |
| } |
| } |
| } |