blob: 5716f63b399899e0f050abfa6dca9f2e0d75a269 [file] [log] [blame]
using J2N.Text;
using Lucene.Net.Index;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Search.Highlight
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Utility class used to extract the terms used in a query, plus any weights.
/// This class will not find terms for <see cref="MultiTermQuery"/>, RangeQuery and <see cref="PrefixQuery"/> classes
/// so the caller must pass a rewritten query (see <see cref="Query.Rewrite(IndexReader)"/>) to obtain a list of
/// expanded terms.
/// </summary>
public static class QueryTermExtractor
{
/// <summary>
/// Extracts all terms texts of a given <see cref="Query"/> into an array of WeightedTerms
/// </summary>
/// <param name="query"><see cref="Query"/> to extract term texts from</param>
/// <returns> an array of the terms used in a query, plus their weights.</returns>
public static WeightedTerm[] GetTerms(Query query)
{
return GetTerms(query, false);
}
/// <summary>
/// Extracts all terms texts of a given <see cref="Query"/> into an array of WeightedTerms
/// </summary>
/// <param name="query"><see cref="Query"/> to extract term texts from</param>
/// <param name="reader">used to compute IDF which can be used to a) score selected fragments better
/// b) use graded highlights eg chaning intensity of font color</param>
/// <param name="fieldName">the field on which Inverse Document Frequency (IDF) calculations are based</param>
/// <returns> an array of the terms used in a query, plus their weights.</returns>
public static WeightedTerm[] GetIdfWeightedTerms(Query query, IndexReader reader, string fieldName)
{
WeightedTerm[] terms = GetTerms(query, false, fieldName);
int totalNumDocs = reader.MaxDoc;
foreach (WeightedTerm t in terms)
{
try
{
int docFreq = reader.DocFreq(new Term(fieldName, t.Term));
//IDF algorithm taken from DefaultSimilarity class
var idf = (float)(Math.Log((float)totalNumDocs / (double)(docFreq + 1)) + 1.0);
t.Weight *= idf;
}
catch (IOException)
{
//ignore
}
}
return terms;
}
/// <summary>Extracts all terms texts of a given <see cref="Query"/> into an array of WeightedTerms</summary>
/// <param name="query"><see cref="Query"/> to extract term texts from</param>
/// <param name="prohibited"><c>true</c> to extract "prohibited" terms, too </param>
/// <param name="fieldName"> The fieldName used to filter query terms</param>
/// <returns>an array of the terms used in a query, plus their weights.</returns>
public static WeightedTerm[] GetTerms(Query query, bool prohibited, string fieldName)
{
var terms = new JCG.HashSet<WeightedTerm>();
if (fieldName != null)
{
fieldName = fieldName.Intern();
}
GetTerms(query, terms, prohibited, fieldName);
return terms.ToArray();
}
/// <summary>
/// Extracts all terms texts of a given <see cref="Query"/> into an array of WeightedTerms
/// </summary>
/// <param name="query"><see cref="Query"/> to extract term texts from</param>
/// <param name="prohibited"><c>true</c> to extract "prohibited" terms, too</param>
/// <returns> an array of the terms used in a query, plus their weights.</returns>
public static WeightedTerm[] GetTerms(Query query, bool prohibited)
{
return GetTerms(query, prohibited, null);
}
//fieldname MUST be interned prior to this call
private static void GetTerms(Query query, ISet<WeightedTerm> terms, bool prohibited, string fieldName)
{
try
{
if (query is BooleanQuery)
GetTermsFromBooleanQuery((BooleanQuery)query, terms, prohibited, fieldName);
else if (query is FilteredQuery)
GetTermsFromFilteredQuery((FilteredQuery)query, terms, prohibited, fieldName);
else
{
var nonWeightedTerms = new JCG.HashSet<Term>();
query.ExtractTerms(nonWeightedTerms);
foreach (var term in nonWeightedTerms)
{
if ((fieldName == null) || (term.Field.Equals(fieldName, StringComparison.Ordinal)))
{
terms.Add(new WeightedTerm(query.Boost, term.Text()));
}
}
}
}
#pragma warning disable 168
catch (NotSupportedException ignore)
#pragma warning restore 168
{
//this is non-fatal for our purposes
}
}
/// <summary> extractTerms is currently the only query-independent means of introspecting queries but it only reveals
/// a list of terms for that query - not the boosts each individual term in that query may or may not have.
/// "Container" queries such as BooleanQuery should be unwrapped to get at the boost info held
/// in each child element.
/// Some discussion around this topic here:
/// http://www.gossamer-threads.com/lists/lucene/java-dev/34208?search_string=introspection;#34208
/// Unfortunately there seemed to be limited interest in requiring all <see cref="Query"/> objects to implement
/// something common which would allow access to child queries so what follows here are query-specific
/// implementations for accessing embedded query elements.
/// </summary>
private static void GetTermsFromBooleanQuery(BooleanQuery query, ISet<WeightedTerm> terms, bool prohibited, string fieldName)
{
var queryClauses = query.Clauses;
for (int i = 0; i < queryClauses.Count; i++)
{
if (prohibited || queryClauses[i].Occur != Occur.MUST_NOT)
GetTerms(queryClauses[i].Query, terms, prohibited, fieldName);
}
}
private static void GetTermsFromFilteredQuery(FilteredQuery query, ISet<WeightedTerm> terms, bool prohibited, string fieldName)
{
GetTerms(query.Query, terms, prohibited, fieldName);
}
}
}