blob: 33142b5f123bf961f9542ba05a69dde7f18d9137 [file] [log] [blame]
using J2N.Collections.Generic.Extensions;
using Lucene.Net.Diagnostics;
using Lucene.Net.Index;
using Lucene.Net.Support;
using System;
using System.Collections;
using System.Collections.Generic;
using System.Text;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Search
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using ArrayUtil = Lucene.Net.Util.ArrayUtil;
using AtomicReader = Lucene.Net.Index.AtomicReader;
using AtomicReaderContext = Lucene.Net.Index.AtomicReaderContext;
using DocsAndPositionsEnum = Lucene.Net.Index.DocsAndPositionsEnum;
using IBits = Lucene.Net.Util.IBits;
using IndexReader = Lucene.Net.Index.IndexReader;
using IndexReaderContext = Lucene.Net.Index.IndexReaderContext;
using Similarity = Lucene.Net.Search.Similarities.Similarity;
using SimScorer = Lucene.Net.Search.Similarities.Similarity.SimScorer;
using Term = Lucene.Net.Index.Term;
using TermContext = Lucene.Net.Index.TermContext;
using Terms = Lucene.Net.Index.Terms;
using TermsEnum = Lucene.Net.Index.TermsEnum;
using TermState = Lucene.Net.Index.TermState;
using ToStringUtils = Lucene.Net.Util.ToStringUtils;
/// <summary>
/// A <see cref="Query"/> that matches documents containing a particular sequence of terms.
/// A <see cref="PhraseQuery"/> is built by QueryParser for input like <c>"new york"</c>.
///
/// <para/>This query may be combined with other terms or queries with a <see cref="BooleanQuery"/>.
/// <para/>
/// Collection initializer note: To create and populate a <see cref="PhraseQuery"/>
/// in a single statement, you can use the following example as a guide:
///
/// <code>
/// var phraseQuery = new PhraseQuery() {
/// new Term("field", "microsoft"),
/// new Term("field", "office")
/// };
/// </code>
/// Note that as long as you specify all of the parameters, you can use either
/// <see cref="Add(Term)"/> or <see cref="Add(Term, int)"/>
/// as the method to use to initialize. If there are multiple parameters, each parameter set
/// must be surrounded by curly braces.
/// </summary>
public class PhraseQuery : Query, IEnumerable<Term> // LUCENENET specific - implemented IEnumerable<Term>, which allows for use of collection initializer. See: https://stackoverflow.com/a/9195144
{
private string field;
private IList<Term> terms = new JCG.List<Term>(4);
private IList<int?> positions = new JCG.List<int?>(4);
private int maxPosition = 0;
private int slop = 0;
/// <summary>
/// Constructs an empty phrase query. </summary>
public PhraseQuery()
{
}
/// <summary>
/// Sets the number of other words permitted between words in query phrase.
/// If zero, then this is an exact phrase search. For larger values this works
/// like a <c>WITHIN</c> or <c>NEAR</c> operator.
///
/// <para/>The slop is in fact an edit-distance, where the units correspond to
/// moves of terms in the query phrase out of position. For example, to switch
/// the order of two words requires two moves (the first move places the words
/// atop one another), so to permit re-orderings of phrases, the slop must be
/// at least two.
///
/// <para/>More exact matches are scored higher than sloppier matches, thus search
/// results are sorted by exactness.
///
/// <para/>The slop is zero by default, requiring exact matches.
/// </summary>
public virtual int Slop
{
get => slop;
set
{
if (value < 0)
throw new ArgumentException("slop value cannot be negative");
slop = value;
}
}
/// <summary>
/// Adds a term to the end of the query phrase.
/// The relative position of the term is the one immediately after the last term added.
/// </summary>
public virtual void Add(Term term)
{
int position = 0;
if (positions.Count > 0)
{
position = (int)positions[positions.Count - 1] + 1;
}
Add(term, position);
}
/// <summary>
/// Adds a term to the end of the query phrase.
/// The relative position of the term within the phrase is specified explicitly.
/// this allows e.g. phrases with more than one term at the same position
/// or phrases with gaps (e.g. in connection with stopwords).
/// </summary>
public virtual void Add(Term term, int position)
{
if (terms.Count == 0)
{
field = term.Field;
}
else if (!term.Field.Equals(field, StringComparison.Ordinal))
{
throw new ArgumentException("All phrase terms must be in the same field: " + term);
}
terms.Add(term);
positions.Add(position);
if (position > maxPosition)
{
maxPosition = position;
}
}
/// <summary>
/// Returns the set of terms in this phrase. </summary>
public virtual Term[] GetTerms()
{
return terms.ToArray();
}
/// <summary>
/// Returns the relative positions of terms in this phrase.
/// </summary>
public virtual int[] GetPositions()
{
int[] result = new int[positions.Count];
for (int i = 0; i < positions.Count; i++)
{
result[i] = (int)positions[i];
}
return result;
}
public override Query Rewrite(IndexReader reader)
{
if (terms.Count == 0)
{
BooleanQuery bq = new BooleanQuery();
bq.Boost = Boost;
return bq;
}
else if (terms.Count == 1)
{
TermQuery tq = new TermQuery(terms[0]);
tq.Boost = Boost;
return tq;
}
else
{
return base.Rewrite(reader);
}
}
internal class PostingsAndFreq : IComparable<PostingsAndFreq>
{
internal readonly DocsAndPositionsEnum postings;
internal readonly int docFreq;
internal readonly int position;
internal readonly Term[] terms;
internal readonly int nTerms; // for faster comparisons
public PostingsAndFreq(DocsAndPositionsEnum postings, int docFreq, int position, params Term[] terms)
{
this.postings = postings;
this.docFreq = docFreq;
this.position = position;
nTerms = terms == null ? 0 : terms.Length;
if (nTerms > 0)
{
if (terms.Length == 1)
{
this.terms = terms;
}
else
{
Term[] terms2 = new Term[terms.Length];
Array.Copy(terms, 0, terms2, 0, terms.Length);
Array.Sort(terms2);
this.terms = terms2;
}
}
else
{
this.terms = null;
}
}
public virtual int CompareTo(PostingsAndFreq other)
{
if (docFreq != other.docFreq)
{
return docFreq - other.docFreq;
}
if (position != other.position)
{
return position - other.position;
}
if (nTerms != other.nTerms)
{
return nTerms - other.nTerms;
}
if (nTerms == 0)
{
return 0;
}
for (int i = 0; i < terms.Length; i++)
{
int res = terms[i].CompareTo(other.terms[i]);
if (res != 0)
{
return res;
}
}
return 0;
}
public override int GetHashCode()
{
const int prime = 31;
int result = 1;
result = prime * result + docFreq;
result = prime * result + position;
for (int i = 0; i < nTerms; i++)
{
result = prime * result + terms[i].GetHashCode();
}
return result;
}
public override bool Equals(object obj)
{
if (this == obj)
{
return true;
}
if (obj == null)
{
return false;
}
if (this.GetType() != obj.GetType())
{
return false;
}
PostingsAndFreq other = (PostingsAndFreq)obj;
if (docFreq != other.docFreq)
{
return false;
}
if (position != other.position)
{
return false;
}
if (terms == null)
{
return other.terms == null;
}
return Arrays.Equals(terms, other.terms);
}
}
private class PhraseWeight : Weight
{
private readonly PhraseQuery outerInstance;
internal readonly Similarity similarity;
internal readonly Similarity.SimWeight stats;
internal TermContext[] states;
public PhraseWeight(PhraseQuery outerInstance, IndexSearcher searcher)
{
this.outerInstance = outerInstance;
this.similarity = searcher.Similarity;
IndexReaderContext context = searcher.TopReaderContext;
states = new TermContext[outerInstance.terms.Count];
TermStatistics[] termStats = new TermStatistics[outerInstance.terms.Count];
for (int i = 0; i < outerInstance.terms.Count; i++)
{
Term term = outerInstance.terms[i];
states[i] = TermContext.Build(context, term);
termStats[i] = searcher.TermStatistics(term, states[i]);
}
stats = similarity.ComputeWeight(outerInstance.Boost, searcher.CollectionStatistics(outerInstance.field), termStats);
}
public override string ToString()
{
return "weight(" + outerInstance + ")";
}
public override Query Query => outerInstance;
public override float GetValueForNormalization()
{
return stats.GetValueForNormalization();
}
public override void Normalize(float queryNorm, float topLevelBoost)
{
stats.Normalize(queryNorm, topLevelBoost);
}
public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs)
{
if (Debugging.AssertsEnabled) Debugging.Assert(outerInstance.terms.Count > 0);
AtomicReader reader = context.AtomicReader;
IBits liveDocs = acceptDocs;
PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[outerInstance.terms.Count];
Terms fieldTerms = reader.GetTerms(outerInstance.field);
if (fieldTerms == null)
{
return null;
}
// Reuse single TermsEnum below:
TermsEnum te = fieldTerms.GetEnumerator();
for (int i = 0; i < outerInstance.terms.Count; i++)
{
Term t = outerInstance.terms[i];
TermState state = states[i].Get(context.Ord);
if (state == null) // term doesnt exist in this segment
{
if (Debugging.AssertsEnabled) Debugging.Assert(TermNotInReader(reader, t), "no termstate found but term exists in reader");
return null;
}
te.SeekExact(t.Bytes, state);
DocsAndPositionsEnum postingsEnum = te.DocsAndPositions(liveDocs, null, DocsAndPositionsFlags.NONE);
// PhraseQuery on a field that did not index
// positions.
if (postingsEnum == null)
{
if (Debugging.AssertsEnabled) Debugging.Assert(te.SeekExact(t.Bytes), "termstate found but no term exists in reader");
// term does exist, but has no positions
throw new InvalidOperationException("field \"" + t.Field + "\" was indexed without position data; cannot run PhraseQuery (term=" + t.Text() + ")");
}
postingsFreqs[i] = new PostingsAndFreq(postingsEnum, te.DocFreq, (int)outerInstance.positions[i], t);
}
// sort by increasing docFreq order
if (outerInstance.slop == 0)
{
ArrayUtil.TimSort(postingsFreqs);
}
if (outerInstance.slop == 0) // optimize exact case
{
ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.GetSimScorer(stats, context));
if (s.noDocs)
{
return null;
}
else
{
return s;
}
}
else
{
return new SloppyPhraseScorer(this, postingsFreqs, outerInstance.slop, similarity.GetSimScorer(stats, context));
}
}
// only called from assert
private bool TermNotInReader(AtomicReader reader, Term term)
{
return reader.DocFreq(term) == 0;
}
public override Explanation Explain(AtomicReaderContext context, int doc)
{
Scorer scorer = GetScorer(context, context.AtomicReader.LiveDocs);
if (scorer != null)
{
int newDoc = scorer.Advance(doc);
if (newDoc == doc)
{
float freq = outerInstance.slop == 0 ? scorer.Freq : ((SloppyPhraseScorer)scorer).SloppyFreq;
SimScorer docScorer = similarity.GetSimScorer(stats, context);
ComplexExplanation result = new ComplexExplanation();
result.Description = "weight(" + Query + " in " + doc + ") [" + similarity.GetType().Name + "], result of:";
Explanation scoreExplanation = docScorer.Explain(doc, new Explanation(freq, "phraseFreq=" + freq));
result.AddDetail(scoreExplanation);
result.Value = scoreExplanation.Value;
result.Match = true;
return result;
}
}
return new ComplexExplanation(false, 0.0f, "no matching term");
}
}
public override Weight CreateWeight(IndexSearcher searcher)
{
return new PhraseWeight(this, searcher);
}
/// <seealso cref="Lucene.Net.Search.Query.ExtractTerms(ISet{Term})"/>
public override void ExtractTerms(ISet<Term> queryTerms)
{
queryTerms.UnionWith(terms);
}
/// <summary>
/// Prints a user-readable version of this query. </summary>
public override string ToString(string f)
{
StringBuilder buffer = new StringBuilder();
if (field != null && !field.Equals(f, StringComparison.Ordinal))
{
buffer.Append(field);
buffer.Append(":");
}
buffer.Append("\"");
string[] pieces = new string[maxPosition + 1];
for (int i = 0; i < terms.Count; i++)
{
int pos = (int)positions[i];
string s = pieces[pos];
if (s == null)
{
s = (terms[i]).Text();
}
else
{
s = s + "|" + (terms[i]).Text();
}
pieces[pos] = s;
}
for (int i = 0; i < pieces.Length; i++)
{
if (i > 0)
{
buffer.Append(' ');
}
string s = pieces[i];
if (s == null)
{
buffer.Append('?');
}
else
{
buffer.Append(s);
}
}
buffer.Append("\"");
if (slop != 0)
{
buffer.Append("~");
buffer.Append(slop);
}
buffer.Append(ToStringUtils.Boost(Boost));
return buffer.ToString();
}
/// <summary>
/// Returns <c>true</c> if <paramref name="o"/> is equal to this. </summary>
public override bool Equals(object o)
{
if (!(o is PhraseQuery))
{
return false;
}
PhraseQuery other = (PhraseQuery)o;
return (this.Boost == other.Boost)
&& (this.slop == other.slop)
&& this.terms.Equals(other.terms)
&& this.positions.Equals(other.positions);
}
/// <summary>
/// Returns a hash code value for this object. </summary>
public override int GetHashCode()
{
return J2N.BitConversion.SingleToInt32Bits(Boost)
^ slop
^ terms.GetHashCode()
^ positions.GetHashCode();
}
/// <summary>
/// Returns an enumerator that iterates through the <see cref="terms"/> collection.
/// </summary>
/// <returns>An enumerator that can be used to iterate through the <see cref="terms"/> collection.</returns>
// LUCENENET specific
public IEnumerator<Term> GetEnumerator()
{
return this.terms.GetEnumerator();
}
/// <summary>
/// Returns an enumerator that iterates through the <see cref="terms"/> collection.
/// </summary>
/// <returns>An enumerator that can be used to iterate through the <see cref="terms"/> collection.</returns>
// LUCENENET specific
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
}
}