blob: 5de9348b0900899ae40df8dae43774e5a3cd22ea [file] [log] [blame]
using Lucene.Net.Index;
using Lucene.Net.Support;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Text;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Search
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using ArrayUtil = Lucene.Net.Util.ArrayUtil;
using AtomicReader = Lucene.Net.Index.AtomicReader;
using AtomicReaderContext = Lucene.Net.Index.AtomicReaderContext;
using IBits = Lucene.Net.Util.IBits;
using BytesRef = Lucene.Net.Util.BytesRef;
using DocsAndPositionsEnum = Lucene.Net.Index.DocsAndPositionsEnum;
using DocsEnum = Lucene.Net.Index.DocsEnum;
using IndexReader = Lucene.Net.Index.IndexReader;
using IndexReaderContext = Lucene.Net.Index.IndexReaderContext;
using Similarity = Lucene.Net.Search.Similarities.Similarity;
using SimScorer = Lucene.Net.Search.Similarities.Similarity.SimScorer;
using Term = Lucene.Net.Index.Term;
using TermContext = Lucene.Net.Index.TermContext;
using Terms = Lucene.Net.Index.Terms;
using TermsEnum = Lucene.Net.Index.TermsEnum;
using TermState = Lucene.Net.Index.TermState;
using ToStringUtils = Lucene.Net.Util.ToStringUtils;
using System.Collections;
using J2N.Collections.Generic.Extensions;
/// <summary>
/// <see cref="MultiPhraseQuery"/> is a generalized version of <see cref="PhraseQuery"/>, with an added
/// method <see cref="Add(Term[])"/>.
/// <para/>
/// To use this class, to search for the phrase "Microsoft app*" first use
/// <see cref="Add(Term)"/> on the term "Microsoft", then find all terms that have "app" as
/// prefix using <c>MultiFields.GetFields(IndexReader).GetTerms(string)</c>, and use <see cref="MultiPhraseQuery.Add(Term[])"/>
/// to add them to the query.
/// <para/>
/// Collection initializer note: To create and populate a <see cref="MultiPhraseQuery"/>
/// in a single statement, you can use the following example as a guide:
///
/// <code>
/// var multiPhraseQuery = new MultiPhraseQuery() {
/// new Term("field", "microsoft"),
/// new Term("field", "office")
/// };
/// </code>
/// Note that as long as you specify all of the parameters, you can use either
/// <see cref="Add(Term)"/>, <see cref="Add(Term[])"/>, or <see cref="Add(Term[], int)"/>
/// as the method to use to initialize. If there are multiple parameters, each parameter set
/// must be surrounded by curly braces.
/// </summary>
public class MultiPhraseQuery : Query, IEnumerable<Term[]> // LUCENENET specific - implemented IEnumerable<Term[]>, which allows for use of collection initializer. See: https://stackoverflow.com/a/9195144
{
private string field;
private IList<Term[]> termArrays = new JCG.List<Term[]>();
private readonly IList<int> positions = new JCG.List<int>();
private int slop = 0;
/// <summary>
/// Sets the phrase slop for this query. </summary>
/// <seealso cref="PhraseQuery.Slop"/>
public virtual int Slop
{
set
{
if (value < 0)
{
throw new System.ArgumentException("slop value cannot be negative");
}
slop = value;
}
get
{
return slop;
}
}
/// <summary>
/// Add a single term at the next position in the phrase. </summary>
/// <seealso cref="PhraseQuery.Add(Term)"/>
public virtual void Add(Term term)
{
Add(new Term[] { term });
}
/// <summary>
/// Add multiple terms at the next position in the phrase. Any of the terms
/// may match.
/// </summary>
/// <seealso cref="PhraseQuery.Add(Term)"/>
public virtual void Add(Term[] terms)
{
int position = 0;
if (positions.Count > 0)
{
position = (int)positions[positions.Count - 1] + 1;
}
Add(terms, position);
}
/// <summary>
/// Allows to specify the relative position of terms within the phrase.
/// </summary>
/// <seealso cref="PhraseQuery.Add(Term, int)"/>
public virtual void Add(Term[] terms, int position)
{
if (termArrays.Count == 0)
{
field = terms[0].Field;
}
for (var i = 0; i < terms.Length; i++)
{
if (!terms[i].Field.Equals(field, StringComparison.Ordinal))
{
throw new System.ArgumentException("All phrase terms must be in the same field (" + field + "): " + terms[i]);
}
}
termArrays.Add(terms);
positions.Add(position);
}
/// <summary>
/// Returns a List of the terms in the multiphrase.
/// Do not modify the List or its contents.
/// </summary>
public virtual IList<Term[]> GetTermArrays() // LUCENENET TODO: API - make into a property
{
return termArrays.AsReadOnly();
}
/// <summary>
/// Returns the relative positions of terms in this phrase.
/// </summary>
public virtual int[] GetPositions()
{
var result = new int[positions.Count];
for (int i = 0; i < positions.Count; i++)
{
result[i] = (int)positions[i];
}
return result;
}
/// <summary>
/// Expert: adds all terms occurring in this query to the terms set. Only
/// works if this query is in its rewritten (<see cref="Rewrite(IndexReader)"/>) form.
/// </summary>
/// <exception cref="InvalidOperationException"> If this query is not yet rewritten </exception>
public override void ExtractTerms(ISet<Term> terms)
{
foreach (Term[] arr in termArrays)
{
foreach (Term term in arr)
{
terms.Add(term);
}
}
}
private class MultiPhraseWeight : Weight
{
private readonly MultiPhraseQuery outerInstance;
private readonly Similarity similarity;
private readonly Similarity.SimWeight stats;
private readonly IDictionary<Term, TermContext> termContexts = new Dictionary<Term, TermContext>();
public MultiPhraseWeight(MultiPhraseQuery outerInstance, IndexSearcher searcher)
{
this.outerInstance = outerInstance;
this.similarity = searcher.Similarity;
IndexReaderContext context = searcher.TopReaderContext;
// compute idf
var allTermStats = new List<TermStatistics>();
foreach (Term[] terms in outerInstance.termArrays)
{
foreach (Term term in terms)
{
TermContext termContext;
termContexts.TryGetValue(term, out termContext);
if (termContext == null)
{
termContext = TermContext.Build(context, term);
termContexts[term] = termContext;
}
allTermStats.Add(searcher.TermStatistics(term, termContext));
}
}
stats = similarity.ComputeWeight(outerInstance.Boost, searcher.CollectionStatistics(outerInstance.field), allTermStats.ToArray());
}
public override Query Query
{
get
{
return outerInstance;
}
}
public override float GetValueForNormalization()
{
return stats.GetValueForNormalization();
}
public override void Normalize(float queryNorm, float topLevelBoost)
{
stats.Normalize(queryNorm, topLevelBoost);
}
public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs)
{
Debug.Assert(outerInstance.termArrays.Count > 0);
AtomicReader reader = (context.AtomicReader);
IBits liveDocs = acceptDocs;
PhraseQuery.PostingsAndFreq[] postingsFreqs = new PhraseQuery.PostingsAndFreq[outerInstance.termArrays.Count];
Terms fieldTerms = reader.GetTerms(outerInstance.field);
if (fieldTerms == null)
{
return null;
}
// Reuse single TermsEnum below:
TermsEnum termsEnum = fieldTerms.GetIterator(null);
for (int pos = 0; pos < postingsFreqs.Length; pos++)
{
Term[] terms = outerInstance.termArrays[pos];
DocsAndPositionsEnum postingsEnum;
int docFreq;
if (terms.Length > 1)
{
postingsEnum = new UnionDocsAndPositionsEnum(liveDocs, context, terms, termContexts, termsEnum);
// coarse -- this overcounts since a given doc can
// have more than one term:
docFreq = 0;
for (int termIdx = 0; termIdx < terms.Length; termIdx++)
{
Term term = terms[termIdx];
TermState termState = termContexts[term].Get(context.Ord);
if (termState == null)
{
// Term not in reader
continue;
}
termsEnum.SeekExact(term.Bytes, termState);
docFreq += termsEnum.DocFreq;
}
if (docFreq == 0)
{
// None of the terms are in this reader
return null;
}
}
else
{
Term term = terms[0];
TermState termState = termContexts[term].Get(context.Ord);
if (termState == null)
{
// Term not in reader
return null;
}
termsEnum.SeekExact(term.Bytes, termState);
postingsEnum = termsEnum.DocsAndPositions(liveDocs, null, DocsAndPositionsFlags.NONE);
if (postingsEnum == null)
{
// term does exist, but has no positions
Debug.Assert(termsEnum.Docs(liveDocs, null, DocsFlags.NONE) != null, "termstate found but no term exists in reader");
throw new InvalidOperationException("field \"" + term.Field + "\" was indexed without position data; cannot run PhraseQuery (term=" + term.Text() + ")");
}
docFreq = termsEnum.DocFreq;
}
postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, (int)outerInstance.positions[pos], terms);
}
// sort by increasing docFreq order
if (outerInstance.slop == 0)
{
ArrayUtil.TimSort(postingsFreqs);
}
if (outerInstance.slop == 0)
{
ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.GetSimScorer(stats, context));
if (s.noDocs)
{
return null;
}
else
{
return s;
}
}
else
{
return new SloppyPhraseScorer(this, postingsFreqs, outerInstance.slop, similarity.GetSimScorer(stats, context));
}
}
public override Explanation Explain(AtomicReaderContext context, int doc)
{
Scorer scorer = GetScorer(context, (context.AtomicReader).LiveDocs);
if (scorer != null)
{
int newDoc = scorer.Advance(doc);
if (newDoc == doc)
{
float freq = outerInstance.slop == 0 ? scorer.Freq : ((SloppyPhraseScorer)scorer).SloppyFreq;
SimScorer docScorer = similarity.GetSimScorer(stats, context);
ComplexExplanation result = new ComplexExplanation();
result.Description = "weight(" + Query + " in " + doc + ") [" + similarity.GetType().Name + "], result of:";
Explanation scoreExplanation = docScorer.Explain(doc, new Explanation(freq, "phraseFreq=" + freq));
result.AddDetail(scoreExplanation);
result.Value = scoreExplanation.Value;
result.Match = true;
return result;
}
}
return new ComplexExplanation(false, 0.0f, "no matching term");
}
}
public override Query Rewrite(IndexReader reader)
{
if (termArrays.Count == 0)
{
BooleanQuery bq = new BooleanQuery();
bq.Boost = Boost;
return bq;
} // optimize one-term case
else if (termArrays.Count == 1)
{
Term[] terms = termArrays[0];
BooleanQuery boq = new BooleanQuery(true);
for (int i = 0; i < terms.Length; i++)
{
boq.Add(new TermQuery(terms[i]), Occur.SHOULD);
}
boq.Boost = Boost;
return boq;
}
else
{
return this;
}
}
public override Weight CreateWeight(IndexSearcher searcher)
{
return new MultiPhraseWeight(this, searcher);
}
/// <summary>
/// Prints a user-readable version of this query. </summary>
public override sealed string ToString(string f)
{
StringBuilder buffer = new StringBuilder();
if (field == null || !field.Equals(f, StringComparison.Ordinal))
{
buffer.Append(field);
buffer.Append(":");
}
buffer.Append("\"");
int k = 0;
int? lastPos = -1;
bool first = true;
foreach (Term[] terms in termArrays)
{
int? position = positions[k];
if (first)
{
first = false;
}
else
{
buffer.Append(" ");
for (int j = 1; j < (position - lastPos); j++)
{
buffer.Append("? ");
}
}
if (terms.Length > 1)
{
buffer.Append("(");
for (int j = 0; j < terms.Length; j++)
{
buffer.Append(terms[j].Text());
if (j < terms.Length - 1)
{
buffer.Append(" ");
}
}
buffer.Append(")");
}
else
{
buffer.Append(terms[0].Text());
}
lastPos = position;
++k;
}
buffer.Append("\"");
if (slop != 0)
{
buffer.Append("~");
buffer.Append(slop);
}
buffer.Append(ToStringUtils.Boost(Boost));
return buffer.ToString();
}
/// <summary>
/// Returns <c>true</c> if <paramref name="o"/> is equal to this. </summary>
public override bool Equals(object o)
{
if (!(o is MultiPhraseQuery))
{
return false;
}
MultiPhraseQuery other = (MultiPhraseQuery)o;
return this.Boost == other.Boost
&& this.slop == other.slop
&& TermArraysEquals(this.termArrays, other.termArrays)
&& this.positions.Equals(other.positions);
}
/// <summary>
/// Returns a hash code value for this object. </summary>
public override int GetHashCode()
{
//If this doesn't work hash all elements of positions. This was used to reduce time overhead
return J2N.BitConversion.SingleToInt32Bits(Boost)
^ slop
^ TermArraysHashCode()
^ ((positions.Count == 0) ? 0 : positions.GetHashCode()
^ 0x4AC65113);
}
// Breakout calculation of the termArrays hashcode
private int TermArraysHashCode()
{
int hashCode = 1;
foreach (Term[] termArray in termArrays)
{
hashCode = 31 * hashCode
+ (termArray == null ? 0 : Arrays.GetHashCode(termArray));
}
return hashCode;
}
// Breakout calculation of the termArrays equals
private bool TermArraysEquals(IList<Term[]> termArrays1, IList<Term[]> termArrays2)
{
if (termArrays1.Count != termArrays2.Count)
{
return false;
}
using (IEnumerator<Term[]> iterator1 = termArrays1.GetEnumerator())
{
using (IEnumerator<Term[]> iterator2 = termArrays2.GetEnumerator())
{
while (iterator1.MoveNext())
{
Term[] termArray1 = iterator1.Current;
iterator2.MoveNext();
Term[] termArray2 = iterator2.Current;
if (!(termArray1 == null ? termArray2 == null : Arrays.Equals(termArray1, termArray2)))
{
return false;
}
}
}
}
return true;
}
/// <summary>
/// Returns an enumerator that iterates through the <see cref="termArrays"/> collection.
/// </summary>
/// <returns>An enumerator that can be used to iterate through the <see cref="termArrays"/> collection.</returns>
// LUCENENET specific
public IEnumerator<Term[]> GetEnumerator()
{
return termArrays.GetEnumerator();
}
/// <summary>
/// Returns an enumerator that iterates through the <see cref="termArrays"/>.
/// </summary>
/// <returns>An enumerator that can be used to iterate through the <see cref="termArrays"/> collection.</returns>
// LUCENENET specific
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
}
/// <summary>
/// Takes the logical union of multiple <see cref="DocsEnum"/> iterators.
/// </summary>
// TODO: if ever we allow subclassing of the *PhraseScorer
internal class UnionDocsAndPositionsEnum : DocsAndPositionsEnum
{
private sealed class DocsQueue : Util.PriorityQueue<DocsAndPositionsEnum>
{
internal DocsQueue(ICollection<DocsAndPositionsEnum> docsEnums)
: base(docsEnums.Count)
{
foreach (DocsAndPositionsEnum postings in docsEnums)
{
if (postings.NextDoc() != DocIdSetIterator.NO_MORE_DOCS)
{
Add(postings);
}
}
}
protected internal override bool LessThan(DocsAndPositionsEnum a, DocsAndPositionsEnum b)
{
return a.DocID < b.DocID;
}
}
/// <summary>
/// NOTE: This was IntQueue in Lucene
/// </summary>
private sealed class Int32Queue
{
public Int32Queue()
{
InitializeInstanceFields();
}
internal void InitializeInstanceFields()
{
_array = new int[_arraySize];
}
private int _arraySize = 16;
private int _index = 0;
private int _lastIndex = 0;
private int[] _array;
internal void Add(int i)
{
if (_lastIndex == _arraySize)
{
GrowArray();
}
_array[_lastIndex++] = i;
}
internal int Next()
{
return _array[_index++];
}
internal void Sort()
{
Array.Sort(_array, _index, _lastIndex);
}
internal void Clear()
{
_index = 0;
_lastIndex = 0;
}
internal int Count // LUCENENET NOTE: This was size() in Lucene.
{
get { return (_lastIndex - _index); }
}
private void GrowArray()
{
var newArray = new int[_arraySize * 2];
Array.Copy(_array, 0, newArray, 0, _arraySize);
_array = newArray;
_arraySize *= 2;
}
}
private int _doc;
private int _freq;
private readonly DocsQueue _queue;
private readonly Int32Queue _posList;
private readonly long _cost;
public UnionDocsAndPositionsEnum(IBits liveDocs, AtomicReaderContext context, Term[] terms, IDictionary<Term, TermContext> termContexts, TermsEnum termsEnum)
{
ICollection<DocsAndPositionsEnum> docsEnums = new LinkedList<DocsAndPositionsEnum>();
for (int i = 0; i < terms.Length; i++)
{
Term term = terms[i];
TermState termState = termContexts[term].Get(context.Ord);
if (termState == null)
{
// Term doesn't exist in reader
continue;
}
termsEnum.SeekExact(term.Bytes, termState);
DocsAndPositionsEnum postings = termsEnum.DocsAndPositions(liveDocs, null, DocsAndPositionsFlags.NONE);
if (postings == null)
{
// term does exist, but has no positions
throw new InvalidOperationException("field \"" + term.Field + "\" was indexed without position data; cannot run PhraseQuery (term=" + term.Text() + ")");
}
_cost += postings.GetCost();
docsEnums.Add(postings);
}
_queue = new DocsQueue(docsEnums);
_posList = new Int32Queue();
}
public override sealed int NextDoc()
{
if (_queue.Count == 0)
{
return NO_MORE_DOCS;
}
// TODO: move this init into positions(): if the search
// doesn't need the positions for this doc then don't
// waste CPU merging them:
_posList.Clear();
_doc = _queue.Top.DocID;
// merge sort all positions together
DocsAndPositionsEnum postings;
do
{
postings = _queue.Top;
int freq = postings.Freq;
for (int i = 0; i < freq; i++)
{
_posList.Add(postings.NextPosition());
}
if (postings.NextDoc() != NO_MORE_DOCS)
{
_queue.UpdateTop();
}
else
{
_queue.Pop();
}
} while (_queue.Count > 0 && _queue.Top.DocID == _doc);
_posList.Sort();
_freq = _posList.Count;
return _doc;
}
public override int NextPosition()
{
return _posList.Next();
}
public override int StartOffset
{
get { return -1; }
}
public override int EndOffset
{
get { return -1; }
}
public override BytesRef GetPayload()
{
return null;
}
public override sealed int Advance(int target)
{
while (_queue.Top != null && target > _queue.Top.DocID)
{
DocsAndPositionsEnum postings = _queue.Pop();
if (postings.Advance(target) != NO_MORE_DOCS)
{
_queue.Add(postings);
}
}
return NextDoc();
}
public override sealed int Freq
{
get { return _freq; }
}
public override sealed int DocID
{
get { return _doc; }
}
public override long GetCost()
{
return _cost;
}
}
}