blob: 7973e11be14be8565df3cd4b04cb87f8052c068b [file] [log] [blame]
using Lucene.Net.Support;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
using System.Text;
using QueryPhraseMap = Lucene.Net.Search.VectorHighlight.FieldQuery.QueryPhraseMap;
using TermInfo = Lucene.Net.Search.VectorHighlight.FieldTermStack.TermInfo;
namespace Lucene.Net.Search.VectorHighlight
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// FieldPhraseList has a list of WeightedPhraseInfo that is used by FragListBuilder
/// to create a FieldFragList object.
/// </summary>
public class FieldPhraseList
{
/// <summary>
/// List of non-overlapping <see cref="WeightedPhraseInfo"/> objects.
/// </summary>
internal List<WeightedPhraseInfo> phraseList = new List<WeightedPhraseInfo>();
/// <summary>
/// create a <see cref="FieldPhraseList"/> that has no limit on the number of phrases to analyze
/// </summary>
/// <param name="fieldTermStack"><see cref="FieldTermStack"/> object</param>
/// <param name="fieldQuery"><see cref="FieldQuery"/> object</param>
public FieldPhraseList(FieldTermStack fieldTermStack, FieldQuery fieldQuery)
: this(fieldTermStack, fieldQuery, int.MaxValue)
{
}
/// <summary>
/// return the list of <see cref="WeightedPhraseInfo"/>.
/// </summary>
public virtual IList<WeightedPhraseInfo> PhraseList => phraseList;
/// <summary>
/// a constructor.
/// </summary>
/// <param name="fieldTermStack"><see cref="FieldTermStack"/> object</param>
/// <param name="fieldQuery"><see cref="FieldQuery"/> object</param>
/// <param name="phraseLimit">maximum size of phraseList</param>
public FieldPhraseList(FieldTermStack fieldTermStack, FieldQuery fieldQuery, int phraseLimit)
{
string field = fieldTermStack.FieldName;
List<TermInfo> phraseCandidate = new List<TermInfo>();
QueryPhraseMap currMap = null;
QueryPhraseMap nextMap = null;
while (!fieldTermStack.IsEmpty && (phraseList.Count < phraseLimit))
{
phraseCandidate.Clear();
TermInfo ti = null;
TermInfo first = null;
first = ti = fieldTermStack.Pop();
currMap = fieldQuery.GetFieldTermMap(field, ti.Text);
while (currMap == null && ti.Next != first)
{
ti = ti.Next;
currMap = fieldQuery.GetFieldTermMap(field, ti.Text);
}
// if not found, discard top TermInfo from stack, then try next element
if (currMap == null) continue;
// if found, search the longest phrase
phraseCandidate.Add(ti);
while (true)
{
first = ti = fieldTermStack.Pop();
nextMap = null;
if (ti != null)
{
nextMap = currMap.GetTermMap(ti.Text);
while (nextMap == null && ti.Next != first)
{
ti = ti.Next;
nextMap = currMap.GetTermMap(ti.Text);
}
}
if (ti == null || nextMap == null)
{
if (ti != null)
fieldTermStack.Push(ti);
if (currMap.IsValidTermOrPhrase(phraseCandidate))
{
AddIfNoOverlap(new WeightedPhraseInfo(phraseCandidate, currMap.Boost, currMap.TermOrPhraseNumber));
}
else
{
while (phraseCandidate.Count > 1)
{
//fieldTermStack.Push(phraseCandidate.Last.Value);
//phraseCandidate.RemoveLast();
TermInfo last = phraseCandidate[phraseCandidate.Count - 1];
phraseCandidate.Remove(last);
fieldTermStack.Push(last);
currMap = fieldQuery.SearchPhrase(field, phraseCandidate);
if (currMap != null)
{
AddIfNoOverlap(new WeightedPhraseInfo(phraseCandidate, currMap.Boost, currMap.TermOrPhraseNumber));
break;
}
}
}
break;
}
else
{
phraseCandidate.Add(ti);
currMap = nextMap;
}
}
}
}
/// <summary>
/// Merging constructor.
/// </summary>
/// <param name="toMerge"><see cref="FieldPhraseList"/>s to merge to build this one</param>
public FieldPhraseList(FieldPhraseList[] toMerge)
{
// Merge all overlapping WeightedPhraseInfos
// Step 1. Sort by startOffset, endOffset, and boost, in that order.
IEnumerator<WeightedPhraseInfo>[] allInfos = new IEnumerator<WeightedPhraseInfo>[toMerge.Length];
try
{
int index = 0;
foreach (FieldPhraseList fplToMerge in toMerge)
{
allInfos[index++] = fplToMerge.phraseList.GetEnumerator();
}
using (MergedEnumerator<WeightedPhraseInfo> itr = new MergedEnumerator<WeightedPhraseInfo>(false, allInfos))
{
// Step 2. Walk the sorted list merging infos that overlap
phraseList = new List<WeightedPhraseInfo>();
if (!itr.MoveNext())
{
return;
}
List<WeightedPhraseInfo> work = new List<WeightedPhraseInfo>();
WeightedPhraseInfo first = itr.Current;
work.Add(first);
int workEndOffset = first.EndOffset;
while (itr.MoveNext())
{
WeightedPhraseInfo current = itr.Current;
if (current.StartOffset <= workEndOffset)
{
workEndOffset = Math.Max(workEndOffset, current.EndOffset);
work.Add(current);
}
else
{
if (work.Count == 1)
{
phraseList.Add(work[0]);
work[0] = current;
}
else
{
phraseList.Add(new WeightedPhraseInfo(work));
work.Clear();
work.Add(current);
}
workEndOffset = current.EndOffset;
}
}
if (work.Count == 1)
{
phraseList.Add(work[0]);
}
else
{
phraseList.Add(new WeightedPhraseInfo(work));
work.Clear();
}
}
}
finally
{
IOUtils.Dispose(allInfos);
}
}
public virtual void AddIfNoOverlap(WeightedPhraseInfo wpi)
{
foreach (WeightedPhraseInfo existWpi in PhraseList)
{
if (existWpi.IsOffsetOverlap(wpi))
{
// WeightedPhraseInfo.addIfNoOverlap() dumps the second part of, for example, hyphenated words (social-economics).
// The result is that all informations in TermInfo are lost and not available for further operations.
existWpi.TermsInfos.AddRange(wpi.TermsInfos);
return;
}
}
PhraseList.Add(wpi);
}
/// <summary>
/// Represents the list of term offsets and boost for some text
/// </summary>
public class WeightedPhraseInfo : IComparable<WeightedPhraseInfo>
{
private List<Toffs> termsOffsets; // usually termsOffsets.size() == 1,
// but if position-gap > 1 and slop > 0 then size() could be greater than 1
private float boost; // query boost
private int seqnum;
private List<TermInfo> termsInfos;
/// <summary>
/// Text of the match, calculated on the fly. Use for debugging only.
/// </summary>
/// <returns>the text</returns>
public virtual string GetText()
{
StringBuilder text = new StringBuilder();
foreach (TermInfo ti in termsInfos)
{
text.Append(ti.Text);
}
return text.ToString();
}
/// <summary>
/// the termsOffsets
/// </summary>
public virtual IList<Toffs> TermsOffsets => termsOffsets;
/// <summary>
/// the boost
/// </summary>
public virtual float Boost => boost;
/// <summary>
/// the termInfos
/// </summary>
public virtual IList<TermInfo> TermsInfos => termsInfos;
public WeightedPhraseInfo(IList<TermInfo> terms, float boost)
: this(terms, boost, 0)
{
}
public WeightedPhraseInfo(IList<TermInfo> terms, float boost, int seqnum)
{
this.boost = boost;
this.seqnum = seqnum;
// We keep TermInfos for further operations
termsInfos = new List<TermInfo>(terms);
termsOffsets = new List<Toffs>(terms.Count);
TermInfo ti = terms[0];
termsOffsets.Add(new Toffs(ti.StartOffset, ti.EndOffset));
if (terms.Count == 1)
{
return;
}
int pos = ti.Position;
for (int i = 1; i < terms.Count; i++)
{
ti = terms[i];
if (ti.Position - pos == 1)
{
Toffs to = termsOffsets[termsOffsets.Count - 1];
to.EndOffset = ti.EndOffset;
}
else
{
termsOffsets.Add(new Toffs(ti.StartOffset, ti.EndOffset));
}
pos = ti.Position;
}
}
/// <summary>
/// Merging constructor. Note that this just grabs seqnum from the first info.
/// </summary>
public WeightedPhraseInfo(ICollection<WeightedPhraseInfo> toMerge)
{
IEnumerator<Toffs>[] allToffs = new IEnumerator<Toffs>[toMerge.Count];
try
{
// Pretty much the same idea as merging FieldPhraseLists:
// Step 1. Sort by startOffset, endOffset
// While we are here merge the boosts and termInfos
using (IEnumerator<WeightedPhraseInfo> toMergeItr = toMerge.GetEnumerator())
{
if (!toMergeItr.MoveNext())
{
throw new ArgumentException("toMerge must contain at least one WeightedPhraseInfo.");
}
WeightedPhraseInfo first = toMergeItr.Current;
termsInfos = new List<TermInfo>();
seqnum = first.seqnum;
boost = first.boost;
allToffs[0] = first.termsOffsets.GetEnumerator();
int index = 1;
while (toMergeItr.MoveNext())
{
WeightedPhraseInfo info = toMergeItr.Current;
boost += info.boost;
termsInfos.AddRange(info.termsInfos);
allToffs[index++] = info.termsOffsets.GetEnumerator();
}
// Step 2. Walk the sorted list merging overlaps
using (MergedEnumerator<Toffs> itr = new MergedEnumerator<Toffs>(false, allToffs))
{
termsOffsets = new List<Toffs>();
if (!itr.MoveNext())
{
return;
}
Toffs work = itr.Current;
while (itr.MoveNext())
{
Toffs current = itr.Current;
if (current.StartOffset <= work.EndOffset)
{
work.EndOffset = Math.Max(work.EndOffset, current.EndOffset);
}
else
{
termsOffsets.Add(work);
work = current;
}
}
termsOffsets.Add(work);
}
}
}
finally
{
IOUtils.Dispose(allToffs);
}
}
public virtual int StartOffset => termsOffsets[0].StartOffset;
public virtual int EndOffset => termsOffsets[termsOffsets.Count - 1].EndOffset;
public virtual bool IsOffsetOverlap(WeightedPhraseInfo other)
{
int so = StartOffset;
int eo = EndOffset;
int oso = other.StartOffset;
int oeo = other.EndOffset;
if (so <= oso && oso < eo) return true;
if (so < oeo && oeo <= eo) return true;
if (oso <= so && so < oeo) return true;
if (oso < eo && eo <= oeo) return true;
return false;
}
public override string ToString()
{
StringBuilder sb = new StringBuilder();
sb.Append(GetText()).Append('(').Append(Number.ToString(boost)).Append(")(");
foreach (Toffs to in termsOffsets)
{
sb.Append(to);
}
sb.Append(')');
return sb.ToString();
}
/// <summary>
/// the seqnum
/// </summary>
public virtual int Seqnum => seqnum;
public virtual int CompareTo(WeightedPhraseInfo other)
{
int diff = StartOffset - other.StartOffset;
if (diff != 0)
{
return diff;
}
diff = EndOffset - other.EndOffset;
if (diff != 0)
{
return diff;
}
return (int)Math.Sign(Boost - other.Boost);
}
public override int GetHashCode()
{
int prime = 31;
int result = 1;
result = prime * result + StartOffset;
result = prime * result + EndOffset;
long b = J2N.BitConversion.DoubleToInt64Bits(Boost);
result = prime * result + (int)(b ^ TripleShift(b, 32));
return result;
}
// LUCENENET NOTE: For some reason the standard way of correcting the >>>
// operator (int)((uint)b >> 32) didn't work here. Got this solution from http://stackoverflow.com/a/6625912
// and it works just like in Java.
private static long TripleShift(long n, int s)
{
if (n >= 0)
return n >> s;
return (n >> s) + (2 << ~s);
}
public override bool Equals(object obj)
{
if (this == obj)
{
return true;
}
if (obj == null)
{
return false;
}
if (GetType() != obj.GetType())
{
return false;
}
WeightedPhraseInfo other = (WeightedPhraseInfo)obj;
if (StartOffset != other.StartOffset)
{
return false;
}
if (EndOffset != other.EndOffset)
{
return false;
}
if (Boost != other.Boost)
{
return false;
}
return true;
}
/// <summary>
/// Term offsets (start + end)
/// </summary>
public class Toffs : IComparable<Toffs>
{
private int startOffset;
private int endOffset;
public Toffs(int startOffset, int endOffset)
{
this.startOffset = startOffset;
this.endOffset = endOffset;
}
public virtual int StartOffset => startOffset;
public virtual int EndOffset
{
get => endOffset;
set => endOffset = value;
}
public virtual int CompareTo(Toffs other)
{
int diff = StartOffset - other.StartOffset;
if (diff != 0)
{
return diff;
}
return EndOffset - other.EndOffset;
}
public override int GetHashCode()
{
int prime = 31;
int result = 1;
result = prime * result + StartOffset;
result = prime * result + EndOffset;
return result;
}
public override bool Equals(object obj)
{
if (this == obj)
{
return true;
}
if (obj == null)
{
return false;
}
if (GetType() != obj.GetType())
{
return false;
}
Toffs other = (Toffs)obj;
if (StartOffset != other.StartOffset)
{
return false;
}
if (EndOffset != other.EndOffset)
{
return false;
}
return true;
}
public override string ToString()
{
StringBuilder sb = new StringBuilder();
sb.Append('(').Append(startOffset).Append(',').Append(endOffset).Append(')');
return sb.ToString();
}
}
}
}
}