blob: c4da4232f9886b7884dcf3f392364b4b8580f179 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.lucene.index.Impact;
import org.apache.lucene.index.Impacts;
import org.apache.lucene.index.ImpactsSource;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.FixedBitSet;
/**
* Find all slop-valid position-combinations (matches)
* encountered while traversing/hopping the PhrasePositions.
* <br> The sloppy frequency contribution of a match depends on the distance:
* <br> - highest freq for distance=0 (exact match).
* <br> - freq gets lower as distance gets higher.
* <br>Example: for query "a b"~2, a document "x a b a y" can be matched twice:
* once for "a b" (distance=0), and once for "b a" (distance=2).
* <br>Possibly not all valid combinations are encountered, because for efficiency
* we always propagate the least PhrasePosition. This allows to base on
* PriorityQueue and move forward faster.
* As result, for example, document "a b c b a"
* would score differently for queries "a b c"~4 and "c b a"~4, although
* they really are equivalent.
* Similarly, for doc "a b c b a f g", query "c b"~2
* would get same score as "g f"~2, although "c b"~2 could be matched twice.
* We may want to fix this in the future (currently not, for performance reasons).
*/
final class SloppyPhraseMatcher extends PhraseMatcher {
private final PhrasePositions[] phrasePositions;
private final int slop;
private final int numPostings;
private final PhraseQueue pq; // for advancing min position
private final boolean captureLeadMatch;
private final DocIdSetIterator approximation;
private final ImpactsDISI impactsApproximation;
private int end; // current largest phrase position
private int leadPosition;
private int leadOffset;
private int leadEndOffset;
private int leadOrd;
private boolean hasRpts; // flag indicating that there are repetitions (as checked in first candidate doc)
private boolean checkedRpts; // flag to only check for repetitions in first candidate doc
private boolean hasMultiTermRpts; //
private PhrasePositions[][] rptGroups; // in each group are PPs that repeats each other (i.e. same term), sorted by (query) offset
private PhrasePositions[] rptStack; // temporary stack for switching colliding repeating pps
private boolean positioned;
private int matchLength;
SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, ScoreMode scoreMode, SimScorer scorer, float matchCost, boolean captureLeadMatch) {
super(matchCost);
this.slop = slop;
this.numPostings = postings.length;
this.captureLeadMatch = captureLeadMatch;
pq = new PhraseQueue(postings.length);
phrasePositions = new PhrasePositions[postings.length];
for (int i = 0; i < postings.length; ++i) {
phrasePositions[i] = new PhrasePositions(postings[i].postings, postings[i].position, i, postings[i].terms);
}
approximation = ConjunctionDISI.intersectIterators(Arrays.stream(postings).map(p -> p.postings).collect(Collectors.toList()));
// What would be a good upper bound of the sloppy frequency? A sum of the
// sub frequencies would be correct, but it is usually so much higher than
// the actual sloppy frequency that it doesn't help skip irrelevant
// documents. As a consequence for now, sloppy phrase queries use dummy
// impacts:
final ImpactsSource impactsSource = new ImpactsSource() {
@Override
public Impacts getImpacts() throws IOException {
return new Impacts() {
@Override
public int numLevels() {
return 1;
}
@Override
public List<Impact> getImpacts(int level) {
return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L));
}
@Override
public int getDocIdUpTo(int level) {
return DocIdSetIterator.NO_MORE_DOCS;
}
};
}
@Override
public void advanceShallow(int target) throws IOException {}
};
impactsApproximation = new ImpactsDISI(approximation, impactsSource, scorer);
}
@Override
DocIdSetIterator approximation() {
return approximation;
}
@Override
ImpactsDISI impactsApproximation() {
return impactsApproximation;
}
@Override
float maxFreq() throws IOException {
// every term position in each postings list can be at the head of at most
// one matching phrase, so the maximum possible phrase freq is the sum of
// the freqs of the postings lists.
float maxFreq = 0;
for (PhrasePositions phrasePosition : phrasePositions) {
maxFreq += phrasePosition.postings.freq();
}
return maxFreq;
}
@Override
public void reset() throws IOException {
this.positioned = initPhrasePositions();
this.matchLength = Integer.MAX_VALUE;
this.leadPosition = Integer.MAX_VALUE;
}
@Override
float sloppyWeight() {
return 1f / (1f + matchLength);
}
@Override
public boolean nextMatch() throws IOException {
if (!positioned) {
return false;
}
PhrasePositions pp = pq.pop();
assert pp != null; // if the pq is not full, then positioned == false
captureLead(pp);
matchLength = end - pp.position;
int next = pq.top().position;
while (advancePP(pp)) {
if (hasRpts && !advanceRpts(pp)) {
break; // pps exhausted
}
if (pp.position > next) { // done minimizing current match-length
pq.add(pp);
if (matchLength <= slop) {
return true;
}
pp = pq.pop();
next = pq.top().position;
assert pp != null; // if the pq is not full, then positioned == false
matchLength = end - pp.position;
} else {
int matchLength2 = end - pp.position;
if (matchLength2 < matchLength) {
matchLength = matchLength2;
}
}
captureLead(pp);
}
positioned = false;
return matchLength <= slop;
}
private void captureLead(PhrasePositions pp) throws IOException {
if (captureLeadMatch == false) {
return;
}
leadOrd = pp.ord;
leadPosition = pp.position + pp.offset;
leadOffset = pp.postings.startOffset();
leadEndOffset = pp.postings.endOffset();
}
@Override
public int startPosition() {
// when a match is detected, the top postings is advanced until it has moved
// beyond its successor, to ensure that the match is of minimal width. This
// means that we need to record the lead position before it is advanced.
// However, the priority queue doesn't guarantee that the top postings is in fact the
// earliest in the list, so we need to cycle through all terms to check.
// this is slow, but Matches is slow anyway...
int leadPosition = this.leadPosition;
for (PhrasePositions pp : phrasePositions) {
leadPosition = Math.min(leadPosition, pp.position + pp.offset);
}
return leadPosition;
}
@Override
public int endPosition() {
int endPosition = leadPosition;
for (PhrasePositions pp : phrasePositions) {
if (pp.ord != leadOrd) {
endPosition = Math.max(endPosition, pp.position + pp.offset);
}
}
return endPosition;
}
@Override
public int startOffset() throws IOException {
// when a match is detected, the top postings is advanced until it has moved
// beyond its successor, to ensure that the match is of minimal width. This
// means that we need to record the lead offset before it is advanced.
// However, the priority queue doesn't guarantee that the top postings is in fact the
// earliest in the list, so we need to cycle through all terms to check
// this is slow, but Matches is slow anyway...
int leadOffset = this.leadOffset;
for (PhrasePositions pp : phrasePositions) {
leadOffset = Math.min(leadOffset, pp.postings.startOffset());
}
return leadOffset;
}
@Override
public int endOffset() throws IOException {
int endOffset = leadEndOffset;
for (PhrasePositions pp : phrasePositions) {
if (pp.ord != leadOrd) {
endOffset = Math.max(endOffset, pp.postings.endOffset());
}
}
return endOffset;
}
/** advance a PhrasePosition and update 'end', return false if exhausted */
private boolean advancePP(PhrasePositions pp) throws IOException {
if (!pp.nextPosition()) {
return false;
}
if (pp.position > end) {
end = pp.position;
}
return true;
}
/** pp was just advanced. If that caused a repeater collision, resolve by advancing the lesser
* of the two colliding pps. Note that there can only be one collision, as by the initialization
* there were no collisions before pp was advanced. */
private boolean advanceRpts(PhrasePositions pp) throws IOException {
if (pp.rptGroup < 0) {
return true; // not a repeater
}
PhrasePositions[] rg = rptGroups[pp.rptGroup];
FixedBitSet bits = new FixedBitSet(rg.length); // for re-queuing after collisions are resolved
int k0 = pp.rptInd;
int k;
while((k=collide(pp)) >= 0) {
pp = lesser(pp, rg[k]); // always advance the lesser of the (only) two colliding pps
if (!advancePP(pp)) {
return false; // exhausted
}
if (k != k0) { // careful: mark only those currently in the queue
bits = FixedBitSet.ensureCapacity(bits, k);
bits.set(k); // mark that pp2 need to be re-queued
}
}
// collisions resolved, now re-queue
// empty (partially) the queue until seeing all pps advanced for resolving collisions
int n = 0;
// TODO would be good if we can avoid calling cardinality() in each iteration!
int numBits = bits.length(); // larges bit we set
while (bits.cardinality() > 0) {
PhrasePositions pp2 = pq.pop();
rptStack[n++] = pp2;
if (pp2.rptGroup >= 0
&& pp2.rptInd < numBits // this bit may not have been set
&& bits.get(pp2.rptInd)) {
bits.clear(pp2.rptInd);
}
}
// add back to queue
for (int i=n-1; i>=0; i--) {
pq.add(rptStack[i]);
}
return true;
}
/** compare two pps, but only by position and offset */
private PhrasePositions lesser(PhrasePositions pp, PhrasePositions pp2) {
if (pp.position < pp2.position ||
(pp.position == pp2.position && pp.offset < pp2.offset)) {
return pp;
}
return pp2;
}
/** index of a pp2 colliding with pp, or -1 if none */
private int collide(PhrasePositions pp) {
int tpPos = tpPos(pp);
PhrasePositions[] rg = rptGroups[pp.rptGroup];
for (int i=0; i<rg.length; i++) {
PhrasePositions pp2 = rg[i];
if (pp2 != pp && tpPos(pp2) == tpPos) {
return pp2.rptInd;
}
}
return -1;
}
/**
* Initialize PhrasePositions in place.
* A one time initialization for this scorer (on first doc matching all terms):
* <ul>
* <li>Check if there are repetitions
* <li>If there are, find groups of repetitions.
* </ul>
* Examples:
* <ol>
* <li>no repetitions: <b>"ho my"~2</b>
* <li>repetitions: <b>"ho my my"~2</b>
* <li>repetitions: <b>"my ho my"~2</b>
* </ol>
* @return false if PPs are exhausted (and so current doc will not be a match)
*/
private boolean initPhrasePositions() throws IOException {
end = Integer.MIN_VALUE;
if (!checkedRpts) {
return initFirstTime();
}
if (!hasRpts) {
initSimple();
return true; // PPs available
}
return initComplex();
}
/** no repeats: simplest case, and most common. It is important to keep this piece of the code simple and efficient */
private void initSimple() throws IOException {
//System.err.println("initSimple: doc: "+min.doc);
pq.clear();
// position pps and build queue from list
for (PhrasePositions pp : phrasePositions) {
pp.firstPosition();
if (pp.position > end) {
end = pp.position;
}
pq.add(pp);
}
}
/** with repeats: not so simple. */
private boolean initComplex() throws IOException {
//System.err.println("initComplex: doc: "+min.doc);
placeFirstPositions();
if (!advanceRepeatGroups()) {
return false; // PPs exhausted
}
fillQueue();
return true; // PPs available
}
/** move all PPs to their first position */
private void placeFirstPositions() throws IOException {
for (PhrasePositions pp : phrasePositions) {
pp.firstPosition();
}
}
/** Fill the queue (all pps are already placed */
private void fillQueue() {
pq.clear();
for (PhrasePositions pp : phrasePositions) { // iterate cyclic list: done once handled max
if (pp.position > end) {
end = pp.position;
}
pq.add(pp);
}
}
/** At initialization (each doc), each repetition group is sorted by (query) offset.
* This provides the start condition: no collisions.
* <p>Case 1: no multi-term repeats<br>
* It is sufficient to advance each pp in the group by one less than its group index.
* So lesser pp is not advanced, 2nd one advance once, 3rd one advanced twice, etc.
* <p>Case 2: multi-term repeats<br>
*
* @return false if PPs are exhausted.
*/
private boolean advanceRepeatGroups() throws IOException {
for (PhrasePositions[] rg: rptGroups) {
if (hasMultiTermRpts) {
// more involved, some may not collide
int incr;
for (int i=0; i<rg.length; i+=incr) {
incr = 1;
PhrasePositions pp = rg[i];
int k;
while((k=collide(pp)) >= 0) {
PhrasePositions pp2 = lesser(pp, rg[k]);
if (!advancePP(pp2)) { // at initialization always advance pp with higher offset
return false; // exhausted
}
if (pp2.rptInd < i) { // should not happen?
incr = 0;
break;
}
}
}
} else {
// simpler, we know exactly how much to advance
for (int j=1; j<rg.length; j++) {
for (int k=0; k<j; k++) {
if (!rg[j].nextPosition()) {
return false; // PPs exhausted
}
}
}
}
}
return true; // PPs available
}
/** initialize with checking for repeats. Heavy work, but done only for the first candidate doc.<p>
* If there are repetitions, check if multi-term postings (MTP) are involved.<p>
* Without MTP, once PPs are placed in the first candidate doc, repeats (and groups) are visible.<br>
* With MTP, a more complex check is needed, up-front, as there may be "hidden collisions".<br>
* For example P1 has {A,B}, P1 has {B,C}, and the first doc is: "A C B". At start, P1 would point
* to "A", p2 to "C", and it will not be identified that P1 and P2 are repetitions of each other.<p>
* The more complex initialization has two parts:<br>
* (1) identification of repetition groups.<br>
* (2) advancing repeat groups at the start of the doc.<br>
* For (1), a possible solution is to just create a single repetition group,
* made of all repeating pps. But this would slow down the check for collisions,
* as all pps would need to be checked. Instead, we compute "connected regions"
* on the bipartite graph of postings and terms.
*/
private boolean initFirstTime() throws IOException {
//System.err.println("initFirstTime: doc: "+min.doc);
checkedRpts = true;
placeFirstPositions();
LinkedHashMap<Term,Integer> rptTerms = repeatingTerms();
hasRpts = !rptTerms.isEmpty();
if (hasRpts) {
rptStack = new PhrasePositions[numPostings]; // needed with repetitions
ArrayList<ArrayList<PhrasePositions>> rgs = gatherRptGroups(rptTerms);
sortRptGroups(rgs);
if (!advanceRepeatGroups()) {
return false; // PPs exhausted
}
}
fillQueue();
return true; // PPs available
}
/** sort each repetition group by (query) offset.
* Done only once (at first doc) and allows to initialize faster for each doc. */
private void sortRptGroups(ArrayList<ArrayList<PhrasePositions>> rgs) {
rptGroups = new PhrasePositions[rgs.size()][];
Comparator<PhrasePositions> cmprtr = new Comparator<PhrasePositions>() {
@Override
public int compare(PhrasePositions pp1, PhrasePositions pp2) {
return pp1.offset - pp2.offset;
}
};
for (int i=0; i<rptGroups.length; i++) {
PhrasePositions[] rg = rgs.get(i).toArray(new PhrasePositions[0]);
Arrays.sort(rg, cmprtr);
rptGroups[i] = rg;
for (int j=0; j<rg.length; j++) {
rg[j].rptInd = j; // we use this index for efficient re-queuing
}
}
}
/** Detect repetition groups. Done once - for first doc */
private ArrayList<ArrayList<PhrasePositions>> gatherRptGroups(LinkedHashMap<Term,Integer> rptTerms) throws IOException {
PhrasePositions[] rpp = repeatingPPs(rptTerms);
ArrayList<ArrayList<PhrasePositions>> res = new ArrayList<>();
if (!hasMultiTermRpts) {
// simpler - no multi-terms - can base on positions in first doc
for (int i=0; i<rpp.length; i++) {
PhrasePositions pp = rpp[i];
if (pp.rptGroup >=0) continue; // already marked as a repetition
int tpPos = tpPos(pp);
for (int j=i+1; j<rpp.length; j++) {
PhrasePositions pp2 = rpp[j];
if (
pp2.rptGroup >=0 // already marked as a repetition
|| pp2.offset == pp.offset // not a repetition: two PPs are originally in same offset in the query!
|| tpPos(pp2) != tpPos) { // not a repetition
continue;
}
// a repetition
int g = pp.rptGroup;
if (g < 0) {
g = res.size();
pp.rptGroup = g;
ArrayList<PhrasePositions> rl = new ArrayList<>(2);
rl.add(pp);
res.add(rl);
}
pp2.rptGroup = g;
res.get(g).add(pp2);
}
}
} else {
// more involved - has multi-terms
ArrayList<HashSet<PhrasePositions>> tmp = new ArrayList<>();
ArrayList<FixedBitSet> bb = ppTermsBitSets(rpp, rptTerms);
unionTermGroups(bb);
HashMap<Term,Integer> tg = termGroups(rptTerms, bb);
HashSet<Integer> distinctGroupIDs = new HashSet<>(tg.values());
for (int i=0; i<distinctGroupIDs.size(); i++) {
tmp.add(new HashSet<PhrasePositions>());
}
for (PhrasePositions pp : rpp) {
for (Term t: pp.terms) {
if (rptTerms.containsKey(t)) {
int g = tg.get(t);
tmp.get(g).add(pp);
assert pp.rptGroup==-1 || pp.rptGroup==g;
pp.rptGroup = g;
}
}
}
for (HashSet<PhrasePositions> hs : tmp) {
res.add(new ArrayList<>(hs));
}
}
return res;
}
/** Actual position in doc of a PhrasePosition, relies on that position = tpPos - offset) */
private final int tpPos(PhrasePositions pp) {
return pp.position + pp.offset;
}
/** find repeating terms and assign them ordinal values */
private LinkedHashMap<Term,Integer> repeatingTerms() {
LinkedHashMap<Term,Integer> tord = new LinkedHashMap<>();
HashMap<Term,Integer> tcnt = new HashMap<>();
for (PhrasePositions pp : phrasePositions) {
for (Term t : pp.terms) {
Integer cnt = tcnt.compute(t, (key, old) -> old == null ? 1 : 1 + old);
if (cnt==2) {
tord.put(t,tord.size());
}
}
}
return tord;
}
/** find repeating pps, and for each, if has multi-terms, update this.hasMultiTermRpts */
private PhrasePositions[] repeatingPPs(HashMap<Term,Integer> rptTerms) {
ArrayList<PhrasePositions> rp = new ArrayList<>();
for (PhrasePositions pp : phrasePositions) {
for (Term t : pp.terms) {
if (rptTerms.containsKey(t)) {
rp.add(pp);
hasMultiTermRpts |= (pp.terms.length > 1);
break;
}
}
}
return rp.toArray(new PhrasePositions[0]);
}
/** bit-sets - for each repeating pp, for each of its repeating terms, the term ordinal values is set */
private ArrayList<FixedBitSet> ppTermsBitSets(PhrasePositions[] rpp, HashMap<Term,Integer> tord) {
ArrayList<FixedBitSet> bb = new ArrayList<>(rpp.length);
for (PhrasePositions pp : rpp) {
FixedBitSet b = new FixedBitSet(tord.size());
Integer ord;
for (Term t: pp.terms) {
if ((ord=tord.get(t))!=null) {
b.set(ord);
}
}
bb.add(b);
}
return bb;
}
/** union (term group) bit-sets until they are disjoint (O(n^^2)), and each group have different terms */
private void unionTermGroups(ArrayList<FixedBitSet> bb) {
int incr;
for (int i=0; i<bb.size()-1; i+=incr) {
incr = 1;
int j = i+1;
while (j<bb.size()) {
if (bb.get(i).intersects(bb.get(j))) {
bb.get(i).or(bb.get(j));
bb.remove(j);
incr = 0;
} else {
++j;
}
}
}
}
/** map each term to the single group that contains it */
private HashMap<Term,Integer> termGroups(LinkedHashMap<Term,Integer> tord, ArrayList<FixedBitSet> bb) throws IOException {
HashMap<Term,Integer> tg = new HashMap<>();
Term[] t = tord.keySet().toArray(new Term[0]);
for (int i=0; i<bb.size(); i++) { // i is the group no.
FixedBitSet bits = bb.get(i);
for (int ord = bits.nextSetBit(0); ord != DocIdSetIterator.NO_MORE_DOCS; ord = ord + 1 >= bits.length() ? DocIdSetIterator.NO_MORE_DOCS : bits.nextSetBit(ord + 1)) {
tg.put(t[ord],i);
}
}
return tg;
}
}