| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search; |
| |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.LinkedHashMap; |
| import java.util.List; |
| import java.util.stream.Collectors; |
| |
| import org.apache.lucene.index.Impact; |
| import org.apache.lucene.index.Impacts; |
| import org.apache.lucene.index.ImpactsSource; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.search.similarities.Similarity.SimScorer; |
| import org.apache.lucene.util.FixedBitSet; |
| |
| /** |
| * Find all slop-valid position-combinations (matches) |
| * encountered while traversing/hopping the PhrasePositions. |
| * <br> The sloppy frequency contribution of a match depends on the distance: |
| * <br> - highest freq for distance=0 (exact match). |
| * <br> - freq gets lower as distance gets higher. |
| * <br>Example: for query "a b"~2, a document "x a b a y" can be matched twice: |
| * once for "a b" (distance=0), and once for "b a" (distance=2). |
| * <br>Possibly not all valid combinations are encountered, because for efficiency |
| * we always propagate the least PhrasePosition. This allows to base on |
| * PriorityQueue and move forward faster. |
| * As result, for example, document "a b c b a" |
| * would score differently for queries "a b c"~4 and "c b a"~4, although |
| * they really are equivalent. |
| * Similarly, for doc "a b c b a f g", query "c b"~2 |
| * would get same score as "g f"~2, although "c b"~2 could be matched twice. |
| * We may want to fix this in the future (currently not, for performance reasons). |
| */ |
| final class SloppyPhraseMatcher extends PhraseMatcher { |
| |
| private final PhrasePositions[] phrasePositions; |
| |
| private final int slop; |
| private final int numPostings; |
| private final PhraseQueue pq; // for advancing min position |
| private final boolean captureLeadMatch; |
| |
| private final DocIdSetIterator approximation; |
| private final ImpactsDISI impactsApproximation; |
| |
| private int end; // current largest phrase position |
| |
| private int leadPosition; |
| private int leadOffset; |
| private int leadEndOffset; |
| private int leadOrd; |
| |
| private boolean hasRpts; // flag indicating that there are repetitions (as checked in first candidate doc) |
| private boolean checkedRpts; // flag to only check for repetitions in first candidate doc |
| private boolean hasMultiTermRpts; // |
| private PhrasePositions[][] rptGroups; // in each group are PPs that repeats each other (i.e. same term), sorted by (query) offset |
| private PhrasePositions[] rptStack; // temporary stack for switching colliding repeating pps |
| |
| private boolean positioned; |
| private int matchLength; |
| |
| SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, ScoreMode scoreMode, SimScorer scorer, float matchCost, boolean captureLeadMatch) { |
| super(matchCost); |
| this.slop = slop; |
| this.numPostings = postings.length; |
| this.captureLeadMatch = captureLeadMatch; |
| pq = new PhraseQueue(postings.length); |
| phrasePositions = new PhrasePositions[postings.length]; |
| for (int i = 0; i < postings.length; ++i) { |
| phrasePositions[i] = new PhrasePositions(postings[i].postings, postings[i].position, i, postings[i].terms); |
| } |
| |
| approximation = ConjunctionDISI.intersectIterators(Arrays.stream(postings).map(p -> p.postings).collect(Collectors.toList())); |
| // What would be a good upper bound of the sloppy frequency? A sum of the |
| // sub frequencies would be correct, but it is usually so much higher than |
| // the actual sloppy frequency that it doesn't help skip irrelevant |
| // documents. As a consequence for now, sloppy phrase queries use dummy |
| // impacts: |
| final ImpactsSource impactsSource = new ImpactsSource() { |
| @Override |
| public Impacts getImpacts() throws IOException { |
| return new Impacts() { |
| |
| @Override |
| public int numLevels() { |
| return 1; |
| } |
| |
| @Override |
| public List<Impact> getImpacts(int level) { |
| return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); |
| } |
| |
| @Override |
| public int getDocIdUpTo(int level) { |
| return DocIdSetIterator.NO_MORE_DOCS; |
| } |
| }; |
| } |
| |
| @Override |
| public void advanceShallow(int target) throws IOException {} |
| }; |
| impactsApproximation = new ImpactsDISI(approximation, impactsSource, scorer); |
| } |
| |
| @Override |
| DocIdSetIterator approximation() { |
| return approximation; |
| } |
| |
| @Override |
| ImpactsDISI impactsApproximation() { |
| return impactsApproximation; |
| } |
| |
| @Override |
| float maxFreq() throws IOException { |
| // every term position in each postings list can be at the head of at most |
| // one matching phrase, so the maximum possible phrase freq is the sum of |
| // the freqs of the postings lists. |
| float maxFreq = 0; |
| for (PhrasePositions phrasePosition : phrasePositions) { |
| maxFreq += phrasePosition.postings.freq(); |
| } |
| return maxFreq; |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| this.positioned = initPhrasePositions(); |
| this.matchLength = Integer.MAX_VALUE; |
| this.leadPosition = Integer.MAX_VALUE; |
| } |
| |
| @Override |
| float sloppyWeight() { |
| return 1f / (1f + matchLength); |
| } |
| |
| @Override |
| public boolean nextMatch() throws IOException { |
| if (!positioned) { |
| return false; |
| } |
| PhrasePositions pp = pq.pop(); |
| assert pp != null; // if the pq is not full, then positioned == false |
| captureLead(pp); |
| matchLength = end - pp.position; |
| int next = pq.top().position; |
| while (advancePP(pp)) { |
| if (hasRpts && !advanceRpts(pp)) { |
| break; // pps exhausted |
| } |
| if (pp.position > next) { // done minimizing current match-length |
| pq.add(pp); |
| if (matchLength <= slop) { |
| return true; |
| } |
| pp = pq.pop(); |
| next = pq.top().position; |
| assert pp != null; // if the pq is not full, then positioned == false |
| matchLength = end - pp.position; |
| } else { |
| int matchLength2 = end - pp.position; |
| if (matchLength2 < matchLength) { |
| matchLength = matchLength2; |
| } |
| } |
| captureLead(pp); |
| } |
| positioned = false; |
| return matchLength <= slop; |
| } |
| |
| private void captureLead(PhrasePositions pp) throws IOException { |
| if (captureLeadMatch == false) { |
| return; |
| } |
| leadOrd = pp.ord; |
| leadPosition = pp.position + pp.offset; |
| leadOffset = pp.postings.startOffset(); |
| leadEndOffset = pp.postings.endOffset(); |
| } |
| |
| @Override |
| public int startPosition() { |
| // when a match is detected, the top postings is advanced until it has moved |
| // beyond its successor, to ensure that the match is of minimal width. This |
| // means that we need to record the lead position before it is advanced. |
| // However, the priority queue doesn't guarantee that the top postings is in fact the |
| // earliest in the list, so we need to cycle through all terms to check. |
| // this is slow, but Matches is slow anyway... |
| int leadPosition = this.leadPosition; |
| for (PhrasePositions pp : phrasePositions) { |
| leadPosition = Math.min(leadPosition, pp.position + pp.offset); |
| } |
| return leadPosition; |
| } |
| |
| @Override |
| public int endPosition() { |
| int endPosition = leadPosition; |
| for (PhrasePositions pp : phrasePositions) { |
| if (pp.ord != leadOrd) { |
| endPosition = Math.max(endPosition, pp.position + pp.offset); |
| } |
| } |
| return endPosition; |
| } |
| |
| @Override |
| public int startOffset() throws IOException { |
| // when a match is detected, the top postings is advanced until it has moved |
| // beyond its successor, to ensure that the match is of minimal width. This |
| // means that we need to record the lead offset before it is advanced. |
| // However, the priority queue doesn't guarantee that the top postings is in fact the |
| // earliest in the list, so we need to cycle through all terms to check |
| // this is slow, but Matches is slow anyway... |
| int leadOffset = this.leadOffset; |
| for (PhrasePositions pp : phrasePositions) { |
| leadOffset = Math.min(leadOffset, pp.postings.startOffset()); |
| } |
| return leadOffset; |
| } |
| |
| @Override |
| public int endOffset() throws IOException { |
| int endOffset = leadEndOffset; |
| for (PhrasePositions pp : phrasePositions) { |
| if (pp.ord != leadOrd) { |
| endOffset = Math.max(endOffset, pp.postings.endOffset()); |
| } |
| } |
| return endOffset; |
| } |
| |
| /** advance a PhrasePosition and update 'end', return false if exhausted */ |
| private boolean advancePP(PhrasePositions pp) throws IOException { |
| if (!pp.nextPosition()) { |
| return false; |
| } |
| if (pp.position > end) { |
| end = pp.position; |
| } |
| return true; |
| } |
| |
| /** pp was just advanced. If that caused a repeater collision, resolve by advancing the lesser |
| * of the two colliding pps. Note that there can only be one collision, as by the initialization |
| * there were no collisions before pp was advanced. */ |
| private boolean advanceRpts(PhrasePositions pp) throws IOException { |
| if (pp.rptGroup < 0) { |
| return true; // not a repeater |
| } |
| PhrasePositions[] rg = rptGroups[pp.rptGroup]; |
| FixedBitSet bits = new FixedBitSet(rg.length); // for re-queuing after collisions are resolved |
| int k0 = pp.rptInd; |
| int k; |
| while((k=collide(pp)) >= 0) { |
| pp = lesser(pp, rg[k]); // always advance the lesser of the (only) two colliding pps |
| if (!advancePP(pp)) { |
| return false; // exhausted |
| } |
| if (k != k0) { // careful: mark only those currently in the queue |
| bits = FixedBitSet.ensureCapacity(bits, k); |
| bits.set(k); // mark that pp2 need to be re-queued |
| } |
| } |
| // collisions resolved, now re-queue |
| // empty (partially) the queue until seeing all pps advanced for resolving collisions |
| int n = 0; |
| // TODO would be good if we can avoid calling cardinality() in each iteration! |
| int numBits = bits.length(); // larges bit we set |
| while (bits.cardinality() > 0) { |
| PhrasePositions pp2 = pq.pop(); |
| rptStack[n++] = pp2; |
| if (pp2.rptGroup >= 0 |
| && pp2.rptInd < numBits // this bit may not have been set |
| && bits.get(pp2.rptInd)) { |
| bits.clear(pp2.rptInd); |
| } |
| } |
| // add back to queue |
| for (int i=n-1; i>=0; i--) { |
| pq.add(rptStack[i]); |
| } |
| return true; |
| } |
| |
| /** compare two pps, but only by position and offset */ |
| private PhrasePositions lesser(PhrasePositions pp, PhrasePositions pp2) { |
| if (pp.position < pp2.position || |
| (pp.position == pp2.position && pp.offset < pp2.offset)) { |
| return pp; |
| } |
| return pp2; |
| } |
| |
| /** index of a pp2 colliding with pp, or -1 if none */ |
| private int collide(PhrasePositions pp) { |
| int tpPos = tpPos(pp); |
| PhrasePositions[] rg = rptGroups[pp.rptGroup]; |
| for (int i=0; i<rg.length; i++) { |
| PhrasePositions pp2 = rg[i]; |
| if (pp2 != pp && tpPos(pp2) == tpPos) { |
| return pp2.rptInd; |
| } |
| } |
| return -1; |
| } |
| |
| /** |
| * Initialize PhrasePositions in place. |
| * A one time initialization for this scorer (on first doc matching all terms): |
| * <ul> |
| * <li>Check if there are repetitions |
| * <li>If there are, find groups of repetitions. |
| * </ul> |
| * Examples: |
| * <ol> |
| * <li>no repetitions: <b>"ho my"~2</b> |
| * <li>repetitions: <b>"ho my my"~2</b> |
| * <li>repetitions: <b>"my ho my"~2</b> |
| * </ol> |
| * @return false if PPs are exhausted (and so current doc will not be a match) |
| */ |
| private boolean initPhrasePositions() throws IOException { |
| end = Integer.MIN_VALUE; |
| if (!checkedRpts) { |
| return initFirstTime(); |
| } |
| if (!hasRpts) { |
| initSimple(); |
| return true; // PPs available |
| } |
| return initComplex(); |
| } |
| |
| /** no repeats: simplest case, and most common. It is important to keep this piece of the code simple and efficient */ |
| private void initSimple() throws IOException { |
| //System.err.println("initSimple: doc: "+min.doc); |
| pq.clear(); |
| // position pps and build queue from list |
| for (PhrasePositions pp : phrasePositions) { |
| pp.firstPosition(); |
| if (pp.position > end) { |
| end = pp.position; |
| } |
| pq.add(pp); |
| } |
| } |
| |
| /** with repeats: not so simple. */ |
| private boolean initComplex() throws IOException { |
| //System.err.println("initComplex: doc: "+min.doc); |
| placeFirstPositions(); |
| if (!advanceRepeatGroups()) { |
| return false; // PPs exhausted |
| } |
| fillQueue(); |
| return true; // PPs available |
| } |
| |
| /** move all PPs to their first position */ |
| private void placeFirstPositions() throws IOException { |
| for (PhrasePositions pp : phrasePositions) { |
| pp.firstPosition(); |
| } |
| } |
| |
| /** Fill the queue (all pps are already placed */ |
| private void fillQueue() { |
| pq.clear(); |
| for (PhrasePositions pp : phrasePositions) { // iterate cyclic list: done once handled max |
| if (pp.position > end) { |
| end = pp.position; |
| } |
| pq.add(pp); |
| } |
| } |
| |
| /** At initialization (each doc), each repetition group is sorted by (query) offset. |
| * This provides the start condition: no collisions. |
| * <p>Case 1: no multi-term repeats<br> |
| * It is sufficient to advance each pp in the group by one less than its group index. |
| * So lesser pp is not advanced, 2nd one advance once, 3rd one advanced twice, etc. |
| * <p>Case 2: multi-term repeats<br> |
| * |
| * @return false if PPs are exhausted. |
| */ |
| private boolean advanceRepeatGroups() throws IOException { |
| for (PhrasePositions[] rg: rptGroups) { |
| if (hasMultiTermRpts) { |
| // more involved, some may not collide |
| int incr; |
| for (int i=0; i<rg.length; i+=incr) { |
| incr = 1; |
| PhrasePositions pp = rg[i]; |
| int k; |
| while((k=collide(pp)) >= 0) { |
| PhrasePositions pp2 = lesser(pp, rg[k]); |
| if (!advancePP(pp2)) { // at initialization always advance pp with higher offset |
| return false; // exhausted |
| } |
| if (pp2.rptInd < i) { // should not happen? |
| incr = 0; |
| break; |
| } |
| } |
| } |
| } else { |
| // simpler, we know exactly how much to advance |
| for (int j=1; j<rg.length; j++) { |
| for (int k=0; k<j; k++) { |
| if (!rg[j].nextPosition()) { |
| return false; // PPs exhausted |
| } |
| } |
| } |
| } |
| } |
| return true; // PPs available |
| } |
| |
| /** initialize with checking for repeats. Heavy work, but done only for the first candidate doc.<p> |
| * If there are repetitions, check if multi-term postings (MTP) are involved.<p> |
| * Without MTP, once PPs are placed in the first candidate doc, repeats (and groups) are visible.<br> |
| * With MTP, a more complex check is needed, up-front, as there may be "hidden collisions".<br> |
| * For example P1 has {A,B}, P1 has {B,C}, and the first doc is: "A C B". At start, P1 would point |
| * to "A", p2 to "C", and it will not be identified that P1 and P2 are repetitions of each other.<p> |
| * The more complex initialization has two parts:<br> |
| * (1) identification of repetition groups.<br> |
| * (2) advancing repeat groups at the start of the doc.<br> |
| * For (1), a possible solution is to just create a single repetition group, |
| * made of all repeating pps. But this would slow down the check for collisions, |
| * as all pps would need to be checked. Instead, we compute "connected regions" |
| * on the bipartite graph of postings and terms. |
| */ |
| private boolean initFirstTime() throws IOException { |
| //System.err.println("initFirstTime: doc: "+min.doc); |
| checkedRpts = true; |
| placeFirstPositions(); |
| |
| LinkedHashMap<Term,Integer> rptTerms = repeatingTerms(); |
| hasRpts = !rptTerms.isEmpty(); |
| |
| if (hasRpts) { |
| rptStack = new PhrasePositions[numPostings]; // needed with repetitions |
| ArrayList<ArrayList<PhrasePositions>> rgs = gatherRptGroups(rptTerms); |
| sortRptGroups(rgs); |
| if (!advanceRepeatGroups()) { |
| return false; // PPs exhausted |
| } |
| } |
| |
| fillQueue(); |
| return true; // PPs available |
| } |
| |
| /** sort each repetition group by (query) offset. |
| * Done only once (at first doc) and allows to initialize faster for each doc. */ |
| private void sortRptGroups(ArrayList<ArrayList<PhrasePositions>> rgs) { |
| rptGroups = new PhrasePositions[rgs.size()][]; |
| Comparator<PhrasePositions> cmprtr = new Comparator<PhrasePositions>() { |
| @Override |
| public int compare(PhrasePositions pp1, PhrasePositions pp2) { |
| return pp1.offset - pp2.offset; |
| } |
| }; |
| for (int i=0; i<rptGroups.length; i++) { |
| PhrasePositions[] rg = rgs.get(i).toArray(new PhrasePositions[0]); |
| Arrays.sort(rg, cmprtr); |
| rptGroups[i] = rg; |
| for (int j=0; j<rg.length; j++) { |
| rg[j].rptInd = j; // we use this index for efficient re-queuing |
| } |
| } |
| } |
| |
| /** Detect repetition groups. Done once - for first doc */ |
| private ArrayList<ArrayList<PhrasePositions>> gatherRptGroups(LinkedHashMap<Term,Integer> rptTerms) throws IOException { |
| PhrasePositions[] rpp = repeatingPPs(rptTerms); |
| ArrayList<ArrayList<PhrasePositions>> res = new ArrayList<>(); |
| if (!hasMultiTermRpts) { |
| // simpler - no multi-terms - can base on positions in first doc |
| for (int i=0; i<rpp.length; i++) { |
| PhrasePositions pp = rpp[i]; |
| if (pp.rptGroup >=0) continue; // already marked as a repetition |
| int tpPos = tpPos(pp); |
| for (int j=i+1; j<rpp.length; j++) { |
| PhrasePositions pp2 = rpp[j]; |
| if ( |
| pp2.rptGroup >=0 // already marked as a repetition |
| || pp2.offset == pp.offset // not a repetition: two PPs are originally in same offset in the query! |
| || tpPos(pp2) != tpPos) { // not a repetition |
| continue; |
| } |
| // a repetition |
| int g = pp.rptGroup; |
| if (g < 0) { |
| g = res.size(); |
| pp.rptGroup = g; |
| ArrayList<PhrasePositions> rl = new ArrayList<>(2); |
| rl.add(pp); |
| res.add(rl); |
| } |
| pp2.rptGroup = g; |
| res.get(g).add(pp2); |
| } |
| } |
| } else { |
| // more involved - has multi-terms |
| ArrayList<HashSet<PhrasePositions>> tmp = new ArrayList<>(); |
| ArrayList<FixedBitSet> bb = ppTermsBitSets(rpp, rptTerms); |
| unionTermGroups(bb); |
| HashMap<Term,Integer> tg = termGroups(rptTerms, bb); |
| HashSet<Integer> distinctGroupIDs = new HashSet<>(tg.values()); |
| for (int i=0; i<distinctGroupIDs.size(); i++) { |
| tmp.add(new HashSet<PhrasePositions>()); |
| } |
| for (PhrasePositions pp : rpp) { |
| for (Term t: pp.terms) { |
| if (rptTerms.containsKey(t)) { |
| int g = tg.get(t); |
| tmp.get(g).add(pp); |
| assert pp.rptGroup==-1 || pp.rptGroup==g; |
| pp.rptGroup = g; |
| } |
| } |
| } |
| for (HashSet<PhrasePositions> hs : tmp) { |
| res.add(new ArrayList<>(hs)); |
| } |
| } |
| return res; |
| } |
| |
| /** Actual position in doc of a PhrasePosition, relies on that position = tpPos - offset) */ |
| private final int tpPos(PhrasePositions pp) { |
| return pp.position + pp.offset; |
| } |
| |
| /** find repeating terms and assign them ordinal values */ |
| private LinkedHashMap<Term,Integer> repeatingTerms() { |
| LinkedHashMap<Term,Integer> tord = new LinkedHashMap<>(); |
| HashMap<Term,Integer> tcnt = new HashMap<>(); |
| for (PhrasePositions pp : phrasePositions) { |
| for (Term t : pp.terms) { |
| Integer cnt = tcnt.compute(t, (key, old) -> old == null ? 1 : 1 + old); |
| if (cnt==2) { |
| tord.put(t,tord.size()); |
| } |
| } |
| } |
| return tord; |
| } |
| |
| /** find repeating pps, and for each, if has multi-terms, update this.hasMultiTermRpts */ |
| private PhrasePositions[] repeatingPPs(HashMap<Term,Integer> rptTerms) { |
| ArrayList<PhrasePositions> rp = new ArrayList<>(); |
| for (PhrasePositions pp : phrasePositions) { |
| for (Term t : pp.terms) { |
| if (rptTerms.containsKey(t)) { |
| rp.add(pp); |
| hasMultiTermRpts |= (pp.terms.length > 1); |
| break; |
| } |
| } |
| } |
| return rp.toArray(new PhrasePositions[0]); |
| } |
| |
| /** bit-sets - for each repeating pp, for each of its repeating terms, the term ordinal values is set */ |
| private ArrayList<FixedBitSet> ppTermsBitSets(PhrasePositions[] rpp, HashMap<Term,Integer> tord) { |
| ArrayList<FixedBitSet> bb = new ArrayList<>(rpp.length); |
| for (PhrasePositions pp : rpp) { |
| FixedBitSet b = new FixedBitSet(tord.size()); |
| Integer ord; |
| for (Term t: pp.terms) { |
| if ((ord=tord.get(t))!=null) { |
| b.set(ord); |
| } |
| } |
| bb.add(b); |
| } |
| return bb; |
| } |
| |
| /** union (term group) bit-sets until they are disjoint (O(n^^2)), and each group have different terms */ |
| private void unionTermGroups(ArrayList<FixedBitSet> bb) { |
| int incr; |
| for (int i=0; i<bb.size()-1; i+=incr) { |
| incr = 1; |
| int j = i+1; |
| while (j<bb.size()) { |
| if (bb.get(i).intersects(bb.get(j))) { |
| bb.get(i).or(bb.get(j)); |
| bb.remove(j); |
| incr = 0; |
| } else { |
| ++j; |
| } |
| } |
| } |
| } |
| |
| /** map each term to the single group that contains it */ |
| private HashMap<Term,Integer> termGroups(LinkedHashMap<Term,Integer> tord, ArrayList<FixedBitSet> bb) throws IOException { |
| HashMap<Term,Integer> tg = new HashMap<>(); |
| Term[] t = tord.keySet().toArray(new Term[0]); |
| for (int i=0; i<bb.size(); i++) { // i is the group no. |
| FixedBitSet bits = bb.get(i); |
| for (int ord = bits.nextSetBit(0); ord != DocIdSetIterator.NO_MORE_DOCS; ord = ord + 1 >= bits.length() ? DocIdSetIterator.NO_MORE_DOCS : bits.nextSetBit(ord + 1)) { |
| tg.put(t[ord],i); |
| } |
| } |
| return tg; |
| } |
| |
| } |