| diff --git a/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java b/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java |
| index fd7cccd..07227d2 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java |
| @@ -155,7 +155,7 @@ public class ConjunctionDISI extends DocIdSetIterator { |
| |
| @Override |
| public long cost() { |
| - return lead.cost(); |
| + return lead.cost(); // overestimate |
| } |
| |
| /** |
| @@ -164,16 +164,33 @@ public class ConjunctionDISI extends DocIdSetIterator { |
| private static class TwoPhaseConjunctionDISI extends TwoPhaseIterator { |
| |
| private final TwoPhaseIterator[] twoPhaseIterators; |
| + private final float matchCost; |
| |
| private TwoPhaseConjunctionDISI(List<? extends DocIdSetIterator> iterators, List<TwoPhaseIterator> twoPhaseIterators) { |
| super(new ConjunctionDISI(iterators)); |
| assert twoPhaseIterators.size() > 0; |
| + |
| + CollectionUtil.timSort(twoPhaseIterators, new Comparator<TwoPhaseIterator>() { |
| + @Override |
| + public int compare(TwoPhaseIterator o1, TwoPhaseIterator o2) { |
| + return Float.compare(o1.matchCost(), o2.matchCost()); |
| + } |
| + }); |
| + |
| this.twoPhaseIterators = twoPhaseIterators.toArray(new TwoPhaseIterator[twoPhaseIterators.size()]); |
| + |
| + // Compute the matchCost as the total matchCost of the sub iterators. |
| + // TODO: This could be too high because the matching is done cheapest first: give the lower matchCosts a higher weight. |
| + float totalMatchCost = 0; |
| + for (TwoPhaseIterator tpi : twoPhaseIterators) { |
| + totalMatchCost += tpi.matchCost(); |
| + } |
| + matchCost = totalMatchCost; |
| } |
| |
| @Override |
| public boolean matches() throws IOException { |
| - for (TwoPhaseIterator twoPhaseIterator : twoPhaseIterators) { |
| + for (TwoPhaseIterator twoPhaseIterator : twoPhaseIterators) { // match cheapest first |
| if (twoPhaseIterator.matches() == false) { |
| return false; |
| } |
| @@ -181,6 +198,11 @@ public class ConjunctionDISI extends DocIdSetIterator { |
| return true; |
| } |
| |
| + @Override |
| + public float matchCost() { |
| + return matchCost; |
| + } |
| + |
| } |
| |
| /** |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionScorer.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionScorer.java |
| index c32a520..e02efba 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionScorer.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionScorer.java |
| @@ -52,19 +52,25 @@ abstract class DisjunctionScorer extends Scorer { |
| |
| @Override |
| public TwoPhaseIterator asTwoPhaseIterator() { |
| - boolean hasApproximation = false; |
| + float sumMatchCost = 0; |
| + long sumApproxCost = 0; |
| + |
| + // Compute matchCost as the avarage over the matchCost of the subScorers. |
| + // This is weighted by the cost, which is an expected number of matching documents. |
| for (DisiWrapper<Scorer> w : subScorers) { |
| if (w.twoPhaseView != null) { |
| - hasApproximation = true; |
| - break; |
| + long costWeight = (w.cost <= 1) ? 1 : w.cost; |
| + sumMatchCost += w.twoPhaseView.matchCost() * costWeight; |
| + sumApproxCost += costWeight; |
| } |
| } |
| |
| - if (! hasApproximation) { |
| - // none of the sub scorers supports approximations |
| + if (sumApproxCost == 0) { // no sub scorer supports approximations |
| return null; |
| } |
| |
| + final float matchCost = sumMatchCost / sumApproxCost; |
| + |
| // note it is important to share the same pq as this scorer so that |
| // rebalancing the pq through the approximation will also rebalance |
| // the pq in this scorer. |
| @@ -105,6 +111,11 @@ abstract class DisjunctionScorer extends Scorer { |
| DisjunctionScorer.this.topScorers = topScorers; |
| return true; |
| } |
| + |
| + @Override |
| + public float matchCost() { |
| + return matchCost; |
| + } |
| }; |
| } |
| |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java b/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java |
| index 48060ef..248a948 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java |
| @@ -44,9 +44,11 @@ final class ExactPhraseScorer extends Scorer { |
| |
| private final Similarity.SimScorer docScorer; |
| private final boolean needsScores; |
| + private float matchCost; |
| |
| ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, |
| - Similarity.SimScorer docScorer, boolean needsScores) throws IOException { |
| + Similarity.SimScorer docScorer, boolean needsScores, |
| + float matchCost) throws IOException { |
| super(weight); |
| this.docScorer = docScorer; |
| this.needsScores = needsScores; |
| @@ -59,6 +61,7 @@ final class ExactPhraseScorer extends Scorer { |
| } |
| conjunction = ConjunctionDISI.intersect(iterators); |
| this.postings = postingsAndPositions.toArray(new PostingsAndPosition[postingsAndPositions.size()]); |
| + this.matchCost = matchCost; |
| } |
| |
| @Override |
| @@ -68,6 +71,11 @@ final class ExactPhraseScorer extends Scorer { |
| public boolean matches() throws IOException { |
| return phraseFreq() > 0; |
| } |
| + |
| + @Override |
| + public float matchCost() { |
| + return matchCost; |
| + } |
| }; |
| } |
| |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java |
| index f29d86a..58620fa 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java |
| @@ -189,6 +189,7 @@ public class MultiPhraseQuery extends Query { |
| |
| // Reuse single TermsEnum below: |
| final TermsEnum termsEnum = fieldTerms.iterator(); |
| + float totalMatchCost = 0; |
| |
| for (int pos=0; pos<postingsFreqs.length; pos++) { |
| Term[] terms = termArrays.get(pos); |
| @@ -199,6 +200,7 @@ public class MultiPhraseQuery extends Query { |
| if (termState != null) { |
| termsEnum.seekExact(term.bytes(), termState); |
| postings.add(termsEnum.postings(null, PostingsEnum.POSITIONS)); |
| + totalMatchCost += PhraseQuery.termPositionsCost(termsEnum); |
| } |
| } |
| |
| @@ -222,9 +224,13 @@ public class MultiPhraseQuery extends Query { |
| } |
| |
| if (slop == 0) { |
| - return new ExactPhraseScorer(this, postingsFreqs, similarity.simScorer(stats, context), needsScores); |
| + return new ExactPhraseScorer(this, postingsFreqs, |
| + similarity.simScorer(stats, context), |
| + needsScores, totalMatchCost); |
| } else { |
| - return new SloppyPhraseScorer(this, postingsFreqs, slop, similarity.simScorer(stats, context), needsScores); |
| + return new SloppyPhraseScorer(this, postingsFreqs, slop, |
| + similarity.simScorer(stats, context), |
| + needsScores, totalMatchCost); |
| } |
| } |
| |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java |
| index fd3cddf..049cfbe 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java |
| @@ -405,6 +405,7 @@ public class PhraseQuery extends Query { |
| |
| // Reuse single TermsEnum below: |
| final TermsEnum te = fieldTerms.iterator(); |
| + float totalMatchCost = 0; |
| |
| for (int i = 0; i < terms.length; i++) { |
| final Term t = terms[i]; |
| @@ -416,6 +417,7 @@ public class PhraseQuery extends Query { |
| te.seekExact(t.bytes(), state); |
| PostingsEnum postingsEnum = te.postings(null, PostingsEnum.POSITIONS); |
| postingsFreqs[i] = new PostingsAndFreq(postingsEnum, positions[i], t); |
| + totalMatchCost += termPositionsCost(te); |
| } |
| |
| // sort by increasing docFreq order |
| @@ -424,9 +426,13 @@ public class PhraseQuery extends Query { |
| } |
| |
| if (slop == 0) { // optimize exact case |
| - return new ExactPhraseScorer(this, postingsFreqs, similarity.simScorer(stats, context), needsScores); |
| + return new ExactPhraseScorer(this, postingsFreqs, |
| + similarity.simScorer(stats, context), |
| + needsScores, totalMatchCost); |
| } else { |
| - return new SloppyPhraseScorer(this, postingsFreqs, slop, similarity.simScorer(stats, context), needsScores); |
| + return new SloppyPhraseScorer(this, postingsFreqs, slop, |
| + similarity.simScorer(stats, context), |
| + needsScores, totalMatchCost); |
| } |
| } |
| |
| @@ -456,6 +462,42 @@ public class PhraseQuery extends Query { |
| } |
| } |
| |
| + /** A guess of |
| + * the average number of simple operations for the initial seek and buffer refill |
| + * per document for the positions of a term. |
| + * See also {@link Lucene50PostingsReader.BlockPostingsEnum#nextPosition()}. |
| + * <p> |
| + * Aside: Instead of being constant this could depend among others on |
| + * {@link Lucene50PostingsFormat#BLOCK_SIZE}, |
| + * {@link TermsEnum#docFreq()}, |
| + * {@link TermsEnum#totalTermFreq()}, |
| + * {@link DocIdSetIterator#cost()} (expected number of matching docs), |
| + * {@link LeafReader#maxDoc()} (total number of docs in the segment), |
| + * and the seek time and block size of the device storing the index. |
| + */ |
| + private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128; |
| + |
| + /** Number of simple operations in {@link Lucene50PostingsReader.BlockPostingsEnum#nextPosition()} |
| + * when no seek or buffer refill is done. |
| + */ |
| + private static final int TERM_OPS_PER_POS = 7; |
| + |
| + /** Returns an expected cost in simple operations |
| + * of processing the occurrences of a term |
| + * in a document that contains the term. |
| + * This is for use by {@link #matchCost} implementations. |
| + * <br>This may be inaccurate when {@link TermsEnum#totalTermFreq()} is not available. |
| + * @param termsEnum The term is the term at which this TermsEnum is positioned. |
| + */ |
| + static float termPositionsCost(TermsEnum termsEnum) throws IOException { |
| + int docFreq = termsEnum.docFreq(); |
| + assert docFreq > 0; |
| + long totalTermFreq = termsEnum.totalTermFreq(); // -1 when not available |
| + float expOccurrencesInMatchingDoc = (totalTermFreq < docFreq) ? 1 : (totalTermFreq / (float) docFreq); |
| + return TERM_POSNS_SEEK_OPS_PER_DOC + expOccurrencesInMatchingDoc * TERM_OPS_PER_POS; |
| + } |
| + |
| + |
| @Override |
| public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException { |
| return new PhraseWeight(searcher, needsScores); |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/RandomAccessWeight.java b/lucene/core/src/java/org/apache/lucene/search/RandomAccessWeight.java |
| index 5e920cb..2d25e29 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/RandomAccessWeight.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/RandomAccessWeight.java |
| @@ -62,6 +62,11 @@ public abstract class RandomAccessWeight extends ConstantScoreWeight { |
| |
| return matchingDocs.get(doc); |
| } |
| + |
| + @Override |
| + public float matchCost() { |
| + return 10; // TODO: use some cost of matchingDocs |
| + } |
| }; |
| |
| return new ConstantScoreScorer(this, score(), twoPhase); |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/ReqExclScorer.java b/lucene/core/src/java/org/apache/lucene/search/ReqExclScorer.java |
| index 125d887..d401cde 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/ReqExclScorer.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/ReqExclScorer.java |
| @@ -149,6 +149,10 @@ class ReqExclScorer extends Scorer { |
| return ReqExclScorer.matches(doc, exclDoc, reqTwoPhaseIterator, exclTwoPhaseIterator); |
| } |
| |
| + @Override |
| + public float matchCost() { |
| + return reqTwoPhaseIterator.matchCost(); // TODO: also use cost of exclApproximation.advance() |
| + } |
| }; |
| } |
| } |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java b/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java |
| index 4ee2bf6..5a5cae6 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java |
| @@ -52,9 +52,11 @@ final class SloppyPhraseScorer extends Scorer { |
| |
| private int numMatches; |
| final boolean needsScores; |
| + private final float matchCost; |
| |
| SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, |
| - int slop, Similarity.SimScorer docScorer, boolean needsScores) { |
| + int slop, Similarity.SimScorer docScorer, boolean needsScores, |
| + float matchCost) { |
| super(weight); |
| this.docScorer = docScorer; |
| this.needsScores = needsScores; |
| @@ -68,6 +70,7 @@ final class SloppyPhraseScorer extends Scorer { |
| phrasePositions[i] = new PhrasePositions(postings[i].postings, postings[i].position, i, postings[i].terms); |
| } |
| conjunction = ConjunctionDISI.intersect(Arrays.asList(iterators)); |
| + this.matchCost = matchCost; |
| } |
| |
| /** |
| @@ -596,6 +599,16 @@ final class SloppyPhraseScorer extends Scorer { |
| sloppyFreq = phraseFreq(); // check for phrase |
| return sloppyFreq != 0F; |
| } |
| + |
| + @Override |
| + public float matchCost() { |
| + return matchCost; |
| + } |
| + |
| + @Override |
| + public String toString() { |
| + return "SloppyPhraseScorer@asTwoPhaseIterator(" + SloppyPhraseScorer.this + ")"; |
| + } |
| }; |
| } |
| } |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/TwoPhaseIterator.java b/lucene/core/src/java/org/apache/lucene/search/TwoPhaseIterator.java |
| index 3d774c5..ff22e5d 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/TwoPhaseIterator.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/TwoPhaseIterator.java |
| @@ -20,6 +20,7 @@ package org.apache.lucene.search; |
| import java.io.IOException; |
| import java.util.Objects; |
| |
| +import org.apache.lucene.index.TermsEnum; |
| /** |
| * Returned by {@link Scorer#asTwoPhaseIterator()} |
| * to expose an approximation of a {@link DocIdSetIterator}. |
| @@ -84,15 +85,23 @@ public abstract class TwoPhaseIterator { |
| return approximation; |
| } |
| |
| - /** Return whether the current doc ID that the iterator is on matches. This |
| + /** Return whether the current doc ID that {@link #approximation()} is on matches. This |
| * method should only be called when the iterator is positioned -- ie. not |
| * when {@link DocIdSetIterator#docID()} is {@code -1} or |
| * {@link DocIdSetIterator#NO_MORE_DOCS} -- and at most once. */ |
| public abstract boolean matches() throws IOException; |
| |
| + /** An estimate of the expected cost to determine that a single document {@link #matches()}. |
| + * This can be called before iterating the documents of {@link #approximation()}. |
| + * Returns an expected cost in number of simple operations like addition, multiplication, |
| + * comparing two numbers and indexing an array. |
| + * The returned value must be positive. |
| + */ |
| + public abstract float matchCost(); |
| + |
| /** |
| * Returns a {@link TwoPhaseIterator} for this {@link DocIdSetIterator} |
| - * when available * otherwise returns null. |
| + * when available, otherwise returns null. |
| */ |
| public static TwoPhaseIterator asTwoPhaseIterator(DocIdSetIterator iter) { |
| return (iter instanceof Scorer) |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionSpans.java |
| index fcc2484..533714d 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionSpans.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionSpans.java |
| @@ -88,14 +88,34 @@ abstract class ConjunctionSpans extends Spans { |
| */ |
| @Override |
| public TwoPhaseIterator asTwoPhaseIterator() { |
| - TwoPhaseIterator res = new TwoPhaseIterator(conjunction) { |
| + float totalMatchCost = 0; |
| + // Compute the matchCost as the total matchCost/positionsCostant of the sub spans. |
| + for (Spans spans : subSpans) { |
| + TwoPhaseIterator tpi = spans.asTwoPhaseIterator(); |
| + if (tpi != null) { |
| + totalMatchCost += tpi.matchCost(); |
| + } else { |
| + totalMatchCost += spans.positionsCost(); |
| + } |
| + } |
| + final float matchCost = totalMatchCost; |
| |
| + return new TwoPhaseIterator(conjunction) { |
| @Override |
| public boolean matches() throws IOException { |
| return twoPhaseCurrentDocMatches(); |
| } |
| + |
| + @Override |
| + public float matchCost() { |
| + return matchCost; |
| + } |
| }; |
| - return res; |
| + } |
| + |
| + @Override |
| + public float positionsCost() { |
| + throw new UnsupportedOperationException(); // asTwoPhaseIterator never returns null here. |
| } |
| |
| public Spans[] getSubSpans() { |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/FilterSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/FilterSpans.java |
| index e4ec1b5..1db08aa 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/spans/FilterSpans.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/FilterSpans.java |
| @@ -142,6 +142,16 @@ public abstract class FilterSpans extends Spans { |
| public boolean matches() throws IOException { |
| return inner.matches() && twoPhaseCurrentDocMatches(); |
| } |
| + |
| + @Override |
| + public float matchCost() { |
| + return inner.matchCost(); // underestimate |
| + } |
| + |
| + @Override |
| + public String toString() { |
| + return "FilterSpans@asTwoPhaseIterator(inner=" + inner + ", in=" + in + ")"; |
| + } |
| }; |
| } else { |
| // wrapped instance has no approximation, but |
| @@ -151,10 +161,25 @@ public abstract class FilterSpans extends Spans { |
| public boolean matches() throws IOException { |
| return twoPhaseCurrentDocMatches(); |
| } |
| + |
| + @Override |
| + public float matchCost() { |
| + return in.positionsCost(); // overestimate |
| + } |
| + |
| + @Override |
| + public String toString() { |
| + return "FilterSpans@asTwoPhaseIterator(in=" + in + ")"; |
| + } |
| }; |
| } |
| } |
| |
| + @Override |
| + public float positionsCost() { |
| + throw new UnsupportedOperationException(); // asTwoPhaseIterator never returns null |
| + } |
| + |
| /** |
| * Returns true if the current document matches. |
| * <p> |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java b/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java |
| index bd40add..cf92e6f 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java |
| @@ -134,6 +134,11 @@ public class NearSpansUnordered extends ConjunctionSpans { |
| } |
| |
| @Override |
| + public float positionsCost() { |
| + return in.positionsCost(); |
| + } |
| + |
| + @Override |
| public int docID() { |
| return in.docID(); |
| } |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/ScoringWrapperSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/ScoringWrapperSpans.java |
| index a409477..6274466 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/spans/ScoringWrapperSpans.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/ScoringWrapperSpans.java |
| @@ -20,6 +20,7 @@ package org.apache.lucene.search.spans; |
| import java.io.IOException; |
| |
| import org.apache.lucene.search.similarities.Similarity; |
| +import org.apache.lucene.search.TwoPhaseIterator; |
| |
| /** |
| * A Spans that wraps another Spans with a different SimScorer |
| @@ -82,4 +83,14 @@ public class ScoringWrapperSpans extends Spans { |
| public long cost() { |
| return in.cost(); |
| } |
| + |
| + @Override |
| + public TwoPhaseIterator asTwoPhaseIterator() { |
| + return in.asTwoPhaseIterator(); |
| + } |
| + |
| + @Override |
| + public float positionsCost() { |
| + return in.positionsCost(); |
| + } |
| } |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNearQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNearQuery.java |
| index 3fd1703..33c7d92 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNearQuery.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNearQuery.java |
| @@ -384,6 +384,11 @@ public class SpanNearQuery extends SpanQuery implements Cloneable { |
| public long cost() { |
| return 0; |
| } |
| + |
| + @Override |
| + public float positionsCost() { |
| + throw new UnsupportedOperationException(); |
| + } |
| } |
| |
| } |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java |
| index 9c39f41..6fadd60 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java |
| @@ -210,26 +210,58 @@ public final class SpanOrQuery extends SpanQuery { |
| |
| @Override |
| public TwoPhaseIterator asTwoPhaseIterator() { |
| - boolean hasApproximation = false; |
| + float sumMatchCost = 0; // See also DisjunctionScorer.asTwoPhaseIterator() |
| + long sumApproxCost = 0; |
| + |
| for (DisiWrapper<Spans> w : byDocQueue) { |
| if (w.twoPhaseView != null) { |
| - hasApproximation = true; |
| - break; |
| + long costWeight = (w.cost <= 1) ? 1 : w.cost; |
| + sumMatchCost += w.twoPhaseView.matchCost() * costWeight; |
| + sumApproxCost += costWeight; |
| } |
| } |
| |
| - if (!hasApproximation) { // none of the sub spans supports approximations |
| + if (sumApproxCost == 0) { // no sub spans supports approximations |
| + computePositionsCost(); |
| return null; |
| } |
| |
| + final float matchCost = sumMatchCost / sumApproxCost; |
| + |
| return new TwoPhaseIterator(new DisjunctionDISIApproximation<Spans>(byDocQueue)) { |
| @Override |
| public boolean matches() throws IOException { |
| return twoPhaseCurrentDocMatches(); |
| } |
| + |
| + @Override |
| + public float matchCost() { |
| + return matchCost; |
| + } |
| }; |
| } |
| |
| + float positionsCost = -1; |
| + |
| + void computePositionsCost() { |
| + float sumPositionsCost = 0; |
| + long sumCost = 0; |
| + for (DisiWrapper<Spans> w : byDocQueue) { |
| + long costWeight = (w.cost <= 1) ? 1 : w.cost; |
| + sumPositionsCost += w.iterator.positionsCost() * costWeight; |
| + sumCost += costWeight; |
| + } |
| + positionsCost = sumPositionsCost / sumCost; |
| + } |
| + |
| + @Override |
| + public float positionsCost() { |
| + // This may be called when asTwoPhaseIterator returned null, |
| + // which happens when none of the sub spans supports approximations. |
| + assert positionsCost > 0; |
| + return positionsCost; |
| + } |
| + |
| int lastDocTwoPhaseMatched = -1; |
| |
| boolean twoPhaseCurrentDocMatches() throws IOException { |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java |
| index be75575..4799295 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java |
| @@ -33,6 +33,7 @@ import org.apache.lucene.index.TermState; |
| import org.apache.lucene.index.Terms; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.search.IndexSearcher; |
| +import org.apache.lucene.search.TwoPhaseIterator; |
| |
| /** Matches spans containing a term. |
| * This should not be used for terms that are indexed at position Integer.MAX_VALUE. |
| @@ -117,10 +118,40 @@ public class SpanTermQuery extends SpanQuery { |
| termsEnum.seekExact(term.bytes(), state); |
| |
| final PostingsEnum postings = termsEnum.postings(null, requiredPostings.getRequiredPostings()); |
| - return new TermSpans(this, getSimScorer(context), postings, term); |
| + float positionsCost = termPositionsCost(termsEnum) * PHRASE_TO_SPAN_TERM_POSITIONS_COST; |
| + return new TermSpans(this, getSimScorer(context), postings, term, positionsCost); |
| } |
| } |
| |
| + /** A guess of |
| + * the relative cost of dealing with the term positions |
| + * when using a SpanNearQuery instead of a PhraseQuery. |
| + */ |
| + private static final float PHRASE_TO_SPAN_TERM_POSITIONS_COST = 4.0f; |
| + |
| + private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128; |
| + |
| + private static final int TERM_OPS_PER_POS = 7; |
| + |
| + /** Returns an expected cost in simple operations |
| + * of processing the occurrences of a term |
| + * in a document that contains the term. |
| + * <br>This may be inaccurate when {@link TermsEnum#totalTermFreq()} is not available. |
| + * @param termsEnum The term is the term at which this TermsEnum is positioned. |
| + * <p> |
| + * This is a copy of {@link org.apache.lucene.search.PhraseQuery.termPositionsCost()}. |
| + * <br> |
| + * TODO: keep only a single copy of this method and the constants used in it |
| + * when SpanTermQuery moves to the o.a.l.search package. |
| + */ |
| + static float termPositionsCost(TermsEnum termsEnum) throws IOException { |
| + int docFreq = termsEnum.docFreq(); |
| + assert docFreq > 0; |
| + long totalTermFreq = termsEnum.totalTermFreq(); // -1 when not available |
| + float expOccurrencesInMatchingDoc = (totalTermFreq < docFreq) ? 1 : (totalTermFreq / (float) docFreq); |
| + return TERM_POSNS_SEEK_OPS_PER_DOC + expOccurrencesInMatchingDoc * TERM_OPS_PER_POS; |
| + } |
| + |
| @Override |
| public String toString(String field) { |
| StringBuilder buffer = new StringBuilder(); |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/Spans.java b/lucene/core/src/java/org/apache/lucene/search/spans/Spans.java |
| index fff328a..3f7ff4f 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/spans/Spans.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/Spans.java |
| @@ -86,6 +86,16 @@ public abstract class Spans extends Scorer { |
| */ |
| public abstract void collect(SpanCollector collector) throws IOException; |
| |
| + /** |
| + * Return an estimation of the cost of using the positions of |
| + * this {@link Spans} for any single document, but only after |
| + * {@link #asTwoPhaseIterator} returned {@code null}. |
| + * Otherwise this method should not be called. |
| + * The returned value is independent of the current document. |
| + * |
| + * @lucene.experimental |
| + */ |
| + public abstract float positionsCost(); |
| @Override |
| public String toString() { |
| StringBuilder sb = new StringBuilder(); |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/TermSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/TermSpans.java |
| index 802b761..68f3cd4 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/spans/TermSpans.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/TermSpans.java |
| @@ -37,13 +37,17 @@ public class TermSpans extends Spans { |
| protected int count; |
| protected int position; |
| protected boolean readPayload; |
| + private final float positionsCost; |
| |
| - public TermSpans(SpanWeight weight, Similarity.SimScorer scorer, PostingsEnum postings, Term term) { |
| + public TermSpans(SpanWeight weight, Similarity.SimScorer scorer, |
| + PostingsEnum postings, Term term, float positionsCost) { |
| super(weight, scorer); |
| this.postings = Objects.requireNonNull(postings); |
| this.term = Objects.requireNonNull(term); |
| this.doc = -1; |
| this.position = -1; |
| + assert positionsCost > 0; // otherwise the TermSpans should not be created. |
| + this.positionsCost = positionsCost; |
| } |
| |
| @Override |
| @@ -119,6 +123,11 @@ public class TermSpans extends Spans { |
| } |
| |
| @Override |
| + public float positionsCost() { |
| + return positionsCost; |
| + } |
| + |
| + @Override |
| public String toString() { |
| return "spans(" + term.toString() + ")@" + |
| (doc == -1 ? "START" : (doc == NO_MORE_DOCS) ? "ENDDOC" |
| @@ -128,5 +137,4 @@ public class TermSpans extends Spans { |
| public PostingsEnum getPostings() { |
| return postings; |
| } |
| - |
| } |
| diff --git a/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java b/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java |
| index f62b19d..c907e6e 100644 |
| --- a/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java |
| +++ b/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java |
| @@ -37,6 +37,11 @@ public class TestConjunctionDISI extends LuceneTestCase { |
| public boolean matches() throws IOException { |
| return confirmed.get(iterator.docID()); |
| } |
| + |
| + @Override |
| + public float matchCost() { |
| + return 5; // #operations in FixedBitSet#get() |
| + } |
| }; |
| } |
| |
| diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/JustCompileSearchSpans.java b/lucene/core/src/test/org/apache/lucene/search/spans/JustCompileSearchSpans.java |
| index ed91bc6..f5680e9 100644 |
| --- a/lucene/core/src/test/org/apache/lucene/search/spans/JustCompileSearchSpans.java |
| +++ b/lucene/core/src/test/org/apache/lucene/search/spans/JustCompileSearchSpans.java |
| @@ -82,6 +82,11 @@ final class JustCompileSearchSpans { |
| public long cost() { |
| throw new UnsupportedOperationException(UNSUPPORTED_MSG); |
| } |
| + |
| + @Override |
| + public float positionsCost() { |
| + throw new UnsupportedOperationException(UNSUPPORTED_MSG); |
| + } |
| } |
| |
| static final class JustCompileSpanQuery extends SpanQuery { |
| diff --git a/lucene/facet/src/java/org/apache/lucene/facet/range/DoubleRange.java b/lucene/facet/src/java/org/apache/lucene/facet/range/DoubleRange.java |
| index a819f9b..ee51e2f 100644 |
| --- a/lucene/facet/src/java/org/apache/lucene/facet/range/DoubleRange.java |
| +++ b/lucene/facet/src/java/org/apache/lucene/facet/range/DoubleRange.java |
| @@ -176,6 +176,11 @@ public final class DoubleRange extends Range { |
| public boolean matches() throws IOException { |
| return range.accept(values.doubleVal(approximation.docID())); |
| } |
| + |
| + @Override |
| + public float matchCost() { |
| + return 100; // TODO: use cost of range.accept() |
| + } |
| }; |
| return new ConstantScoreScorer(this, score(), twoPhase); |
| } |
| diff --git a/lucene/facet/src/java/org/apache/lucene/facet/range/LongRange.java b/lucene/facet/src/java/org/apache/lucene/facet/range/LongRange.java |
| index 66f6e2e..254bc8a 100644 |
| --- a/lucene/facet/src/java/org/apache/lucene/facet/range/LongRange.java |
| +++ b/lucene/facet/src/java/org/apache/lucene/facet/range/LongRange.java |
| @@ -168,6 +168,11 @@ public final class LongRange extends Range { |
| public boolean matches() throws IOException { |
| return range.accept(values.longVal(approximation.docID())); |
| } |
| + |
| + @Override |
| + public float matchCost() { |
| + return 100; // TODO: use cost of range.accept() |
| + } |
| }; |
| return new ConstantScoreScorer(this, score(), twoPhase); |
| } |
| diff --git a/lucene/join/src/java/org/apache/lucene/search/join/GlobalOrdinalsQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/GlobalOrdinalsQuery.java |
| index 366932b..e0c7880 100644 |
| --- a/lucene/join/src/java/org/apache/lucene/search/join/GlobalOrdinalsQuery.java |
| +++ b/lucene/join/src/java/org/apache/lucene/search/join/GlobalOrdinalsQuery.java |
| @@ -184,6 +184,11 @@ final class GlobalOrdinalsQuery extends Query { |
| } |
| return false; |
| } |
| + |
| + @Override |
| + public float matchCost() { |
| + return 100; // TODO: use cost of values.getOrd() and foundOrds.get() |
| + } |
| }; |
| } |
| } |
| @@ -225,6 +230,11 @@ final class GlobalOrdinalsQuery extends Query { |
| } |
| return false; |
| } |
| + |
| + @Override |
| + public float matchCost() { |
| + return 100; // TODO: use cost of values.getOrd() and foundOrds.get() |
| + } |
| }; |
| } |
| |
| diff --git a/lucene/join/src/java/org/apache/lucene/search/join/GlobalOrdinalsWithScoreQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/GlobalOrdinalsWithScoreQuery.java |
| index 385b302..c7763b7 100644 |
| --- a/lucene/join/src/java/org/apache/lucene/search/join/GlobalOrdinalsWithScoreQuery.java |
| +++ b/lucene/join/src/java/org/apache/lucene/search/join/GlobalOrdinalsWithScoreQuery.java |
| @@ -211,6 +211,10 @@ final class GlobalOrdinalsWithScoreQuery extends Query { |
| return false; |
| } |
| |
| + @Override |
| + public float matchCost() { |
| + return 100; // TODO: use cost of values.getOrd() and collector.score() |
| + } |
| }; |
| } |
| } |
| @@ -253,6 +257,11 @@ final class GlobalOrdinalsWithScoreQuery extends Query { |
| } |
| return false; |
| } |
| + |
| + @Override |
| + public float matchCost() { |
| + return 100; // TODO: use cost.getOrd() of values and collector.score() |
| + } |
| }; |
| } |
| } |
| diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/ValueSourceScorer.java b/lucene/queries/src/java/org/apache/lucene/queries/function/ValueSourceScorer.java |
| index c8e946e..a071a95 100644 |
| --- a/lucene/queries/src/java/org/apache/lucene/queries/function/ValueSourceScorer.java |
| +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/ValueSourceScorer.java |
| @@ -53,6 +53,11 @@ public abstract class ValueSourceScorer extends Scorer { |
| public boolean matches() throws IOException { |
| return ValueSourceScorer.this.matches(docID()); |
| } |
| + |
| + @Override |
| + public float matchCost() { |
| + return 100; // TODO: use cost of ValueSourceScorer.this.matches() |
| + } |
| }; |
| this.disi = TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator); |
| } |
| diff --git a/lucene/queries/src/java/org/apache/lucene/queries/payloads/PayloadScoreQuery.java b/lucene/queries/src/java/org/apache/lucene/queries/payloads/PayloadScoreQuery.java |
| index c805581..9602bd6 100644 |
| --- a/lucene/queries/src/java/org/apache/lucene/queries/payloads/PayloadScoreQuery.java |
| +++ b/lucene/queries/src/java/org/apache/lucene/queries/payloads/PayloadScoreQuery.java |
| @@ -274,6 +274,11 @@ public class PayloadScoreQuery extends SpanQuery { |
| public long cost() { |
| return in.cost(); |
| } |
| + |
| + @Override |
| + public float positionsCost() { |
| + return in.positionsCost(); |
| + } |
| } |
| |
| } |
| diff --git a/lucene/spatial/src/java/org/apache/lucene/spatial/composite/CompositeVerifyQuery.java b/lucene/spatial/src/java/org/apache/lucene/spatial/composite/CompositeVerifyQuery.java |
| index d49fb41..a7ccfb5 100644 |
| --- a/lucene/spatial/src/java/org/apache/lucene/spatial/composite/CompositeVerifyQuery.java |
| +++ b/lucene/spatial/src/java/org/apache/lucene/spatial/composite/CompositeVerifyQuery.java |
| @@ -108,6 +108,11 @@ public class CompositeVerifyQuery extends Query { |
| public boolean matches() throws IOException { |
| return predFuncValues.boolVal(indexQueryScorer.docID()); |
| } |
| + |
| + @Override |
| + public float matchCost() { |
| + return 100; // TODO: use cost of predFuncValues.boolVal() |
| + } |
| }; |
| |
| return new ConstantScoreScorer(this, score(), twoPhaseIterator); |
| diff --git a/lucene/spatial/src/java/org/apache/lucene/spatial/composite/IntersectsRPTVerifyQuery.java b/lucene/spatial/src/java/org/apache/lucene/spatial/composite/IntersectsRPTVerifyQuery.java |
| index 798550f..7810c21 100644 |
| --- a/lucene/spatial/src/java/org/apache/lucene/spatial/composite/IntersectsRPTVerifyQuery.java |
| +++ b/lucene/spatial/src/java/org/apache/lucene/spatial/composite/IntersectsRPTVerifyQuery.java |
| @@ -130,6 +130,11 @@ public class IntersectsRPTVerifyQuery extends Query { |
| |
| return predFuncValues.boolVal(doc); |
| } |
| + |
| + @Override |
| + public float matchCost() { |
| + return 100; // TODO: use cost of exactIterator.advance() and predFuncValues.boolVal() |
| + } |
| }; |
| |
| return new ConstantScoreScorer(this, score(), twoPhaseIterator); |
| diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/AssertingScorer.java b/lucene/test-framework/src/java/org/apache/lucene/search/AssertingScorer.java |
| index 2bc61ef..78f6f6c 100644 |
| --- a/lucene/test-framework/src/java/org/apache/lucene/search/AssertingScorer.java |
| +++ b/lucene/test-framework/src/java/org/apache/lucene/search/AssertingScorer.java |
| @@ -195,6 +195,19 @@ public class AssertingScorer extends Scorer { |
| } |
| return matches; |
| } |
| + |
| + @Override |
| + public float matchCost() { |
| + float matchCost = in.matchCost(); |
| + assert ! Float.isNaN(matchCost); |
| + assert matchCost >= 0; |
| + return matchCost; |
| + } |
| + |
| + @Override |
| + public String toString() { |
| + return "AssertingScorer@asTwoPhaseIterator(" + in + ")"; |
| + } |
| }; |
| } |
| } |
| diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/RandomApproximationQuery.java b/lucene/test-framework/src/java/org/apache/lucene/search/RandomApproximationQuery.java |
| index 88cfd77..53a3610 100644 |
| --- a/lucene/test-framework/src/java/org/apache/lucene/search/RandomApproximationQuery.java |
| +++ b/lucene/test-framework/src/java/org/apache/lucene/search/RandomApproximationQuery.java |
| @@ -172,10 +172,12 @@ public class RandomApproximationQuery extends Query { |
| |
| private final DocIdSetIterator disi; |
| private int lastDoc = -1; |
| + private final float randomMatchCost; |
| |
| RandomTwoPhaseView(Random random, DocIdSetIterator disi) { |
| super(new RandomApproximation(random, disi)); |
| this.disi = disi; |
| + this.randomMatchCost = random.nextFloat() * 200; // between 0 and 200 |
| } |
| |
| @Override |
| @@ -190,6 +192,11 @@ public class RandomApproximationQuery extends Query { |
| return approximation.docID() == disi.docID(); |
| } |
| |
| + @Override |
| + public float matchCost() { |
| + TwoPhaseIterator tpi = TwoPhaseIterator.asTwoPhaseIterator(approximation); |
| + return (tpi == null) ? randomMatchCost : tpi.matchCost(); // TODO: is randomMatchCost ok? |
| + } |
| } |
| |
| private static class RandomApproximation extends DocIdSetIterator { |
| diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/spans/AssertingSpans.java b/lucene/test-framework/src/java/org/apache/lucene/search/spans/AssertingSpans.java |
| index 89a4ed2..18053a9 100644 |
| --- a/lucene/test-framework/src/java/org/apache/lucene/search/spans/AssertingSpans.java |
| +++ b/lucene/test-framework/src/java/org/apache/lucene/search/spans/AssertingSpans.java |
| @@ -191,6 +191,14 @@ class AssertingSpans extends Spans { |
| } |
| |
| @Override |
| + public float positionsCost() { |
| + float cost = in.positionsCost(); |
| + assert ! Float.isNaN(cost) : "positionsCost() should not be NaN"; |
| + assert cost > 0 : "positionsCost() must be positive"; |
| + return cost; |
| + } |
| + |
| + @Override |
| protected float scoreCurrentDoc() throws IOException { |
| assert in.docScorer != null : in.getClass() + " has no docScorer!"; |
| return in.scoreCurrentDoc(); |
| @@ -229,6 +237,18 @@ class AssertingSpans extends Spans { |
| } |
| return v; |
| } |
| + |
| + @Override |
| + public float matchCost() { |
| + float cost = in.matchCost(); |
| + if (Float.isNaN(cost)) { |
| + throw new AssertionError("matchCost()=" + cost + " should not be NaN on doc ID " + approximation.docID()); |
| + } |
| + if (cost < 0) { |
| + throw new AssertionError("matchCost()=" + cost + " should be non negative on doc ID " + approximation.docID()); |
| + } |
| + return cost; |
| + } |
| } |
| |
| class AssertingDISI extends DocIdSetIterator { |
| diff --git a/solr/core/src/java/org/apache/solr/search/Filter.java b/solr/core/src/java/org/apache/solr/search/Filter.java |
| index 6f968a8..98c5d2d 100644 |
| --- a/solr/core/src/java/org/apache/solr/search/Filter.java |
| +++ b/solr/core/src/java/org/apache/solr/search/Filter.java |
| @@ -129,6 +129,11 @@ public abstract class Filter extends Query { |
| public boolean matches() throws IOException { |
| return bits.get(approximation.docID()); |
| } |
| + |
| + @Override |
| + public float matchCost() { |
| + return 10; // TODO use cost of bits.get() |
| + } |
| }; |
| return new ConstantScoreScorer(this, 0f, twoPhase); |
| } |