| diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java |
| index c718dc9..ecca69a 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java |
| @@ -29,11 +29,14 @@ import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.LeafReaderContext; |
| import org.apache.lucene.index.PostingsEnum; |
| import org.apache.lucene.index.Term; |
| +import org.apache.lucene.index.Terms; |
| import org.apache.lucene.index.TermContext; |
| import org.apache.lucene.index.TermState; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.search.similarities.Similarity; |
| import org.apache.lucene.search.similarities.Similarity.SimScorer; |
| +import org.apache.lucene.search.MatchNoDocsQuery; |
| + |
| |
| /** |
| * A query that treats multiple terms as synonyms. |
| @@ -45,6 +48,7 @@ import org.apache.lucene.search.similarities.Similarity.SimScorer; |
| */ |
| public final class SynonymQuery extends Query { |
| private final Term terms[]; |
| + private final String field; |
| |
| /** |
| * Creates a new SynonymQuery, matching any of the supplied terms. |
| @@ -62,16 +66,23 @@ public final class SynonymQuery extends Query { |
| throw new IllegalArgumentException("Synonyms must be across the same field"); |
| } |
| } |
| + this.field = field; |
| if (terms.length > BooleanQuery.getMaxClauseCount()) { |
| throw new BooleanQuery.TooManyClauses(); |
| } |
| Arrays.sort(this.terms); |
| } |
| |
| + /** The terms to be treated as synonyms. */ |
| public List<Term> getTerms() { |
| return Collections.unmodifiableList(Arrays.asList(terms)); |
| } |
| |
| + /** The field of the terms. */ |
| + public String getField() { |
| + return field; |
| + } |
| + |
| @Override |
| public String toString(String field) { |
| StringBuilder builder = new StringBuilder("Synonym("); |
| @@ -101,7 +112,7 @@ public final class SynonymQuery extends Query { |
| public Query rewrite(IndexReader reader) throws IOException { |
| // optimize zero and single term cases |
| if (terms.length == 0) { |
| - return new BooleanQuery.Builder().build(); |
| + return new MatchNoDocsQuery(); |
| } |
| if (terms.length == 1) { |
| return new TermQuery(terms[0]); |
| @@ -123,7 +134,7 @@ public final class SynonymQuery extends Query { |
| } |
| } |
| |
| - class SynonymWeight extends Weight { |
| + public class SynonymWeight extends Weight { |
| private final TermContext termContexts[]; |
| private final Similarity similarity; |
| private final Similarity.SimWeight simWeight; |
| @@ -183,20 +194,42 @@ public final class SynonymQuery extends Query { |
| return Explanation.noMatch("no matching term"); |
| } |
| |
| + /** |
| + * Expert: Return a SimScorer for this context. |
| + * Public only for use in the spans package. |
| + * @param context the LeafReaderContext |
| + * @return a SimWeight |
| + * @throws IOException on error |
| + */ |
| + public Similarity.SimScorer getSimScorer(LeafReaderContext context) throws IOException { |
| + return similarity.simScorer(simWeight, context); |
| + } |
| + |
| + /** |
| + * Expert: Return a TermContext array in the same order as the terms. |
| + * Public only for use in the spans package, do not modify. |
| + */ |
| + public TermContext[] getTermContexts() { |
| + return termContexts; |
| + } |
| + |
| @Override |
| public Scorer scorer(LeafReaderContext context) throws IOException { |
| - Similarity.SimScorer simScorer = similarity.simScorer(simWeight, context); |
| + Similarity.SimScorer simScorer = getSimScorer(context); |
| // we use termscorers + disjunction as an impl detail |
| List<Scorer> subScorers = new ArrayList<>(); |
| + Terms fieldTerms = context.reader().terms(field); |
| + if (fieldTerms != null) { |
| + TermsEnum termsEnum = fieldTerms.iterator(); |
| for (int i = 0; i < terms.length; i++) { |
| TermState state = termContexts[i].get(context.ord); |
| if (state != null) { |
| - TermsEnum termsEnum = context.reader().terms(terms[i].field()).iterator(); |
| termsEnum.seekExact(terms[i].bytes(), state); |
| PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS); |
| subScorers.add(new TermScorer(this, postings, simScorer)); |
| } |
| } |
| + } |
| if (subScorers.isEmpty()) { |
| return null; |
| } else if (subScorers.size() == 1) { |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java |
| index 6763118..4f7f5a5 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java |
| @@ -75,6 +75,7 @@ public class BM25Similarity extends Similarity { |
| |
| /** Implemented as <code>1 / (distance + 1)</code>. */ |
| protected float sloppyFreq(int distance) { |
| + assert distance <= Integer.MAX_VALUE - 1; |
| return 1.0f / (distance + 1); |
| } |
| |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java |
| index 5a1e237..66b2cc1 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java |
| @@ -115,6 +115,7 @@ public class ClassicSimilarity extends TFIDFSimilarity { |
| /** Implemented as <code>1 / (distance + 1)</code>. */ |
| @Override |
| public float sloppyFreq(int distance) { |
| + assert distance <= Integer.MAX_VALUE - 1; |
| return 1.0f / (distance + 1); |
| } |
| |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java |
| index 7f0f27c..fdf8799 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java |
| @@ -143,7 +143,7 @@ public abstract class Similarity { |
| * API for scoring "sloppy" queries such as {@link TermQuery}, |
| * {@link SpanQuery}, and {@link PhraseQuery}. |
| * <p> |
| - * Frequencies are floating-point values: an approximate |
| + * Frequencies may be floating-point values to allow an approximate |
| * within-document frequency adjusted for "sloppiness" by |
| * {@link SimScorer#computeSlopFactor(int)}. |
| */ |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/AsSingleTermSpansDocScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/AsSingleTermSpansDocScorer.java |
| new file mode 100644 |
| index 0000000..0be25d8 |
| --- /dev/null |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/AsSingleTermSpansDocScorer.java |
| @@ -0,0 +1,166 @@ |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| +package org.apache.lucene.search.spans; |
| + |
| +import java.io.IOException; |
| +import static java.util.Arrays.sort; |
| + |
| +import org.apache.lucene.search.similarities.Similarity.SimScorer; |
| +import static org.apache.lucene.util.ArrayUtil.oversize; |
| + |
| + |
| +/** |
| + * For {@link SpansTreeQuery}. Public for extension. |
| + * |
| + * @lucene.experimental |
| + */ |
| +public abstract class AsSingleTermSpansDocScorer<SpansT extends Spans> |
| +extends SpansDocScorer<SpansT> { |
| + |
| + protected final SimScorer simScorer; |
| + protected final double nonMatchWeight; |
| + |
| + protected int currentDoc = -1; |
| + protected int tf; |
| + protected int matchTF; |
| + protected int lastCountedPosition; |
| + protected double[] occSlops; |
| + |
| + protected final int INIT_SLOPS_SIZE = 2; // CHECKME: use average term frequency? |
| + |
| + /** |
| + * @param spans Provides matching term occurrences. |
| + * @param simScorer Scores the matching and non matching term occurrences per document. |
| + * @param nonMatchWeight The non negative weight to be used for the non matching term occurrences. |
| + */ |
| + public AsSingleTermSpansDocScorer(SpansT spans, SimScorer simScorer, double nonMatchWeight) { |
| + super(spans); |
| + this.simScorer = simScorer; |
| + this.nonMatchWeight = nonMatchWeight; |
| + assert nonMatchWeight >= 0 : ("nonMatchWeight="+ nonMatchWeight); |
| + this.occSlops = new double[INIT_SLOPS_SIZE]; |
| + } |
| + |
| + /** The total number of occurrences of the term in the current document. |
| + */ |
| + public abstract int termFreqInDoc() throws IOException; |
| + |
| + @Override |
| + public void beginDoc() throws IOException { |
| + matchTF = 0; |
| + lastCountedPosition = -1; |
| + currentDoc = docID(); // only for asserts |
| + |
| + tf = termFreqInDoc(); |
| + assert tf >= 1; |
| + if (occSlops.length < tf) { |
| + occSlops = new double[oversize(tf, Double.BYTES)]; |
| + } |
| + } |
| + |
| + /** Record a matching term occurrence and record its slopFactor. |
| + * Keep the largest slop factor when the spans start position |
| + * has not changed. |
| + */ |
| + @Override |
| + public void recordMatch(double slopFactor) { |
| + assert docID() == currentDoc; |
| + assert slopFactor >= 0; |
| + int currentPosition = spans.startPosition(); |
| + assert currentPosition != Spans.NO_MORE_POSITIONS; |
| + if (lastCountedPosition < currentPosition) { |
| + occSlops[matchTF] = slopFactor; |
| + matchTF += 1; |
| + assert matchTF <= tf; |
| + lastCountedPosition = currentPosition; |
| + } else { |
| + assert lastCountedPosition == currentPosition; |
| + assert matchTF >= 1; |
| + if (slopFactor > occSlops[matchTF-1]) { |
| + occSlops[matchTF-1] = slopFactor; |
| + } |
| + } |
| + } |
| + |
| + @Override |
| + public int docMatchFreq() { |
| + assert docID() == currentDoc; |
| + return matchTF; |
| + } |
| + |
| + /** Compute the document score for the term. |
| + * <br> |
| + * For each matching occurrence determine the score contribution |
| + * and use the given slop factors in decreasing order as weights |
| + * on this contribution. |
| + * <br> |
| + * Use the <code>nonMatchSlop</code> as the weight for the score contribution |
| + * of the non matching occurrences. |
| + * <br> |
| + * For this it is assumed that {@link SimScorer#score(int, float)} provides |
| + * a diminishing (at least non increasing) |
| + * score contribution for each extra term occurrence. |
| + * <br> |
| + * Return the sum of these weighted contributions over all term occurrences. |
| + * <p> |
| + * The implementation is not optimal, especially when there are many |
| + * matching occurrences with the same slop factors. |
| + * <p> |
| + * Aside: The purpose of using the given slop factors in decreasing order |
| + * is to provide scoring consistency |
| + * between span near queries that only differ in the maximum allowed slop. |
| + * This consistency requires that any extra match increases the score of the document, |
| + * even when an extra match has a bigger slop and corresponding lower slop factor. |
| + * It is not known whether such scoring consistency is always achieved. |
| + * <br> |
| + * Sorting the slop factors could be avoided if an actual score |
| + * of each single term occurrence was available. |
| + * In that case the given slop factor could be used as a weight on that score. |
| + * Perhaps it is possible to estimate an actual score for a single term |
| + * occurrence from the distances to other occurrences of the same term. |
| + */ |
| + @Override |
| + public double docScore() throws IOException { |
| + assert docID() == currentDoc; |
| + double docScore = 0; |
| + |
| + assert simScorer.score(currentDoc, 0) == 0; |
| + double cumulMatchTFScore = 0; |
| + |
| + if (matchTF > 0) { |
| + sort(occSlops, 0, matchTF); |
| + assert occSlops[0] >= nonMatchWeight; // non match distance large enough |
| + |
| + for (int matchOcc = 1; matchOcc <= matchTF; matchOcc++) { |
| + double prev = cumulMatchTFScore; |
| + cumulMatchTFScore = simScorer.score(currentDoc, (float) (matchOcc)); |
| + double matchTFScore = cumulMatchTFScore - prev; // matchTFScore should not increase |
| + // use occurence slop factors in decreasing order: |
| + docScore += matchTFScore * occSlops[matchTF - matchOcc]; |
| + } |
| + } |
| + |
| + if (matchTF < tf) { // non matching occurrences |
| + double tfScore = simScorer.score(currentDoc, (float) tf); |
| + double nonMatchingFreqScore = tfScore - cumulMatchTFScore; |
| + double nonMatchScore = nonMatchingFreqScore * nonMatchWeight; |
| + docScore += nonMatchScore; |
| + } |
| + |
| + return docScore; |
| + } |
| +} |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionNearSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionNearSpans.java |
| new file mode 100644 |
| index 0000000..f31b86b |
| --- /dev/null |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionNearSpans.java |
| @@ -0,0 +1,38 @@ |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| +package org.apache.lucene.search.spans; |
| + |
| +import java.util.List; |
| + |
| +import org.apache.lucene.search.similarities.Similarity; |
| + |
| +/** |
| + * Spans that are all present within a given slop. |
| + * |
| + * @lucene.experimental |
| + */ |
| +public abstract class ConjunctionNearSpans extends ConjunctionSpans { |
| + protected final Similarity.SimScorer simScorer; |
| + |
| + public ConjunctionNearSpans(List<Spans> subSpans, Similarity.SimScorer simScorer) { |
| + super(subSpans); |
| + this.simScorer = simScorer; |
| + } |
| + |
| + /** Compute the slop of the current match. */ |
| + public abstract int currentSlop(); |
| +} |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionNearSpansDocScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionNearSpansDocScorer.java |
| new file mode 100644 |
| index 0000000..1de1cdf |
| --- /dev/null |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionNearSpansDocScorer.java |
| @@ -0,0 +1,90 @@ |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| +package org.apache.lucene.search.spans; |
| + |
| +import java.io.IOException; |
| +import java.util.ArrayList; |
| + |
| +import org.apache.lucene.search.similarities.Similarity.SimScorer; |
| + |
| +/** |
| + * For {@link SpansTreeQuery}. Public for extension. |
| + * |
| + * @lucene.experimental |
| + */ |
| +public class ConjunctionNearSpansDocScorer extends SpansDocScorer<ConjunctionNearSpans> { |
| + protected final SimScorer simScorer; |
| + protected final ArrayList<SpansDocScorer<?>> subSpansDocScorers; |
| + |
| + /** Create a ConjunctionNearSpansDocScorer for a ConjunctionNearSpans and its subspans. |
| + * For the subspans use {@link SpansTreeScorer#createSpansDocScorer}. |
| + */ |
| + public ConjunctionNearSpansDocScorer( |
| + SpansTreeScorer spansTreeScorer, |
| + ConjunctionNearSpans nearSpans) |
| + { |
| + super(nearSpans); |
| + this.simScorer = nearSpans.simScorer; |
| + Spans[] subSpansArray = nearSpans.getSubSpans(); |
| + this.subSpansDocScorers = new ArrayList<>(subSpansArray.length); |
| + for (Spans subSpans : subSpansArray) { |
| + SpansDocScorer<?> spansDocScorer = spansTreeScorer.createSpansDocScorer(subSpans); |
| + subSpansDocScorers.add(spansDocScorer); |
| + } |
| + } |
| + |
| + @Override |
| + public void beginDoc() throws IOException { |
| + for (SpansDocScorer<?> spansDocScorer : subSpansDocScorers) { |
| + spansDocScorer.beginDoc(); |
| + } |
| + } |
| + |
| + /** Record a matching occurrence for all subspans. |
| + * Use a slop factor that is the product of the given slopFactor |
| + * and the slop factor of {@link ConjunctionNearSpans#currentSlop}. |
| + */ |
| + @Override |
| + public void recordMatch(double slopFactor) { |
| + int slop = Integer.max(spans.currentSlop(), 0); // avoid infinite localSlopFactor for negative slop |
| + double localSlopFactor = simScorer.computeSlopFactor(slop); |
| + double nestedSlopFactor = slopFactor * localSlopFactor; |
| + for (SpansDocScorer<?> spansDocScorer : subSpansDocScorers) { |
| + spansDocScorer.recordMatch(nestedSlopFactor); |
| + } |
| + } |
| + |
| + /** Return the sum of the matching frequencies of the subspans. */ |
| + @Override |
| + public int docMatchFreq() { |
| + int freq = 0; |
| + for (SpansDocScorer<?> spansDocScorer : subSpansDocScorers) { |
| + freq += spansDocScorer.docMatchFreq(); |
| + } |
| + return freq; |
| + } |
| + |
| + /** Return the sum of document scores of the subspans. */ |
| + @Override |
| + public double docScore() throws IOException { |
| + double score = 0; |
| + for (SpansDocScorer<?> spansDocScorer : subSpansDocScorers) { |
| + score += spansDocScorer.docScore(); |
| + } |
| + return score; |
| + } |
| +} |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionNearSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionNearSpans.java |
| new file mode 100644 |
| index 0000000..681cfc6 |
| --- /dev/null |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionNearSpans.java |
| @@ -0,0 +1,122 @@ |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| +package org.apache.lucene.search.spans; |
| + |
| +import java.util.List; |
| + |
| +import org.apache.lucene.search.similarities.Similarity.SimScorer; |
| + |
| +/** |
| + * A DisjunctionSpans that also provides a slop for each match. |
| + * |
| + * See also {@link SpanOrQuery#SpanOrQuery(int, SpanQuery...)}. |
| + * |
| + * @lucene.experimental |
| + */ |
| +public class DisjunctionNearSpans extends DisjunctionSpans { |
| + protected final int maxDistance; |
| + protected final SimScorer simScorer; |
| + |
| + /** Construct a DisjunctionNearSpans. |
| + * @param spanOrQuery The query that provides the subSpans. |
| + * @param subSpans Over which the disjunction is to be taken. |
| + * @param maxDistance The maximum distance to be returned as the current match slop. |
| + * @param simScorer For computing the slop factor from the slop. |
| + */ |
| + public DisjunctionNearSpans( |
| + SpanOrQuery spanOrQuery, |
| + List<Spans> subSpans, |
| + int maxDistance, |
| + SimScorer simScorer) |
| + { |
| + super(spanOrQuery, subSpans); |
| + this.maxDistance = maxDistance; |
| + this.simScorer = simScorer; |
| + } |
| + |
| + int currentSlop; |
| + int lastDoc = -1; |
| + |
| + Spans prevFirstSpans; |
| + int prevFirstSpansEndPosition; |
| + int lastDifferentSpansEndPosition; |
| + |
| + |
| + /** |
| + * Compute the minimum slop between the currently matching |
| + * sub spans and the previous and next matching other sub spans. |
| + * When this slop is bigger than maxDistance |
| + * or no other matching spans is available, return maxDistance. |
| + * <br> |
| + * The slop is computed from the end of a spans to the beginning |
| + * of the following different one. When this is negative, zero is used. |
| + * <br> |
| + * When this method is used in a document, it must be called once at each match |
| + * in the document. |
| + * <br> |
| + * See also {@link DisjunctionNearSpansDocScorer}. |
| + */ |
| + public int currentSlop() { |
| + Spans firstSpans = byPositionQueue.top(); |
| + assert firstSpans.startPosition() != NO_MORE_POSITIONS; // at a disjunction match |
| + |
| + int currentDoc = docID(); |
| + if (lastDoc != currentDoc) { // at first match in currentDoc |
| + lastDoc = currentDoc; |
| + prevFirstSpans = null; |
| + lastDifferentSpansEndPosition = -1; |
| + } |
| + |
| + int firstSpansEndPosition = firstSpans.endPosition(); // avoid calling more than once below, no spans is moved here. |
| + |
| + int slopBefore; |
| + if (prevFirstSpans == null) { // at first match in currentDoc |
| + slopBefore = maxDistance; |
| + } else if (prevFirstSpans == firstSpans) { // sequence of same subspans. |
| + if (lastDifferentSpansEndPosition == -1) { // initial sequence of same subspans |
| + slopBefore = maxDistance; |
| + } else { // later sequence of same subspans |
| + slopBefore = Math.max(0, firstSpans.startPosition() - lastDifferentSpansEndPosition); |
| + slopBefore = Math.min(slopBefore, maxDistance); |
| + } |
| + } else { // first spans is different from previous spans |
| + slopBefore = Math.max(0, firstSpans.startPosition() - prevFirstSpansEndPosition); |
| + slopBefore = Math.min(slopBefore, maxDistance); |
| + lastDifferentSpansEndPosition = prevFirstSpansEndPosition; |
| + } |
| + prevFirstSpans = firstSpans; |
| + prevFirstSpansEndPosition = firstSpansEndPosition; |
| + |
| + int slopAfter; |
| + if (byPositionQueue.size() == 1) { // no other spans at this document |
| + slopAfter = maxDistance; |
| + } else { |
| + Spans secondSpans = byPositionQueue.subTop(); |
| + assert secondSpans != null; // byPositionQueue.size() >= 2 |
| + assert secondSpans != firstSpans; |
| + if (secondSpans.startPosition() == NO_MORE_POSITIONS) { // second exhausted in current doc |
| + slopAfter = maxDistance; |
| + } else { |
| + slopAfter = Math.max(0, secondSpans.startPosition() - firstSpansEndPosition); |
| + slopAfter = Math.min(slopAfter, maxDistance); |
| + } |
| + } |
| + |
| + currentSlop = Math.min(slopBefore, slopAfter); |
| + return currentSlop; |
| + } |
| +} |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionNearSpansDocScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionNearSpansDocScorer.java |
| new file mode 100644 |
| index 0000000..5f5a4da |
| --- /dev/null |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionNearSpansDocScorer.java |
| @@ -0,0 +1,51 @@ |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| +package org.apache.lucene.search.spans; |
| + |
| +import org.apache.lucene.search.similarities.Similarity.SimScorer; |
| + |
| + |
| +/** |
| + * For {@link SpansTreeQuery}. Public for extension. |
| + * |
| + * @lucene.experimental |
| + */ |
| +public class DisjunctionNearSpansDocScorer |
| + extends DisjunctionSpansDocScorer<DisjunctionNearSpans> { |
| + protected final SimScorer simScorer; |
| + |
| + public DisjunctionNearSpansDocScorer( |
| + SpansTreeScorer spansTreeScorer, |
| + DisjunctionNearSpans orNearSpans) |
| + { |
| + super(spansTreeScorer, orNearSpans); |
| + this.simScorer = orNearSpans.simScorer; |
| + } |
| + |
| + /** Record a match for the subspans at the first position. |
| + * Use a slop factor that is the product of the given slopFactor |
| + * and the slop factor of {@link DisjunctionNearSpans#currentSlop}. |
| + */ |
| + @Override |
| + public void recordMatch(double slopFactor) { |
| + int slop = spans.currentSlop(); |
| + double localSlopFactor = simScorer.computeSlopFactor(slop); |
| + double nestedSlopFactor = slopFactor * localSlopFactor; |
| + super.recordMatch(nestedSlopFactor); |
| + } |
| + |
| +} |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpans.java |
| new file mode 100644 |
| index 0000000..2e5d271 |
| --- /dev/null |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpans.java |
| @@ -0,0 +1,261 @@ |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| +package org.apache.lucene.search.spans; |
| + |
| +import java.io.IOException; |
| +import java.util.List; |
| + |
| +import org.apache.lucene.search.DisiPriorityQueue; |
| +import org.apache.lucene.search.DisiWrapper; |
| +import org.apache.lucene.search.TwoPhaseIterator; |
| +import org.apache.lucene.search.DisjunctionDISIApproximation; |
| + |
| + |
| +/** |
| + * A spans that merges given spans. |
| + * |
| + * @lucene.experimental |
| + */ |
| +public class DisjunctionSpans extends Spans { |
| + protected final SpanQuery spanQuery; |
| + protected final List<Spans> subSpans; |
| + protected final DisiPriorityQueue byDocQueue; |
| + protected final SpanPositionQueue byPositionQueue; |
| + protected Spans topPositionSpans; |
| + |
| + /** Construct a DisjunctionSpans. |
| + * @param spanQuery The query that provides the subSpans. |
| + * @param subSpans Over which the disjunction is to be taken. |
| + */ |
| + public DisjunctionSpans(SpanQuery spanQuery, List<Spans> subSpans) { |
| + this.spanQuery = spanQuery; // for toString() only |
| + this.subSpans = subSpans; |
| + byDocQueue = new DisiPriorityQueue(subSpans.size()); |
| + for (Spans spans : subSpans) { |
| + byDocQueue.add(new DisiWrapper(spans)); |
| + } |
| + byPositionQueue = new SpanPositionQueue(subSpans.size()); // when empty use -1 |
| + topPositionSpans = null; |
| + } |
| + |
| + |
| + /** For {@link DisjunctionSpansDocScorer}. */ |
| + public List<Spans> subSpans() { |
| + return subSpans; |
| + } |
| + |
| + /** For {@link DisjunctionSpansDocScorer}. */ |
| + public void extractSubSpansAtCurrentDoc(List<Spans> spansList) { |
| + byPositionQueue.extractSpansList(spansList); |
| + } |
| + |
| + /** For {@link DisjunctionSpansDocScorer}. */ |
| + public Spans getFirstPositionSpans() { |
| + return byPositionQueue.top(); |
| + } |
| + |
| + @Override |
| + public int nextDoc() throws IOException { |
| + topPositionSpans = null; |
| + DisiWrapper topDocSpans = byDocQueue.top(); |
| + int currentDoc = topDocSpans.doc; |
| + do { |
| + topDocSpans.doc = topDocSpans.iterator.nextDoc(); |
| + topDocSpans = byDocQueue.updateTop(); |
| + } while (topDocSpans.doc == currentDoc); |
| + return topDocSpans.doc; |
| + } |
| + |
| + @Override |
| + public int advance(int target) throws IOException { |
| + topPositionSpans = null; |
| + DisiWrapper topDocSpans = byDocQueue.top(); |
| + do { |
| + topDocSpans.doc = topDocSpans.iterator.advance(target); |
| + topDocSpans = byDocQueue.updateTop(); |
| + } while (topDocSpans.doc < target); |
| + return topDocSpans.doc; |
| + } |
| + |
| + @Override |
| + public int docID() { |
| + DisiWrapper topDocSpans = byDocQueue.top(); |
| + return topDocSpans.doc; |
| + } |
| + |
| + @Override |
| + public TwoPhaseIterator asTwoPhaseIterator() { |
| + float sumMatchCost = 0; // See also DisjunctionScorer.asTwoPhaseIterator() |
| + long sumApproxCost = 0; |
| + |
| + for (DisiWrapper w : byDocQueue) { |
| + if (w.twoPhaseView != null) { |
| + long costWeight = (w.cost <= 1) ? 1 : w.cost; |
| + sumMatchCost += w.twoPhaseView.matchCost() * costWeight; |
| + sumApproxCost += costWeight; |
| + } |
| + } |
| + |
| + if (sumApproxCost == 0) { // no sub spans supports approximations |
| + computePositionsCost(); |
| + return null; |
| + } |
| + |
| + final float matchCost = sumMatchCost / sumApproxCost; |
| + |
| + return new TwoPhaseIterator(new DisjunctionDISIApproximation(byDocQueue)) { |
| + @Override |
| + public boolean matches() throws IOException { |
| + return twoPhaseCurrentDocMatches(); |
| + } |
| + |
| + @Override |
| + public float matchCost() { |
| + return matchCost; |
| + } |
| + }; |
| + } |
| + |
| + float positionsCost = -1; |
| + |
| + void computePositionsCost() { |
| + float sumPositionsCost = 0; |
| + long sumCost = 0; |
| + for (DisiWrapper w : byDocQueue) { |
| + long costWeight = (w.cost <= 1) ? 1 : w.cost; |
| + sumPositionsCost += w.spans.positionsCost() * costWeight; |
| + sumCost += costWeight; |
| + } |
| + positionsCost = sumPositionsCost / sumCost; |
| + } |
| + |
| + @Override |
| + public float positionsCost() { |
| + // This may be called when asTwoPhaseIterator returned null, |
| + // which happens when none of the sub spans supports approximations. |
| + assert positionsCost > 0; |
| + return positionsCost; |
| + } |
| + |
| + int lastDocTwoPhaseMatched = -1; |
| + |
| + boolean twoPhaseCurrentDocMatches() throws IOException { |
| + DisiWrapper listAtCurrentDoc = byDocQueue.topList(); |
| + // remove the head of the list as long as it does not match |
| + final int currentDoc = listAtCurrentDoc.doc; |
| + while (listAtCurrentDoc.twoPhaseView != null) { |
| + if (listAtCurrentDoc.twoPhaseView.matches()) { |
| + // use this spans for positions at current doc: |
| + listAtCurrentDoc.lastApproxMatchDoc = currentDoc; |
| + break; |
| + } |
| + // do not use this spans for positions at current doc: |
| + listAtCurrentDoc.lastApproxNonMatchDoc = currentDoc; |
| + listAtCurrentDoc = listAtCurrentDoc.next; |
| + if (listAtCurrentDoc == null) { |
| + return false; |
| + } |
| + } |
| + lastDocTwoPhaseMatched = currentDoc; |
| + topPositionSpans = null; |
| + return true; |
| + } |
| + |
| + void fillPositionQueue() throws IOException { // called at first nextStartPosition |
| + assert byPositionQueue.size() == 0; |
| + // add all matching Spans at current doc to byPositionQueue |
| + DisiWrapper listAtCurrentDoc = byDocQueue.topList(); |
| + while (listAtCurrentDoc != null) { |
| + Spans spansAtDoc = listAtCurrentDoc.spans; |
| + if (lastDocTwoPhaseMatched == listAtCurrentDoc.doc) { // matched by DisjunctionDisiApproximation |
| + if (listAtCurrentDoc.twoPhaseView != null) { // matched by approximation |
| + if (listAtCurrentDoc.lastApproxNonMatchDoc == listAtCurrentDoc.doc) { // matches() returned false |
| + spansAtDoc = null; |
| + } else { |
| + if (listAtCurrentDoc.lastApproxMatchDoc != listAtCurrentDoc.doc) { |
| + if (!listAtCurrentDoc.twoPhaseView.matches()) { |
| + spansAtDoc = null; |
| + } |
| + } |
| + } |
| + } |
| + } |
| + |
| + if (spansAtDoc != null) { |
| + assert spansAtDoc.docID() == listAtCurrentDoc.doc; |
| + assert spansAtDoc.startPosition() == -1; |
| + spansAtDoc.nextStartPosition(); |
| + assert spansAtDoc.startPosition() != NO_MORE_POSITIONS; |
| + byPositionQueue.add(spansAtDoc); |
| + } |
| + listAtCurrentDoc = listAtCurrentDoc.next; |
| + } |
| + assert byPositionQueue.size() > 0; |
| + } |
| + |
| + @Override |
| + public int nextStartPosition() throws IOException { |
| + if (topPositionSpans == null) { |
| + byPositionQueue.clear(); |
| + fillPositionQueue(); // fills byPositionQueue at first position |
| + topPositionSpans = byPositionQueue.top(); |
| + } else { |
| + topPositionSpans.nextStartPosition(); |
| + topPositionSpans = byPositionQueue.updateTop(); |
| + } |
| + return topPositionSpans.startPosition(); |
| + } |
| + |
| + @Override |
| + public int startPosition() { |
| + return topPositionSpans == null ? -1 : topPositionSpans.startPosition(); |
| + } |
| + |
| + @Override |
| + public int endPosition() { |
| + return topPositionSpans == null ? -1 : topPositionSpans.endPosition(); |
| + } |
| + |
| + @Override |
| + public int width() { |
| + return topPositionSpans.width(); |
| + } |
| + |
| + @Override |
| + public void collect(SpanCollector collector) throws IOException { |
| + if (topPositionSpans != null) |
| + topPositionSpans.collect(collector); |
| + } |
| + |
| + @Override |
| + public String toString() { |
| + return "DisjunctionSpans(" + spanQuery + ")@" + docID() + ": " + startPosition() + " - " + endPosition(); |
| + } |
| + |
| + long cost = -1; |
| + |
| + @Override |
| + public long cost() { |
| + if (cost == -1) { |
| + cost = 0; |
| + for (Spans spans : subSpans) { |
| + cost += spans.cost(); |
| + } |
| + } |
| + return cost; |
| + } |
| +} |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpansDocScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpansDocScorer.java |
| new file mode 100644 |
| index 0000000..4831580 |
| --- /dev/null |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpansDocScorer.java |
| @@ -0,0 +1,84 @@ |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| +package org.apache.lucene.search.spans; |
| + |
| +import java.io.IOException; |
| +import java.util.List; |
| +import java.util.ArrayList; |
| + |
| +/** |
| + * For {@link SpansTreeQuery}. Public for extension. |
| + * |
| + * @lucene.experimental |
| + */ |
| +public class DisjunctionSpansDocScorer<DisjunctionSpansT extends DisjunctionSpans> |
| + extends SpansDocScorer<DisjunctionSpansT> { |
| + protected final ArrayList<Spans> subSpansAtDoc; |
| + |
| + /** Create a DisjunctionSpansDocScorer for a DisjunctionSpans and its subspans. |
| + * For the subspans use {@link SpansTreeScorer#createSpansDocScorer}. |
| + */ |
| + public DisjunctionSpansDocScorer( |
| + SpansTreeScorer spansTreeScorer, |
| + DisjunctionSpansT orSpans) |
| + { |
| + super(orSpans); |
| + List<Spans> subSpans = orSpans.subSpans(); |
| + for (Spans spans : subSpans) { |
| + spansTreeScorer.createSpansDocScorer(spans); |
| + } |
| + this.subSpansAtDoc = new ArrayList<Spans>(subSpans.size()); |
| + } |
| + |
| + @Override |
| + public void beginDoc() throws IOException { |
| + subSpansAtDoc.clear(); |
| + spans.extractSubSpansAtCurrentDoc(subSpansAtDoc); |
| + assert subSpansAtDoc.size() > 0 : "empty subSpansAtDoc docID=" + docID(); |
| + for (Spans subSpans : subSpansAtDoc) { |
| + subSpans.spansDocScorer.beginDoc(); |
| + } |
| + } |
| + |
| + /** Record a match with the given slop factor for the subspans at the first position. */ |
| + @Override |
| + public void recordMatch(double slopFactor) { |
| + Spans firstPosSpans = spans.getFirstPositionSpans(); |
| + assert subSpansAtDoc.contains(firstPosSpans); |
| + firstPosSpans.spansDocScorer.recordMatch(slopFactor); |
| + } |
| + |
| + /** Return the sum of the matching frequencies of the subspans. */ |
| + @Override |
| + public int docMatchFreq() { |
| + int freq = 0; |
| + for (Spans subSpans : subSpansAtDoc) { |
| + freq += subSpans.spansDocScorer.docMatchFreq(); |
| + } |
| + return freq; |
| + } |
| + |
| + /** Return the sum of document scores of the subspans. */ |
| + @Override |
| + public double docScore() throws IOException { |
| + double score = 0; |
| + for (Spans subSpans : subSpansAtDoc) { |
| + score += subSpans.spansDocScorer.docScore(); |
| + } |
| + return score; |
| + } |
| +} |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java b/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java |
| index f405473..257999e 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java |
| @@ -20,6 +20,8 @@ package org.apache.lucene.search.spans; |
| import java.io.IOException; |
| import java.util.List; |
| |
| +import org.apache.lucene.search.similarities.Similarity; |
| + |
| /** |
| * A Spans that is formed from the ordered subspans of a SpanNearQuery |
| * where the subspans do not overlap and have a maximum slop between them. |
| @@ -42,7 +44,7 @@ import java.util.List; |
| * Expert: |
| * Only public for subclassing. Most implementations should not need this class |
| */ |
| -public class NearSpansOrdered extends ConjunctionSpans { |
| +public class NearSpansOrdered extends ConjunctionNearSpans { |
| |
| protected int matchStart = -1; |
| protected int matchEnd = -1; |
| @@ -50,8 +52,12 @@ public class NearSpansOrdered extends ConjunctionSpans { |
| |
| private final int allowedSlop; |
| |
| - public NearSpansOrdered(int allowedSlop, List<Spans> subSpans) throws IOException { |
| - super(subSpans); |
| + public NearSpansOrdered( |
| + int allowedSlop, |
| + List<Spans> subSpans, |
| + Similarity.SimScorer simScorer) throws IOException |
| + { |
| + super(subSpans, simScorer); |
| this.atFirstInCurrentDoc = true; // -1 startPosition/endPosition also at doc -1 |
| this.allowedSlop = allowedSlop; |
| } |
| @@ -144,6 +150,11 @@ public class NearSpansOrdered extends ConjunctionSpans { |
| } |
| |
| @Override |
| + public int currentSlop() { |
| + return matchWidth; |
| + } |
| + |
| + @Override |
| public void collect(SpanCollector collector) throws IOException { |
| for (Spans span : subSpans) { |
| span.collect(collector); |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java b/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java |
| index c3402bc..c14f2fa 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java |
| @@ -23,6 +23,7 @@ import java.util.List; |
| |
| import org.apache.lucene.search.TwoPhaseIterator; |
| import org.apache.lucene.util.PriorityQueue; |
| +import org.apache.lucene.search.similarities.Similarity; |
| |
| /** |
| * Similar to {@link NearSpansOrdered}, but for the unordered case. |
| @@ -30,17 +31,19 @@ import org.apache.lucene.util.PriorityQueue; |
| * Expert: |
| * Only public for subclassing. Most implementations should not need this class |
| */ |
| -public class NearSpansUnordered extends ConjunctionSpans { |
| +public class NearSpansUnordered extends ConjunctionNearSpans { |
| |
| private List<SpansCell> subSpanCells; // in query order |
| private final int allowedSlop; |
| |
| private SpanPositionQueue spanPositionQueue; |
| |
| - public NearSpansUnordered(int allowedSlop, List<Spans> subSpans) |
| - throws IOException { |
| - super(subSpans); |
| - |
| + public NearSpansUnordered( |
| + int allowedSlop, |
| + List<Spans> subSpans, |
| + Similarity.SimScorer simScorer) throws IOException |
| + { |
| + super(subSpans, simScorer); |
| this.subSpanCells = new ArrayList<>(subSpans.size()); |
| for (Spans subSpan : subSpans) { // sub spans in query order |
| this.subSpanCells.add(new SpansCell(subSpan)); |
| @@ -190,9 +193,14 @@ public class NearSpansUnordered extends ConjunctionSpans { |
| return spanPositionQueue.top(); |
| } |
| |
| + @Override |
| + public int currentSlop() { |
| + return maxEndPositionCell.endPosition() - minPositionCell().startPosition() - totalSpanLength; |
| + } |
| + |
| private boolean atMatch() { |
| assert minPositionCell().docID() == maxEndPositionCell.docID(); |
| - return (maxEndPositionCell.endPosition() - minPositionCell().startPosition() - totalSpanLength) <= allowedSlop; |
| + return currentSlop() <= allowedSlop; |
| } |
| |
| @Override |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNearQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNearQuery.java |
| index 7958f47..75d14fe 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNearQuery.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNearQuery.java |
| @@ -48,6 +48,7 @@ public class SpanNearQuery extends SpanQuery implements Cloneable { |
| private final String field; |
| private final List<SpanQuery> clauses = new LinkedList<>(); |
| private int slop; |
| + private int nonMatchSlop = -1; |
| |
| /** |
| * Construct a new builder |
| @@ -88,10 +89,20 @@ public class SpanNearQuery extends SpanQuery implements Cloneable { |
| } |
| |
| /** |
| + * Set the non match slop for this query |
| + */ |
| + public Builder setNonMatchSlop(int nonMatchSlop) { |
| + this.nonMatchSlop = nonMatchSlop; |
| + return this; |
| + } |
| + |
| + /** |
| * Build the query |
| */ |
| public SpanNearQuery build() { |
| - return new SpanNearQuery(clauses.toArray(new SpanQuery[clauses.size()]), slop, ordered); |
| + return (nonMatchSlop == -1) |
| + ? new SpanNearQuery(clauses.toArray(new SpanQuery[clauses.size()]), slop, ordered) |
| + : new SpanNearQuery(clauses.toArray(new SpanQuery[clauses.size()]), slop, ordered, nonMatchSlop); |
| } |
| |
| } |
| @@ -113,9 +124,21 @@ public class SpanNearQuery extends SpanQuery implements Cloneable { |
| protected List<SpanQuery> clauses; |
| protected int slop; |
| protected boolean inOrder; |
| + protected int nonMatchSlop; |
| |
| protected String field; |
| |
| + /** |
| + * Construct a SpanNearQuery. |
| + * See {@link SpanNearQuery#SpanNearQuery(SpanQuery[], int, boolean, int)} |
| + * for the first three parameters. |
| + * This will use <code>Integer.MAX_VALUE-1</code> for the non matching slop. |
| + */ |
| + public SpanNearQuery(SpanQuery[] clausesIn, int slop, boolean inOrder) { |
| + // Integer.MAX_VALUE causes overflow in sloppyFreq which adds 1. |
| + this(clausesIn, slop, inOrder, Integer.MAX_VALUE-1); |
| + } |
| + |
| /** Construct a SpanNearQuery. Matches spans matching a span from each |
| * clause, with up to <code>slop</code> total unmatched positions between |
| * them. |
| @@ -124,10 +147,30 @@ public class SpanNearQuery extends SpanQuery implements Cloneable { |
| * <br>When <code>inOrder</code> is false, the spans from each clause |
| * need not be ordered and may overlap. |
| * @param clausesIn the clauses to find near each other, in the same field, at least 2. |
| - * @param slop The slop value |
| + * @param slop The allowed slop. This should be non negative and at most Integer.Max_VALUE-1. |
| * @param inOrder true if order is important |
| + * @param nonMatchSlop |
| + * The distance for determining the slop factor to be used for non matching |
| + * occurrences. This is used for scoring by {@link SpansTreeQuery}, and it |
| + * should not be smaller than <code>slop</code>. |
| + * <br> |
| + * Smaller values of <code>nonMatchSlop</code> will increase the |
| + * score contribution of non matching occurrences |
| + * via {@link org.apache.lucene.search.similarities.Similarity.SimScorer#computeSlopFactor}. |
| + * <br> |
| + * Smaller values may lead to a scoring inconsistency between two span near queries |
| + * that only differ in the allowed slop. |
| + * For example consider query A with a smaller allowed slop and query B with a larger one. |
| + * For query B there can be more matches, and these should increase the score of B |
| + * when compared to the score of A. |
| + * For each extra match at B, the non matching score for query A should be lower than |
| + * the matching score for query B. |
| + * <br> |
| + * To have consistent scoring between two such queries, choose |
| + * a non matching scoring distance that is larger than the largest allowed distance, |
| + * and provide that to both queries. |
| */ |
| - public SpanNearQuery(SpanQuery[] clausesIn, int slop, boolean inOrder) { |
| + public SpanNearQuery(SpanQuery[] clausesIn, int slop, boolean inOrder, int nonMatchSlop) { |
| this.clauses = new ArrayList<>(clausesIn.length); |
| for (SpanQuery clause : clausesIn) { |
| if (this.field == null) { // check field |
| @@ -137,8 +180,14 @@ public class SpanNearQuery extends SpanQuery implements Cloneable { |
| } |
| this.clauses.add(clause); |
| } |
| - this.slop = slop; |
| + if (nonMatchSlop != -1) { |
| + if (nonMatchSlop < slop) { |
| + throw new IllegalArgumentException("nonMatchSlop < slop: " + nonMatchSlop + " < " + slop); |
| + } |
| + } |
| this.inOrder = inOrder; |
| + this.slop = slop; |
| + this.nonMatchSlop = nonMatchSlop; |
| } |
| |
| /** Return the clauses whose spans are matched. */ |
| @@ -152,6 +201,9 @@ public class SpanNearQuery extends SpanQuery implements Cloneable { |
| /** Return true if matches are required to be in-order.*/ |
| public boolean isInOrder() { return inOrder; } |
| |
| + /** Return the slop used for scoring non matching occurrences. */ |
| + public int getNonMatchSlop() { return nonMatchSlop; } |
| + |
| @Override |
| public String getField() { return field; } |
| |
| @@ -171,6 +223,8 @@ public class SpanNearQuery extends SpanQuery implements Cloneable { |
| buffer.append(slop); |
| buffer.append(", "); |
| buffer.append(inOrder); |
| + buffer.append(", "); |
| + buffer.append(nonMatchSlop); |
| buffer.append(")"); |
| return buffer.toString(); |
| } |
| @@ -179,7 +233,7 @@ public class SpanNearQuery extends SpanQuery implements Cloneable { |
| public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { |
| List<SpanWeight> subWeights = new ArrayList<>(); |
| for (SpanQuery q : clauses) { |
| - subWeights.add(q.createWeight(searcher, false, boost)); |
| + subWeights.add(q.createWeight(searcher, needsScores, boost)); |
| } |
| return new SpanNearWeight(subWeights, searcher, needsScores ? getTermContexts(subWeights) : null, boost); |
| } |
| @@ -219,8 +273,8 @@ public class SpanNearQuery extends SpanQuery implements Cloneable { |
| } |
| |
| // all NearSpans require at least two subSpans |
| - return (!inOrder) ? new NearSpansUnordered(slop, subSpans) |
| - : new NearSpansOrdered(slop, subSpans); |
| + return (!inOrder) ? new NearSpansUnordered(slop, subSpans, getSimScorer(context)) |
| + : new NearSpansOrdered(slop, subSpans, getSimScorer(context)); |
| } |
| |
| @Override |
| @@ -262,6 +316,7 @@ public class SpanNearQuery extends SpanQuery implements Cloneable { |
| private boolean equalsTo(SpanNearQuery other) { |
| return inOrder == other.inOrder && |
| slop == other.slop && |
| + nonMatchSlop == other.nonMatchSlop && |
| clauses.equals(other.clauses); |
| } |
| |
| @@ -270,6 +325,7 @@ public class SpanNearQuery extends SpanQuery implements Cloneable { |
| int result = classHash(); |
| result ^= clauses.hashCode(); |
| result += slop; |
| + result ^= 4 * nonMatchSlop; |
| int fac = 1 + (inOrder ? 8 : 4); |
| return fac * result; |
| } |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java |
| index 00bcc4c..7441319 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java |
| @@ -98,7 +98,7 @@ public final class SpanNotQuery extends SpanQuery { |
| |
| @Override |
| public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { |
| - SpanWeight includeWeight = include.createWeight(searcher, false, boost); |
| + SpanWeight includeWeight = include.createWeight(searcher, needsScores, boost); |
| SpanWeight excludeWeight = exclude.createWeight(searcher, false, boost); |
| return new SpanNotWeight(searcher, needsScores ? getTermContexts(includeWeight, excludeWeight) : null, |
| includeWeight, excludeWeight, boost); |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java |
| index 15abc7d..264f1c8 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java |
| @@ -16,7 +16,6 @@ |
| */ |
| package org.apache.lucene.search.spans; |
| |
| - |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Iterator; |
| @@ -28,25 +27,46 @@ import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.LeafReaderContext; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.index.TermContext; |
| -import org.apache.lucene.search.DisiPriorityQueue; |
| -import org.apache.lucene.search.DisiWrapper; |
| -import org.apache.lucene.search.DisjunctionDISIApproximation; |
| import org.apache.lucene.search.IndexSearcher; |
| import org.apache.lucene.search.Query; |
| -import org.apache.lucene.search.TwoPhaseIterator; |
| |
| +import org.apache.lucene.search.similarities.Similarity.SimScorer; |
| |
| /** Matches the union of its clauses. |
| */ |
| public final class SpanOrQuery extends SpanQuery { |
| private List<SpanQuery> clauses; |
| private String field; |
| + private final int maxDistance; |
| |
| /** Construct a SpanOrQuery merging the provided clauses. |
| * All clauses must have the same field. |
| */ |
| public SpanOrQuery(SpanQuery... clauses) { |
| this.clauses = new ArrayList<>(clauses.length); |
| + this.maxDistance = -1; |
| + for (SpanQuery seq : clauses) { |
| + addClause(seq); |
| + } |
| + } |
| + |
| + /** Construct a SpanOrQuery merging the provided clauses |
| + * with the scoring depending on the distances between the successive clauses. |
| + * All clauses must have the same field. |
| + * The non negative maxDistance is used for scoring the successive occurrences |
| + * of the different clauses. When the actual distance is larger than this, or when |
| + * no other clause is present, maxDistance determines the slop factor. |
| + * Otherwise each clause occurrence is scored with a slop factor determined |
| + * by the minimum distance to the occurrence of another clause. |
| + * <br> |
| + * This scoring works only when wrapped in a {@link SpansTreeQuery}. |
| + */ |
| + public SpanOrQuery(int maxDistance, SpanQuery... clauses) { |
| + this.clauses = new ArrayList<>(clauses.length); |
| + this.maxDistance = maxDistance; |
| + if (maxDistance < 0) { |
| + throw new IllegalArgumentException("maxDistance must be non negative: " + maxDistance); |
| + } |
| for (SpanQuery seq : clauses) { |
| addClause(seq); |
| } |
| @@ -67,6 +87,11 @@ public final class SpanOrQuery extends SpanQuery { |
| return clauses.toArray(new SpanQuery[clauses.size()]); |
| } |
| |
| + /** Return the maximum distance used to determine a slop factor for a clause occurrence. |
| + * When no maximum distance was given, -1 is returned. |
| + */ |
| + public int getMaxDistance() { return maxDistance; } |
| + |
| @Override |
| public String getField() { return field; } |
| |
| @@ -89,7 +114,13 @@ public final class SpanOrQuery extends SpanQuery { |
| @Override |
| public String toString(String field) { |
| StringBuilder buffer = new StringBuilder(); |
| - buffer.append("spanOr(["); |
| + buffer.append("spanOr("); |
| + if (maxDistance != -1) { |
| + buffer.append("maxDistance="); |
| + buffer.append(maxDistance); |
| + buffer.append(", "); |
| + } |
| + buffer.append("["); |
| Iterator<SpanQuery> i = clauses.iterator(); |
| while (i.hasNext()) { |
| SpanQuery clause = i.next(); |
| @@ -104,31 +135,47 @@ public final class SpanOrQuery extends SpanQuery { |
| |
| @Override |
| public boolean equals(Object other) { |
| - return sameClassAs(other) && |
| - clauses.equals(((SpanOrQuery) other).clauses); |
| + return sameClassAs(other) |
| + && maxDistance == ((SpanOrQuery) other).maxDistance |
| + && clauses.equals(((SpanOrQuery) other).clauses); |
| } |
| |
| @Override |
| public int hashCode() { |
| - return classHash() ^ clauses.hashCode(); |
| + return classHash() ^ clauses.hashCode() ^ (7 * maxDistance); |
| } |
| |
| @Override |
| public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { |
| List<SpanWeight> subWeights = new ArrayList<>(clauses.size()); |
| for (SpanQuery q : clauses) { |
| - subWeights.add(q.createWeight(searcher, false, boost)); |
| + subWeights.add(q.createWeight(searcher, needsScores, boost)); |
| } |
| - return new SpanOrWeight(searcher, needsScores ? getTermContexts(subWeights) : null, subWeights, boost); |
| + return new SpanOrWeight(searcher, |
| + needsScores ? getTermContexts(subWeights) : null, |
| + subWeights, |
| + needsScores, |
| + boost); |
| } |
| |
| public class SpanOrWeight extends SpanWeight { |
| |
| final List<SpanWeight> subWeights; |
| - |
| - public SpanOrWeight(IndexSearcher searcher, Map<Term, TermContext> terms, List<SpanWeight> subWeights, float boost) throws IOException { |
| + final IndexSearcher searcher; |
| + final boolean needsScores; |
| + final float boost; |
| + |
| + public SpanOrWeight(IndexSearcher searcher, |
| + Map<Term, TermContext> terms, |
| + List<SpanWeight> subWeights, |
| + boolean needsScores, |
| + float boost) throws IOException |
| + { |
| super(SpanOrQuery.this, searcher, terms, boost); |
| this.subWeights = subWeights; |
| + this.searcher = searcher; |
| + this.needsScores = needsScores; |
| + this.boost = boost; |
| } |
| |
| @Override |
| @@ -151,222 +198,35 @@ public final class SpanOrQuery extends SpanQuery { |
| |
| ArrayList<Spans> subSpans = new ArrayList<>(clauses.size()); |
| |
| + SpanWeight lastSpanWeight = null; |
| for (SpanWeight w : subWeights) { |
| Spans spans = w.getSpans(context, requiredPostings); |
| if (spans != null) { |
| subSpans.add(spans); |
| + lastSpanWeight = w; |
| } |
| } |
| |
| if (subSpans.size() == 0) { |
| return null; |
| } else if (subSpans.size() == 1) { |
| + if (maxDistance == -1) { |
| return subSpans.get(0); |
| + } else { // only weigh by slop factor of maxDistance |
| + SimScorer simScorer = getSimScorer(context); |
| + float maxDistanceSlop = simScorer.computeSlopFactor(maxDistance); |
| + SpanQuery subQuery = (SpanQuery) lastSpanWeight.getQuery(); |
| + return subQuery.createWeight(searcher, needsScores, (boost * maxDistanceSlop)) |
| + .getSpans(context, requiredPostings); |
| } |
| - |
| - DisiPriorityQueue byDocQueue = new DisiPriorityQueue(subSpans.size()); |
| - for (Spans spans : subSpans) { |
| - byDocQueue.add(new DisiWrapper(spans)); |
| } |
| - |
| - SpanPositionQueue byPositionQueue = new SpanPositionQueue(subSpans.size()); // when empty use -1 |
| - |
| - return new Spans() { |
| - Spans topPositionSpans = null; |
| - |
| - @Override |
| - public int nextDoc() throws IOException { |
| - topPositionSpans = null; |
| - DisiWrapper topDocSpans = byDocQueue.top(); |
| - int currentDoc = topDocSpans.doc; |
| - do { |
| - topDocSpans.doc = topDocSpans.iterator.nextDoc(); |
| - topDocSpans = byDocQueue.updateTop(); |
| - } while (topDocSpans.doc == currentDoc); |
| - return topDocSpans.doc; |
| - } |
| - |
| - @Override |
| - public int advance(int target) throws IOException { |
| - topPositionSpans = null; |
| - DisiWrapper topDocSpans = byDocQueue.top(); |
| - do { |
| - topDocSpans.doc = topDocSpans.iterator.advance(target); |
| - topDocSpans = byDocQueue.updateTop(); |
| - } while (topDocSpans.doc < target); |
| - return topDocSpans.doc; |
| - } |
| - |
| - @Override |
| - public int docID() { |
| - DisiWrapper topDocSpans = byDocQueue.top(); |
| - return topDocSpans.doc; |
| - } |
| - |
| - @Override |
| - public TwoPhaseIterator asTwoPhaseIterator() { |
| - float sumMatchCost = 0; // See also DisjunctionScorer.asTwoPhaseIterator() |
| - long sumApproxCost = 0; |
| - |
| - for (DisiWrapper w : byDocQueue) { |
| - if (w.twoPhaseView != null) { |
| - long costWeight = (w.cost <= 1) ? 1 : w.cost; |
| - sumMatchCost += w.twoPhaseView.matchCost() * costWeight; |
| - sumApproxCost += costWeight; |
| - } |
| - } |
| - |
| - if (sumApproxCost == 0) { // no sub spans supports approximations |
| - computePositionsCost(); |
| - return null; |
| - } |
| - |
| - final float matchCost = sumMatchCost / sumApproxCost; |
| - |
| - return new TwoPhaseIterator(new DisjunctionDISIApproximation(byDocQueue)) { |
| - @Override |
| - public boolean matches() throws IOException { |
| - return twoPhaseCurrentDocMatches(); |
| - } |
| - |
| - @Override |
| - public float matchCost() { |
| - return matchCost; |
| - } |
| - }; |
| - } |
| - |
| - float positionsCost = -1; |
| - |
| - void computePositionsCost() { |
| - float sumPositionsCost = 0; |
| - long sumCost = 0; |
| - for (DisiWrapper w : byDocQueue) { |
| - long costWeight = (w.cost <= 1) ? 1 : w.cost; |
| - sumPositionsCost += w.spans.positionsCost() * costWeight; |
| - sumCost += costWeight; |
| - } |
| - positionsCost = sumPositionsCost / sumCost; |
| - } |
| - |
| - @Override |
| - public float positionsCost() { |
| - // This may be called when asTwoPhaseIterator returned null, |
| - // which happens when none of the sub spans supports approximations. |
| - assert positionsCost > 0; |
| - return positionsCost; |
| - } |
| - |
| - int lastDocTwoPhaseMatched = -1; |
| - |
| - boolean twoPhaseCurrentDocMatches() throws IOException { |
| - DisiWrapper listAtCurrentDoc = byDocQueue.topList(); |
| - // remove the head of the list as long as it does not match |
| - final int currentDoc = listAtCurrentDoc.doc; |
| - while (listAtCurrentDoc.twoPhaseView != null) { |
| - if (listAtCurrentDoc.twoPhaseView.matches()) { |
| - // use this spans for positions at current doc: |
| - listAtCurrentDoc.lastApproxMatchDoc = currentDoc; |
| - break; |
| - } |
| - // do not use this spans for positions at current doc: |
| - listAtCurrentDoc.lastApproxNonMatchDoc = currentDoc; |
| - listAtCurrentDoc = listAtCurrentDoc.next; |
| - if (listAtCurrentDoc == null) { |
| - return false; |
| - } |
| - } |
| - lastDocTwoPhaseMatched = currentDoc; |
| - topPositionSpans = null; |
| - return true; |
| - } |
| - |
| - void fillPositionQueue() throws IOException { // called at first nextStartPosition |
| - assert byPositionQueue.size() == 0; |
| - // add all matching Spans at current doc to byPositionQueue |
| - DisiWrapper listAtCurrentDoc = byDocQueue.topList(); |
| - while (listAtCurrentDoc != null) { |
| - Spans spansAtDoc = listAtCurrentDoc.spans; |
| - if (lastDocTwoPhaseMatched == listAtCurrentDoc.doc) { // matched by DisjunctionDisiApproximation |
| - if (listAtCurrentDoc.twoPhaseView != null) { // matched by approximation |
| - if (listAtCurrentDoc.lastApproxNonMatchDoc == listAtCurrentDoc.doc) { // matches() returned false |
| - spansAtDoc = null; |
| - } else { |
| - if (listAtCurrentDoc.lastApproxMatchDoc != listAtCurrentDoc.doc) { |
| - if (!listAtCurrentDoc.twoPhaseView.matches()) { |
| - spansAtDoc = null; |
| - } |
| - } |
| - } |
| - } |
| - } |
| - |
| - if (spansAtDoc != null) { |
| - assert spansAtDoc.docID() == listAtCurrentDoc.doc; |
| - assert spansAtDoc.startPosition() == -1; |
| - spansAtDoc.nextStartPosition(); |
| - assert spansAtDoc.startPosition() != NO_MORE_POSITIONS; |
| - byPositionQueue.add(spansAtDoc); |
| - } |
| - listAtCurrentDoc = listAtCurrentDoc.next; |
| - } |
| - assert byPositionQueue.size() > 0; |
| - } |
| - |
| - @Override |
| - public int nextStartPosition() throws IOException { |
| - if (topPositionSpans == null) { |
| - byPositionQueue.clear(); |
| - fillPositionQueue(); // fills byPositionQueue at first position |
| - topPositionSpans = byPositionQueue.top(); |
| + if (maxDistance == -1) { |
| + return new DisjunctionSpans(SpanOrQuery.this, subSpans); |
| } else { |
| - topPositionSpans.nextStartPosition(); |
| - topPositionSpans = byPositionQueue.updateTop(); |
| - } |
| - return topPositionSpans.startPosition(); |
| + SimScorer simScorer = getSimScorer(context); |
| + return new DisjunctionNearSpans(SpanOrQuery.this, subSpans, maxDistance, simScorer); |
| } |
| - |
| - @Override |
| - public int startPosition() { |
| - return topPositionSpans == null ? -1 : topPositionSpans.startPosition(); |
| - } |
| - |
| - @Override |
| - public int endPosition() { |
| - return topPositionSpans == null ? -1 : topPositionSpans.endPosition(); |
| - } |
| - |
| - @Override |
| - public int width() { |
| - return topPositionSpans.width(); |
| - } |
| - |
| - @Override |
| - public void collect(SpanCollector collector) throws IOException { |
| - if (topPositionSpans != null) |
| - topPositionSpans.collect(collector); |
| } |
| - |
| - @Override |
| - public String toString() { |
| - return "spanOr(" + SpanOrQuery.this + ")@" + docID() + ": " + startPosition() + " - " + endPosition(); |
| - } |
| - |
| - long cost = -1; |
| - |
| - @Override |
| - public long cost() { |
| - if (cost == -1) { |
| - cost = 0; |
| - for (Spans spans : subSpans) { |
| - cost += spans.cost(); |
| } |
| } |
| - return cost; |
| - } |
| - }; |
| - } |
| - } |
| - |
| -} |
| |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanPositionQueue.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanPositionQueue.java |
| index 2d2bd16..22bdb17 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanPositionQueue.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanPositionQueue.java |
| @@ -16,6 +16,8 @@ |
| */ |
| package org.apache.lucene.search.spans; |
| |
| +import java.util.List; |
| +import java.util.Iterator; |
| |
| import org.apache.lucene.util.PriorityQueue; |
| |
| @@ -31,5 +33,12 @@ class SpanPositionQueue extends PriorityQueue<Spans> { |
| : (start1 == start2) ? s1.endPosition() < s2.endPosition() |
| : false; |
| } |
| + |
| + void extractSpansList(List<Spans> spansList) { |
| + Iterator<Spans> spansIter = iterator(); |
| + while (spansIter.hasNext()) { |
| + spansList.add(spansIter.next()); |
| + } |
| + } |
| } |
| |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanSynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanSynonymQuery.java |
| new file mode 100644 |
| index 0000000..fb57ddb |
| --- /dev/null |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanSynonymQuery.java |
| @@ -0,0 +1,187 @@ |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| +package org.apache.lucene.search.spans; |
| + |
| +import java.io.IOException; |
| +import java.util.ArrayList; |
| +import java.util.List; |
| +import java.util.Set; |
| +import java.util.Map; |
| + |
| +import org.apache.lucene.index.IndexReader; |
| +import org.apache.lucene.index.LeafReaderContext; |
| +import org.apache.lucene.index.PostingsEnum; |
| +import org.apache.lucene.index.Term; |
| +import org.apache.lucene.index.Terms; |
| +import org.apache.lucene.index.TermContext; |
| +import org.apache.lucene.index.TermState; |
| +import org.apache.lucene.index.TermsEnum; |
| + |
| +import org.apache.lucene.search.similarities.Similarity.SimScorer; |
| + |
| +import org.apache.lucene.search.Query; |
| +import org.apache.lucene.search.SynonymQuery; |
| +import org.apache.lucene.search.SynonymQuery.SynonymWeight; |
| +import org.apache.lucene.search.IndexSearcher; |
| +import org.apache.lucene.search.Explanation; |
| +import org.apache.lucene.search.MatchNoDocsQuery; |
| + |
| +/** |
| + * A SpanQuery that treats terms as synonyms. |
| + * <p> |
| + * For scoring purposes, this query tries to score the terms as if you |
| + * had indexed them as one term: it will match any of the terms while |
| + * using the same scoring as {@link SynonymQuery}, as far as possible. |
| + */ |
| +public final class SpanSynonymQuery extends SpanQuery { |
| + final SynonymQuery synonymQuery; |
| + final List<Term> terms; |
| + |
| + /** |
| + * Creates a new SpanSynonymQuery, matching any of the supplied terms. |
| + * <p> |
| + * The terms must all have the same field. |
| + */ |
| + public SpanSynonymQuery(Term... terms) { |
| + this.synonymQuery = new SynonymQuery(terms); |
| + this.terms = synonymQuery.getTerms(); |
| + } |
| + |
| + @Override |
| + public String getField() { |
| + return synonymQuery.getField(); |
| + } |
| + |
| + @Override |
| + public String toString(String field) { |
| + StringBuilder builder = new StringBuilder("SpanSynonym("); |
| + builder.append(synonymQuery.toString(field)); |
| + builder.append(")"); |
| + return builder.toString(); |
| + } |
| + |
| + @Override |
| + public int hashCode() { |
| + return 31 * classHash() - synonymQuery.hashCode(); |
| + } |
| + |
| + @Override |
| + public boolean equals(Object other) { |
| + return sameClassAs(other) && |
| + synonymQuery.equals(((SpanSynonymQuery) other).synonymQuery); |
| + } |
| + |
| + @Override |
| + public Query rewrite(IndexReader reader) throws IOException { |
| + // optimize zero and single term cases |
| + int numTerms = terms.size(); |
| + if (numTerms == 0) { |
| + return new MatchNoDocsQuery(); |
| + } |
| + if (numTerms == 1) { |
| + return new SpanTermQuery(terms.get(0)); |
| + } |
| + return this; |
| + } |
| + |
| + /** The returned SpanWeight does not support {@link SpanWeight#explain}. */ |
| + @Override |
| + public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { |
| + if (needsScores) { |
| + SynonymWeight synonymWeight = (SynonymWeight) |
| + synonymQuery.createWeight(searcher, needsScores, boost); |
| + return new SpanSynonymWeight(searcher, boost, synonymWeight); |
| + } |
| + else { // scores not needed, use SpanOrQuery without scoring. |
| + SpanTermQuery[] clauses = new SpanTermQuery[terms.size()]; |
| + int i = 0; |
| + for (Term term : terms) { |
| + clauses[i++] = new SpanTermQuery(term); |
| + } |
| + return new SpanOrQuery(clauses).createWeight(searcher, needsScores, boost); |
| + } |
| + } |
| + |
| + class SpanSynonymWeight extends SpanWeight { |
| + final SynonymWeight synonymWeight; |
| + |
| + SpanSynonymWeight( |
| + IndexSearcher searcher, |
| + float boost, |
| + SynonymWeight synonymWeight) |
| + throws IOException { |
| + super(SpanSynonymQuery.this, searcher, null, boost); // null: no term context map |
| + this.synonymWeight = synonymWeight; |
| + } |
| + |
| + @Override |
| + public void extractTerms(Set<Term> termSet) { |
| + for (Term t : terms) { |
| + termSet.add(t); |
| + } |
| + } |
| + |
| + @Override |
| + public void extractTermContexts(Map<Term, TermContext> termContextbyTerm) { |
| + TermContext[] termContexts = synonymWeight.getTermContexts(); |
| + int i = 0; |
| + for (Term term : terms) { |
| + TermContext termContext = termContexts[i++]; |
| + termContextbyTerm.put(term, termContext); |
| + } |
| + } |
| + |
| + @Override |
| + public Explanation explain(LeafReaderContext context, int doc) throws IOException { |
| + throw new UnsupportedOperationException(); |
| + } |
| + |
| + @Override |
| + public SimScorer getSimScorer(LeafReaderContext context) throws IOException { |
| + return synonymWeight.getSimScorer(context); |
| + } |
| + |
| + @Override |
| + public Spans getSpans(final LeafReaderContext context, Postings requiredPostings) |
| + throws IOException { |
| + SimScorer simScorer = getSimScorer(context); |
| + final String field = getField(); |
| + Terms fieldTerms = context.reader().terms(field); |
| + List<Spans> termSpans = new ArrayList<>(terms.size()); |
| + if (fieldTerms != null) { |
| + TermsEnum termsEnum = fieldTerms.iterator(); |
| + TermContext[] termContexts = synonymWeight.getTermContexts(); |
| + int i = 0; |
| + for (Term term : terms) { |
| + TermContext termContext = termContexts[i++]; // in term order |
| + TermState termState = termContext.get(context.ord); |
| + if (termState != null) { |
| + termsEnum.seekExact(term.bytes(), termState); |
| + PostingsEnum postings = termsEnum.postings(null, PostingsEnum.POSITIONS); |
| + float positionsCost = SpanTermQuery.termPositionsCost(termsEnum) |
| + * SpanTermQuery.PHRASE_TO_SPAN_TERM_POSITIONS_COST; |
| + termSpans.add(new TermSpans(simScorer, postings, term, positionsCost)); |
| + } |
| + } |
| + } |
| + |
| + return (termSpans.size() == 0) ? null |
| + : (termSpans.size() == 1) ? termSpans.get(0) |
| + : new SynonymSpans(SpanSynonymQuery.this, termSpans, simScorer); |
| + } |
| + } |
| +} |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java |
| index 2746a0c..9c28ac9 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java |
| @@ -126,7 +126,7 @@ public class SpanTermQuery extends SpanQuery { |
| * the relative cost of dealing with the term positions |
| * when using a SpanNearQuery instead of a PhraseQuery. |
| */ |
| - private static final float PHRASE_TO_SPAN_TERM_POSITIONS_COST = 4.0f; |
| + static final float PHRASE_TO_SPAN_TERM_POSITIONS_COST = 4.0f; |
| |
| private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128; |
| |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/Spans.java b/lucene/core/src/java/org/apache/lucene/search/spans/Spans.java |
| index 7857708..73dd083 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/spans/Spans.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/Spans.java |
| @@ -116,4 +116,6 @@ public abstract class Spans extends DocIdSetIterator { |
| */ |
| protected void doCurrentSpans() throws IOException {} |
| |
| + /** For {@link SpansTreeQuery}. */ |
| + SpansDocScorer<?> spansDocScorer; |
| } |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpansDocScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpansDocScorer.java |
| new file mode 100644 |
| index 0000000..55d39fd |
| --- /dev/null |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpansDocScorer.java |
| @@ -0,0 +1,53 @@ |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| +package org.apache.lucene.search.spans; |
| + |
| +import java.io.IOException; |
| + |
| +/** |
| + * Record span matches in a document and compute a document score. |
| + * <br> |
| + * For {@link SpansTreeQuery}. Public for extension. |
| + * |
| + * @lucene.experimental |
| + */ |
| +public abstract class SpansDocScorer<SpansT extends Spans> { |
| + protected final SpansT spans; |
| + |
| + /** |
| + * Create a SpansDocScorer and make {@link Spans#spansDocScorer} refer to it. |
| + */ |
| + public SpansDocScorer(SpansT spans) { |
| + this.spans = spans; |
| + spans.spansDocScorer = this; |
| + } |
| + |
| + /** The document of the spans, see {@link Spans#docID}. */ |
| + public int docID() { return spans.docID(); } |
| + |
| + /** Called before the first match of the spans is to be recorded for the document. */ |
| + public abstract void beginDoc() throws IOException; |
| + |
| + /** Record a match with its slop factor. */ |
| + public abstract void recordMatch(double slopFactor); |
| + |
| + /** Return the matching frequency of the last {@link #beginDoc} document. */ |
| + public abstract int docMatchFreq(); |
| + |
| + /** Return the score of the last {@link #beginDoc} document. */ |
| + public abstract double docScore() throws IOException; |
| +} |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpansTreeQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpansTreeQuery.java |
| new file mode 100644 |
| index 0000000..2a73e38 |
| --- /dev/null |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpansTreeQuery.java |
| @@ -0,0 +1,328 @@ |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| +package org.apache.lucene.search.spans; |
| + |
| +import java.io.IOException; |
| +import java.util.Objects; |
| +import java.util.Set; |
| +import java.util.ArrayList; |
| + |
| +import org.apache.lucene.index.Term; |
| +import org.apache.lucene.index.IndexReader; |
| +import org.apache.lucene.index.LeafReaderContext; |
| + |
| +import org.apache.lucene.search.Query; |
| +import org.apache.lucene.search.BooleanQuery; |
| +import org.apache.lucene.search.BooleanClause; |
| +import org.apache.lucene.search.DisjunctionMaxQuery; |
| +import org.apache.lucene.search.BoostQuery; |
| +import org.apache.lucene.search.Weight; |
| +import org.apache.lucene.search.IndexSearcher; |
| +import org.apache.lucene.search.Explanation; |
| + |
| +import org.apache.lucene.search.similarities.Similarity.SimScorer; |
| + |
| +/** Wrapper class for scoring span queries via matching term occurrences. |
| + * |
| + * @lucene.experimental |
| + */ |
| +public class SpansTreeQuery extends Query { |
| + |
| + final SpanQuery spanQuery; |
| + final int TOP_LEVEL_SLOP = 0; |
| + |
| + /** Wrap a span query to score via its matching term occurrences. |
| + * <br> |
| + * For more details on scoring see {@link SpansTreeScorer#createSpansDocScorer}. |
| + * |
| + * @param spanQuery This can be any nested combination of |
| + * {@link org.apache.lucene.search.spans.SpanNearQuery}, |
| + * {@link org.apache.lucene.search.spans.SpanOrQuery}, |
| + * {@link org.apache.lucene.search.spans.SpanSynonymQuery}, |
| + * {@link org.apache.lucene.search.spans.SpanTermQuery}, |
| + * {@link org.apache.lucene.search.spans.SpanBoostQuery}, |
| + * {@link org.apache.lucene.search.spans.SpanNotQuery}, |
| + * {@link org.apache.lucene.search.spans.SpanFirstQuery}, |
| + * {@link org.apache.lucene.search.spans.SpanContainingQuery} and |
| + * {@link org.apache.lucene.search.spans.SpanWithinQuery}. |
| + */ |
| + public SpansTreeQuery(SpanQuery spanQuery) { |
| + this.spanQuery = Objects.requireNonNull(spanQuery); |
| + } |
| + |
| + /** Wrap the span (subqueries of a) query in a SpansTreeQuery. |
| + * <br> |
| + * A {@link SpanQuery} will be wrapped in a {@link SpansTreeQuery#SpansTreeQuery}. |
| + * For {@link BooleanQuery}, {@link DisjunctionMaxQuery} and {@link BoostQuery}, |
| + * the subqueries/subquery will be wrapped recursively. |
| + * Otherwise the given query is returned. |
| + * <br> |
| + * No double wrapping will be done because |
| + * a {@link SpansTreeQuery} is not a {@link SpanQuery}. |
| + */ |
| + public static Query wrap(Query query) { |
| + if (query instanceof SpanQuery) { |
| + return new SpansTreeQuery((SpanQuery)query); |
| + } |
| + if (query instanceof BooleanQuery) { |
| + return wrapBooleanQuery((BooleanQuery)query); |
| + } |
| + if (query instanceof DisjunctionMaxQuery) { |
| + return wrapDMQ((DisjunctionMaxQuery)query); |
| + } |
| + if (query instanceof BoostQuery) { |
| + Query subQuery = ((BoostQuery)query).getQuery(); |
| + Query wrappedSubQuery = wrap(subQuery); |
| + if (wrappedSubQuery == subQuery) { |
| + return query; |
| + } |
| + float boost = ((BoostQuery)query).getBoost(); |
| + return new BoostQuery(wrappedSubQuery, boost); |
| + } |
| + return query; |
| + } |
| + |
| + static BooleanQuery wrapBooleanQuery(BooleanQuery blq) { |
| + ArrayList<BooleanClause> wrappedClauses = new ArrayList<>(); |
| + boolean wrapped = false; |
| + for (BooleanClause clause : blq.clauses()) { |
| + Query subQuery = clause.getQuery(); |
| + Query wrappedSubQuery = wrap(subQuery); |
| + if (wrappedSubQuery != subQuery) { |
| + wrapped = true; |
| + wrappedClauses.add(new BooleanClause(wrappedSubQuery, clause.getOccur())); |
| + } |
| + else { |
| + wrappedClauses.add(clause); |
| + } |
| + } |
| + if (! wrapped) { |
| + return blq; |
| + } |
| + BooleanQuery.Builder builder = new BooleanQuery.Builder(); |
| + for (BooleanClause clause : wrappedClauses) { |
| + builder.add(clause); |
| + } |
| + return builder.build(); |
| + } |
| + |
| + static DisjunctionMaxQuery wrapDMQ(DisjunctionMaxQuery dmq) { |
| + ArrayList<Query> wrappedDisjuncts = new ArrayList<>(); |
| + boolean wrapped = false; |
| + for (Query disjunct : dmq.getDisjuncts()) { |
| + Query wrappedDisjunct = wrap(disjunct); |
| + if (wrappedDisjunct != disjunct) { |
| + wrapped = true; |
| + wrappedDisjuncts.add(wrappedDisjunct); |
| + } |
| + else { |
| + wrappedDisjuncts.add(disjunct); |
| + } |
| + } |
| + if (! wrapped) { |
| + return dmq; |
| + } |
| + float tbm = dmq.getTieBreakerMultiplier(); |
| + return new DisjunctionMaxQuery(wrappedDisjuncts, tbm); |
| + } |
| + |
| + |
| + /** Wrap a given query by {@link #wrap(Query)} after it was rewritten. |
| + */ |
| + public static Query wrapAfterRewrite(Query query) { |
| + return new Query() { |
| + @Override |
| + public Query rewrite(IndexReader reader) throws IOException { |
| + Query rewritten = query.rewrite(reader); |
| + Query wrapped = wrap(rewritten); |
| + return wrapped; |
| + } |
| + |
| + @Override |
| + public boolean equals(Object other) { |
| + return this == other; |
| + } |
| + |
| + @Override |
| + public int hashCode() { |
| + return query.hashCode() ^ SpansTreeQuery.class.hashCode(); |
| + } |
| + |
| + @Override |
| + public String toString(String field) { |
| + return "SpansTreeQuery.wrapAfterRewrite: " + query.toString(field); |
| + } |
| + }; |
| + } |
| + |
| + /** The wrapped SpanQuery */ |
| + public SpanQuery getSpanQuery() { return spanQuery; } |
| + |
| + @Override |
| + public int hashCode() { |
| + return getClass().hashCode() - spanQuery.hashCode(); |
| + } |
| + |
| + @Override |
| + public boolean equals(Object other) { |
| + return sameClassAs(other) && |
| + equalsTo(getClass().cast(other)); |
| + } |
| + |
| + private boolean equalsTo(SpansTreeQuery other) { |
| + return spanQuery.equals(other.spanQuery); |
| + } |
| + |
| + @Override |
| + public String toString(String field) { |
| + StringBuilder buffer = new StringBuilder(); |
| + buffer.append("SpansTreeQuery("); |
| + buffer.append(spanQuery.toString(field)); |
| + buffer.append(")"); |
| + return buffer.toString(); |
| + } |
| + |
| + /** Return a weight for scoring by matching term occurrences. |
| + * <br>{@link Weight#explain} is not supported on the result. |
| + */ |
| + @Override |
| + public SpansTreeWeight createWeight( |
| + IndexSearcher searcher, |
| + boolean needsScores, |
| + float boost) |
| + throws IOException |
| + { |
| + return new SpansTreeWeight(searcher, needsScores, boost); |
| + } |
| + |
| + public class SpansTreeWeight extends Weight { |
| + final SpanWeight spanWeight; |
| + |
| + public SpansTreeWeight( |
| + IndexSearcher searcher, |
| + boolean needsScores, |
| + float boost) |
| + throws IOException |
| + { |
| + super(SpansTreeQuery.this); |
| + this.spanWeight = spanQuery.createWeight(searcher, needsScores, boost); |
| + } |
| + |
| + /** Throws an UnsupportedOperationException. */ |
| + @Override |
| + public Explanation explain(LeafReaderContext context, int doc) throws IOException { |
| + throw new UnsupportedOperationException(); |
| + } |
| + |
| + @Override |
| + public void extractTerms(Set<Term> terms) { |
| + spanWeight.extractTerms(terms); |
| + } |
| + |
| + /** Compute a minimal slop factor from the maximum possible slops that can occur |
| + * in a SpanQuery for nested SpanNearQueries and for nested SpanOrQueries with distance. |
| + * This supports the queries mentioned at {@link SpansTreeScorer#createSpansDocScorer}. |
| + * <p> |
| + * This uses the maximum slops from {@link SpanOrQuery#getMaxDistance()} and |
| + * {@link SpanNearQuery#getNonMatchSlop()}. |
| + * <p> |
| + * This assumes that slop factors are multiplied in |
| + * {@link ConjunctionNearSpansDocScorer#recordMatch} and in |
| + * {@link DisjunctionNearSpansDocScorer#recordMatch} |
| + */ |
| + public double minSlopFactor(SpanQuery spanQuery, SimScorer simScorer, double slopFactor) { |
| + assert slopFactor >= 0; |
| + if (spanQuery instanceof SpanTermQuery) { |
| + return slopFactor; |
| + } |
| + if (spanQuery instanceof SpanSynonymQuery) { |
| + return slopFactor; |
| + } |
| + if (spanQuery instanceof SpanNotQuery) { |
| + return minSlopFactor(((SpanNotQuery)spanQuery).getInclude(), simScorer, slopFactor); |
| + } |
| + if (spanQuery instanceof SpanPositionCheckQuery) { |
| + return minSlopFactor(((SpanFirstQuery)spanQuery).getMatch(), simScorer, slopFactor); |
| + } |
| + if (spanQuery instanceof SpanContainingQuery) { |
| + return minSlopFactor(((SpanContainingQuery)spanQuery).getBig(), simScorer, slopFactor); |
| + } |
| + if (spanQuery instanceof SpanWithinQuery) { |
| + return minSlopFactor(((SpanWithinQuery)spanQuery).getLittle(), simScorer, slopFactor); |
| + } |
| + if (spanQuery instanceof SpanBoostQuery) { |
| + return minSlopFactor(((SpanBoostQuery)spanQuery).getQuery(), simScorer, slopFactor); |
| + } |
| + |
| + SpanQuery[] clauses = null; |
| + int maxAllowedSlop = -1; |
| + |
| + if (spanQuery instanceof SpanOrQuery) { |
| + SpanOrQuery spanOrQuery = (SpanOrQuery)spanQuery; |
| + clauses = spanOrQuery.getClauses(); |
| + maxAllowedSlop = spanOrQuery.getMaxDistance(); |
| + if (maxAllowedSlop == -1) { |
| + return minSlopFactorClauses(clauses, simScorer, slopFactor); |
| + } |
| + } |
| + else if (spanQuery instanceof SpanNearQuery) { |
| + SpanNearQuery spanNearQuery = (SpanNearQuery) spanQuery; |
| + clauses = spanNearQuery.getClauses(); |
| + maxAllowedSlop = spanNearQuery.getNonMatchSlop(); |
| + } |
| + |
| + if (clauses == null) { |
| + throw new IllegalArgumentException("Not implemented for SpanQuery class: " |
| + + spanQuery.getClass().getName()); |
| + } |
| + |
| + assert maxAllowedSlop >= 0; |
| + double localSlopFactor = simScorer.computeSlopFactor(maxAllowedSlop); |
| + assert localSlopFactor >= 0; |
| + // assumed multiplication: |
| + return minSlopFactorClauses(clauses, simScorer, slopFactor * localSlopFactor); |
| + } |
| + |
| + /** Helper for {@link #minSlopFactor} */ |
| + public double minSlopFactorClauses(SpanQuery[] clauses, SimScorer simScorer, double slopFactor) { |
| + assert slopFactor >= 0; |
| + assert clauses.length >= 1; |
| + double res = Double.MAX_VALUE; |
| + for (SpanQuery clause : clauses) { |
| + double minSlopFacClause = minSlopFactor(clause, simScorer, slopFactor); |
| + res = Double.min(res, minSlopFacClause); |
| + } |
| + return res; |
| + } |
| + |
| + /** Provide a SpansTreeScorer that has the result of {@link #minSlopFactor} |
| + * as the weight for non matching terms. |
| + */ |
| + @Override |
| + public SpansTreeScorer scorer(LeafReaderContext context) throws IOException { |
| + final Spans spans = spanWeight.getSpans(context, SpanWeight.Postings.POSITIONS); |
| + if (spans == null) { |
| + return null; |
| + } |
| + SimScorer topLevelScorer = spanWeight.getSimScorer(context); |
| + double topLevelSlopFactor = topLevelScorer.computeSlopFactor(TOP_LEVEL_SLOP); |
| + double nonMatchWeight = minSlopFactor(spanQuery, topLevelScorer, topLevelSlopFactor); |
| + |
| + return new SpansTreeScorer(this, spans, topLevelSlopFactor, nonMatchWeight); |
| + } |
| + } |
| +} |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpansTreeScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpansTreeScorer.java |
| new file mode 100644 |
| index 0000000..1455cf4 |
| --- /dev/null |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpansTreeScorer.java |
| @@ -0,0 +1,186 @@ |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| +package org.apache.lucene.search.spans; |
| + |
| +import java.io.IOException; |
| +import java.util.Objects; |
| + |
| +import org.apache.lucene.index.Term; // javadocs |
| +import org.apache.lucene.search.DocIdSetIterator; |
| +import org.apache.lucene.search.Scorer; |
| +import org.apache.lucene.search.Weight; |
| +import org.apache.lucene.search.TwoPhaseIterator; |
| + |
| +/** |
| + * A Scorer for (nested) spans. |
| + * This associates the spans with a {@link SpansDocScorer} and uses its score values. |
| + * <p> |
| + * For {@link SpansTreeQuery}. Public for extension. |
| + * |
| + * @lucene.experimental |
| + */ |
| +public class SpansTreeScorer extends Scorer { |
| + |
| + protected final Spans spans; |
| + protected final double topLevelSlopFactor; |
| + protected final double nonMatchWeight; |
| + protected final SpansDocScorer<?> spansDocScorer; |
| + |
| + protected int lastScoredDoc = -1; |
| + |
| + public SpansTreeScorer(Weight weight, Spans spans, double topLevelSlopFactor, double nonMatchWeight) { |
| + super(weight); |
| + this.spans = Objects.requireNonNull(spans); |
| + this.topLevelSlopFactor = topLevelSlopFactor; |
| + this.nonMatchWeight = nonMatchWeight; |
| + this.spansDocScorer = createSpansDocScorer(spans); |
| + } |
| + |
| + @Override |
| + public int docID() { |
| + return spans.docID(); |
| + } |
| + |
| + @Override |
| + public DocIdSetIterator iterator() { |
| + return spans; |
| + } |
| + |
| + @Override |
| + public TwoPhaseIterator twoPhaseIterator() { |
| + return spans.asTwoPhaseIterator(); |
| + } |
| + |
| + /** |
| + * Provide the SpansDocScorer that will be used by {@link #score} and {@link #freq}. |
| + * <br> |
| + * Override this to provide support for span queries for which the spans are not supported here. |
| + * <table rules="all" frame="box" cellpadding="3" summary="SpansDocScorer for Spans"> |
| + * <tr><td>For {@link Spans}:</td> |
| + * <td>normally from {@link SpanQuery}:</td> |
| + * <td>return:</td> |
| + * </tr> |
| + * <tr><td>{@link TermSpans}</td> |
| + * <td>{@link SpanTermQuery}</td> |
| + * <td>{@link TermSpansDocScorer}</td> |
| + * </tr> |
| + * <tr><td>{@link DisjunctionNearSpans}</td> |
| + * <td>{@link SpanOrQuery#SpanOrQuery(int,SpanQuery...)}</td> |
| + * <td>{@link DisjunctionNearSpansDocScorer}</td> |
| + * </tr> |
| + * <tr><td>{@link DisjunctionSpans}</td> |
| + * <td>{@link SpanOrQuery#SpanOrQuery(SpanQuery...)}</td> |
| + * <td>{@link DisjunctionSpansDocScorer}</td> |
| + * </tr> |
| + * <tr><td>{@link SynonymSpans}</td> |
| + * <td>{@link SpanSynonymQuery#SpanSynonymQuery(Term...)}</td> |
| + * <td>{@link SynonymSpansDocScorer}</td> |
| + * </tr> |
| + * <tr><td>{@link ConjunctionNearSpans}</td> |
| + * <td>{@link SpanNearQuery}</td> |
| + * <td>{@link ConjunctionNearSpansDocScorer}</td> |
| + * </tr> |
| + * <tr><td>{@link FilterSpans}</td> |
| + * <td>{@link SpanNotQuery}, {@link SpanFirstQuery}</td> |
| + * <td>recursively use {@link FilterSpans#in}</td> |
| + * </tr> |
| + * <tr><td>{@link ContainSpans}</td> |
| + * <td>{@link SpanContainingQuery}, {@link SpanWithinQuery}</td> |
| + * <td>recursively use {@link ContainSpans#sourceSpans}</td> |
| + * </tr> |
| + * </table> |
| + */ |
| + public SpansDocScorer<?> createSpansDocScorer(Spans spans) { |
| + SpansDocScorer<?> spansDocScorer = null; |
| + if (spans instanceof TermSpans) { |
| + spansDocScorer = new TermSpansDocScorer((TermSpans) spans, nonMatchWeight); |
| + } |
| + else if (spans instanceof DisjunctionNearSpans) { |
| + spansDocScorer = new DisjunctionNearSpansDocScorer(this, (DisjunctionNearSpans) spans); |
| + } |
| + else if (spans instanceof SynonymSpans) { |
| + spansDocScorer = new SynonymSpansDocScorer((SynonymSpans) spans, nonMatchWeight); |
| + } |
| + else if (spans instanceof DisjunctionSpans) { |
| + spansDocScorer = new DisjunctionSpansDocScorer<>(this, (DisjunctionSpans) spans); |
| + } |
| + else if (spans instanceof ConjunctionNearSpans) { |
| + spansDocScorer = new ConjunctionNearSpansDocScorer(this, (ConjunctionNearSpans) spans); |
| + } |
| + else if (spans instanceof FilterSpans) { |
| + spansDocScorer = createSpansDocScorer(((FilterSpans) spans).in); |
| + spans.spansDocScorer = spansDocScorer; // shortcut |
| + } |
| + else if (spans instanceof ContainSpans) { |
| + spansDocScorer = createSpansDocScorer(((ContainSpans) spans).sourceSpans); |
| + spans.spansDocScorer = spansDocScorer; // shortcut |
| + } |
| + if (spansDocScorer == null) { |
| + throw new IllegalArgumentException("Not implemented for Spans class: " |
| + + spans.getClass().getName()); |
| + } |
| + return spansDocScorer; |
| + } |
| + |
| + /** |
| + * Record the span matches in the current document. |
| + * <p> |
| + * This will be called at most once per document. |
| + */ |
| + protected void recordMatchesCurrentDoc() throws IOException { |
| + int startPos = spans.nextStartPosition(); |
| + assert startPos != Spans.NO_MORE_POSITIONS; |
| + spansDocScorer.beginDoc(); |
| + do { |
| + spansDocScorer.recordMatch(topLevelSlopFactor); |
| + startPos = spans.nextStartPosition(); |
| + } while (startPos != Spans.NO_MORE_POSITIONS); |
| + } |
| + |
| + /** |
| + * Ensure recordMatchesCurrentDoc is called, if not already called for the current doc. |
| + */ |
| + public void ensureMatchesRecorded() throws IOException { |
| + int currentDoc = docID(); |
| + if (lastScoredDoc != currentDoc) { |
| + recordMatchesCurrentDoc(); |
| + lastScoredDoc = currentDoc; |
| + } |
| + } |
| + |
| + /** Score the current document. |
| + * See {@link #createSpansDocScorer} and {@link SpansDocScorer#docScore}. |
| + */ |
| + @Override |
| + public final float score() throws IOException { |
| + ensureMatchesRecorded(); |
| + return (float) spansDocScorer.docScore(); |
| + } |
| + |
| + /** Return the total matching frequency of the current document. |
| + * See {@link #createSpansDocScorer} and {@link SpansDocScorer#docMatchFreq}. |
| + */ |
| + @Override |
| + public final int freq() throws IOException { |
| + ensureMatchesRecorded(); |
| + return spansDocScorer.docMatchFreq(); |
| + } |
| + |
| + public String toString() { |
| + return "SpansTreeScorer(" + spansDocScorer + ")"; |
| + } |
| +} |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpans.java |
| new file mode 100644 |
| index 0000000..fdbf676 |
| --- /dev/null |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpans.java |
| @@ -0,0 +1,47 @@ |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| +package org.apache.lucene.search.spans; |
| + |
| +import java.util.List; |
| + |
| +import org.apache.lucene.search.similarities.Similarity.SimScorer; |
| + |
| + |
| +/** |
| + * A spans for merging and equal scoring of given spans. |
| + * This does not provide score values. |
| + * |
| + * @lucene.experimental |
| + */ |
| +public class SynonymSpans extends DisjunctionSpans { |
| + SimScorer simScorer; |
| + |
| + /** Construct a SynonymSpans. |
| + * @param spanQuery The query that provides the subSpans. |
| + * @param subSpans Over which the disjunction is to be taken. |
| + * @param simScorer To be used for scoring. |
| + */ |
| + public SynonymSpans(SpanQuery spanQuery, List<Spans> subSpans, SimScorer simScorer) { |
| + super(spanQuery, subSpans); |
| + this.simScorer = simScorer; |
| + } |
| + |
| + @Override |
| + public String toString() { |
| + return "SynonymSpans(" + spanQuery + ")@" + docID() + ": " + startPosition() + " - " + endPosition(); |
| + } |
| +} |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpansDocScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpansDocScorer.java |
| new file mode 100644 |
| index 0000000..0bbe708 |
| --- /dev/null |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpansDocScorer.java |
| @@ -0,0 +1,58 @@ |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| +package org.apache.lucene.search.spans; |
| + |
| +import java.io.IOException; |
| +import java.util.ArrayList; |
| + |
| +/** |
| + * For {@link SpansTreeQuery}. Public for extension. |
| + * |
| + * @lucene.experimental |
| + */ |
| +public class SynonymSpansDocScorer |
| + extends AsSingleTermSpansDocScorer<SynonymSpans> { |
| + |
| + protected final ArrayList<Spans> subSpansAtDoc; |
| + /** |
| + * @param synSpans Provides matching synonym occurrences. |
| + * This should only contain TermSpans. |
| + * @param nonMatchWeight The non negative weight to be used for the non matching term occurrences. |
| + */ |
| + public SynonymSpansDocScorer(SynonymSpans synSpans, double nonMatchWeight) { |
| + super(synSpans, synSpans.simScorer, nonMatchWeight); |
| + this.subSpansAtDoc = new ArrayList<>(synSpans.subSpans().size()); |
| + } |
| + |
| + @Override |
| + public int termFreqInDoc() throws IOException { |
| + int freq = 0; |
| + for (Spans subSpans : subSpansAtDoc) { |
| + freq += ((TermSpans)subSpans).getPostings().freq(); |
| + } |
| + return freq; |
| + } |
| + |
| + @Override |
| + public void beginDoc() throws IOException { |
| + subSpansAtDoc.clear(); |
| + spans.extractSubSpansAtCurrentDoc(subSpansAtDoc); |
| + assert subSpansAtDoc.size() > 0 : "empty subSpansAtDoc docID=" + docID(); |
| + super.beginDoc(); // calls termFreqInDoc. |
| + } |
| + |
| +} |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/TermSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/TermSpans.java |
| index f1e1aed..6b0bb47 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/search/spans/TermSpans.java |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/TermSpans.java |
| @@ -32,6 +32,7 @@ import org.apache.lucene.search.similarities.Similarity; |
| public class TermSpans extends Spans { |
| protected final PostingsEnum postings; |
| protected final Term term; |
| + protected final Similarity.SimScorer simScorer; |
| protected int doc; |
| protected int freq; |
| protected int count; |
| @@ -41,6 +42,7 @@ public class TermSpans extends Spans { |
| |
| public TermSpans(Similarity.SimScorer scorer, |
| PostingsEnum postings, Term term, float positionsCost) { |
| + this.simScorer = scorer; |
| this.postings = Objects.requireNonNull(postings); |
| this.term = Objects.requireNonNull(term); |
| this.doc = -1; |
| diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/TermSpansDocScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/TermSpansDocScorer.java |
| new file mode 100644 |
| index 0000000..a033ce7 |
| --- /dev/null |
| +++ b/lucene/core/src/java/org/apache/lucene/search/spans/TermSpansDocScorer.java |
| @@ -0,0 +1,46 @@ |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| +package org.apache.lucene.search.spans; |
| + |
| +import java.io.IOException; |
| + |
| +import org.apache.lucene.index.PostingsEnum; |
| + |
| + |
| +/** |
| + * For {@link SpansTreeQuery}. Public for extension. |
| + * |
| + * @lucene.experimental |
| + */ |
| +public class TermSpansDocScorer extends AsSingleTermSpansDocScorer<TermSpans> { |
| + |
| + protected final PostingsEnum postings; |
| + |
| + /** |
| + * @param termSpans Provides matching term occurrences. |
| + * @param nonMatchWeight The non negative weight to be used for the non matching term occurrences. |
| + */ |
| + public TermSpansDocScorer(TermSpans termSpans, double nonMatchWeight) { |
| + super(termSpans, termSpans.simScorer, nonMatchWeight); |
| + this.postings = termSpans.getPostings(); |
| + } |
| + |
| + @Override |
| + public int termFreqInDoc() throws IOException { |
| + return postings.freq(); |
| + } |
| +} |
| diff --git a/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java b/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java |
| index 83ac613..83f61c1 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java |
| +++ b/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java |
| @@ -173,6 +173,14 @@ public abstract class PriorityQueue<T> implements Iterable<T> { |
| return heap[1]; |
| } |
| |
| + /** Returns the second least element of the PriorityQueue in constant time. */ |
| + public final T subTop() { |
| + if (size == 2) { |
| + return heap[2]; |
| + } |
| + return lessThan(heap[2], heap[3]) ? heap[2] : heap[3]; |
| + } |
| + |
| /** Removes and returns the least element of the PriorityQueue in log(size) |
| time. */ |
| public final T pop() { |
| diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSearchEquivalence.java b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSearchEquivalence.java |
| index 7d7fbe4..e3b4d24 100644 |
| --- a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSearchEquivalence.java |
| +++ b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSearchEquivalence.java |
| @@ -32,6 +32,8 @@ import static org.apache.lucene.search.spans.SpanTestUtil.*; |
| */ |
| public class TestSpanSearchEquivalence extends SearchEquivalenceTestBase { |
| |
| + final int MAX_SLOP = Integer.MAX_VALUE-1; // avoid distance+1 overflow in computeSlopFactor |
| + |
| // TODO: we could go a little crazy for a lot of these, |
| // but these are just simple minimal cases in case something |
| // goes horribly wrong. Put more intense tests elsewhere. |
| @@ -169,7 +171,7 @@ public class TestSpanSearchEquivalence extends SearchEquivalenceTestBase { |
| spanQuery(new SpanTermQuery(t1)), |
| spanQuery(new SpanTermQuery(t2)) |
| }; |
| - SpanQuery q1 = spanQuery(new SpanNearQuery(subquery, Integer.MAX_VALUE, false)); |
| + SpanQuery q1 = spanQuery(new SpanNearQuery(subquery, MAX_SLOP, false)); |
| BooleanQuery.Builder q2 = new BooleanQuery.Builder(); |
| q2.add(new TermQuery(t1), Occur.MUST); |
| q2.add(new TermQuery(t2), Occur.MUST); |
| @@ -293,7 +295,7 @@ public class TestSpanSearchEquivalence extends SearchEquivalenceTestBase { |
| /** SpanPositionRangeQuery(A, 0, ∞) = TermQuery(A) */ |
| public void testSpanRangeTermEverything() throws Exception { |
| Term t1 = randomTerm(); |
| - Query q1 = spanQuery(new SpanPositionRangeQuery(spanQuery(new SpanTermQuery(t1)), 0, Integer.MAX_VALUE)); |
| + Query q1 = spanQuery(new SpanPositionRangeQuery(spanQuery(new SpanTermQuery(t1)), 0, MAX_SLOP)); |
| Query q2 = new TermQuery(t1); |
| assertSameSet(q1, q2); |
| } |
| @@ -343,7 +345,7 @@ public class TestSpanSearchEquivalence extends SearchEquivalenceTestBase { |
| spanQuery(new SpanTermQuery(t2)) |
| }; |
| SpanQuery nearQuery = spanQuery(new SpanNearQuery(subquery, 10, true)); |
| - Query q1 = spanQuery(new SpanPositionRangeQuery(nearQuery, 0, Integer.MAX_VALUE)); |
| + Query q1 = spanQuery(new SpanPositionRangeQuery(nearQuery, 0, MAX_SLOP)); |
| Query q2 = nearQuery; |
| assertSameSet(q1, q2); |
| } |
| @@ -371,7 +373,7 @@ public class TestSpanSearchEquivalence extends SearchEquivalenceTestBase { |
| /** SpanFirstQuery(A, ∞) = TermQuery(A) */ |
| public void testSpanFirstTermEverything() throws Exception { |
| Term t1 = randomTerm(); |
| - Query q1 = spanQuery(new SpanFirstQuery(spanQuery(new SpanTermQuery(t1)), Integer.MAX_VALUE)); |
| + Query q1 = spanQuery(new SpanFirstQuery(spanQuery(new SpanTermQuery(t1)), MAX_SLOP)); |
| Query q2 = new TermQuery(t1); |
| assertSameSet(q1, q2); |
| } |
| @@ -417,7 +419,7 @@ public class TestSpanSearchEquivalence extends SearchEquivalenceTestBase { |
| spanQuery(new SpanTermQuery(t2)) |
| }; |
| SpanQuery nearQuery = spanQuery(new SpanNearQuery(subquery, 10, true)); |
| - Query q1 = spanQuery(new SpanFirstQuery(nearQuery, Integer.MAX_VALUE)); |
| + Query q1 = spanQuery(new SpanFirstQuery(nearQuery, MAX_SLOP)); |
| Query q2 = nearQuery; |
| assertSameSet(q1, q2); |
| } |
| diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSynonymQuery.java b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSynonymQuery.java |
| new file mode 100644 |
| index 0000000..5f4b8eb |
| --- /dev/null |
| +++ b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSynonymQuery.java |
| @@ -0,0 +1,238 @@ |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| +package org.apache.lucene.search.spans; |
| + |
| + |
| +import java.io.IOException; |
| +import java.util.Arrays; |
| +import java.util.Comparator; |
| + |
| +import org.apache.lucene.analysis.*; |
| +import org.apache.lucene.document.Document; |
| +import org.apache.lucene.document.Field; |
| +import org.apache.lucene.index.IndexReader; |
| +import org.apache.lucene.index.RandomIndexWriter; |
| +import org.apache.lucene.index.Term; |
| +import org.apache.lucene.search.CheckHits; |
| +import org.apache.lucene.search.IndexSearcher; |
| +import org.apache.lucene.search.Query; |
| +import org.apache.lucene.search.TermQuery; |
| +import org.apache.lucene.search.ScoreDoc; |
| +import org.apache.lucene.search.TopScoreDocCollector; |
| + |
| +import org.apache.lucene.search.SynonymQuery; |
| + |
| +import org.apache.lucene.store.Directory; |
| +import org.apache.lucene.util.English; |
| +import org.apache.lucene.util.LuceneTestCase; |
| +import org.apache.lucene.util.TestUtil; |
| + |
| +import org.junit.AfterClass; |
| +import org.junit.BeforeClass; |
| +import junit.framework.Assert; |
| + |
| + |
| +public class TestSpanSynonymQuery extends LuceneTestCase { |
| + static IndexSearcher searcher; |
| + static IndexReader reader; |
| + static Directory directory; |
| + |
| + static final int MAX_TEST_DOC = 32; |
| + |
| + @BeforeClass |
| + public static void beforeClass() throws Exception { |
| + directory = newDirectory(); |
| + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, |
| + newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)) |
| + .setMaxBufferedDocs(TestUtil.nextInt(random(), MAX_TEST_DOC, MAX_TEST_DOC + 100)) |
| + .setMergePolicy(newLogMergePolicy())); |
| + for (int i = 0; i < MAX_TEST_DOC; i++) { |
| + Document doc = new Document(); |
| + String text; |
| + if (i < (MAX_TEST_DOC-1)) { |
| + text = English.intToEnglish(i); |
| + if ((i % 5) == 0) { // add some multiple occurrences of the same term(s) |
| + text += " " + text; |
| + } |
| + } else { // last doc, for testing distances > 1, and repeating occurrrences of wb |
| + text = "az a b c d e wa wb wb wc az"; |
| + } |
| + doc.add(newTextField("field", text, Field.Store.YES)); |
| + writer.addDocument(doc); |
| + } |
| + reader = writer.getReader(); |
| + searcher = new IndexSearcher(reader); |
| + writer.close(); |
| + } |
| + |
| + @AfterClass |
| + public static void afterClass() throws Exception { |
| + reader.close(); |
| + directory.close(); |
| + searcher = null; |
| + directory = null; |
| + } |
| + |
| + final String FIELD_NAME = "field"; |
| + |
| + |
| + Term lcnTerm(String term) { |
| + return new Term(FIELD_NAME, term); |
| + } |
| + |
| + Term[] lcnTerms(String... terms) { |
| + Term[] lcnTrms = new Term[terms.length]; |
| + for (int i = 0; i < terms.length; i++) { |
| + lcnTrms[i] = lcnTerm(terms[i]); |
| + } |
| + return lcnTrms; |
| + } |
| + |
| + TermQuery termQuery(String term) { |
| + return new TermQuery(lcnTerm(term)); |
| + } |
| + |
| + SpanTermQuery spanTermQuery(String term) { |
| + return new SpanTermQuery(lcnTerm(term)); |
| + } |
| + |
| + SpanTermQuery[] spanTermQueries(String... terms) { |
| + SpanTermQuery[] stqs = new SpanTermQuery[terms.length]; |
| + for (int i = 0; i < terms.length; i++) { |
| + stqs[i] = spanTermQuery(terms[i]); |
| + } |
| + return stqs; |
| + } |
| + |
| + SpanSynonymQuery spanSynonymQuery(String... terms) { |
| + return new SpanSynonymQuery(lcnTerms(terms)); |
| + } |
| + |
| + SynonymQuery synonymQuery(String... terms) { |
| + return new SynonymQuery(lcnTerms(terms)); |
| + } |
| + |
| + void sortByDoc(ScoreDoc[] scoreDocs) { |
| + Arrays.sort(scoreDocs, new Comparator<ScoreDoc>() { |
| + @Override |
| + public int compare(ScoreDoc sd1, ScoreDoc sd2) { |
| + return sd1.doc - sd2.doc; |
| + } |
| + }); |
| + } |
| + |
| + ScoreDoc[] search(IndexSearcher searcher, Query query) throws IOException { |
| + TopScoreDocCollector collector = TopScoreDocCollector.create(MAX_TEST_DOC); |
| + searcher.search(query, collector); |
| + return collector.topDocs().scoreDocs; |
| + } |
| + |
| + int[] docsFromHits(ScoreDoc[] hits) throws Exception { |
| + int[] docs = new int[hits.length]; |
| + for (int i = 0; i < hits.length; i++) { |
| + docs[i] = hits[i].doc; |
| + } |
| + return docs; |
| + } |
| + |
| + void showQueryResults(String message, Query q, ScoreDoc[] hits) { |
| + System.out.println(message + " results from query " + q); |
| + for (ScoreDoc hit : hits) { |
| + System.out.println("doc=" + hit.doc + ", score=" + hit.score); |
| + } |
| + } |
| + |
| + void checkEqualScores(Query qexp, Query qact) throws Exception { |
| + ScoreDoc[] expHits = search(searcher, qexp); |
| + |
| + int[] expDocs = docsFromHits(expHits); |
| + //showQueryResults("checkEqualScores expected", qexp, expHits); |
| + |
| + ScoreDoc[] actHits = search(searcher, qact); |
| + //showQueryResults("checkEqualScores actual", qact, actHits); |
| + |
| + CheckHits.checkHitsQuery(qact, actHits, expHits, expDocs); |
| + } |
| + |
| + void checkScoresInRange(Query qexp, Query qact, float maxFac, float minFac) throws Exception { |
| + ScoreDoc[] expHits = search(searcher, qexp); |
| + //showQueryResults("checkScoresInRange expected", qexp, expHits); |
| + |
| + ScoreDoc[] actHits = search(searcher, qact); |
| + //showQueryResults("checkScoresInRange actual", qact, actHits); |
| + |
| + if (expHits.length != actHits.length) { |
| + Assert.fail("Unequal lengths: expHits="+expHits.length+",actHits="+actHits.length); |
| + } |
| + |
| + sortByDoc(expHits); |
| + sortByDoc(actHits); |
| + for (int i = 0; i < expHits.length; i++) { |
| + if (expHits[i].doc != actHits[i].doc) |
| + { |
| + Assert.fail("At index " + i |
| + + ": expHits[i].doc=" + expHits[i].doc |
| + + " != actHits[i].doc=" + actHits[i].doc); |
| + } |
| + |
| + if ( (expHits[i].score * maxFac < actHits[i].score) |
| + || (expHits[i].score * minFac > actHits[i].score)) |
| + { |
| + Assert.fail("At index " + i |
| + + ", expHits[i].doc=" + expHits[i].doc |
| + + ", score not in expected range: " + (expHits[i].score * minFac) |
| + + " <= " + actHits[i].score |
| + + " <= " + (expHits[i].score * maxFac)); |
| + } |
| + } |
| + } |
| + |
| + void checkSingleTerm(String term) throws Exception { |
| + TermQuery tq = termQuery(term); |
| + SpanTermQuery stq = spanTermQuery(term); |
| + SpanSynonymQuery ssq = spanSynonymQuery(term); |
| + |
| + checkEqualScores(tq, stq); |
| + checkEqualScores(tq, ssq); |
| + } |
| + |
| + public void testSingleZero() throws Exception { |
| + checkSingleTerm("zero"); |
| + } |
| + |
| + SpanOrQuery spanOrQuery(String... terms) { |
| + return new SpanOrQuery(spanTermQueries(terms)); |
| + } |
| + |
| + void checkOrTerms(String... terms) throws Exception { |
| + assertTrue(terms.length >= 1); |
| + SpanOrQuery soq = spanOrQuery(terms); |
| + SpanSynonymQuery ssq = spanSynonymQuery(terms); |
| + checkScoresInRange(soq, ssq, 0.7f, 0.3f); |
| + |
| + SynonymQuery sq = synonymQuery(terms); |
| + checkEqualScores(sq, ssq); |
| + } |
| + |
| + public void testOrTwoTermsNoDocOverlap() throws Exception { |
| + checkOrTerms("zero", "one"); |
| + } |
| + |
| + public void testOrTwoTermsDocOverlap() throws Exception { |
| + checkOrTerms("twenty", "one"); |
| + } |
| +} |
| diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpansTreeQuery.java b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpansTreeQuery.java |
| new file mode 100644 |
| index 0000000..c166ed2 |
| --- /dev/null |
| +++ b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpansTreeQuery.java |
| @@ -0,0 +1,648 @@ |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| +package org.apache.lucene.search.spans; |
| + |
| + |
| +import java.io.IOException; |
| +import java.util.Arrays; |
| +import java.util.Comparator; |
| + |
| +import org.apache.lucene.analysis.*; |
| +import org.apache.lucene.document.Document; |
| +import org.apache.lucene.document.Field; |
| +import org.apache.lucene.index.IndexReader; |
| +import org.apache.lucene.index.RandomIndexWriter; |
| +import org.apache.lucene.index.Term; |
| +import org.apache.lucene.search.BooleanClause; |
| +import org.apache.lucene.search.BooleanQuery; |
| +import org.apache.lucene.search.SynonymQuery; |
| +import org.apache.lucene.search.CheckHits; |
| +import org.apache.lucene.search.IndexSearcher; |
| +import org.apache.lucene.search.Query; |
| +import org.apache.lucene.search.TermQuery; |
| +import org.apache.lucene.search.ScoreDoc; |
| +import org.apache.lucene.search.TopScoreDocCollector; |
| +import org.apache.lucene.search.similarities.ClassicSimilarity; |
| +import org.apache.lucene.search.similarities.BM25Similarity; |
| +import org.apache.lucene.store.Directory; |
| +import org.apache.lucene.util.English; |
| +import org.apache.lucene.util.LuceneTestCase; |
| +import org.apache.lucene.util.TestUtil; |
| + |
| +import org.junit.AfterClass; |
| +import org.junit.BeforeClass; |
| +import junit.framework.Assert; |
| + |
| + |
| +public class TestSpansTreeQuery extends LuceneTestCase { |
| + static IndexSearcher searcherClassic; |
| + static IndexSearcher searcherBM25; |
| + static IndexReader reader; |
| + static Directory directory; |
| + |
| + static final int MAX_TEST_DOC = 33; |
| + |
| + @BeforeClass |
| + public static void beforeClass() throws Exception { |
| + directory = newDirectory(); |
| + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, |
| + newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)) |
| + .setMaxBufferedDocs(TestUtil.nextInt(random(), MAX_TEST_DOC, MAX_TEST_DOC + 100)) |
| + .setMergePolicy(newLogMergePolicy())); |
| + for (int i = 0; i < MAX_TEST_DOC; i++) { |
| + Document doc = new Document(); |
| + String text; |
| + if (i < (MAX_TEST_DOC-1)) { |
| + text = English.intToEnglish(i); |
| + if ((i % 5) == 0) { // add some multiple occurrences of the same term(s) |
| + text += " " + text; |
| + } |
| + } else { // last doc, for testing distances > 1, and repeating occurrrences of wb |
| + text = "az a b c d e wa wb wb wc az"; |
| + } |
| + doc.add(newTextField("field", text, Field.Store.YES)); |
| + writer.addDocument(doc); |
| + } |
| + reader = writer.getReader(); |
| + searcherClassic = new IndexSearcher(reader); |
| + searcherClassic.setSimilarity(new ClassicSimilarity()); |
| + searcherBM25 = new IndexSearcher(reader); |
| + searcherBM25.setSimilarity(new BM25Similarity()); |
| + writer.close(); |
| + } |
| + |
| + @AfterClass |
| + public static void afterClass() throws Exception { |
| + reader.close(); |
| + directory.close(); |
| + searcherClassic = null; |
| + searcherBM25 = null; |
| + reader = null; |
| + directory = null; |
| + } |
| + |
| + final String FIELD_NAME = "field"; |
| + |
| + Term lcnTerm(String term) { |
| + return new Term(FIELD_NAME, term); |
| + } |
| + |
| + Term[] lcnTerms(String... terms) { |
| + Term[] lcnTrms = new Term[terms.length]; |
| + for (int i = 0; i < terms.length; i++) { |
| + lcnTrms[i] = lcnTerm(terms[i]); |
| + } |
| + return lcnTrms; |
| + } |
| + |
| + |
| + TermQuery termQuery(String term) { |
| + return new TermQuery(lcnTerm(term)); |
| + } |
| + |
| + SpanTermQuery spanTermQuery(String term) { |
| + return new SpanTermQuery(lcnTerm(term)); |
| + } |
| + |
| + ScoreDoc[] search(IndexSearcher searcher, Query query) throws IOException { |
| + TopScoreDocCollector collector = TopScoreDocCollector.create(MAX_TEST_DOC); |
| + searcher.search(query, collector); |
| + return collector.topDocs().scoreDocs; |
| + } |
| + |
| + int[] docsFromHits(ScoreDoc[] hits) throws Exception { |
| + int[] docs = new int[hits.length]; |
| + for (int i = 0; i < hits.length; i++) { |
| + docs[i] = hits[i].doc; |
| + } |
| + return docs; |
| + } |
| + |
| + void checkEqualDocOrder(Query qexp, Query qact) throws Exception { |
| + ScoreDoc[] expHits = search(searcherBM25, qexp); |
| + ScoreDoc[] actHits = search(searcherBM25, qact); |
| + assertEquals("same nr of hits", expHits.length, actHits.length); |
| + for (int i = 0; i < expHits.length; i++) { |
| + assertEquals("same doc at rank " + i, expHits[i].doc, actHits[i].doc); |
| + } |
| + } |
| + |
| + void showQueryResults(String message, Query q, ScoreDoc[] hits) { |
| + System.out.println(message + " results from query " + q); |
| + for (ScoreDoc hit : hits) { |
| + System.out.println("doc=" + hit.doc + ", score=" + hit.score); |
| + } |
| + } |
| + |
| + void checkEqualScores(Query qexp, Query qact) throws Exception { |
| + ScoreDoc[] expHits = search(searcherBM25, qexp); |
| + int[] expDocs = docsFromHits(expHits); |
| + //showQueryResults("expected BM25", qexp, expHits); |
| + |
| + ScoreDoc[] actHits = search(searcherBM25, qact); |
| + //showQueryResults("actual BM25", qact, actHits); |
| + |
| + CheckHits.checkHitsQuery(qact, actHits, expHits, expDocs); |
| + |
| + expHits = search(searcherClassic, qexp); |
| + expDocs = docsFromHits(expHits); |
| + //showQueryResults("expected Classic", qexp, expHits); |
| + |
| + actHits = search(searcherClassic, qact); |
| + //showQueryResults("actual Classic", qexp, expHits); |
| + CheckHits.checkHitsQuery(qact, actHits, expHits, expDocs); |
| + } |
| + |
| + void checkSpanTerm(String term) throws Exception { |
| + TermQuery tq = termQuery(term); |
| + SpanTermQuery stq = spanTermQuery(term); |
| + |
| + checkEqualScores(tq, stq); // test SpanScorer |
| + |
| + checkEqualScores(tq, SpansTreeQuery.wrap(stq)); // test SpanTreeScorer |
| + } |
| + |
| + public void testSpanTermZero() throws Exception { |
| + checkSpanTerm("zero"); |
| + } |
| + |
| + public void testSpanTermSeven() throws Exception { |
| + checkSpanTerm("seven"); |
| + } |
| + |
| + public void testSpanTermFive() throws Exception { |
| + checkSpanTerm("five"); |
| + } |
| + |
| + SpanTermQuery[] spanTermQueries(String... terms) { |
| + SpanTermQuery[] stqs = new SpanTermQuery[terms.length]; |
| + for (int i = 0; i < terms.length; i++) { |
| + stqs[i] = spanTermQuery(terms[i]); |
| + } |
| + return stqs; |
| + } |
| + |
| + SpanOrQuery spanOrQuery(String... terms) { |
| + return new SpanOrQuery(spanTermQueries(terms)); |
| + } |
| + |
| + SpanOrQuery spanOrNearQuery(int maxDistance, String... terms) { |
| + return new SpanOrQuery(maxDistance, spanTermQueries(terms)); |
| + } |
| + |
| + BooleanQuery booleanOrQuery(String... terms) { |
| + BooleanQuery.Builder bqb = new BooleanQuery.Builder(); |
| + for (int i = 0; i < terms.length; i++) { |
| + bqb.add(termQuery(terms[i]), BooleanClause.Occur.SHOULD); |
| + } |
| + return bqb.build(); |
| + } |
| + |
| + void checkSpanOrTerms(String... terms) throws Exception { |
| + assertTrue(terms.length >= 1); |
| + Query boq = SpansTreeQuery.wrap(booleanOrQuery(terms)); |
| + assertTrue(boq instanceof BooleanQuery); // test SpansTreeQuery.wrap |
| + assertTrue(((BooleanQuery)boq).clauses().get(terms.length-1).getQuery() instanceof TermQuery); // test SpansTreeQuery.wrap |
| + SpanOrQuery soq = spanOrQuery(terms); |
| + Query sptroq = SpansTreeQuery.wrap(soq); |
| + //checkEqualDocOrder(boq, sptroq); |
| + //checkEqualScores(boq, soq); // test SpanScorer for OR over terms, fails |
| + checkEqualScores(boq, sptroq); // test SpanTreeScorer for OR over terms |
| + } |
| + |
| + public void testSpanOrOneTerm1() throws Exception { |
| + checkSpanOrTerms("zero"); |
| + } |
| + |
| + public void testSpanOrOneTerm2() throws Exception { |
| + checkSpanOrTerms("thirty"); |
| + } |
| + |
| + public void testSpanOrTwoTerms() throws Exception { |
| + checkSpanOrTerms("zero", "thirty"); |
| + } |
| + |
| + public void testSpanOrTwoCooccurringTerms() throws Exception { |
| + checkSpanOrTerms("twenty", "five"); |
| + } |
| + |
| + public void testSpanOrMoreTerms() throws Exception { |
| + checkSpanOrTerms( |
| + "zero", |
| + "one", |
| + "two", |
| + "three", |
| + "four", |
| + "five", |
| + "six", |
| + "seven", |
| + "twenty", |
| + "thirty" |
| + ); |
| + } |
| + |
| + void checkSameHighestScoringDocAndScore(Query exp, Query act) throws Exception { |
| + ScoreDoc[] expHits = search(searcherBM25, exp); |
| + int[] expDocs = docsFromHits(expHits); |
| + //showQueryResults("checkSameHighestScoringDocAndScore expected BM25", exp, expHits); |
| + |
| + ScoreDoc[] actHits = search(searcherBM25, act); |
| + //showQueryResults("checkSameHighestScoringDocAndScore actual BM25", act, actHits); |
| + |
| + final float scoreTolerance = 1.0e-6f; // from CheckHits.java |
| + |
| + assertEquals("highest scoring docs the same", expHits[0].doc, actHits[0].doc); |
| + assertTrue("equal scores", Math.abs(expHits[0].score - actHits[0].score) <= scoreTolerance); |
| + } |
| + |
| + void checkSameHighestScoringDocAndScoreRange(Query exp, Query act, float maxFac, float minFac) throws Exception { |
| + ScoreDoc[] expHits = search(searcherBM25, exp); |
| + int[] expDocs = docsFromHits(expHits); |
| + //showQueryResults("checkSameHighestScoringDocAndScore expected BM25", exp, expHits); |
| + |
| + ScoreDoc[] actHits = search(searcherBM25, act); |
| + //showQueryResults("checkSameHighestScoringDocAndScore actual BM25", act, actHits); |
| + |
| + final float scoreTolerance = 1.0e-6f; // from CheckHits.java |
| + |
| + assertTrue("at least one expected hit", expHits.length >= 1); |
| + assertTrue("at least one actual hit", actHits.length >= 1); |
| + |
| + int actDoc = 0; // order may differ when top scores are equal |
| + while ((actDoc < actHits.length) |
| + && (actHits[actDoc].doc != expHits[0].doc) |
| + && (Math.abs(actHits[0].score - actHits[actDoc+1].score) < 1e-6f) ) { |
| + actDoc++; |
| + } |
| + assertEquals("highest scoring docs the same", expHits[0].doc, actHits[actDoc].doc); |
| + if ( (expHits[0].score * maxFac < actHits[actDoc].score) |
| + || (expHits[0].score * minFac > actHits[actDoc].score)) |
| + { |
| + Assert.fail("For highest scoring doc" |
| + + ", expHits[0].doc=" + expHits[0].doc |
| + + ", score not in expected range: " + (expHits[0].score * minFac) |
| + + " <= " + actHits[actDoc].score |
| + + " <= " + (expHits[0].score * maxFac)); |
| + } |
| + } |
| + |
| + public void testSpanAdjacentAllTermsInDocUnordered() throws Exception { |
| + /* On "twenty five twenty five" |
| + * unordered "twenty five" should score the same as "twenty" OR "five" |
| + */ |
| + String t1 = "twenty"; |
| + String t2 = "five"; |
| + SpanNearQuery snq = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME) |
| + .addClause(spanTermQuery(t1)) |
| + .addClause(spanTermQuery(t2)) |
| + .setSlop(0) |
| + .build(); |
| + BooleanQuery boq = booleanOrQuery(t1, t2); |
| + |
| + checkSameHighestScoringDocAndScore(boq, SpansTreeQuery.wrap(snq)); |
| + } |
| + |
| + public void testSpanAdjacentAllTermsInDocOrdered1() throws Exception { |
| + /* On "twenty five twenty five" |
| + * ordered "twenty five" should score the same as "twenty" OR "five" |
| + */ |
| + String t1 = "twenty"; |
| + String t2 = "five"; |
| + SpanNearQuery snq = SpanNearQuery.newOrderedNearQuery(FIELD_NAME) |
| + .addClause(spanTermQuery(t1)) |
| + .addClause(spanTermQuery(t2)) |
| + .setSlop(0) |
| + .build(); |
| + BooleanQuery boq = booleanOrQuery(t1, t2); |
| + |
| + checkSameHighestScoringDocAndScore(boq, SpansTreeQuery.wrap(snq)); |
| + } |
| + |
| + public void testSpanAdjacentAllTermsInDocOrdered2() throws Exception { |
| + /* On "twenty five twenty five" |
| + * ordered "five twenty" should score less, but more than half of "twenty" OR "five" |
| + */ |
| + String t1 = "five"; |
| + String t2 = "twenty"; |
| + SpanNearQuery snq = SpanNearQuery.newOrderedNearQuery(FIELD_NAME) |
| + .addClause(spanTermQuery(t1)) |
| + .addClause(spanTermQuery(t2)) |
| + .setSlop(0) |
| + .build(); |
| + BooleanQuery.Builder bqb = new BooleanQuery.Builder(); |
| + bqb.add(termQuery(t1), BooleanClause.Occur.SHOULD); |
| + bqb.add(termQuery(t2), BooleanClause.Occur.SHOULD); |
| + BooleanQuery boq = bqb.build(); |
| + |
| + checkSameHighestScoringDocAndScoreRange(boq, SpansTreeQuery.wrap(snq), 0.7f, 0.5f); |
| + } |
| + |
| + public void testSpanMoreDistanceLessScore() throws Exception { |
| + String t1 = "a"; |
| + String t2 = "b"; |
| + String t3 = "c"; |
| + SpanNearQuery snq2 = SpanNearQuery.newOrderedNearQuery(FIELD_NAME) |
| + .addClause(spanTermQuery(t1)) |
| + .addClause(spanTermQuery(t2)) |
| + .setSlop(2) |
| + .build(); |
| + SpanNearQuery snq3 = SpanNearQuery.newOrderedNearQuery(FIELD_NAME) |
| + .addClause(spanTermQuery(t1)) |
| + .addClause(spanTermQuery(t3)) |
| + .setSlop(2) |
| + .build(); |
| + |
| + checkSameHighestScoringDocAndScoreRange(SpansTreeQuery.wrap(snq2), SpansTreeQuery.wrap(snq3), |
| + 0.50f, 0.49f); |
| + } |
| + |
| + Query sptrSimpleUnorderedNested(String t1a, String t1b, String t2, int slop) { |
| + SpanNearQuery snq1 = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME) |
| + .addClause(spanTermQuery(t1a)) |
| + .addClause(spanTermQuery(t1b)) |
| + .setSlop(slop) |
| + .build(); |
| + |
| + SpanNearQuery snqn = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME) |
| + .addClause(snq1) |
| + .addClause(spanTermQuery(t2)) |
| + .setSlop(slop) |
| + .build(); |
| + |
| + return SpansTreeQuery.wrap(snqn); |
| + } |
| + |
| + public void testSpanNestedMoreDistanceLessScore() throws Exception { |
| + String t1 = "a"; |
| + String t2 = "b"; |
| + String t3 = "c"; |
| + String t4 = "d"; |
| + String t5 = "e"; |
| + Query sptrq1 = sptrSimpleUnorderedNested(t1, t2, t4, 2); |
| + Query sptrq2 = sptrSimpleUnorderedNested(t1, t3, t5, 2); |
| + |
| + checkSameHighestScoringDocAndScoreRange(sptrq1, sptrq2, 0.7f, 0.6f); |
| + } |
| + |
| + public void testNonMatchingPresentTermScore() throws Exception { |
| + String t1 = "a"; |
| + String t2 = "b"; |
| + String t3 = "c"; |
| + |
| + SpanOrQuery soq = new SpanOrQuery(spanTermQuery(t1), spanTermQuery(t2)); |
| + |
| + SpanNearQuery snq1 = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME) |
| + .addClause(soq) |
| + .addClause(spanTermQuery(t3)) |
| + .setSlop(0) |
| + .setNonMatchSlop(3) |
| + .build(); // t1 is present but does not match. |
| + |
| + SpanNearQuery snq2 = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME) |
| + .addClause(soq) |
| + .addClause(spanTermQuery(t3)) |
| + .setSlop(0) |
| + .setNonMatchSlop(4) // t1 scores lower than in snq1 |
| + .build(); // t1 is present but does not match. |
| + |
| + SpansTreeQuery sptrnq1 = new SpansTreeQuery(snq1); |
| + SpansTreeQuery sptrnq2 = new SpansTreeQuery(snq2); |
| + |
| + checkSameHighestScoringDocAndScoreRange(sptrnq1, sptrnq2, 0.98f, 0.9f); |
| + } |
| + |
| + public void testSpanNot() throws Exception { |
| + /* On "twenty five twenty five" |
| + * "twenty" not preceeded by "five", and followed by "five", |
| + * should score less, but more than half of "twenty five" |
| + */ |
| + String t1 = "five"; |
| + String t2 = "twenty"; |
| + SpanNotQuery sntq = new SpanNotQuery( spanTermQuery(t2), spanTermQuery(t1), 1, 0); |
| + |
| + SpanNearQuery snrq1 = SpanNearQuery.newOrderedNearQuery(FIELD_NAME) |
| + .addClause(sntq) |
| + .addClause(spanTermQuery(t1)) |
| + .setSlop(0) |
| + .build(); |
| + |
| + SpanNearQuery snrq2 = SpanNearQuery.newOrderedNearQuery(FIELD_NAME) |
| + .addClause(spanTermQuery(t2)) |
| + .addClause(spanTermQuery(t1)) |
| + .setSlop(0) |
| + .build(); |
| + |
| + Query sptrnrq1 = SpansTreeQuery.wrap(snrq1); |
| + Query sptrnrq2 = SpansTreeQuery.wrap(snrq2); |
| + |
| + checkSameHighestScoringDocAndScoreRange(sptrnrq2, sptrnrq1, 0.8f, 0.5f); |
| + } |
| + |
| + public void testSpanBoost() throws Exception { |
| + String term = "zero"; |
| + SpanTermQuery stq = spanTermQuery(term); |
| + SpanBoostQuery sbq = new SpanBoostQuery(stq, 1.1f); |
| + |
| + checkSameHighestScoringDocAndScoreRange(sbq, stq, 0.92f, 0.90f); |
| + checkSameHighestScoringDocAndScoreRange(SpansTreeQuery.wrap(sbq), stq, 0.92f, 0.90f); |
| + } |
| + |
| + public void testSpanOrNearZeroDistance() throws Exception { |
| + String t1 = "a"; |
| + String t2 = "b"; |
| + BooleanQuery boq = booleanOrQuery(t1, t2); |
| + SpanOrQuery sonq = spanOrNearQuery(0, t1, t2); |
| + checkEqualScores(boq, SpansTreeQuery.wrap(sonq)); |
| + } |
| + |
| + public void testSpanOrNearMoreDistanceLessScore() throws Exception { |
| + String t1 = "a"; |
| + String t2 = "b"; |
| + String t3 = "c"; |
| + Query stq1 = SpansTreeQuery.wrap(spanOrNearQuery(4, t1, t2)); |
| + Query stq2 = SpansTreeQuery.wrap(spanOrNearQuery(4, t1, t3)); |
| + checkSameHighestScoringDocAndScoreRange(stq1, stq2, 0.5f, 0.4f); |
| + } |
| + |
| + public void testSpanOrNearThreeSubqueries() throws Exception { |
| + String t1 = "a"; |
| + String t2 = "b"; |
| + String t3 = "c"; |
| + BooleanQuery boq = booleanOrQuery(t1, t2, t3); |
| + SpanOrQuery sonq = spanOrNearQuery(0, t3, t2, t1); |
| + checkEqualScores(boq, SpansTreeQuery.wrap(sonq)); |
| + } |
| + |
| + public void testSpanOrNearNonMatchingSubQuery() throws Exception { |
| + String t1 = "a"; |
| + String t2 = "b"; |
| + String t3 = "c"; |
| + String t5 = "e"; |
| + SpanOrQuery sonq1 = spanOrNearQuery(1, t3, t2, t1); |
| + SpanOrQuery sonq2 = spanOrNearQuery(1, t5, t2, t1); |
| + checkSameHighestScoringDocAndScoreRange( |
| + SpansTreeQuery.wrap(sonq1), |
| + SpansTreeQuery.wrap(sonq2), |
| + 0.9f, 0.8f); |
| + } |
| + |
| + public void testSpanOrNearSinglePresentSubquery() throws Exception { |
| + String t1 = "a"; |
| + String t2 = "h"; |
| + SpanQuery q1 = spanTermQuery(t1); |
| + SpanOrQuery q2 = spanOrNearQuery(1, t2, t1); |
| + checkSameHighestScoringDocAndScoreRange( |
| + SpansTreeQuery.wrap(q1), |
| + SpansTreeQuery.wrap(q2), |
| + 0.51f, 0.49f); |
| + } |
| + |
| + public void testSpanOrNearRepeatingOccurrences1() throws Exception { |
| + String t1 = "wa"; |
| + String t2 = "wb"; |
| + BooleanQuery boq = booleanOrQuery(t1, t2); |
| + SpanOrQuery sonq = spanOrNearQuery(3, t2, t1); |
| + checkSameHighestScoringDocAndScoreRange( |
| + boq, |
| + SpansTreeQuery.wrap(sonq), |
| + 0.9f, 0.8f); |
| + } |
| + |
| + public void testSpanOrNearRepeatingOccurrences2() throws Exception { |
| + String t1 = "wb"; |
| + String t2 = "wc"; |
| + BooleanQuery boq = booleanOrQuery(t1, t2); |
| + SpanOrQuery sonq = spanOrNearQuery(3, t2, t1); |
| + checkSameHighestScoringDocAndScoreRange( |
| + boq, |
| + SpansTreeQuery.wrap(sonq), |
| + 0.9f, 0.8f); |
| + } |
| + |
| + public void testIncreasingScoreExtraMatchLowSlopFactor() throws Exception { |
| + String t1 = "az"; // near and far from a |
| + String t2 = "a"; |
| + SpanNearQuery snq1 = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME) |
| + .addClause(spanTermQuery(t1)) |
| + .addClause(spanTermQuery(t2)) |
| + .setSlop(0) // does not match far |
| + .setNonMatchSlop(20) // for consistent non match scoring |
| + .build(); |
| + SpanNearQuery snq2 = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME) |
| + .addClause(spanTermQuery(t1)) |
| + .addClause(spanTermQuery(t2)) |
| + .setSlop(8) // also matches far |
| + .setNonMatchSlop(20) // for consistent non match scoring |
| + .build(); |
| + checkSameHighestScoringDocAndScoreRange( |
| + SpansTreeQuery.wrap(snq2), |
| + SpansTreeQuery.wrap(snq1), |
| + 0.98f, 0.9f); |
| + } |
| + |
| + SynonymQuery synonymQuery(String... terms) { |
| + return new SynonymQuery(lcnTerms(terms)); |
| + } |
| + |
| + SpanSynonymQuery spanSynonymQuery(String... terms) { |
| + return new SpanSynonymQuery(lcnTerms(terms)); |
| + } |
| + |
| + void sortByDoc(ScoreDoc[] scoreDocs) { |
| + Arrays.sort(scoreDocs, new Comparator<ScoreDoc>() { |
| + @Override |
| + public int compare(ScoreDoc sd1, ScoreDoc sd2) { |
| + return sd1.doc - sd2.doc; |
| + } |
| + }); |
| + } |
| + |
| + void checkScoresInRange(Query qexp, Query qact, float maxFac, float minFac) throws Exception { |
| + ScoreDoc[] expHits = search(searcherBM25, qexp); |
| + //showQueryResults("checkScoresInRange expected", qexp, expHits); |
| + |
| + ScoreDoc[] actHits = search(searcherBM25, qact); |
| + //showQueryResults("checkScoresInRange actual", qact, actHits); |
| + |
| + if (expHits.length != actHits.length) { |
| + Assert.fail("Unequal lengths: expHits="+expHits.length+",actHits="+actHits.length); |
| + } |
| + |
| + sortByDoc(expHits); |
| + sortByDoc(actHits); |
| + for (int i = 0; i < expHits.length; i++) { |
| + if (expHits[i].doc != actHits[i].doc) |
| + { |
| + Assert.fail("At index " + i |
| + + ": expHits[i].doc=" + expHits[i].doc |
| + + " != actHits[i].doc=" + actHits[i].doc); |
| + } |
| + |
| + if ( (expHits[i].score * maxFac < actHits[i].score) |
| + || (expHits[i].score * minFac > actHits[i].score)) |
| + { |
| + Assert.fail("At index " + i |
| + + ", expHits[i].doc=" + expHits[i].doc |
| + + ", score not in expected range: " + (expHits[i].score * minFac) |
| + + " <= " + actHits[i].score |
| + + " <= " + (expHits[i].score * maxFac)); |
| + } |
| + } |
| + } |
| + |
| + void checkSynTerms(String... terms) throws Exception { |
| + assertTrue(terms.length >= 1); |
| + SpanOrQuery soq = spanOrQuery(terms); |
| + SpanSynonymQuery ssq = spanSynonymQuery(terms); |
| + checkScoresInRange(SpansTreeQuery.wrap(soq), SpansTreeQuery.wrap(ssq), 1.0f, 0.425f); |
| + |
| + SynonymQuery sq = synonymQuery(terms); |
| + checkEqualScores(SpansTreeQuery.wrap(sq), SpansTreeQuery.wrap(ssq)); |
| + } |
| + |
| + public void testSynTwoTermsNoDocOverlap() throws Exception { |
| + checkSynTerms("zero", "one"); |
| + } |
| + |
| + public void testSynTwoTermsDocOverlap() throws Exception { |
| + checkSynTerms("twenty", "one"); |
| + } |
| + |
| + public void testSynNearOrNear() throws Exception { |
| + // twenty occurs 10 times |
| + // thirty occurs 2 times |
| + SpanSynonymQuery ssq2030 = spanSynonymQuery("twenty", "thirty"); |
| + SpanOrQuery soq2030 = spanOrQuery("twenty", "thirty"); |
| + SpanTermQuery stq1 = spanTermQuery("one"); |
| + |
| + SpanNearQuery synNear = SpanNearQuery.newOrderedNearQuery(FIELD_NAME) |
| + .addClause(ssq2030) |
| + .addClause(stq1) |
| + .setSlop(0) |
| + .build(); |
| + SpanNearQuery orNear = SpanNearQuery.newOrderedNearQuery(FIELD_NAME) |
| + .addClause(soq2030) |
| + .addClause(stq1) |
| + .setSlop(0) |
| + .build(); |
| + |
| + checkSameHighestScoringDocAndScoreRange( |
| + SpansTreeQuery.wrap(orNear), |
| + SpansTreeQuery.wrap(synNear), |
| + 0.80f, 0.70f); |
| + } |
| +} |