blob: 445b7924d14b0d47b2f6a7be78280624bc965e31 [file] [log] [blame]
diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
index c718dc9..ecca69a 100644
--- a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
@@ -29,11 +29,14 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
+import org.apache.lucene.search.MatchNoDocsQuery;
+
/**
* A query that treats multiple terms as synonyms.
@@ -45,6 +48,7 @@ import org.apache.lucene.search.similarities.Similarity.SimScorer;
*/
public final class SynonymQuery extends Query {
private final Term terms[];
+ private final String field;
/**
* Creates a new SynonymQuery, matching any of the supplied terms.
@@ -62,16 +66,23 @@ public final class SynonymQuery extends Query {
throw new IllegalArgumentException("Synonyms must be across the same field");
}
}
+ this.field = field;
if (terms.length > BooleanQuery.getMaxClauseCount()) {
throw new BooleanQuery.TooManyClauses();
}
Arrays.sort(this.terms);
}
+ /** The terms to be treated as synonyms. */
public List<Term> getTerms() {
return Collections.unmodifiableList(Arrays.asList(terms));
}
+ /** The field of the terms. */
+ public String getField() {
+ return field;
+ }
+
@Override
public String toString(String field) {
StringBuilder builder = new StringBuilder("Synonym(");
@@ -101,7 +112,7 @@ public final class SynonymQuery extends Query {
public Query rewrite(IndexReader reader) throws IOException {
// optimize zero and single term cases
if (terms.length == 0) {
- return new BooleanQuery.Builder().build();
+ return new MatchNoDocsQuery();
}
if (terms.length == 1) {
return new TermQuery(terms[0]);
@@ -123,7 +134,7 @@ public final class SynonymQuery extends Query {
}
}
- class SynonymWeight extends Weight {
+ public class SynonymWeight extends Weight {
private final TermContext termContexts[];
private final Similarity similarity;
private final Similarity.SimWeight simWeight;
@@ -183,20 +194,42 @@ public final class SynonymQuery extends Query {
return Explanation.noMatch("no matching term");
}
+ /**
+ * Expert: Return a SimScorer for this context.
+ * Public only for use in the spans package.
+ * @param context the LeafReaderContext
+ * @return a SimWeight
+ * @throws IOException on error
+ */
+ public Similarity.SimScorer getSimScorer(LeafReaderContext context) throws IOException {
+ return similarity.simScorer(simWeight, context);
+ }
+
+ /**
+ * Expert: Return a TermContext array in the same order as the terms.
+ * Public only for use in the spans package, do not modify.
+ */
+ public TermContext[] getTermContexts() {
+ return termContexts;
+ }
+
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
- Similarity.SimScorer simScorer = similarity.simScorer(simWeight, context);
+ Similarity.SimScorer simScorer = getSimScorer(context);
// we use termscorers + disjunction as an impl detail
List<Scorer> subScorers = new ArrayList<>();
+ Terms fieldTerms = context.reader().terms(field);
+ if (fieldTerms != null) {
+ TermsEnum termsEnum = fieldTerms.iterator();
for (int i = 0; i < terms.length; i++) {
TermState state = termContexts[i].get(context.ord);
if (state != null) {
- TermsEnum termsEnum = context.reader().terms(terms[i].field()).iterator();
termsEnum.seekExact(terms[i].bytes(), state);
PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS);
subScorers.add(new TermScorer(this, postings, simScorer));
}
}
+ }
if (subScorers.isEmpty()) {
return null;
} else if (subScorers.size() == 1) {
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
index 6763118..4f7f5a5 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
@@ -75,6 +75,7 @@ public class BM25Similarity extends Similarity {
/** Implemented as <code>1 / (distance + 1)</code>. */
protected float sloppyFreq(int distance) {
+ assert distance <= Integer.MAX_VALUE - 1;
return 1.0f / (distance + 1);
}
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java
index 5a1e237..66b2cc1 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java
@@ -115,6 +115,7 @@ public class ClassicSimilarity extends TFIDFSimilarity {
/** Implemented as <code>1 / (distance + 1)</code>. */
@Override
public float sloppyFreq(int distance) {
+ assert distance <= Integer.MAX_VALUE - 1;
return 1.0f / (distance + 1);
}
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java
index 7f0f27c..fdf8799 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java
@@ -143,7 +143,7 @@ public abstract class Similarity {
* API for scoring "sloppy" queries such as {@link TermQuery},
* {@link SpanQuery}, and {@link PhraseQuery}.
* <p>
- * Frequencies are floating-point values: an approximate
+ * Frequencies may be floating-point values to allow an approximate
* within-document frequency adjusted for "sloppiness" by
* {@link SimScorer#computeSlopFactor(int)}.
*/
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/AsSingleTermSpansDocScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/AsSingleTermSpansDocScorer.java
new file mode 100644
index 0000000..0be25d8
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/AsSingleTermSpansDocScorer.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.spans;
+
+import java.io.IOException;
+import static java.util.Arrays.sort;
+
+import org.apache.lucene.search.similarities.Similarity.SimScorer;
+import static org.apache.lucene.util.ArrayUtil.oversize;
+
+
+/**
+ * For {@link SpansTreeQuery}. Public for extension.
+ *
+ * @lucene.experimental
+ */
+public abstract class AsSingleTermSpansDocScorer<SpansT extends Spans>
+extends SpansDocScorer<SpansT> {
+
+ protected final SimScorer simScorer;
+ protected final double nonMatchWeight;
+
+ protected int currentDoc = -1;
+ protected int tf;
+ protected int matchTF;
+ protected int lastCountedPosition;
+ protected double[] occSlops;
+
+ protected final int INIT_SLOPS_SIZE = 2; // CHECKME: use average term frequency?
+
+ /**
+ * @param spans Provides matching term occurrences.
+ * @param simScorer Scores the matching and non matching term occurrences per document.
+ * @param nonMatchWeight The non negative weight to be used for the non matching term occurrences.
+ */
+ public AsSingleTermSpansDocScorer(SpansT spans, SimScorer simScorer, double nonMatchWeight) {
+ super(spans);
+ this.simScorer = simScorer;
+ this.nonMatchWeight = nonMatchWeight;
+ assert nonMatchWeight >= 0 : ("nonMatchWeight="+ nonMatchWeight);
+ this.occSlops = new double[INIT_SLOPS_SIZE];
+ }
+
+ /** The total number of occurrences of the term in the current document.
+ */
+ public abstract int termFreqInDoc() throws IOException;
+
+ @Override
+ public void beginDoc() throws IOException {
+ matchTF = 0;
+ lastCountedPosition = -1;
+ currentDoc = docID(); // only for asserts
+
+ tf = termFreqInDoc();
+ assert tf >= 1;
+ if (occSlops.length < tf) {
+ occSlops = new double[oversize(tf, Double.BYTES)];
+ }
+ }
+
+ /** Record a matching term occurrence and record its slopFactor.
+ * Keep the largest slop factor when the spans start position
+ * has not changed.
+ */
+ @Override
+ public void recordMatch(double slopFactor) {
+ assert docID() == currentDoc;
+ assert slopFactor >= 0;
+ int currentPosition = spans.startPosition();
+ assert currentPosition != Spans.NO_MORE_POSITIONS;
+ if (lastCountedPosition < currentPosition) {
+ occSlops[matchTF] = slopFactor;
+ matchTF += 1;
+ assert matchTF <= tf;
+ lastCountedPosition = currentPosition;
+ } else {
+ assert lastCountedPosition == currentPosition;
+ assert matchTF >= 1;
+ if (slopFactor > occSlops[matchTF-1]) {
+ occSlops[matchTF-1] = slopFactor;
+ }
+ }
+ }
+
+ @Override
+ public int docMatchFreq() {
+ assert docID() == currentDoc;
+ return matchTF;
+ }
+
+ /** Compute the document score for the term.
+ * <br>
+ * For each matching occurrence determine the score contribution
+ * and use the given slop factors in decreasing order as weights
+ * on this contribution.
+ * <br>
+ * Use the <code>nonMatchSlop</code> as the weight for the score contribution
+ * of the non matching occurrences.
+ * <br>
+ * For this it is assumed that {@link SimScorer#score(int, float)} provides
+ * a diminishing (at least non increasing)
+ * score contribution for each extra term occurrence.
+ * <br>
+ * Return the sum of these weighted contributions over all term occurrences.
+ * <p>
+ * The implementation is not optimal, especially when there are many
+ * matching occurrences with the same slop factors.
+ * <p>
+ * Aside: The purpose of using the given slop factors in decreasing order
+ * is to provide scoring consistency
+ * between span near queries that only differ in the maximum allowed slop.
+ * This consistency requires that any extra match increases the score of the document,
+ * even when an extra match has a bigger slop and corresponding lower slop factor.
+ * It is not known whether such scoring consistency is always achieved.
+ * <br>
+ * Sorting the slop factors could be avoided if an actual score
+ * of each single term occurrence was available.
+ * In that case the given slop factor could be used as a weight on that score.
+ * Perhaps it is possible to estimate an actual score for a single term
+ * occurrence from the distances to other occurrences of the same term.
+ */
+ @Override
+ public double docScore() throws IOException {
+ assert docID() == currentDoc;
+ double docScore = 0;
+
+ assert simScorer.score(currentDoc, 0) == 0;
+ double cumulMatchTFScore = 0;
+
+ if (matchTF > 0) {
+ sort(occSlops, 0, matchTF);
+ assert occSlops[0] >= nonMatchWeight; // non match distance large enough
+
+ for (int matchOcc = 1; matchOcc <= matchTF; matchOcc++) {
+ double prev = cumulMatchTFScore;
+ cumulMatchTFScore = simScorer.score(currentDoc, (float) (matchOcc));
+ double matchTFScore = cumulMatchTFScore - prev; // matchTFScore should not increase
+ // use occurence slop factors in decreasing order:
+ docScore += matchTFScore * occSlops[matchTF - matchOcc];
+ }
+ }
+
+ if (matchTF < tf) { // non matching occurrences
+ double tfScore = simScorer.score(currentDoc, (float) tf);
+ double nonMatchingFreqScore = tfScore - cumulMatchTFScore;
+ double nonMatchScore = nonMatchingFreqScore * nonMatchWeight;
+ docScore += nonMatchScore;
+ }
+
+ return docScore;
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionNearSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionNearSpans.java
new file mode 100644
index 0000000..f31b86b
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionNearSpans.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.spans;
+
+import java.util.List;
+
+import org.apache.lucene.search.similarities.Similarity;
+
+/**
+ * Spans that are all present within a given slop.
+ *
+ * @lucene.experimental
+ */
+public abstract class ConjunctionNearSpans extends ConjunctionSpans {
+ protected final Similarity.SimScorer simScorer;
+
+ public ConjunctionNearSpans(List<Spans> subSpans, Similarity.SimScorer simScorer) {
+ super(subSpans);
+ this.simScorer = simScorer;
+ }
+
+ /** Compute the slop of the current match. */
+ public abstract int currentSlop();
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionNearSpansDocScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionNearSpansDocScorer.java
new file mode 100644
index 0000000..1de1cdf
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/ConjunctionNearSpansDocScorer.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.spans;
+
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.apache.lucene.search.similarities.Similarity.SimScorer;
+
+/**
+ * For {@link SpansTreeQuery}. Public for extension.
+ *
+ * @lucene.experimental
+ */
+public class ConjunctionNearSpansDocScorer extends SpansDocScorer<ConjunctionNearSpans> {
+ protected final SimScorer simScorer;
+ protected final ArrayList<SpansDocScorer<?>> subSpansDocScorers;
+
+ /** Create a ConjunctionNearSpansDocScorer for a ConjunctionNearSpans and its subspans.
+ * For the subspans use {@link SpansTreeScorer#createSpansDocScorer}.
+ */
+ public ConjunctionNearSpansDocScorer(
+ SpansTreeScorer spansTreeScorer,
+ ConjunctionNearSpans nearSpans)
+ {
+ super(nearSpans);
+ this.simScorer = nearSpans.simScorer;
+ Spans[] subSpansArray = nearSpans.getSubSpans();
+ this.subSpansDocScorers = new ArrayList<>(subSpansArray.length);
+ for (Spans subSpans : subSpansArray) {
+ SpansDocScorer<?> spansDocScorer = spansTreeScorer.createSpansDocScorer(subSpans);
+ subSpansDocScorers.add(spansDocScorer);
+ }
+ }
+
+ @Override
+ public void beginDoc() throws IOException {
+ for (SpansDocScorer<?> spansDocScorer : subSpansDocScorers) {
+ spansDocScorer.beginDoc();
+ }
+ }
+
+ /** Record a matching occurrence for all subspans.
+ * Use a slop factor that is the product of the given slopFactor
+ * and the slop factor of {@link ConjunctionNearSpans#currentSlop}.
+ */
+ @Override
+ public void recordMatch(double slopFactor) {
+ int slop = Integer.max(spans.currentSlop(), 0); // avoid infinite localSlopFactor for negative slop
+ double localSlopFactor = simScorer.computeSlopFactor(slop);
+ double nestedSlopFactor = slopFactor * localSlopFactor;
+ for (SpansDocScorer<?> spansDocScorer : subSpansDocScorers) {
+ spansDocScorer.recordMatch(nestedSlopFactor);
+ }
+ }
+
+ /** Return the sum of the matching frequencies of the subspans. */
+ @Override
+ public int docMatchFreq() {
+ int freq = 0;
+ for (SpansDocScorer<?> spansDocScorer : subSpansDocScorers) {
+ freq += spansDocScorer.docMatchFreq();
+ }
+ return freq;
+ }
+
+ /** Return the sum of document scores of the subspans. */
+ @Override
+ public double docScore() throws IOException {
+ double score = 0;
+ for (SpansDocScorer<?> spansDocScorer : subSpansDocScorers) {
+ score += spansDocScorer.docScore();
+ }
+ return score;
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionNearSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionNearSpans.java
new file mode 100644
index 0000000..681cfc6
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionNearSpans.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.spans;
+
+import java.util.List;
+
+import org.apache.lucene.search.similarities.Similarity.SimScorer;
+
+/**
+ * A DisjunctionSpans that also provides a slop for each match.
+ *
+ * See also {@link SpanOrQuery#SpanOrQuery(int, SpanQuery...)}.
+ *
+ * @lucene.experimental
+ */
+public class DisjunctionNearSpans extends DisjunctionSpans {
+ protected final int maxDistance;
+ protected final SimScorer simScorer;
+
+ /** Construct a DisjunctionNearSpans.
+ * @param spanOrQuery The query that provides the subSpans.
+ * @param subSpans Over which the disjunction is to be taken.
+ * @param maxDistance The maximum distance to be returned as the current match slop.
+ * @param simScorer For computing the slop factor from the slop.
+ */
+ public DisjunctionNearSpans(
+ SpanOrQuery spanOrQuery,
+ List<Spans> subSpans,
+ int maxDistance,
+ SimScorer simScorer)
+ {
+ super(spanOrQuery, subSpans);
+ this.maxDistance = maxDistance;
+ this.simScorer = simScorer;
+ }
+
+ int currentSlop;
+ int lastDoc = -1;
+
+ Spans prevFirstSpans;
+ int prevFirstSpansEndPosition;
+ int lastDifferentSpansEndPosition;
+
+
+ /**
+ * Compute the minimum slop between the currently matching
+ * sub spans and the previous and next matching other sub spans.
+ * When this slop is bigger than maxDistance
+ * or no other matching spans is available, return maxDistance.
+ * <br>
+ * The slop is computed from the end of a spans to the beginning
+ * of the following different one. When this is negative, zero is used.
+ * <br>
+ * When this method is used in a document, it must be called once at each match
+ * in the document.
+ * <br>
+ * See also {@link DisjunctionNearSpansDocScorer}.
+ */
+ public int currentSlop() {
+ Spans firstSpans = byPositionQueue.top();
+ assert firstSpans.startPosition() != NO_MORE_POSITIONS; // at a disjunction match
+
+ int currentDoc = docID();
+ if (lastDoc != currentDoc) { // at first match in currentDoc
+ lastDoc = currentDoc;
+ prevFirstSpans = null;
+ lastDifferentSpansEndPosition = -1;
+ }
+
+ int firstSpansEndPosition = firstSpans.endPosition(); // avoid calling more than once below, no spans is moved here.
+
+ int slopBefore;
+ if (prevFirstSpans == null) { // at first match in currentDoc
+ slopBefore = maxDistance;
+ } else if (prevFirstSpans == firstSpans) { // sequence of same subspans.
+ if (lastDifferentSpansEndPosition == -1) { // initial sequence of same subspans
+ slopBefore = maxDistance;
+ } else { // later sequence of same subspans
+ slopBefore = Math.max(0, firstSpans.startPosition() - lastDifferentSpansEndPosition);
+ slopBefore = Math.min(slopBefore, maxDistance);
+ }
+ } else { // first spans is different from previous spans
+ slopBefore = Math.max(0, firstSpans.startPosition() - prevFirstSpansEndPosition);
+ slopBefore = Math.min(slopBefore, maxDistance);
+ lastDifferentSpansEndPosition = prevFirstSpansEndPosition;
+ }
+ prevFirstSpans = firstSpans;
+ prevFirstSpansEndPosition = firstSpansEndPosition;
+
+ int slopAfter;
+ if (byPositionQueue.size() == 1) { // no other spans at this document
+ slopAfter = maxDistance;
+ } else {
+ Spans secondSpans = byPositionQueue.subTop();
+ assert secondSpans != null; // byPositionQueue.size() >= 2
+ assert secondSpans != firstSpans;
+ if (secondSpans.startPosition() == NO_MORE_POSITIONS) { // second exhausted in current doc
+ slopAfter = maxDistance;
+ } else {
+ slopAfter = Math.max(0, secondSpans.startPosition() - firstSpansEndPosition);
+ slopAfter = Math.min(slopAfter, maxDistance);
+ }
+ }
+
+ currentSlop = Math.min(slopBefore, slopAfter);
+ return currentSlop;
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionNearSpansDocScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionNearSpansDocScorer.java
new file mode 100644
index 0000000..5f5a4da
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionNearSpansDocScorer.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.spans;
+
+import org.apache.lucene.search.similarities.Similarity.SimScorer;
+
+
+/**
+ * For {@link SpansTreeQuery}. Public for extension.
+ *
+ * @lucene.experimental
+ */
+public class DisjunctionNearSpansDocScorer
+ extends DisjunctionSpansDocScorer<DisjunctionNearSpans> {
+ protected final SimScorer simScorer;
+
+ public DisjunctionNearSpansDocScorer(
+ SpansTreeScorer spansTreeScorer,
+ DisjunctionNearSpans orNearSpans)
+ {
+ super(spansTreeScorer, orNearSpans);
+ this.simScorer = orNearSpans.simScorer;
+ }
+
+ /** Record a match for the subspans at the first position.
+ * Use a slop factor that is the product of the given slopFactor
+ * and the slop factor of {@link DisjunctionNearSpans#currentSlop}.
+ */
+ @Override
+ public void recordMatch(double slopFactor) {
+ int slop = spans.currentSlop();
+ double localSlopFactor = simScorer.computeSlopFactor(slop);
+ double nestedSlopFactor = slopFactor * localSlopFactor;
+ super.recordMatch(nestedSlopFactor);
+ }
+
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpans.java
new file mode 100644
index 0000000..2e5d271
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpans.java
@@ -0,0 +1,261 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.spans;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.lucene.search.DisiPriorityQueue;
+import org.apache.lucene.search.DisiWrapper;
+import org.apache.lucene.search.TwoPhaseIterator;
+import org.apache.lucene.search.DisjunctionDISIApproximation;
+
+
+/**
+ * A spans that merges given spans.
+ *
+ * @lucene.experimental
+ */
+public class DisjunctionSpans extends Spans {
+ protected final SpanQuery spanQuery;
+ protected final List<Spans> subSpans;
+ protected final DisiPriorityQueue byDocQueue;
+ protected final SpanPositionQueue byPositionQueue;
+ protected Spans topPositionSpans;
+
+ /** Construct a DisjunctionSpans.
+ * @param spanQuery The query that provides the subSpans.
+ * @param subSpans Over which the disjunction is to be taken.
+ */
+ public DisjunctionSpans(SpanQuery spanQuery, List<Spans> subSpans) {
+ this.spanQuery = spanQuery; // for toString() only
+ this.subSpans = subSpans;
+ byDocQueue = new DisiPriorityQueue(subSpans.size());
+ for (Spans spans : subSpans) {
+ byDocQueue.add(new DisiWrapper(spans));
+ }
+ byPositionQueue = new SpanPositionQueue(subSpans.size()); // when empty use -1
+ topPositionSpans = null;
+ }
+
+
+ /** For {@link DisjunctionSpansDocScorer}. */
+ public List<Spans> subSpans() {
+ return subSpans;
+ }
+
+ /** For {@link DisjunctionSpansDocScorer}. */
+ public void extractSubSpansAtCurrentDoc(List<Spans> spansList) {
+ byPositionQueue.extractSpansList(spansList);
+ }
+
+ /** For {@link DisjunctionSpansDocScorer}. */
+ public Spans getFirstPositionSpans() {
+ return byPositionQueue.top();
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ topPositionSpans = null;
+ DisiWrapper topDocSpans = byDocQueue.top();
+ int currentDoc = topDocSpans.doc;
+ do {
+ topDocSpans.doc = topDocSpans.iterator.nextDoc();
+ topDocSpans = byDocQueue.updateTop();
+ } while (topDocSpans.doc == currentDoc);
+ return topDocSpans.doc;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ topPositionSpans = null;
+ DisiWrapper topDocSpans = byDocQueue.top();
+ do {
+ topDocSpans.doc = topDocSpans.iterator.advance(target);
+ topDocSpans = byDocQueue.updateTop();
+ } while (topDocSpans.doc < target);
+ return topDocSpans.doc;
+ }
+
+ @Override
+ public int docID() {
+ DisiWrapper topDocSpans = byDocQueue.top();
+ return topDocSpans.doc;
+ }
+
+ @Override
+ public TwoPhaseIterator asTwoPhaseIterator() {
+ float sumMatchCost = 0; // See also DisjunctionScorer.asTwoPhaseIterator()
+ long sumApproxCost = 0;
+
+ for (DisiWrapper w : byDocQueue) {
+ if (w.twoPhaseView != null) {
+ long costWeight = (w.cost <= 1) ? 1 : w.cost;
+ sumMatchCost += w.twoPhaseView.matchCost() * costWeight;
+ sumApproxCost += costWeight;
+ }
+ }
+
+ if (sumApproxCost == 0) { // no sub spans supports approximations
+ computePositionsCost();
+ return null;
+ }
+
+ final float matchCost = sumMatchCost / sumApproxCost;
+
+ return new TwoPhaseIterator(new DisjunctionDISIApproximation(byDocQueue)) {
+ @Override
+ public boolean matches() throws IOException {
+ return twoPhaseCurrentDocMatches();
+ }
+
+ @Override
+ public float matchCost() {
+ return matchCost;
+ }
+ };
+ }
+
+ float positionsCost = -1;
+
+ void computePositionsCost() {
+ float sumPositionsCost = 0;
+ long sumCost = 0;
+ for (DisiWrapper w : byDocQueue) {
+ long costWeight = (w.cost <= 1) ? 1 : w.cost;
+ sumPositionsCost += w.spans.positionsCost() * costWeight;
+ sumCost += costWeight;
+ }
+ positionsCost = sumPositionsCost / sumCost;
+ }
+
+ @Override
+ public float positionsCost() {
+ // This may be called when asTwoPhaseIterator returned null,
+ // which happens when none of the sub spans supports approximations.
+ assert positionsCost > 0;
+ return positionsCost;
+ }
+
+ int lastDocTwoPhaseMatched = -1;
+
+ boolean twoPhaseCurrentDocMatches() throws IOException {
+ DisiWrapper listAtCurrentDoc = byDocQueue.topList();
+ // remove the head of the list as long as it does not match
+ final int currentDoc = listAtCurrentDoc.doc;
+ while (listAtCurrentDoc.twoPhaseView != null) {
+ if (listAtCurrentDoc.twoPhaseView.matches()) {
+ // use this spans for positions at current doc:
+ listAtCurrentDoc.lastApproxMatchDoc = currentDoc;
+ break;
+ }
+ // do not use this spans for positions at current doc:
+ listAtCurrentDoc.lastApproxNonMatchDoc = currentDoc;
+ listAtCurrentDoc = listAtCurrentDoc.next;
+ if (listAtCurrentDoc == null) {
+ return false;
+ }
+ }
+ lastDocTwoPhaseMatched = currentDoc;
+ topPositionSpans = null;
+ return true;
+ }
+
+ void fillPositionQueue() throws IOException { // called at first nextStartPosition
+ assert byPositionQueue.size() == 0;
+ // add all matching Spans at current doc to byPositionQueue
+ DisiWrapper listAtCurrentDoc = byDocQueue.topList();
+ while (listAtCurrentDoc != null) {
+ Spans spansAtDoc = listAtCurrentDoc.spans;
+ if (lastDocTwoPhaseMatched == listAtCurrentDoc.doc) { // matched by DisjunctionDisiApproximation
+ if (listAtCurrentDoc.twoPhaseView != null) { // matched by approximation
+ if (listAtCurrentDoc.lastApproxNonMatchDoc == listAtCurrentDoc.doc) { // matches() returned false
+ spansAtDoc = null;
+ } else {
+ if (listAtCurrentDoc.lastApproxMatchDoc != listAtCurrentDoc.doc) {
+ if (!listAtCurrentDoc.twoPhaseView.matches()) {
+ spansAtDoc = null;
+ }
+ }
+ }
+ }
+ }
+
+ if (spansAtDoc != null) {
+ assert spansAtDoc.docID() == listAtCurrentDoc.doc;
+ assert spansAtDoc.startPosition() == -1;
+ spansAtDoc.nextStartPosition();
+ assert spansAtDoc.startPosition() != NO_MORE_POSITIONS;
+ byPositionQueue.add(spansAtDoc);
+ }
+ listAtCurrentDoc = listAtCurrentDoc.next;
+ }
+ assert byPositionQueue.size() > 0;
+ }
+
+ @Override
+ public int nextStartPosition() throws IOException {
+ if (topPositionSpans == null) {
+ byPositionQueue.clear();
+ fillPositionQueue(); // fills byPositionQueue at first position
+ topPositionSpans = byPositionQueue.top();
+ } else {
+ topPositionSpans.nextStartPosition();
+ topPositionSpans = byPositionQueue.updateTop();
+ }
+ return topPositionSpans.startPosition();
+ }
+
+ @Override
+ public int startPosition() {
+ return topPositionSpans == null ? -1 : topPositionSpans.startPosition();
+ }
+
+ @Override
+ public int endPosition() {
+ return topPositionSpans == null ? -1 : topPositionSpans.endPosition();
+ }
+
+ @Override
+ public int width() {
+ return topPositionSpans.width();
+ }
+
+ @Override
+ public void collect(SpanCollector collector) throws IOException {
+ if (topPositionSpans != null)
+ topPositionSpans.collect(collector);
+ }
+
+ @Override
+ public String toString() {
+ return "DisjunctionSpans(" + spanQuery + ")@" + docID() + ": " + startPosition() + " - " + endPosition();
+ }
+
+ long cost = -1;
+
+ @Override
+ public long cost() {
+ if (cost == -1) {
+ cost = 0;
+ for (Spans spans : subSpans) {
+ cost += spans.cost();
+ }
+ }
+ return cost;
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpansDocScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpansDocScorer.java
new file mode 100644
index 0000000..4831580
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/DisjunctionSpansDocScorer.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.spans;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.ArrayList;
+
+/**
+ * For {@link SpansTreeQuery}. Public for extension.
+ *
+ * @lucene.experimental
+ */
+public class DisjunctionSpansDocScorer<DisjunctionSpansT extends DisjunctionSpans>
+ extends SpansDocScorer<DisjunctionSpansT> {
+ protected final ArrayList<Spans> subSpansAtDoc;
+
+ /** Create a DisjunctionSpansDocScorer for a DisjunctionSpans and its subspans.
+ * For the subspans use {@link SpansTreeScorer#createSpansDocScorer}.
+ */
+ public DisjunctionSpansDocScorer(
+ SpansTreeScorer spansTreeScorer,
+ DisjunctionSpansT orSpans)
+ {
+ super(orSpans);
+ List<Spans> subSpans = orSpans.subSpans();
+ for (Spans spans : subSpans) {
+ spansTreeScorer.createSpansDocScorer(spans);
+ }
+ this.subSpansAtDoc = new ArrayList<Spans>(subSpans.size());
+ }
+
+ @Override
+ public void beginDoc() throws IOException {
+ subSpansAtDoc.clear();
+ spans.extractSubSpansAtCurrentDoc(subSpansAtDoc);
+ assert subSpansAtDoc.size() > 0 : "empty subSpansAtDoc docID=" + docID();
+ for (Spans subSpans : subSpansAtDoc) {
+ subSpans.spansDocScorer.beginDoc();
+ }
+ }
+
+ /** Record a match with the given slop factor for the subspans at the first position. */
+ @Override
+ public void recordMatch(double slopFactor) {
+ Spans firstPosSpans = spans.getFirstPositionSpans();
+ assert subSpansAtDoc.contains(firstPosSpans);
+ firstPosSpans.spansDocScorer.recordMatch(slopFactor);
+ }
+
+ /** Return the sum of the matching frequencies of the subspans. */
+ @Override
+ public int docMatchFreq() {
+ int freq = 0;
+ for (Spans subSpans : subSpansAtDoc) {
+ freq += subSpans.spansDocScorer.docMatchFreq();
+ }
+ return freq;
+ }
+
+ /** Return the sum of document scores of the subspans. */
+ @Override
+ public double docScore() throws IOException {
+ double score = 0;
+ for (Spans subSpans : subSpansAtDoc) {
+ score += subSpans.spansDocScorer.docScore();
+ }
+ return score;
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java b/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java
index f405473..257999e 100644
--- a/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java
@@ -20,6 +20,8 @@ package org.apache.lucene.search.spans;
import java.io.IOException;
import java.util.List;
+import org.apache.lucene.search.similarities.Similarity;
+
/**
* A Spans that is formed from the ordered subspans of a SpanNearQuery
* where the subspans do not overlap and have a maximum slop between them.
@@ -42,7 +44,7 @@ import java.util.List;
* Expert:
* Only public for subclassing. Most implementations should not need this class
*/
-public class NearSpansOrdered extends ConjunctionSpans {
+public class NearSpansOrdered extends ConjunctionNearSpans {
protected int matchStart = -1;
protected int matchEnd = -1;
@@ -50,8 +52,12 @@ public class NearSpansOrdered extends ConjunctionSpans {
private final int allowedSlop;
- public NearSpansOrdered(int allowedSlop, List<Spans> subSpans) throws IOException {
- super(subSpans);
+ public NearSpansOrdered(
+ int allowedSlop,
+ List<Spans> subSpans,
+ Similarity.SimScorer simScorer) throws IOException
+ {
+ super(subSpans, simScorer);
this.atFirstInCurrentDoc = true; // -1 startPosition/endPosition also at doc -1
this.allowedSlop = allowedSlop;
}
@@ -144,6 +150,11 @@ public class NearSpansOrdered extends ConjunctionSpans {
}
@Override
+ public int currentSlop() {
+ return matchWidth;
+ }
+
+ @Override
public void collect(SpanCollector collector) throws IOException {
for (Spans span : subSpans) {
span.collect(collector);
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java b/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java
index c3402bc..c14f2fa 100644
--- a/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java
@@ -23,6 +23,7 @@ import java.util.List;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.util.PriorityQueue;
+import org.apache.lucene.search.similarities.Similarity;
/**
* Similar to {@link NearSpansOrdered}, but for the unordered case.
@@ -30,17 +31,19 @@ import org.apache.lucene.util.PriorityQueue;
* Expert:
* Only public for subclassing. Most implementations should not need this class
*/
-public class NearSpansUnordered extends ConjunctionSpans {
+public class NearSpansUnordered extends ConjunctionNearSpans {
private List<SpansCell> subSpanCells; // in query order
private final int allowedSlop;
private SpanPositionQueue spanPositionQueue;
- public NearSpansUnordered(int allowedSlop, List<Spans> subSpans)
- throws IOException {
- super(subSpans);
-
+ public NearSpansUnordered(
+ int allowedSlop,
+ List<Spans> subSpans,
+ Similarity.SimScorer simScorer) throws IOException
+ {
+ super(subSpans, simScorer);
this.subSpanCells = new ArrayList<>(subSpans.size());
for (Spans subSpan : subSpans) { // sub spans in query order
this.subSpanCells.add(new SpansCell(subSpan));
@@ -190,9 +193,14 @@ public class NearSpansUnordered extends ConjunctionSpans {
return spanPositionQueue.top();
}
+ @Override
+ public int currentSlop() {
+ return maxEndPositionCell.endPosition() - minPositionCell().startPosition() - totalSpanLength;
+ }
+
private boolean atMatch() {
assert minPositionCell().docID() == maxEndPositionCell.docID();
- return (maxEndPositionCell.endPosition() - minPositionCell().startPosition() - totalSpanLength) <= allowedSlop;
+ return currentSlop() <= allowedSlop;
}
@Override
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNearQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNearQuery.java
index 7958f47..75d14fe 100644
--- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNearQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNearQuery.java
@@ -48,6 +48,7 @@ public class SpanNearQuery extends SpanQuery implements Cloneable {
private final String field;
private final List<SpanQuery> clauses = new LinkedList<>();
private int slop;
+ private int nonMatchSlop = -1;
/**
* Construct a new builder
@@ -88,10 +89,20 @@ public class SpanNearQuery extends SpanQuery implements Cloneable {
}
/**
+ * Set the non match slop for this query
+ */
+ public Builder setNonMatchSlop(int nonMatchSlop) {
+ this.nonMatchSlop = nonMatchSlop;
+ return this;
+ }
+
+ /**
* Build the query
*/
public SpanNearQuery build() {
- return new SpanNearQuery(clauses.toArray(new SpanQuery[clauses.size()]), slop, ordered);
+ return (nonMatchSlop == -1)
+ ? new SpanNearQuery(clauses.toArray(new SpanQuery[clauses.size()]), slop, ordered)
+ : new SpanNearQuery(clauses.toArray(new SpanQuery[clauses.size()]), slop, ordered, nonMatchSlop);
}
}
@@ -113,9 +124,21 @@ public class SpanNearQuery extends SpanQuery implements Cloneable {
protected List<SpanQuery> clauses;
protected int slop;
protected boolean inOrder;
+ protected int nonMatchSlop;
protected String field;
+ /**
+ * Construct a SpanNearQuery.
+ * See {@link SpanNearQuery#SpanNearQuery(SpanQuery[], int, boolean, int)}
+ * for the first three parameters.
+ * This will use <code>Integer.MAX_VALUE-1</code> for the non matching slop.
+ */
+ public SpanNearQuery(SpanQuery[] clausesIn, int slop, boolean inOrder) {
+ // Integer.MAX_VALUE causes overflow in sloppyFreq which adds 1.
+ this(clausesIn, slop, inOrder, Integer.MAX_VALUE-1);
+ }
+
/** Construct a SpanNearQuery. Matches spans matching a span from each
* clause, with up to <code>slop</code> total unmatched positions between
* them.
@@ -124,10 +147,30 @@ public class SpanNearQuery extends SpanQuery implements Cloneable {
* <br>When <code>inOrder</code> is false, the spans from each clause
* need not be ordered and may overlap.
* @param clausesIn the clauses to find near each other, in the same field, at least 2.
- * @param slop The slop value
+ * @param slop The allowed slop. This should be non negative and at most Integer.Max_VALUE-1.
* @param inOrder true if order is important
+ * @param nonMatchSlop
+ * The distance for determining the slop factor to be used for non matching
+ * occurrences. This is used for scoring by {@link SpansTreeQuery}, and it
+ * should not be smaller than <code>slop</code>.
+ * <br>
+ * Smaller values of <code>nonMatchSlop</code> will increase the
+ * score contribution of non matching occurrences
+ * via {@link org.apache.lucene.search.similarities.Similarity.SimScorer#computeSlopFactor}.
+ * <br>
+ * Smaller values may lead to a scoring inconsistency between two span near queries
+ * that only differ in the allowed slop.
+ * For example consider query A with a smaller allowed slop and query B with a larger one.
+ * For query B there can be more matches, and these should increase the score of B
+ * when compared to the score of A.
+ * For each extra match at B, the non matching score for query A should be lower than
+ * the matching score for query B.
+ * <br>
+ * To have consistent scoring between two such queries, choose
+ * a non matching scoring distance that is larger than the largest allowed distance,
+ * and provide that to both queries.
*/
- public SpanNearQuery(SpanQuery[] clausesIn, int slop, boolean inOrder) {
+ public SpanNearQuery(SpanQuery[] clausesIn, int slop, boolean inOrder, int nonMatchSlop) {
this.clauses = new ArrayList<>(clausesIn.length);
for (SpanQuery clause : clausesIn) {
if (this.field == null) { // check field
@@ -137,8 +180,14 @@ public class SpanNearQuery extends SpanQuery implements Cloneable {
}
this.clauses.add(clause);
}
- this.slop = slop;
+ if (nonMatchSlop != -1) {
+ if (nonMatchSlop < slop) {
+ throw new IllegalArgumentException("nonMatchSlop < slop: " + nonMatchSlop + " < " + slop);
+ }
+ }
this.inOrder = inOrder;
+ this.slop = slop;
+ this.nonMatchSlop = nonMatchSlop;
}
/** Return the clauses whose spans are matched. */
@@ -152,6 +201,9 @@ public class SpanNearQuery extends SpanQuery implements Cloneable {
/** Return true if matches are required to be in-order.*/
public boolean isInOrder() { return inOrder; }
+ /** Return the slop used for scoring non matching occurrences. */
+ public int getNonMatchSlop() { return nonMatchSlop; }
+
@Override
public String getField() { return field; }
@@ -171,6 +223,8 @@ public class SpanNearQuery extends SpanQuery implements Cloneable {
buffer.append(slop);
buffer.append(", ");
buffer.append(inOrder);
+ buffer.append(", ");
+ buffer.append(nonMatchSlop);
buffer.append(")");
return buffer.toString();
}
@@ -179,7 +233,7 @@ public class SpanNearQuery extends SpanQuery implements Cloneable {
public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
List<SpanWeight> subWeights = new ArrayList<>();
for (SpanQuery q : clauses) {
- subWeights.add(q.createWeight(searcher, false, boost));
+ subWeights.add(q.createWeight(searcher, needsScores, boost));
}
return new SpanNearWeight(subWeights, searcher, needsScores ? getTermContexts(subWeights) : null, boost);
}
@@ -219,8 +273,8 @@ public class SpanNearQuery extends SpanQuery implements Cloneable {
}
// all NearSpans require at least two subSpans
- return (!inOrder) ? new NearSpansUnordered(slop, subSpans)
- : new NearSpansOrdered(slop, subSpans);
+ return (!inOrder) ? new NearSpansUnordered(slop, subSpans, getSimScorer(context))
+ : new NearSpansOrdered(slop, subSpans, getSimScorer(context));
}
@Override
@@ -262,6 +316,7 @@ public class SpanNearQuery extends SpanQuery implements Cloneable {
private boolean equalsTo(SpanNearQuery other) {
return inOrder == other.inOrder &&
slop == other.slop &&
+ nonMatchSlop == other.nonMatchSlop &&
clauses.equals(other.clauses);
}
@@ -270,6 +325,7 @@ public class SpanNearQuery extends SpanQuery implements Cloneable {
int result = classHash();
result ^= clauses.hashCode();
result += slop;
+ result ^= 4 * nonMatchSlop;
int fac = 1 + (inOrder ? 8 : 4);
return fac * result;
}
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java
index 00bcc4c..7441319 100644
--- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java
@@ -98,7 +98,7 @@ public final class SpanNotQuery extends SpanQuery {
@Override
public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
- SpanWeight includeWeight = include.createWeight(searcher, false, boost);
+ SpanWeight includeWeight = include.createWeight(searcher, needsScores, boost);
SpanWeight excludeWeight = exclude.createWeight(searcher, false, boost);
return new SpanNotWeight(searcher, needsScores ? getTermContexts(includeWeight, excludeWeight) : null,
includeWeight, excludeWeight, boost);
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java
index 15abc7d..264f1c8 100644
--- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java
@@ -16,7 +16,6 @@
*/
package org.apache.lucene.search.spans;
-
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
@@ -28,25 +27,46 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
-import org.apache.lucene.search.DisiPriorityQueue;
-import org.apache.lucene.search.DisiWrapper;
-import org.apache.lucene.search.DisjunctionDISIApproximation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
-import org.apache.lucene.search.TwoPhaseIterator;
+import org.apache.lucene.search.similarities.Similarity.SimScorer;
/** Matches the union of its clauses.
*/
public final class SpanOrQuery extends SpanQuery {
private List<SpanQuery> clauses;
private String field;
+ private final int maxDistance;
/** Construct a SpanOrQuery merging the provided clauses.
* All clauses must have the same field.
*/
public SpanOrQuery(SpanQuery... clauses) {
this.clauses = new ArrayList<>(clauses.length);
+ this.maxDistance = -1;
+ for (SpanQuery seq : clauses) {
+ addClause(seq);
+ }
+ }
+
+ /** Construct a SpanOrQuery merging the provided clauses
+ * with the scoring depending on the distances between the successive clauses.
+ * All clauses must have the same field.
+ * The non negative maxDistance is used for scoring the successive occurrences
+ * of the different clauses. When the actual distance is larger than this, or when
+ * no other clause is present, maxDistance determines the slop factor.
+ * Otherwise each clause occurrence is scored with a slop factor determined
+ * by the minimum distance to the occurrence of another clause.
+ * <br>
+ * This scoring works only when wrapped in a {@link SpansTreeQuery}.
+ */
+ public SpanOrQuery(int maxDistance, SpanQuery... clauses) {
+ this.clauses = new ArrayList<>(clauses.length);
+ this.maxDistance = maxDistance;
+ if (maxDistance < 0) {
+ throw new IllegalArgumentException("maxDistance must be non negative: " + maxDistance);
+ }
for (SpanQuery seq : clauses) {
addClause(seq);
}
@@ -67,6 +87,11 @@ public final class SpanOrQuery extends SpanQuery {
return clauses.toArray(new SpanQuery[clauses.size()]);
}
+ /** Return the maximum distance used to determine a slop factor for a clause occurrence.
+ * When no maximum distance was given, -1 is returned.
+ */
+ public int getMaxDistance() { return maxDistance; }
+
@Override
public String getField() { return field; }
@@ -89,7 +114,13 @@ public final class SpanOrQuery extends SpanQuery {
@Override
public String toString(String field) {
StringBuilder buffer = new StringBuilder();
- buffer.append("spanOr([");
+ buffer.append("spanOr(");
+ if (maxDistance != -1) {
+ buffer.append("maxDistance=");
+ buffer.append(maxDistance);
+ buffer.append(", ");
+ }
+ buffer.append("[");
Iterator<SpanQuery> i = clauses.iterator();
while (i.hasNext()) {
SpanQuery clause = i.next();
@@ -104,31 +135,47 @@ public final class SpanOrQuery extends SpanQuery {
@Override
public boolean equals(Object other) {
- return sameClassAs(other) &&
- clauses.equals(((SpanOrQuery) other).clauses);
+ return sameClassAs(other)
+ && maxDistance == ((SpanOrQuery) other).maxDistance
+ && clauses.equals(((SpanOrQuery) other).clauses);
}
@Override
public int hashCode() {
- return classHash() ^ clauses.hashCode();
+ return classHash() ^ clauses.hashCode() ^ (7 * maxDistance);
}
@Override
public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
List<SpanWeight> subWeights = new ArrayList<>(clauses.size());
for (SpanQuery q : clauses) {
- subWeights.add(q.createWeight(searcher, false, boost));
+ subWeights.add(q.createWeight(searcher, needsScores, boost));
}
- return new SpanOrWeight(searcher, needsScores ? getTermContexts(subWeights) : null, subWeights, boost);
+ return new SpanOrWeight(searcher,
+ needsScores ? getTermContexts(subWeights) : null,
+ subWeights,
+ needsScores,
+ boost);
}
public class SpanOrWeight extends SpanWeight {
final List<SpanWeight> subWeights;
-
- public SpanOrWeight(IndexSearcher searcher, Map<Term, TermContext> terms, List<SpanWeight> subWeights, float boost) throws IOException {
+ final IndexSearcher searcher;
+ final boolean needsScores;
+ final float boost;
+
+ public SpanOrWeight(IndexSearcher searcher,
+ Map<Term, TermContext> terms,
+ List<SpanWeight> subWeights,
+ boolean needsScores,
+ float boost) throws IOException
+ {
super(SpanOrQuery.this, searcher, terms, boost);
this.subWeights = subWeights;
+ this.searcher = searcher;
+ this.needsScores = needsScores;
+ this.boost = boost;
}
@Override
@@ -151,222 +198,35 @@ public final class SpanOrQuery extends SpanQuery {
ArrayList<Spans> subSpans = new ArrayList<>(clauses.size());
+ SpanWeight lastSpanWeight = null;
for (SpanWeight w : subWeights) {
Spans spans = w.getSpans(context, requiredPostings);
if (spans != null) {
subSpans.add(spans);
+ lastSpanWeight = w;
}
}
if (subSpans.size() == 0) {
return null;
} else if (subSpans.size() == 1) {
+ if (maxDistance == -1) {
return subSpans.get(0);
+ } else { // only weigh by slop factor of maxDistance
+ SimScorer simScorer = getSimScorer(context);
+ float maxDistanceSlop = simScorer.computeSlopFactor(maxDistance);
+ SpanQuery subQuery = (SpanQuery) lastSpanWeight.getQuery();
+ return subQuery.createWeight(searcher, needsScores, (boost * maxDistanceSlop))
+ .getSpans(context, requiredPostings);
}
-
- DisiPriorityQueue byDocQueue = new DisiPriorityQueue(subSpans.size());
- for (Spans spans : subSpans) {
- byDocQueue.add(new DisiWrapper(spans));
}
-
- SpanPositionQueue byPositionQueue = new SpanPositionQueue(subSpans.size()); // when empty use -1
-
- return new Spans() {
- Spans topPositionSpans = null;
-
- @Override
- public int nextDoc() throws IOException {
- topPositionSpans = null;
- DisiWrapper topDocSpans = byDocQueue.top();
- int currentDoc = topDocSpans.doc;
- do {
- topDocSpans.doc = topDocSpans.iterator.nextDoc();
- topDocSpans = byDocQueue.updateTop();
- } while (topDocSpans.doc == currentDoc);
- return topDocSpans.doc;
- }
-
- @Override
- public int advance(int target) throws IOException {
- topPositionSpans = null;
- DisiWrapper topDocSpans = byDocQueue.top();
- do {
- topDocSpans.doc = topDocSpans.iterator.advance(target);
- topDocSpans = byDocQueue.updateTop();
- } while (topDocSpans.doc < target);
- return topDocSpans.doc;
- }
-
- @Override
- public int docID() {
- DisiWrapper topDocSpans = byDocQueue.top();
- return topDocSpans.doc;
- }
-
- @Override
- public TwoPhaseIterator asTwoPhaseIterator() {
- float sumMatchCost = 0; // See also DisjunctionScorer.asTwoPhaseIterator()
- long sumApproxCost = 0;
-
- for (DisiWrapper w : byDocQueue) {
- if (w.twoPhaseView != null) {
- long costWeight = (w.cost <= 1) ? 1 : w.cost;
- sumMatchCost += w.twoPhaseView.matchCost() * costWeight;
- sumApproxCost += costWeight;
- }
- }
-
- if (sumApproxCost == 0) { // no sub spans supports approximations
- computePositionsCost();
- return null;
- }
-
- final float matchCost = sumMatchCost / sumApproxCost;
-
- return new TwoPhaseIterator(new DisjunctionDISIApproximation(byDocQueue)) {
- @Override
- public boolean matches() throws IOException {
- return twoPhaseCurrentDocMatches();
- }
-
- @Override
- public float matchCost() {
- return matchCost;
- }
- };
- }
-
- float positionsCost = -1;
-
- void computePositionsCost() {
- float sumPositionsCost = 0;
- long sumCost = 0;
- for (DisiWrapper w : byDocQueue) {
- long costWeight = (w.cost <= 1) ? 1 : w.cost;
- sumPositionsCost += w.spans.positionsCost() * costWeight;
- sumCost += costWeight;
- }
- positionsCost = sumPositionsCost / sumCost;
- }
-
- @Override
- public float positionsCost() {
- // This may be called when asTwoPhaseIterator returned null,
- // which happens when none of the sub spans supports approximations.
- assert positionsCost > 0;
- return positionsCost;
- }
-
- int lastDocTwoPhaseMatched = -1;
-
- boolean twoPhaseCurrentDocMatches() throws IOException {
- DisiWrapper listAtCurrentDoc = byDocQueue.topList();
- // remove the head of the list as long as it does not match
- final int currentDoc = listAtCurrentDoc.doc;
- while (listAtCurrentDoc.twoPhaseView != null) {
- if (listAtCurrentDoc.twoPhaseView.matches()) {
- // use this spans for positions at current doc:
- listAtCurrentDoc.lastApproxMatchDoc = currentDoc;
- break;
- }
- // do not use this spans for positions at current doc:
- listAtCurrentDoc.lastApproxNonMatchDoc = currentDoc;
- listAtCurrentDoc = listAtCurrentDoc.next;
- if (listAtCurrentDoc == null) {
- return false;
- }
- }
- lastDocTwoPhaseMatched = currentDoc;
- topPositionSpans = null;
- return true;
- }
-
- void fillPositionQueue() throws IOException { // called at first nextStartPosition
- assert byPositionQueue.size() == 0;
- // add all matching Spans at current doc to byPositionQueue
- DisiWrapper listAtCurrentDoc = byDocQueue.topList();
- while (listAtCurrentDoc != null) {
- Spans spansAtDoc = listAtCurrentDoc.spans;
- if (lastDocTwoPhaseMatched == listAtCurrentDoc.doc) { // matched by DisjunctionDisiApproximation
- if (listAtCurrentDoc.twoPhaseView != null) { // matched by approximation
- if (listAtCurrentDoc.lastApproxNonMatchDoc == listAtCurrentDoc.doc) { // matches() returned false
- spansAtDoc = null;
- } else {
- if (listAtCurrentDoc.lastApproxMatchDoc != listAtCurrentDoc.doc) {
- if (!listAtCurrentDoc.twoPhaseView.matches()) {
- spansAtDoc = null;
- }
- }
- }
- }
- }
-
- if (spansAtDoc != null) {
- assert spansAtDoc.docID() == listAtCurrentDoc.doc;
- assert spansAtDoc.startPosition() == -1;
- spansAtDoc.nextStartPosition();
- assert spansAtDoc.startPosition() != NO_MORE_POSITIONS;
- byPositionQueue.add(spansAtDoc);
- }
- listAtCurrentDoc = listAtCurrentDoc.next;
- }
- assert byPositionQueue.size() > 0;
- }
-
- @Override
- public int nextStartPosition() throws IOException {
- if (topPositionSpans == null) {
- byPositionQueue.clear();
- fillPositionQueue(); // fills byPositionQueue at first position
- topPositionSpans = byPositionQueue.top();
+ if (maxDistance == -1) {
+ return new DisjunctionSpans(SpanOrQuery.this, subSpans);
} else {
- topPositionSpans.nextStartPosition();
- topPositionSpans = byPositionQueue.updateTop();
- }
- return topPositionSpans.startPosition();
+ SimScorer simScorer = getSimScorer(context);
+ return new DisjunctionNearSpans(SpanOrQuery.this, subSpans, maxDistance, simScorer);
}
-
- @Override
- public int startPosition() {
- return topPositionSpans == null ? -1 : topPositionSpans.startPosition();
- }
-
- @Override
- public int endPosition() {
- return topPositionSpans == null ? -1 : topPositionSpans.endPosition();
- }
-
- @Override
- public int width() {
- return topPositionSpans.width();
- }
-
- @Override
- public void collect(SpanCollector collector) throws IOException {
- if (topPositionSpans != null)
- topPositionSpans.collect(collector);
}
-
- @Override
- public String toString() {
- return "spanOr(" + SpanOrQuery.this + ")@" + docID() + ": " + startPosition() + " - " + endPosition();
- }
-
- long cost = -1;
-
- @Override
- public long cost() {
- if (cost == -1) {
- cost = 0;
- for (Spans spans : subSpans) {
- cost += spans.cost();
}
}
- return cost;
- }
- };
- }
- }
-
-}
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanPositionQueue.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanPositionQueue.java
index 2d2bd16..22bdb17 100644
--- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanPositionQueue.java
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanPositionQueue.java
@@ -16,6 +16,8 @@
*/
package org.apache.lucene.search.spans;
+import java.util.List;
+import java.util.Iterator;
import org.apache.lucene.util.PriorityQueue;
@@ -31,5 +33,12 @@ class SpanPositionQueue extends PriorityQueue<Spans> {
: (start1 == start2) ? s1.endPosition() < s2.endPosition()
: false;
}
+
+ void extractSpansList(List<Spans> spansList) {
+ Iterator<Spans> spansIter = iterator();
+ while (spansIter.hasNext()) {
+ spansList.add(spansIter.next());
+ }
+ }
}
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanSynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanSynonymQuery.java
new file mode 100644
index 0000000..fb57ddb
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanSynonymQuery.java
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.spans;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.Map;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.TermsEnum;
+
+import org.apache.lucene.search.similarities.Similarity.SimScorer;
+
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.SynonymQuery;
+import org.apache.lucene.search.SynonymQuery.SynonymWeight;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Explanation;
+import org.apache.lucene.search.MatchNoDocsQuery;
+
+/**
+ * A SpanQuery that treats terms as synonyms.
+ * <p>
+ * For scoring purposes, this query tries to score the terms as if you
+ * had indexed them as one term: it will match any of the terms while
+ * using the same scoring as {@link SynonymQuery}, as far as possible.
+ */
+public final class SpanSynonymQuery extends SpanQuery {
+ final SynonymQuery synonymQuery;
+ final List<Term> terms;
+
+ /**
+ * Creates a new SpanSynonymQuery, matching any of the supplied terms.
+ * <p>
+ * The terms must all have the same field.
+ */
+ public SpanSynonymQuery(Term... terms) {
+ this.synonymQuery = new SynonymQuery(terms);
+ this.terms = synonymQuery.getTerms();
+ }
+
+ @Override
+ public String getField() {
+ return synonymQuery.getField();
+ }
+
+ @Override
+ public String toString(String field) {
+ StringBuilder builder = new StringBuilder("SpanSynonym(");
+ builder.append(synonymQuery.toString(field));
+ builder.append(")");
+ return builder.toString();
+ }
+
+ @Override
+ public int hashCode() {
+ return 31 * classHash() - synonymQuery.hashCode();
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ return sameClassAs(other) &&
+ synonymQuery.equals(((SpanSynonymQuery) other).synonymQuery);
+ }
+
+ @Override
+ public Query rewrite(IndexReader reader) throws IOException {
+ // optimize zero and single term cases
+ int numTerms = terms.size();
+ if (numTerms == 0) {
+ return new MatchNoDocsQuery();
+ }
+ if (numTerms == 1) {
+ return new SpanTermQuery(terms.get(0));
+ }
+ return this;
+ }
+
+ /** The returned SpanWeight does not support {@link SpanWeight#explain}. */
+ @Override
+ public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
+ if (needsScores) {
+ SynonymWeight synonymWeight = (SynonymWeight)
+ synonymQuery.createWeight(searcher, needsScores, boost);
+ return new SpanSynonymWeight(searcher, boost, synonymWeight);
+ }
+ else { // scores not needed, use SpanOrQuery without scoring.
+ SpanTermQuery[] clauses = new SpanTermQuery[terms.size()];
+ int i = 0;
+ for (Term term : terms) {
+ clauses[i++] = new SpanTermQuery(term);
+ }
+ return new SpanOrQuery(clauses).createWeight(searcher, needsScores, boost);
+ }
+ }
+
+ class SpanSynonymWeight extends SpanWeight {
+ final SynonymWeight synonymWeight;
+
+ SpanSynonymWeight(
+ IndexSearcher searcher,
+ float boost,
+ SynonymWeight synonymWeight)
+ throws IOException {
+ super(SpanSynonymQuery.this, searcher, null, boost); // null: no term context map
+ this.synonymWeight = synonymWeight;
+ }
+
+ @Override
+ public void extractTerms(Set<Term> termSet) {
+ for (Term t : terms) {
+ termSet.add(t);
+ }
+ }
+
+ @Override
+ public void extractTermContexts(Map<Term, TermContext> termContextbyTerm) {
+ TermContext[] termContexts = synonymWeight.getTermContexts();
+ int i = 0;
+ for (Term term : terms) {
+ TermContext termContext = termContexts[i++];
+ termContextbyTerm.put(term, termContext);
+ }
+ }
+
+ @Override
+ public Explanation explain(LeafReaderContext context, int doc) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public SimScorer getSimScorer(LeafReaderContext context) throws IOException {
+ return synonymWeight.getSimScorer(context);
+ }
+
+ @Override
+ public Spans getSpans(final LeafReaderContext context, Postings requiredPostings)
+ throws IOException {
+ SimScorer simScorer = getSimScorer(context);
+ final String field = getField();
+ Terms fieldTerms = context.reader().terms(field);
+ List<Spans> termSpans = new ArrayList<>(terms.size());
+ if (fieldTerms != null) {
+ TermsEnum termsEnum = fieldTerms.iterator();
+ TermContext[] termContexts = synonymWeight.getTermContexts();
+ int i = 0;
+ for (Term term : terms) {
+ TermContext termContext = termContexts[i++]; // in term order
+ TermState termState = termContext.get(context.ord);
+ if (termState != null) {
+ termsEnum.seekExact(term.bytes(), termState);
+ PostingsEnum postings = termsEnum.postings(null, PostingsEnum.POSITIONS);
+ float positionsCost = SpanTermQuery.termPositionsCost(termsEnum)
+ * SpanTermQuery.PHRASE_TO_SPAN_TERM_POSITIONS_COST;
+ termSpans.add(new TermSpans(simScorer, postings, term, positionsCost));
+ }
+ }
+ }
+
+ return (termSpans.size() == 0) ? null
+ : (termSpans.size() == 1) ? termSpans.get(0)
+ : new SynonymSpans(SpanSynonymQuery.this, termSpans, simScorer);
+ }
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java
index 2746a0c..9c28ac9 100644
--- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java
@@ -126,7 +126,7 @@ public class SpanTermQuery extends SpanQuery {
* the relative cost of dealing with the term positions
* when using a SpanNearQuery instead of a PhraseQuery.
*/
- private static final float PHRASE_TO_SPAN_TERM_POSITIONS_COST = 4.0f;
+ static final float PHRASE_TO_SPAN_TERM_POSITIONS_COST = 4.0f;
private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128;
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/Spans.java b/lucene/core/src/java/org/apache/lucene/search/spans/Spans.java
index 7857708..73dd083 100644
--- a/lucene/core/src/java/org/apache/lucene/search/spans/Spans.java
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/Spans.java
@@ -116,4 +116,6 @@ public abstract class Spans extends DocIdSetIterator {
*/
protected void doCurrentSpans() throws IOException {}
+ /** For {@link SpansTreeQuery}. */
+ SpansDocScorer<?> spansDocScorer;
}
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpansDocScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpansDocScorer.java
new file mode 100644
index 0000000..55d39fd
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpansDocScorer.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.spans;
+
+import java.io.IOException;
+
+/**
+ * Record span matches in a document and compute a document score.
+ * <br>
+ * For {@link SpansTreeQuery}. Public for extension.
+ *
+ * @lucene.experimental
+ */
+public abstract class SpansDocScorer<SpansT extends Spans> {
+ protected final SpansT spans;
+
+ /**
+ * Create a SpansDocScorer and make {@link Spans#spansDocScorer} refer to it.
+ */
+ public SpansDocScorer(SpansT spans) {
+ this.spans = spans;
+ spans.spansDocScorer = this;
+ }
+
+ /** The document of the spans, see {@link Spans#docID}. */
+ public int docID() { return spans.docID(); }
+
+ /** Called before the first match of the spans is to be recorded for the document. */
+ public abstract void beginDoc() throws IOException;
+
+ /** Record a match with its slop factor. */
+ public abstract void recordMatch(double slopFactor);
+
+ /** Return the matching frequency of the last {@link #beginDoc} document. */
+ public abstract int docMatchFreq();
+
+ /** Return the score of the last {@link #beginDoc} document. */
+ public abstract double docScore() throws IOException;
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpansTreeQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpansTreeQuery.java
new file mode 100644
index 0000000..2a73e38
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpansTreeQuery.java
@@ -0,0 +1,328 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.spans;
+
+import java.io.IOException;
+import java.util.Objects;
+import java.util.Set;
+import java.util.ArrayList;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReaderContext;
+
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.DisjunctionMaxQuery;
+import org.apache.lucene.search.BoostQuery;
+import org.apache.lucene.search.Weight;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Explanation;
+
+import org.apache.lucene.search.similarities.Similarity.SimScorer;
+
+/** Wrapper class for scoring span queries via matching term occurrences.
+ *
+ * @lucene.experimental
+ */
+public class SpansTreeQuery extends Query {
+
+ final SpanQuery spanQuery;
+ final int TOP_LEVEL_SLOP = 0;
+
+ /** Wrap a span query to score via its matching term occurrences.
+ * <br>
+ * For more details on scoring see {@link SpansTreeScorer#createSpansDocScorer}.
+ *
+ * @param spanQuery This can be any nested combination of
+ * {@link org.apache.lucene.search.spans.SpanNearQuery},
+ * {@link org.apache.lucene.search.spans.SpanOrQuery},
+ * {@link org.apache.lucene.search.spans.SpanSynonymQuery},
+ * {@link org.apache.lucene.search.spans.SpanTermQuery},
+ * {@link org.apache.lucene.search.spans.SpanBoostQuery},
+ * {@link org.apache.lucene.search.spans.SpanNotQuery},
+ * {@link org.apache.lucene.search.spans.SpanFirstQuery},
+ * {@link org.apache.lucene.search.spans.SpanContainingQuery} and
+ * {@link org.apache.lucene.search.spans.SpanWithinQuery}.
+ */
+ public SpansTreeQuery(SpanQuery spanQuery) {
+ this.spanQuery = Objects.requireNonNull(spanQuery);
+ }
+
+ /** Wrap the span (subqueries of a) query in a SpansTreeQuery.
+ * <br>
+ * A {@link SpanQuery} will be wrapped in a {@link SpansTreeQuery#SpansTreeQuery}.
+ * For {@link BooleanQuery}, {@link DisjunctionMaxQuery} and {@link BoostQuery},
+ * the subqueries/subquery will be wrapped recursively.
+ * Otherwise the given query is returned.
+ * <br>
+ * No double wrapping will be done because
+ * a {@link SpansTreeQuery} is not a {@link SpanQuery}.
+ */
+ public static Query wrap(Query query) {
+ if (query instanceof SpanQuery) {
+ return new SpansTreeQuery((SpanQuery)query);
+ }
+ if (query instanceof BooleanQuery) {
+ return wrapBooleanQuery((BooleanQuery)query);
+ }
+ if (query instanceof DisjunctionMaxQuery) {
+ return wrapDMQ((DisjunctionMaxQuery)query);
+ }
+ if (query instanceof BoostQuery) {
+ Query subQuery = ((BoostQuery)query).getQuery();
+ Query wrappedSubQuery = wrap(subQuery);
+ if (wrappedSubQuery == subQuery) {
+ return query;
+ }
+ float boost = ((BoostQuery)query).getBoost();
+ return new BoostQuery(wrappedSubQuery, boost);
+ }
+ return query;
+ }
+
+ static BooleanQuery wrapBooleanQuery(BooleanQuery blq) {
+ ArrayList<BooleanClause> wrappedClauses = new ArrayList<>();
+ boolean wrapped = false;
+ for (BooleanClause clause : blq.clauses()) {
+ Query subQuery = clause.getQuery();
+ Query wrappedSubQuery = wrap(subQuery);
+ if (wrappedSubQuery != subQuery) {
+ wrapped = true;
+ wrappedClauses.add(new BooleanClause(wrappedSubQuery, clause.getOccur()));
+ }
+ else {
+ wrappedClauses.add(clause);
+ }
+ }
+ if (! wrapped) {
+ return blq;
+ }
+ BooleanQuery.Builder builder = new BooleanQuery.Builder();
+ for (BooleanClause clause : wrappedClauses) {
+ builder.add(clause);
+ }
+ return builder.build();
+ }
+
+ static DisjunctionMaxQuery wrapDMQ(DisjunctionMaxQuery dmq) {
+ ArrayList<Query> wrappedDisjuncts = new ArrayList<>();
+ boolean wrapped = false;
+ for (Query disjunct : dmq.getDisjuncts()) {
+ Query wrappedDisjunct = wrap(disjunct);
+ if (wrappedDisjunct != disjunct) {
+ wrapped = true;
+ wrappedDisjuncts.add(wrappedDisjunct);
+ }
+ else {
+ wrappedDisjuncts.add(disjunct);
+ }
+ }
+ if (! wrapped) {
+ return dmq;
+ }
+ float tbm = dmq.getTieBreakerMultiplier();
+ return new DisjunctionMaxQuery(wrappedDisjuncts, tbm);
+ }
+
+
+ /** Wrap a given query by {@link #wrap(Query)} after it was rewritten.
+ */
+ public static Query wrapAfterRewrite(Query query) {
+ return new Query() {
+ @Override
+ public Query rewrite(IndexReader reader) throws IOException {
+ Query rewritten = query.rewrite(reader);
+ Query wrapped = wrap(rewritten);
+ return wrapped;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ return this == other;
+ }
+
+ @Override
+ public int hashCode() {
+ return query.hashCode() ^ SpansTreeQuery.class.hashCode();
+ }
+
+ @Override
+ public String toString(String field) {
+ return "SpansTreeQuery.wrapAfterRewrite: " + query.toString(field);
+ }
+ };
+ }
+
+ /** The wrapped SpanQuery */
+ public SpanQuery getSpanQuery() { return spanQuery; }
+
+ @Override
+ public int hashCode() {
+ return getClass().hashCode() - spanQuery.hashCode();
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ return sameClassAs(other) &&
+ equalsTo(getClass().cast(other));
+ }
+
+ private boolean equalsTo(SpansTreeQuery other) {
+ return spanQuery.equals(other.spanQuery);
+ }
+
+ @Override
+ public String toString(String field) {
+ StringBuilder buffer = new StringBuilder();
+ buffer.append("SpansTreeQuery(");
+ buffer.append(spanQuery.toString(field));
+ buffer.append(")");
+ return buffer.toString();
+ }
+
+ /** Return a weight for scoring by matching term occurrences.
+ * <br>{@link Weight#explain} is not supported on the result.
+ */
+ @Override
+ public SpansTreeWeight createWeight(
+ IndexSearcher searcher,
+ boolean needsScores,
+ float boost)
+ throws IOException
+ {
+ return new SpansTreeWeight(searcher, needsScores, boost);
+ }
+
+ public class SpansTreeWeight extends Weight {
+ final SpanWeight spanWeight;
+
+ public SpansTreeWeight(
+ IndexSearcher searcher,
+ boolean needsScores,
+ float boost)
+ throws IOException
+ {
+ super(SpansTreeQuery.this);
+ this.spanWeight = spanQuery.createWeight(searcher, needsScores, boost);
+ }
+
+ /** Throws an UnsupportedOperationException. */
+ @Override
+ public Explanation explain(LeafReaderContext context, int doc) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void extractTerms(Set<Term> terms) {
+ spanWeight.extractTerms(terms);
+ }
+
+ /** Compute a minimal slop factor from the maximum possible slops that can occur
+ * in a SpanQuery for nested SpanNearQueries and for nested SpanOrQueries with distance.
+ * This supports the queries mentioned at {@link SpansTreeScorer#createSpansDocScorer}.
+ * <p>
+ * This uses the maximum slops from {@link SpanOrQuery#getMaxDistance()} and
+ * {@link SpanNearQuery#getNonMatchSlop()}.
+ * <p>
+ * This assumes that slop factors are multiplied in
+ * {@link ConjunctionNearSpansDocScorer#recordMatch} and in
+ * {@link DisjunctionNearSpansDocScorer#recordMatch}
+ */
+ public double minSlopFactor(SpanQuery spanQuery, SimScorer simScorer, double slopFactor) {
+ assert slopFactor >= 0;
+ if (spanQuery instanceof SpanTermQuery) {
+ return slopFactor;
+ }
+ if (spanQuery instanceof SpanSynonymQuery) {
+ return slopFactor;
+ }
+ if (spanQuery instanceof SpanNotQuery) {
+ return minSlopFactor(((SpanNotQuery)spanQuery).getInclude(), simScorer, slopFactor);
+ }
+ if (spanQuery instanceof SpanPositionCheckQuery) {
+ return minSlopFactor(((SpanFirstQuery)spanQuery).getMatch(), simScorer, slopFactor);
+ }
+ if (spanQuery instanceof SpanContainingQuery) {
+ return minSlopFactor(((SpanContainingQuery)spanQuery).getBig(), simScorer, slopFactor);
+ }
+ if (spanQuery instanceof SpanWithinQuery) {
+ return minSlopFactor(((SpanWithinQuery)spanQuery).getLittle(), simScorer, slopFactor);
+ }
+ if (spanQuery instanceof SpanBoostQuery) {
+ return minSlopFactor(((SpanBoostQuery)spanQuery).getQuery(), simScorer, slopFactor);
+ }
+
+ SpanQuery[] clauses = null;
+ int maxAllowedSlop = -1;
+
+ if (spanQuery instanceof SpanOrQuery) {
+ SpanOrQuery spanOrQuery = (SpanOrQuery)spanQuery;
+ clauses = spanOrQuery.getClauses();
+ maxAllowedSlop = spanOrQuery.getMaxDistance();
+ if (maxAllowedSlop == -1) {
+ return minSlopFactorClauses(clauses, simScorer, slopFactor);
+ }
+ }
+ else if (spanQuery instanceof SpanNearQuery) {
+ SpanNearQuery spanNearQuery = (SpanNearQuery) spanQuery;
+ clauses = spanNearQuery.getClauses();
+ maxAllowedSlop = spanNearQuery.getNonMatchSlop();
+ }
+
+ if (clauses == null) {
+ throw new IllegalArgumentException("Not implemented for SpanQuery class: "
+ + spanQuery.getClass().getName());
+ }
+
+ assert maxAllowedSlop >= 0;
+ double localSlopFactor = simScorer.computeSlopFactor(maxAllowedSlop);
+ assert localSlopFactor >= 0;
+ // assumed multiplication:
+ return minSlopFactorClauses(clauses, simScorer, slopFactor * localSlopFactor);
+ }
+
+ /** Helper for {@link #minSlopFactor} */
+ public double minSlopFactorClauses(SpanQuery[] clauses, SimScorer simScorer, double slopFactor) {
+ assert slopFactor >= 0;
+ assert clauses.length >= 1;
+ double res = Double.MAX_VALUE;
+ for (SpanQuery clause : clauses) {
+ double minSlopFacClause = minSlopFactor(clause, simScorer, slopFactor);
+ res = Double.min(res, minSlopFacClause);
+ }
+ return res;
+ }
+
+ /** Provide a SpansTreeScorer that has the result of {@link #minSlopFactor}
+ * as the weight for non matching terms.
+ */
+ @Override
+ public SpansTreeScorer scorer(LeafReaderContext context) throws IOException {
+ final Spans spans = spanWeight.getSpans(context, SpanWeight.Postings.POSITIONS);
+ if (spans == null) {
+ return null;
+ }
+ SimScorer topLevelScorer = spanWeight.getSimScorer(context);
+ double topLevelSlopFactor = topLevelScorer.computeSlopFactor(TOP_LEVEL_SLOP);
+ double nonMatchWeight = minSlopFactor(spanQuery, topLevelScorer, topLevelSlopFactor);
+
+ return new SpansTreeScorer(this, spans, topLevelSlopFactor, nonMatchWeight);
+ }
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpansTreeScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpansTreeScorer.java
new file mode 100644
index 0000000..1455cf4
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpansTreeScorer.java
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.spans;
+
+import java.io.IOException;
+import java.util.Objects;
+
+import org.apache.lucene.index.Term; // javadocs
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Weight;
+import org.apache.lucene.search.TwoPhaseIterator;
+
+/**
+ * A Scorer for (nested) spans.
+ * This associates the spans with a {@link SpansDocScorer} and uses its score values.
+ * <p>
+ * For {@link SpansTreeQuery}. Public for extension.
+ *
+ * @lucene.experimental
+ */
+public class SpansTreeScorer extends Scorer {
+
+ protected final Spans spans;
+ protected final double topLevelSlopFactor;
+ protected final double nonMatchWeight;
+ protected final SpansDocScorer<?> spansDocScorer;
+
+ protected int lastScoredDoc = -1;
+
+ public SpansTreeScorer(Weight weight, Spans spans, double topLevelSlopFactor, double nonMatchWeight) {
+ super(weight);
+ this.spans = Objects.requireNonNull(spans);
+ this.topLevelSlopFactor = topLevelSlopFactor;
+ this.nonMatchWeight = nonMatchWeight;
+ this.spansDocScorer = createSpansDocScorer(spans);
+ }
+
+ @Override
+ public int docID() {
+ return spans.docID();
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ return spans;
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ return spans.asTwoPhaseIterator();
+ }
+
+ /**
+ * Provide the SpansDocScorer that will be used by {@link #score} and {@link #freq}.
+ * <br>
+ * Override this to provide support for span queries for which the spans are not supported here.
+ * <table rules="all" frame="box" cellpadding="3" summary="SpansDocScorer for Spans">
+ * <tr><td>For {@link Spans}:</td>
+ * <td>normally from {@link SpanQuery}:</td>
+ * <td>return:</td>
+ * </tr>
+ * <tr><td>{@link TermSpans}</td>
+ * <td>{@link SpanTermQuery}</td>
+ * <td>{@link TermSpansDocScorer}</td>
+ * </tr>
+ * <tr><td>{@link DisjunctionNearSpans}</td>
+ * <td>{@link SpanOrQuery#SpanOrQuery(int,SpanQuery...)}</td>
+ * <td>{@link DisjunctionNearSpansDocScorer}</td>
+ * </tr>
+ * <tr><td>{@link DisjunctionSpans}</td>
+ * <td>{@link SpanOrQuery#SpanOrQuery(SpanQuery...)}</td>
+ * <td>{@link DisjunctionSpansDocScorer}</td>
+ * </tr>
+ * <tr><td>{@link SynonymSpans}</td>
+ * <td>{@link SpanSynonymQuery#SpanSynonymQuery(Term...)}</td>
+ * <td>{@link SynonymSpansDocScorer}</td>
+ * </tr>
+ * <tr><td>{@link ConjunctionNearSpans}</td>
+ * <td>{@link SpanNearQuery}</td>
+ * <td>{@link ConjunctionNearSpansDocScorer}</td>
+ * </tr>
+ * <tr><td>{@link FilterSpans}</td>
+ * <td>{@link SpanNotQuery}, {@link SpanFirstQuery}</td>
+ * <td>recursively use {@link FilterSpans#in}</td>
+ * </tr>
+ * <tr><td>{@link ContainSpans}</td>
+ * <td>{@link SpanContainingQuery}, {@link SpanWithinQuery}</td>
+ * <td>recursively use {@link ContainSpans#sourceSpans}</td>
+ * </tr>
+ * </table>
+ */
+ public SpansDocScorer<?> createSpansDocScorer(Spans spans) {
+ SpansDocScorer<?> spansDocScorer = null;
+ if (spans instanceof TermSpans) {
+ spansDocScorer = new TermSpansDocScorer((TermSpans) spans, nonMatchWeight);
+ }
+ else if (spans instanceof DisjunctionNearSpans) {
+ spansDocScorer = new DisjunctionNearSpansDocScorer(this, (DisjunctionNearSpans) spans);
+ }
+ else if (spans instanceof SynonymSpans) {
+ spansDocScorer = new SynonymSpansDocScorer((SynonymSpans) spans, nonMatchWeight);
+ }
+ else if (spans instanceof DisjunctionSpans) {
+ spansDocScorer = new DisjunctionSpansDocScorer<>(this, (DisjunctionSpans) spans);
+ }
+ else if (spans instanceof ConjunctionNearSpans) {
+ spansDocScorer = new ConjunctionNearSpansDocScorer(this, (ConjunctionNearSpans) spans);
+ }
+ else if (spans instanceof FilterSpans) {
+ spansDocScorer = createSpansDocScorer(((FilterSpans) spans).in);
+ spans.spansDocScorer = spansDocScorer; // shortcut
+ }
+ else if (spans instanceof ContainSpans) {
+ spansDocScorer = createSpansDocScorer(((ContainSpans) spans).sourceSpans);
+ spans.spansDocScorer = spansDocScorer; // shortcut
+ }
+ if (spansDocScorer == null) {
+ throw new IllegalArgumentException("Not implemented for Spans class: "
+ + spans.getClass().getName());
+ }
+ return spansDocScorer;
+ }
+
+ /**
+ * Record the span matches in the current document.
+ * <p>
+ * This will be called at most once per document.
+ */
+ protected void recordMatchesCurrentDoc() throws IOException {
+ int startPos = spans.nextStartPosition();
+ assert startPos != Spans.NO_MORE_POSITIONS;
+ spansDocScorer.beginDoc();
+ do {
+ spansDocScorer.recordMatch(topLevelSlopFactor);
+ startPos = spans.nextStartPosition();
+ } while (startPos != Spans.NO_MORE_POSITIONS);
+ }
+
+ /**
+ * Ensure recordMatchesCurrentDoc is called, if not already called for the current doc.
+ */
+ public void ensureMatchesRecorded() throws IOException {
+ int currentDoc = docID();
+ if (lastScoredDoc != currentDoc) {
+ recordMatchesCurrentDoc();
+ lastScoredDoc = currentDoc;
+ }
+ }
+
+ /** Score the current document.
+ * See {@link #createSpansDocScorer} and {@link SpansDocScorer#docScore}.
+ */
+ @Override
+ public final float score() throws IOException {
+ ensureMatchesRecorded();
+ return (float) spansDocScorer.docScore();
+ }
+
+ /** Return the total matching frequency of the current document.
+ * See {@link #createSpansDocScorer} and {@link SpansDocScorer#docMatchFreq}.
+ */
+ @Override
+ public final int freq() throws IOException {
+ ensureMatchesRecorded();
+ return spansDocScorer.docMatchFreq();
+ }
+
+ public String toString() {
+ return "SpansTreeScorer(" + spansDocScorer + ")";
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpans.java
new file mode 100644
index 0000000..fdbf676
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpans.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.spans;
+
+import java.util.List;
+
+import org.apache.lucene.search.similarities.Similarity.SimScorer;
+
+
+/**
+ * A spans for merging and equal scoring of given spans.
+ * This does not provide score values.
+ *
+ * @lucene.experimental
+ */
+public class SynonymSpans extends DisjunctionSpans {
+ SimScorer simScorer;
+
+ /** Construct a SynonymSpans.
+ * @param spanQuery The query that provides the subSpans.
+ * @param subSpans Over which the disjunction is to be taken.
+ * @param simScorer To be used for scoring.
+ */
+ public SynonymSpans(SpanQuery spanQuery, List<Spans> subSpans, SimScorer simScorer) {
+ super(spanQuery, subSpans);
+ this.simScorer = simScorer;
+ }
+
+ @Override
+ public String toString() {
+ return "SynonymSpans(" + spanQuery + ")@" + docID() + ": " + startPosition() + " - " + endPosition();
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpansDocScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpansDocScorer.java
new file mode 100644
index 0000000..0bbe708
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/SynonymSpansDocScorer.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.spans;
+
+import java.io.IOException;
+import java.util.ArrayList;
+
+/**
+ * For {@link SpansTreeQuery}. Public for extension.
+ *
+ * @lucene.experimental
+ */
+public class SynonymSpansDocScorer
+ extends AsSingleTermSpansDocScorer<SynonymSpans> {
+
+ protected final ArrayList<Spans> subSpansAtDoc;
+ /**
+ * @param synSpans Provides matching synonym occurrences.
+ * This should only contain TermSpans.
+ * @param nonMatchWeight The non negative weight to be used for the non matching term occurrences.
+ */
+ public SynonymSpansDocScorer(SynonymSpans synSpans, double nonMatchWeight) {
+ super(synSpans, synSpans.simScorer, nonMatchWeight);
+ this.subSpansAtDoc = new ArrayList<>(synSpans.subSpans().size());
+ }
+
+ @Override
+ public int termFreqInDoc() throws IOException {
+ int freq = 0;
+ for (Spans subSpans : subSpansAtDoc) {
+ freq += ((TermSpans)subSpans).getPostings().freq();
+ }
+ return freq;
+ }
+
+ @Override
+ public void beginDoc() throws IOException {
+ subSpansAtDoc.clear();
+ spans.extractSubSpansAtCurrentDoc(subSpansAtDoc);
+ assert subSpansAtDoc.size() > 0 : "empty subSpansAtDoc docID=" + docID();
+ super.beginDoc(); // calls termFreqInDoc.
+ }
+
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/TermSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/TermSpans.java
index f1e1aed..6b0bb47 100644
--- a/lucene/core/src/java/org/apache/lucene/search/spans/TermSpans.java
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/TermSpans.java
@@ -32,6 +32,7 @@ import org.apache.lucene.search.similarities.Similarity;
public class TermSpans extends Spans {
protected final PostingsEnum postings;
protected final Term term;
+ protected final Similarity.SimScorer simScorer;
protected int doc;
protected int freq;
protected int count;
@@ -41,6 +42,7 @@ public class TermSpans extends Spans {
public TermSpans(Similarity.SimScorer scorer,
PostingsEnum postings, Term term, float positionsCost) {
+ this.simScorer = scorer;
this.postings = Objects.requireNonNull(postings);
this.term = Objects.requireNonNull(term);
this.doc = -1;
diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/TermSpansDocScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/TermSpansDocScorer.java
new file mode 100644
index 0000000..a033ce7
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/search/spans/TermSpansDocScorer.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.spans;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.PostingsEnum;
+
+
+/**
+ * For {@link SpansTreeQuery}. Public for extension.
+ *
+ * @lucene.experimental
+ */
+public class TermSpansDocScorer extends AsSingleTermSpansDocScorer<TermSpans> {
+
+ protected final PostingsEnum postings;
+
+ /**
+ * @param termSpans Provides matching term occurrences.
+ * @param nonMatchWeight The non negative weight to be used for the non matching term occurrences.
+ */
+ public TermSpansDocScorer(TermSpans termSpans, double nonMatchWeight) {
+ super(termSpans, termSpans.simScorer, nonMatchWeight);
+ this.postings = termSpans.getPostings();
+ }
+
+ @Override
+ public int termFreqInDoc() throws IOException {
+ return postings.freq();
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java b/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java
index 83ac613..83f61c1 100644
--- a/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java
+++ b/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java
@@ -173,6 +173,14 @@ public abstract class PriorityQueue<T> implements Iterable<T> {
return heap[1];
}
+ /** Returns the second least element of the PriorityQueue in constant time. */
+ public final T subTop() {
+ if (size == 2) {
+ return heap[2];
+ }
+ return lessThan(heap[2], heap[3]) ? heap[2] : heap[3];
+ }
+
/** Removes and returns the least element of the PriorityQueue in log(size)
time. */
public final T pop() {
diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSearchEquivalence.java b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSearchEquivalence.java
index 7d7fbe4..e3b4d24 100644
--- a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSearchEquivalence.java
+++ b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSearchEquivalence.java
@@ -32,6 +32,8 @@ import static org.apache.lucene.search.spans.SpanTestUtil.*;
*/
public class TestSpanSearchEquivalence extends SearchEquivalenceTestBase {
+ final int MAX_SLOP = Integer.MAX_VALUE-1; // avoid distance+1 overflow in computeSlopFactor
+
// TODO: we could go a little crazy for a lot of these,
// but these are just simple minimal cases in case something
// goes horribly wrong. Put more intense tests elsewhere.
@@ -169,7 +171,7 @@ public class TestSpanSearchEquivalence extends SearchEquivalenceTestBase {
spanQuery(new SpanTermQuery(t1)),
spanQuery(new SpanTermQuery(t2))
};
- SpanQuery q1 = spanQuery(new SpanNearQuery(subquery, Integer.MAX_VALUE, false));
+ SpanQuery q1 = spanQuery(new SpanNearQuery(subquery, MAX_SLOP, false));
BooleanQuery.Builder q2 = new BooleanQuery.Builder();
q2.add(new TermQuery(t1), Occur.MUST);
q2.add(new TermQuery(t2), Occur.MUST);
@@ -293,7 +295,7 @@ public class TestSpanSearchEquivalence extends SearchEquivalenceTestBase {
/** SpanPositionRangeQuery(A, 0, ∞) = TermQuery(A) */
public void testSpanRangeTermEverything() throws Exception {
Term t1 = randomTerm();
- Query q1 = spanQuery(new SpanPositionRangeQuery(spanQuery(new SpanTermQuery(t1)), 0, Integer.MAX_VALUE));
+ Query q1 = spanQuery(new SpanPositionRangeQuery(spanQuery(new SpanTermQuery(t1)), 0, MAX_SLOP));
Query q2 = new TermQuery(t1);
assertSameSet(q1, q2);
}
@@ -343,7 +345,7 @@ public class TestSpanSearchEquivalence extends SearchEquivalenceTestBase {
spanQuery(new SpanTermQuery(t2))
};
SpanQuery nearQuery = spanQuery(new SpanNearQuery(subquery, 10, true));
- Query q1 = spanQuery(new SpanPositionRangeQuery(nearQuery, 0, Integer.MAX_VALUE));
+ Query q1 = spanQuery(new SpanPositionRangeQuery(nearQuery, 0, MAX_SLOP));
Query q2 = nearQuery;
assertSameSet(q1, q2);
}
@@ -371,7 +373,7 @@ public class TestSpanSearchEquivalence extends SearchEquivalenceTestBase {
/** SpanFirstQuery(A, ∞) = TermQuery(A) */
public void testSpanFirstTermEverything() throws Exception {
Term t1 = randomTerm();
- Query q1 = spanQuery(new SpanFirstQuery(spanQuery(new SpanTermQuery(t1)), Integer.MAX_VALUE));
+ Query q1 = spanQuery(new SpanFirstQuery(spanQuery(new SpanTermQuery(t1)), MAX_SLOP));
Query q2 = new TermQuery(t1);
assertSameSet(q1, q2);
}
@@ -417,7 +419,7 @@ public class TestSpanSearchEquivalence extends SearchEquivalenceTestBase {
spanQuery(new SpanTermQuery(t2))
};
SpanQuery nearQuery = spanQuery(new SpanNearQuery(subquery, 10, true));
- Query q1 = spanQuery(new SpanFirstQuery(nearQuery, Integer.MAX_VALUE));
+ Query q1 = spanQuery(new SpanFirstQuery(nearQuery, MAX_SLOP));
Query q2 = nearQuery;
assertSameSet(q1, q2);
}
diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSynonymQuery.java b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSynonymQuery.java
new file mode 100644
index 0000000..5f4b8eb
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanSynonymQuery.java
@@ -0,0 +1,238 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.spans;
+
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Comparator;
+
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.CheckHits;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopScoreDocCollector;
+
+import org.apache.lucene.search.SynonymQuery;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.English;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import junit.framework.Assert;
+
+
+public class TestSpanSynonymQuery extends LuceneTestCase {
+ static IndexSearcher searcher;
+ static IndexReader reader;
+ static Directory directory;
+
+ static final int MAX_TEST_DOC = 32;
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
+ newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true))
+ .setMaxBufferedDocs(TestUtil.nextInt(random(), MAX_TEST_DOC, MAX_TEST_DOC + 100))
+ .setMergePolicy(newLogMergePolicy()));
+ for (int i = 0; i < MAX_TEST_DOC; i++) {
+ Document doc = new Document();
+ String text;
+ if (i < (MAX_TEST_DOC-1)) {
+ text = English.intToEnglish(i);
+ if ((i % 5) == 0) { // add some multiple occurrences of the same term(s)
+ text += " " + text;
+ }
+ } else { // last doc, for testing distances > 1, and repeating occurrrences of wb
+ text = "az a b c d e wa wb wb wc az";
+ }
+ doc.add(newTextField("field", text, Field.Store.YES));
+ writer.addDocument(doc);
+ }
+ reader = writer.getReader();
+ searcher = new IndexSearcher(reader);
+ writer.close();
+ }
+
+ @AfterClass
+ public static void afterClass() throws Exception {
+ reader.close();
+ directory.close();
+ searcher = null;
+ directory = null;
+ }
+
+ final String FIELD_NAME = "field";
+
+
+ Term lcnTerm(String term) {
+ return new Term(FIELD_NAME, term);
+ }
+
+ Term[] lcnTerms(String... terms) {
+ Term[] lcnTrms = new Term[terms.length];
+ for (int i = 0; i < terms.length; i++) {
+ lcnTrms[i] = lcnTerm(terms[i]);
+ }
+ return lcnTrms;
+ }
+
+ TermQuery termQuery(String term) {
+ return new TermQuery(lcnTerm(term));
+ }
+
+ SpanTermQuery spanTermQuery(String term) {
+ return new SpanTermQuery(lcnTerm(term));
+ }
+
+ SpanTermQuery[] spanTermQueries(String... terms) {
+ SpanTermQuery[] stqs = new SpanTermQuery[terms.length];
+ for (int i = 0; i < terms.length; i++) {
+ stqs[i] = spanTermQuery(terms[i]);
+ }
+ return stqs;
+ }
+
+ SpanSynonymQuery spanSynonymQuery(String... terms) {
+ return new SpanSynonymQuery(lcnTerms(terms));
+ }
+
+ SynonymQuery synonymQuery(String... terms) {
+ return new SynonymQuery(lcnTerms(terms));
+ }
+
+ void sortByDoc(ScoreDoc[] scoreDocs) {
+ Arrays.sort(scoreDocs, new Comparator<ScoreDoc>() {
+ @Override
+ public int compare(ScoreDoc sd1, ScoreDoc sd2) {
+ return sd1.doc - sd2.doc;
+ }
+ });
+ }
+
+ ScoreDoc[] search(IndexSearcher searcher, Query query) throws IOException {
+ TopScoreDocCollector collector = TopScoreDocCollector.create(MAX_TEST_DOC);
+ searcher.search(query, collector);
+ return collector.topDocs().scoreDocs;
+ }
+
+ int[] docsFromHits(ScoreDoc[] hits) throws Exception {
+ int[] docs = new int[hits.length];
+ for (int i = 0; i < hits.length; i++) {
+ docs[i] = hits[i].doc;
+ }
+ return docs;
+ }
+
+ void showQueryResults(String message, Query q, ScoreDoc[] hits) {
+ System.out.println(message + " results from query " + q);
+ for (ScoreDoc hit : hits) {
+ System.out.println("doc=" + hit.doc + ", score=" + hit.score);
+ }
+ }
+
+ void checkEqualScores(Query qexp, Query qact) throws Exception {
+ ScoreDoc[] expHits = search(searcher, qexp);
+
+ int[] expDocs = docsFromHits(expHits);
+ //showQueryResults("checkEqualScores expected", qexp, expHits);
+
+ ScoreDoc[] actHits = search(searcher, qact);
+ //showQueryResults("checkEqualScores actual", qact, actHits);
+
+ CheckHits.checkHitsQuery(qact, actHits, expHits, expDocs);
+ }
+
+ void checkScoresInRange(Query qexp, Query qact, float maxFac, float minFac) throws Exception {
+ ScoreDoc[] expHits = search(searcher, qexp);
+ //showQueryResults("checkScoresInRange expected", qexp, expHits);
+
+ ScoreDoc[] actHits = search(searcher, qact);
+ //showQueryResults("checkScoresInRange actual", qact, actHits);
+
+ if (expHits.length != actHits.length) {
+ Assert.fail("Unequal lengths: expHits="+expHits.length+",actHits="+actHits.length);
+ }
+
+ sortByDoc(expHits);
+ sortByDoc(actHits);
+ for (int i = 0; i < expHits.length; i++) {
+ if (expHits[i].doc != actHits[i].doc)
+ {
+ Assert.fail("At index " + i
+ + ": expHits[i].doc=" + expHits[i].doc
+ + " != actHits[i].doc=" + actHits[i].doc);
+ }
+
+ if ( (expHits[i].score * maxFac < actHits[i].score)
+ || (expHits[i].score * minFac > actHits[i].score))
+ {
+ Assert.fail("At index " + i
+ + ", expHits[i].doc=" + expHits[i].doc
+ + ", score not in expected range: " + (expHits[i].score * minFac)
+ + " <= " + actHits[i].score
+ + " <= " + (expHits[i].score * maxFac));
+ }
+ }
+ }
+
+ void checkSingleTerm(String term) throws Exception {
+ TermQuery tq = termQuery(term);
+ SpanTermQuery stq = spanTermQuery(term);
+ SpanSynonymQuery ssq = spanSynonymQuery(term);
+
+ checkEqualScores(tq, stq);
+ checkEqualScores(tq, ssq);
+ }
+
+ public void testSingleZero() throws Exception {
+ checkSingleTerm("zero");
+ }
+
+ SpanOrQuery spanOrQuery(String... terms) {
+ return new SpanOrQuery(spanTermQueries(terms));
+ }
+
+ void checkOrTerms(String... terms) throws Exception {
+ assertTrue(terms.length >= 1);
+ SpanOrQuery soq = spanOrQuery(terms);
+ SpanSynonymQuery ssq = spanSynonymQuery(terms);
+ checkScoresInRange(soq, ssq, 0.7f, 0.3f);
+
+ SynonymQuery sq = synonymQuery(terms);
+ checkEqualScores(sq, ssq);
+ }
+
+ public void testOrTwoTermsNoDocOverlap() throws Exception {
+ checkOrTerms("zero", "one");
+ }
+
+ public void testOrTwoTermsDocOverlap() throws Exception {
+ checkOrTerms("twenty", "one");
+ }
+}
diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpansTreeQuery.java b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpansTreeQuery.java
new file mode 100644
index 0000000..c166ed2
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpansTreeQuery.java
@@ -0,0 +1,648 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.spans;
+
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Comparator;
+
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.SynonymQuery;
+import org.apache.lucene.search.CheckHits;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopScoreDocCollector;
+import org.apache.lucene.search.similarities.ClassicSimilarity;
+import org.apache.lucene.search.similarities.BM25Similarity;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.English;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import junit.framework.Assert;
+
+
+public class TestSpansTreeQuery extends LuceneTestCase {
+ static IndexSearcher searcherClassic;
+ static IndexSearcher searcherBM25;
+ static IndexReader reader;
+ static Directory directory;
+
+ static final int MAX_TEST_DOC = 33;
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
+ newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true))
+ .setMaxBufferedDocs(TestUtil.nextInt(random(), MAX_TEST_DOC, MAX_TEST_DOC + 100))
+ .setMergePolicy(newLogMergePolicy()));
+ for (int i = 0; i < MAX_TEST_DOC; i++) {
+ Document doc = new Document();
+ String text;
+ if (i < (MAX_TEST_DOC-1)) {
+ text = English.intToEnglish(i);
+ if ((i % 5) == 0) { // add some multiple occurrences of the same term(s)
+ text += " " + text;
+ }
+ } else { // last doc, for testing distances > 1, and repeating occurrrences of wb
+ text = "az a b c d e wa wb wb wc az";
+ }
+ doc.add(newTextField("field", text, Field.Store.YES));
+ writer.addDocument(doc);
+ }
+ reader = writer.getReader();
+ searcherClassic = new IndexSearcher(reader);
+ searcherClassic.setSimilarity(new ClassicSimilarity());
+ searcherBM25 = new IndexSearcher(reader);
+ searcherBM25.setSimilarity(new BM25Similarity());
+ writer.close();
+ }
+
+ @AfterClass
+ public static void afterClass() throws Exception {
+ reader.close();
+ directory.close();
+ searcherClassic = null;
+ searcherBM25 = null;
+ reader = null;
+ directory = null;
+ }
+
+ final String FIELD_NAME = "field";
+
+ Term lcnTerm(String term) {
+ return new Term(FIELD_NAME, term);
+ }
+
+ Term[] lcnTerms(String... terms) {
+ Term[] lcnTrms = new Term[terms.length];
+ for (int i = 0; i < terms.length; i++) {
+ lcnTrms[i] = lcnTerm(terms[i]);
+ }
+ return lcnTrms;
+ }
+
+
+ TermQuery termQuery(String term) {
+ return new TermQuery(lcnTerm(term));
+ }
+
+ SpanTermQuery spanTermQuery(String term) {
+ return new SpanTermQuery(lcnTerm(term));
+ }
+
+ ScoreDoc[] search(IndexSearcher searcher, Query query) throws IOException {
+ TopScoreDocCollector collector = TopScoreDocCollector.create(MAX_TEST_DOC);
+ searcher.search(query, collector);
+ return collector.topDocs().scoreDocs;
+ }
+
+ int[] docsFromHits(ScoreDoc[] hits) throws Exception {
+ int[] docs = new int[hits.length];
+ for (int i = 0; i < hits.length; i++) {
+ docs[i] = hits[i].doc;
+ }
+ return docs;
+ }
+
+ void checkEqualDocOrder(Query qexp, Query qact) throws Exception {
+ ScoreDoc[] expHits = search(searcherBM25, qexp);
+ ScoreDoc[] actHits = search(searcherBM25, qact);
+ assertEquals("same nr of hits", expHits.length, actHits.length);
+ for (int i = 0; i < expHits.length; i++) {
+ assertEquals("same doc at rank " + i, expHits[i].doc, actHits[i].doc);
+ }
+ }
+
+ void showQueryResults(String message, Query q, ScoreDoc[] hits) {
+ System.out.println(message + " results from query " + q);
+ for (ScoreDoc hit : hits) {
+ System.out.println("doc=" + hit.doc + ", score=" + hit.score);
+ }
+ }
+
+ void checkEqualScores(Query qexp, Query qact) throws Exception {
+ ScoreDoc[] expHits = search(searcherBM25, qexp);
+ int[] expDocs = docsFromHits(expHits);
+ //showQueryResults("expected BM25", qexp, expHits);
+
+ ScoreDoc[] actHits = search(searcherBM25, qact);
+ //showQueryResults("actual BM25", qact, actHits);
+
+ CheckHits.checkHitsQuery(qact, actHits, expHits, expDocs);
+
+ expHits = search(searcherClassic, qexp);
+ expDocs = docsFromHits(expHits);
+ //showQueryResults("expected Classic", qexp, expHits);
+
+ actHits = search(searcherClassic, qact);
+ //showQueryResults("actual Classic", qexp, expHits);
+ CheckHits.checkHitsQuery(qact, actHits, expHits, expDocs);
+ }
+
+ void checkSpanTerm(String term) throws Exception {
+ TermQuery tq = termQuery(term);
+ SpanTermQuery stq = spanTermQuery(term);
+
+ checkEqualScores(tq, stq); // test SpanScorer
+
+ checkEqualScores(tq, SpansTreeQuery.wrap(stq)); // test SpanTreeScorer
+ }
+
+ public void testSpanTermZero() throws Exception {
+ checkSpanTerm("zero");
+ }
+
+ public void testSpanTermSeven() throws Exception {
+ checkSpanTerm("seven");
+ }
+
+ public void testSpanTermFive() throws Exception {
+ checkSpanTerm("five");
+ }
+
+ SpanTermQuery[] spanTermQueries(String... terms) {
+ SpanTermQuery[] stqs = new SpanTermQuery[terms.length];
+ for (int i = 0; i < terms.length; i++) {
+ stqs[i] = spanTermQuery(terms[i]);
+ }
+ return stqs;
+ }
+
+ SpanOrQuery spanOrQuery(String... terms) {
+ return new SpanOrQuery(spanTermQueries(terms));
+ }
+
+ SpanOrQuery spanOrNearQuery(int maxDistance, String... terms) {
+ return new SpanOrQuery(maxDistance, spanTermQueries(terms));
+ }
+
+ BooleanQuery booleanOrQuery(String... terms) {
+ BooleanQuery.Builder bqb = new BooleanQuery.Builder();
+ for (int i = 0; i < terms.length; i++) {
+ bqb.add(termQuery(terms[i]), BooleanClause.Occur.SHOULD);
+ }
+ return bqb.build();
+ }
+
+ void checkSpanOrTerms(String... terms) throws Exception {
+ assertTrue(terms.length >= 1);
+ Query boq = SpansTreeQuery.wrap(booleanOrQuery(terms));
+ assertTrue(boq instanceof BooleanQuery); // test SpansTreeQuery.wrap
+ assertTrue(((BooleanQuery)boq).clauses().get(terms.length-1).getQuery() instanceof TermQuery); // test SpansTreeQuery.wrap
+ SpanOrQuery soq = spanOrQuery(terms);
+ Query sptroq = SpansTreeQuery.wrap(soq);
+ //checkEqualDocOrder(boq, sptroq);
+ //checkEqualScores(boq, soq); // test SpanScorer for OR over terms, fails
+ checkEqualScores(boq, sptroq); // test SpanTreeScorer for OR over terms
+ }
+
+ public void testSpanOrOneTerm1() throws Exception {
+ checkSpanOrTerms("zero");
+ }
+
+ public void testSpanOrOneTerm2() throws Exception {
+ checkSpanOrTerms("thirty");
+ }
+
+ public void testSpanOrTwoTerms() throws Exception {
+ checkSpanOrTerms("zero", "thirty");
+ }
+
+ public void testSpanOrTwoCooccurringTerms() throws Exception {
+ checkSpanOrTerms("twenty", "five");
+ }
+
+ public void testSpanOrMoreTerms() throws Exception {
+ checkSpanOrTerms(
+ "zero",
+ "one",
+ "two",
+ "three",
+ "four",
+ "five",
+ "six",
+ "seven",
+ "twenty",
+ "thirty"
+ );
+ }
+
+ void checkSameHighestScoringDocAndScore(Query exp, Query act) throws Exception {
+ ScoreDoc[] expHits = search(searcherBM25, exp);
+ int[] expDocs = docsFromHits(expHits);
+ //showQueryResults("checkSameHighestScoringDocAndScore expected BM25", exp, expHits);
+
+ ScoreDoc[] actHits = search(searcherBM25, act);
+ //showQueryResults("checkSameHighestScoringDocAndScore actual BM25", act, actHits);
+
+ final float scoreTolerance = 1.0e-6f; // from CheckHits.java
+
+ assertEquals("highest scoring docs the same", expHits[0].doc, actHits[0].doc);
+ assertTrue("equal scores", Math.abs(expHits[0].score - actHits[0].score) <= scoreTolerance);
+ }
+
+ void checkSameHighestScoringDocAndScoreRange(Query exp, Query act, float maxFac, float minFac) throws Exception {
+ ScoreDoc[] expHits = search(searcherBM25, exp);
+ int[] expDocs = docsFromHits(expHits);
+ //showQueryResults("checkSameHighestScoringDocAndScore expected BM25", exp, expHits);
+
+ ScoreDoc[] actHits = search(searcherBM25, act);
+ //showQueryResults("checkSameHighestScoringDocAndScore actual BM25", act, actHits);
+
+ final float scoreTolerance = 1.0e-6f; // from CheckHits.java
+
+ assertTrue("at least one expected hit", expHits.length >= 1);
+ assertTrue("at least one actual hit", actHits.length >= 1);
+
+ int actDoc = 0; // order may differ when top scores are equal
+ while ((actDoc < actHits.length)
+ && (actHits[actDoc].doc != expHits[0].doc)
+ && (Math.abs(actHits[0].score - actHits[actDoc+1].score) < 1e-6f) ) {
+ actDoc++;
+ }
+ assertEquals("highest scoring docs the same", expHits[0].doc, actHits[actDoc].doc);
+ if ( (expHits[0].score * maxFac < actHits[actDoc].score)
+ || (expHits[0].score * minFac > actHits[actDoc].score))
+ {
+ Assert.fail("For highest scoring doc"
+ + ", expHits[0].doc=" + expHits[0].doc
+ + ", score not in expected range: " + (expHits[0].score * minFac)
+ + " <= " + actHits[actDoc].score
+ + " <= " + (expHits[0].score * maxFac));
+ }
+ }
+
+ public void testSpanAdjacentAllTermsInDocUnordered() throws Exception {
+ /* On "twenty five twenty five"
+ * unordered "twenty five" should score the same as "twenty" OR "five"
+ */
+ String t1 = "twenty";
+ String t2 = "five";
+ SpanNearQuery snq = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME)
+ .addClause(spanTermQuery(t1))
+ .addClause(spanTermQuery(t2))
+ .setSlop(0)
+ .build();
+ BooleanQuery boq = booleanOrQuery(t1, t2);
+
+ checkSameHighestScoringDocAndScore(boq, SpansTreeQuery.wrap(snq));
+ }
+
+ public void testSpanAdjacentAllTermsInDocOrdered1() throws Exception {
+ /* On "twenty five twenty five"
+ * ordered "twenty five" should score the same as "twenty" OR "five"
+ */
+ String t1 = "twenty";
+ String t2 = "five";
+ SpanNearQuery snq = SpanNearQuery.newOrderedNearQuery(FIELD_NAME)
+ .addClause(spanTermQuery(t1))
+ .addClause(spanTermQuery(t2))
+ .setSlop(0)
+ .build();
+ BooleanQuery boq = booleanOrQuery(t1, t2);
+
+ checkSameHighestScoringDocAndScore(boq, SpansTreeQuery.wrap(snq));
+ }
+
+ public void testSpanAdjacentAllTermsInDocOrdered2() throws Exception {
+ /* On "twenty five twenty five"
+ * ordered "five twenty" should score less, but more than half of "twenty" OR "five"
+ */
+ String t1 = "five";
+ String t2 = "twenty";
+ SpanNearQuery snq = SpanNearQuery.newOrderedNearQuery(FIELD_NAME)
+ .addClause(spanTermQuery(t1))
+ .addClause(spanTermQuery(t2))
+ .setSlop(0)
+ .build();
+ BooleanQuery.Builder bqb = new BooleanQuery.Builder();
+ bqb.add(termQuery(t1), BooleanClause.Occur.SHOULD);
+ bqb.add(termQuery(t2), BooleanClause.Occur.SHOULD);
+ BooleanQuery boq = bqb.build();
+
+ checkSameHighestScoringDocAndScoreRange(boq, SpansTreeQuery.wrap(snq), 0.7f, 0.5f);
+ }
+
+ public void testSpanMoreDistanceLessScore() throws Exception {
+ String t1 = "a";
+ String t2 = "b";
+ String t3 = "c";
+ SpanNearQuery snq2 = SpanNearQuery.newOrderedNearQuery(FIELD_NAME)
+ .addClause(spanTermQuery(t1))
+ .addClause(spanTermQuery(t2))
+ .setSlop(2)
+ .build();
+ SpanNearQuery snq3 = SpanNearQuery.newOrderedNearQuery(FIELD_NAME)
+ .addClause(spanTermQuery(t1))
+ .addClause(spanTermQuery(t3))
+ .setSlop(2)
+ .build();
+
+ checkSameHighestScoringDocAndScoreRange(SpansTreeQuery.wrap(snq2), SpansTreeQuery.wrap(snq3),
+ 0.50f, 0.49f);
+ }
+
+ Query sptrSimpleUnorderedNested(String t1a, String t1b, String t2, int slop) {
+ SpanNearQuery snq1 = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME)
+ .addClause(spanTermQuery(t1a))
+ .addClause(spanTermQuery(t1b))
+ .setSlop(slop)
+ .build();
+
+ SpanNearQuery snqn = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME)
+ .addClause(snq1)
+ .addClause(spanTermQuery(t2))
+ .setSlop(slop)
+ .build();
+
+ return SpansTreeQuery.wrap(snqn);
+ }
+
+ public void testSpanNestedMoreDistanceLessScore() throws Exception {
+ String t1 = "a";
+ String t2 = "b";
+ String t3 = "c";
+ String t4 = "d";
+ String t5 = "e";
+ Query sptrq1 = sptrSimpleUnorderedNested(t1, t2, t4, 2);
+ Query sptrq2 = sptrSimpleUnorderedNested(t1, t3, t5, 2);
+
+ checkSameHighestScoringDocAndScoreRange(sptrq1, sptrq2, 0.7f, 0.6f);
+ }
+
+ public void testNonMatchingPresentTermScore() throws Exception {
+ String t1 = "a";
+ String t2 = "b";
+ String t3 = "c";
+
+ SpanOrQuery soq = new SpanOrQuery(spanTermQuery(t1), spanTermQuery(t2));
+
+ SpanNearQuery snq1 = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME)
+ .addClause(soq)
+ .addClause(spanTermQuery(t3))
+ .setSlop(0)
+ .setNonMatchSlop(3)
+ .build(); // t1 is present but does not match.
+
+ SpanNearQuery snq2 = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME)
+ .addClause(soq)
+ .addClause(spanTermQuery(t3))
+ .setSlop(0)
+ .setNonMatchSlop(4) // t1 scores lower than in snq1
+ .build(); // t1 is present but does not match.
+
+ SpansTreeQuery sptrnq1 = new SpansTreeQuery(snq1);
+ SpansTreeQuery sptrnq2 = new SpansTreeQuery(snq2);
+
+ checkSameHighestScoringDocAndScoreRange(sptrnq1, sptrnq2, 0.98f, 0.9f);
+ }
+
+ public void testSpanNot() throws Exception {
+ /* On "twenty five twenty five"
+ * "twenty" not preceeded by "five", and followed by "five",
+ * should score less, but more than half of "twenty five"
+ */
+ String t1 = "five";
+ String t2 = "twenty";
+ SpanNotQuery sntq = new SpanNotQuery( spanTermQuery(t2), spanTermQuery(t1), 1, 0);
+
+ SpanNearQuery snrq1 = SpanNearQuery.newOrderedNearQuery(FIELD_NAME)
+ .addClause(sntq)
+ .addClause(spanTermQuery(t1))
+ .setSlop(0)
+ .build();
+
+ SpanNearQuery snrq2 = SpanNearQuery.newOrderedNearQuery(FIELD_NAME)
+ .addClause(spanTermQuery(t2))
+ .addClause(spanTermQuery(t1))
+ .setSlop(0)
+ .build();
+
+ Query sptrnrq1 = SpansTreeQuery.wrap(snrq1);
+ Query sptrnrq2 = SpansTreeQuery.wrap(snrq2);
+
+ checkSameHighestScoringDocAndScoreRange(sptrnrq2, sptrnrq1, 0.8f, 0.5f);
+ }
+
+ public void testSpanBoost() throws Exception {
+ String term = "zero";
+ SpanTermQuery stq = spanTermQuery(term);
+ SpanBoostQuery sbq = new SpanBoostQuery(stq, 1.1f);
+
+ checkSameHighestScoringDocAndScoreRange(sbq, stq, 0.92f, 0.90f);
+ checkSameHighestScoringDocAndScoreRange(SpansTreeQuery.wrap(sbq), stq, 0.92f, 0.90f);
+ }
+
+ public void testSpanOrNearZeroDistance() throws Exception {
+ String t1 = "a";
+ String t2 = "b";
+ BooleanQuery boq = booleanOrQuery(t1, t2);
+ SpanOrQuery sonq = spanOrNearQuery(0, t1, t2);
+ checkEqualScores(boq, SpansTreeQuery.wrap(sonq));
+ }
+
+ public void testSpanOrNearMoreDistanceLessScore() throws Exception {
+ String t1 = "a";
+ String t2 = "b";
+ String t3 = "c";
+ Query stq1 = SpansTreeQuery.wrap(spanOrNearQuery(4, t1, t2));
+ Query stq2 = SpansTreeQuery.wrap(spanOrNearQuery(4, t1, t3));
+ checkSameHighestScoringDocAndScoreRange(stq1, stq2, 0.5f, 0.4f);
+ }
+
+ public void testSpanOrNearThreeSubqueries() throws Exception {
+ String t1 = "a";
+ String t2 = "b";
+ String t3 = "c";
+ BooleanQuery boq = booleanOrQuery(t1, t2, t3);
+ SpanOrQuery sonq = spanOrNearQuery(0, t3, t2, t1);
+ checkEqualScores(boq, SpansTreeQuery.wrap(sonq));
+ }
+
+ public void testSpanOrNearNonMatchingSubQuery() throws Exception {
+ String t1 = "a";
+ String t2 = "b";
+ String t3 = "c";
+ String t5 = "e";
+ SpanOrQuery sonq1 = spanOrNearQuery(1, t3, t2, t1);
+ SpanOrQuery sonq2 = spanOrNearQuery(1, t5, t2, t1);
+ checkSameHighestScoringDocAndScoreRange(
+ SpansTreeQuery.wrap(sonq1),
+ SpansTreeQuery.wrap(sonq2),
+ 0.9f, 0.8f);
+ }
+
+ public void testSpanOrNearSinglePresentSubquery() throws Exception {
+ String t1 = "a";
+ String t2 = "h";
+ SpanQuery q1 = spanTermQuery(t1);
+ SpanOrQuery q2 = spanOrNearQuery(1, t2, t1);
+ checkSameHighestScoringDocAndScoreRange(
+ SpansTreeQuery.wrap(q1),
+ SpansTreeQuery.wrap(q2),
+ 0.51f, 0.49f);
+ }
+
+ public void testSpanOrNearRepeatingOccurrences1() throws Exception {
+ String t1 = "wa";
+ String t2 = "wb";
+ BooleanQuery boq = booleanOrQuery(t1, t2);
+ SpanOrQuery sonq = spanOrNearQuery(3, t2, t1);
+ checkSameHighestScoringDocAndScoreRange(
+ boq,
+ SpansTreeQuery.wrap(sonq),
+ 0.9f, 0.8f);
+ }
+
+ public void testSpanOrNearRepeatingOccurrences2() throws Exception {
+ String t1 = "wb";
+ String t2 = "wc";
+ BooleanQuery boq = booleanOrQuery(t1, t2);
+ SpanOrQuery sonq = spanOrNearQuery(3, t2, t1);
+ checkSameHighestScoringDocAndScoreRange(
+ boq,
+ SpansTreeQuery.wrap(sonq),
+ 0.9f, 0.8f);
+ }
+
+ public void testIncreasingScoreExtraMatchLowSlopFactor() throws Exception {
+ String t1 = "az"; // near and far from a
+ String t2 = "a";
+ SpanNearQuery snq1 = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME)
+ .addClause(spanTermQuery(t1))
+ .addClause(spanTermQuery(t2))
+ .setSlop(0) // does not match far
+ .setNonMatchSlop(20) // for consistent non match scoring
+ .build();
+ SpanNearQuery snq2 = SpanNearQuery.newUnorderedNearQuery(FIELD_NAME)
+ .addClause(spanTermQuery(t1))
+ .addClause(spanTermQuery(t2))
+ .setSlop(8) // also matches far
+ .setNonMatchSlop(20) // for consistent non match scoring
+ .build();
+ checkSameHighestScoringDocAndScoreRange(
+ SpansTreeQuery.wrap(snq2),
+ SpansTreeQuery.wrap(snq1),
+ 0.98f, 0.9f);
+ }
+
+ SynonymQuery synonymQuery(String... terms) {
+ return new SynonymQuery(lcnTerms(terms));
+ }
+
+ SpanSynonymQuery spanSynonymQuery(String... terms) {
+ return new SpanSynonymQuery(lcnTerms(terms));
+ }
+
+ void sortByDoc(ScoreDoc[] scoreDocs) {
+ Arrays.sort(scoreDocs, new Comparator<ScoreDoc>() {
+ @Override
+ public int compare(ScoreDoc sd1, ScoreDoc sd2) {
+ return sd1.doc - sd2.doc;
+ }
+ });
+ }
+
+ void checkScoresInRange(Query qexp, Query qact, float maxFac, float minFac) throws Exception {
+ ScoreDoc[] expHits = search(searcherBM25, qexp);
+ //showQueryResults("checkScoresInRange expected", qexp, expHits);
+
+ ScoreDoc[] actHits = search(searcherBM25, qact);
+ //showQueryResults("checkScoresInRange actual", qact, actHits);
+
+ if (expHits.length != actHits.length) {
+ Assert.fail("Unequal lengths: expHits="+expHits.length+",actHits="+actHits.length);
+ }
+
+ sortByDoc(expHits);
+ sortByDoc(actHits);
+ for (int i = 0; i < expHits.length; i++) {
+ if (expHits[i].doc != actHits[i].doc)
+ {
+ Assert.fail("At index " + i
+ + ": expHits[i].doc=" + expHits[i].doc
+ + " != actHits[i].doc=" + actHits[i].doc);
+ }
+
+ if ( (expHits[i].score * maxFac < actHits[i].score)
+ || (expHits[i].score * minFac > actHits[i].score))
+ {
+ Assert.fail("At index " + i
+ + ", expHits[i].doc=" + expHits[i].doc
+ + ", score not in expected range: " + (expHits[i].score * minFac)
+ + " <= " + actHits[i].score
+ + " <= " + (expHits[i].score * maxFac));
+ }
+ }
+ }
+
+ void checkSynTerms(String... terms) throws Exception {
+ assertTrue(terms.length >= 1);
+ SpanOrQuery soq = spanOrQuery(terms);
+ SpanSynonymQuery ssq = spanSynonymQuery(terms);
+ checkScoresInRange(SpansTreeQuery.wrap(soq), SpansTreeQuery.wrap(ssq), 1.0f, 0.425f);
+
+ SynonymQuery sq = synonymQuery(terms);
+ checkEqualScores(SpansTreeQuery.wrap(sq), SpansTreeQuery.wrap(ssq));
+ }
+
+ public void testSynTwoTermsNoDocOverlap() throws Exception {
+ checkSynTerms("zero", "one");
+ }
+
+ public void testSynTwoTermsDocOverlap() throws Exception {
+ checkSynTerms("twenty", "one");
+ }
+
+ public void testSynNearOrNear() throws Exception {
+ // twenty occurs 10 times
+ // thirty occurs 2 times
+ SpanSynonymQuery ssq2030 = spanSynonymQuery("twenty", "thirty");
+ SpanOrQuery soq2030 = spanOrQuery("twenty", "thirty");
+ SpanTermQuery stq1 = spanTermQuery("one");
+
+ SpanNearQuery synNear = SpanNearQuery.newOrderedNearQuery(FIELD_NAME)
+ .addClause(ssq2030)
+ .addClause(stq1)
+ .setSlop(0)
+ .build();
+ SpanNearQuery orNear = SpanNearQuery.newOrderedNearQuery(FIELD_NAME)
+ .addClause(soq2030)
+ .addClause(stq1)
+ .setSlop(0)
+ .build();
+
+ checkSameHighestScoringDocAndScoreRange(
+ SpansTreeQuery.wrap(orNear),
+ SpansTreeQuery.wrap(synNear),
+ 0.80f, 0.70f);
+ }
+}