blob: 7f031412e14e5ba0d30f7e508d650beca6888712 [file] [log] [blame]
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/ScoreDocRank.java b/lucene/queries/src/java/org/apache/lucene/queries/ScoreDocRank.java
new file mode 100644
index 0000000..a711271
--- /dev/null
+++ b/lucene/queries/src/java/org/apache/lucene/queries/ScoreDocRank.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.queries;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.ArrayList;
+
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+
+/**
+ * The {@link #joinDocsWithRanks} method can be used when one is interested
+ * in the ranks of some of the documents produced by queries,
+ * for example for creating statistics like precision and recall.
+ * <p>
+ * This holds one query hit, an index of the producer and the rank of the hit.
+ */
+public class ScoreDocRank {
+
+ /** The score of this document for the query. */
+ public float score;
+
+ /** A hit document's number.
+ * @see org.apache.lucene.search.IndexSearcher#doc(int) */
+ public int doc;
+
+ /** An index of the producer. */
+ public int index;
+
+ /** The rank of the document in the result from the producer. */
+ public int rank;
+
+ /** Constructs a ScoreDocRank. */
+ public ScoreDocRank(int doc, float score, int index, int rank) {
+ this.doc = doc;
+ this.score = score;
+ this.index = index;
+ this.rank = rank;
+ }
+
+ /** Constructs a ScoreDocRank. */
+ public ScoreDocRank(ScoreDoc sd, int index, int rank) {
+ this(sd.doc, sd.score, index, rank);
+ }
+
+ // A convenience method for debugging.
+ @Override
+ public String toString() {
+ return "ScoreDocRank(doc=" + doc
+ + " score=" + score
+ + " index=" + index
+ + " rank=" + rank
+ + ")";
+ }
+
+ /** Join the given TopDocs's by document
+ * keeping only the hits for documents occurring in at least two TopDocs's.
+ * <br>
+ * This implementation works best when the last TopDocs is the largest one.
+ * <br>
+ * For each {@link ScoreDocRank} result hit provide:
+ * <ul>
+ * <li>the index (starting from 0) of the TopDocs as given here,
+ * <li>the rank in the TopDocs that the hit originated from.
+ * </ul>
+ * ScoreDocRanks with equal docs are adjacent in the result.
+ * <p>
+ * Please note that joining TopDocs's by document only makes sense when they
+ * were produced from the same {@link org.apache.lucene.index.IndexReader}.
+ */
+ public static ScoreDocRank[] joinDocsWithRanks(TopDocs... topDocss) {
+ int totalSize = 0;
+ for (TopDocs topDocs: topDocss) {
+ totalSize += topDocs.totalHits;
+ }
+
+ HashMap<Integer, List<ScoreDocRank>> sdrsByDoc = new HashMap<>();
+
+ int r = 0; // result size
+ List<ScoreDocRank> lst;
+ for (int tdi = 0; tdi < topDocss.length; tdi++) {
+ TopDocs topDocs = topDocss[tdi];
+ for (int rank = 0; rank < topDocs.totalHits; rank++) {
+ ScoreDoc scoreDoc = topDocs.scoreDocs[rank];
+ int doc = scoreDoc.doc;
+ if (! sdrsByDoc.containsKey(doc)) { // first doc occurrence
+ if (tdi + 1 == topDocss.length) { // only in last TopDocs.
+ continue;
+ }
+ lst = new ArrayList<>(topDocss.length - tdi);
+ lst.add(new ScoreDocRank(scoreDoc, tdi, rank));
+ sdrsByDoc.put(doc, lst);
+ } else { // later occurrence of the same doc
+ lst = sdrsByDoc.get(doc);
+ lst.add(new ScoreDocRank(scoreDoc, tdi, rank));
+ r += (lst.size() == 2) ? 2 : 1; // count only hits with at least two docs present
+ }
+ }
+ }
+
+ ScoreDocRank[] res = new ScoreDocRank[r];
+ r = 0;
+ for (List<ScoreDocRank> lsdr: sdrsByDoc.values()) {
+ if (lsdr.size() >= 2) {
+ for (ScoreDocRank sdr: lsdr) {
+ res[r++] = sdr;
+ }
+ }
+ }
+ assert r == res.length;
+
+ return res;
+ }
+}
diff --git a/lucene/queries/src/test/org/apache/lucene/queries/TestScoreDocRank.java b/lucene/queries/src/test/org/apache/lucene/queries/TestScoreDocRank.java
new file mode 100644
index 0000000..b8f5825
--- /dev/null
+++ b/lucene/queries/src/test/org/apache/lucene/queries/TestScoreDocRank.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.queries;
+
+import java.util.Comparator;
+
+import org.apache.lucene.util.ArrayUtil;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BaseExplanationTestCase;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+
+public class TestScoreDocRank extends BaseExplanationTestCase {
+
+ TopDocs topDocsFromTerm(Term t) throws Exception {
+ return searcher.search(new TermQuery(t), docFields.length);
+ }
+
+ public void testEmpty() throws Exception {
+ TopDocs tdtxt = topDocsFromTerm(new Term(ALTFIELD, "xx"));// matches docs 2 and 3
+ TopDocs td0 = topDocsFromTerm(new Term(KEY, "0"));
+
+ ScoreDocRank[] res = ScoreDocRank.joinDocsWithRanks(); // no input at all
+ assertEquals("result length", 0, res.length);
+
+ res = ScoreDocRank.joinDocsWithRanks(tdtxt); // single TopDocs, no docs occur twice
+ assertEquals("result length", 0, res.length);
+
+ res = ScoreDocRank.joinDocsWithRanks(tdtxt, td0); // two TopDocs, matching different docs.
+ assertEquals("result length", 0, res.length);
+ }
+
+ public void testOneTermTwoDocsRankBoth() throws Exception {
+ TopDocs tdtxt = topDocsFromTerm(new Term(ALTFIELD, "xx")); // matches docs 2 and 3
+ TopDocs td2 = topDocsFromTerm(new Term(KEY, "2"));
+ TopDocs td3 = topDocsFromTerm(new Term(KEY, "3"));
+
+ // reverse order td3, td2 for testing, tdtxt with largest result at end
+ ScoreDocRank[] res = ScoreDocRank.joinDocsWithRanks(td3, td2, tdtxt);
+
+ assertEquals("result length", 4, res.length);
+
+ // sort by doc for presence tests, stable sort keeps input order
+ ArrayUtil.timSort(res, new Comparator<ScoreDocRank>() {
+ @Override
+ public int compare(ScoreDocRank a, ScoreDocRank b) {
+ return a.doc - b.doc;
+ }
+ });
+
+ assertEquals(2, res[0].doc);
+ assertEquals(1, res[0].index); // td2 has index 1
+ assertEquals(0, res[0].rank);
+
+ assertEquals(2, res[1].doc);
+ assertEquals(2, res[1].index); // tdtxt has index 2
+ assertEquals(0, res[1].rank);
+
+ assertEquals(3, res[2].doc);
+ assertEquals(0, res[2].index); // td3 has index 0
+ assertEquals(0, res[2].rank);
+
+ assertEquals(3, res[3].doc);
+ assertEquals(2, res[3].index); // tdtxt has index 2
+ assertEquals(1, res[3].rank);
+ }
+
+ public void testOneTermTwoDocsRankOne() throws Exception {
+ TopDocs tdtxt = topDocsFromTerm(new Term(ALTFIELD, "xx")); // matches docs 2 and 3
+ TopDocs td3 = topDocsFromTerm(new Term(KEY, "3"));
+
+ ScoreDocRank[] res = ScoreDocRank.joinDocsWithRanks(td3, tdtxt);
+
+ assertEquals("result length", 2, res.length);
+
+ assertEquals(3, res[0].doc);
+ assertEquals(0, res[0].index); // td3 has index 0
+ assertEquals(0, res[0].rank);
+
+ assertEquals(3, res[1].doc);
+ assertEquals(1, res[1].index); // tdtxt has index 1
+ assertEquals(1, res[1].rank);
+ }
+
+ public void testKeepAllHits() throws Exception {
+ TopDocs tdtxt = topDocsFromTerm(new Term(ALTFIELD, "w5")); // matches doc 0
+ TopDocs td0 = topDocsFromTerm(new Term(KEY, "0"));
+ ScoreDocRank[] res = ScoreDocRank.joinDocsWithRanks(tdtxt, td0);
+ assertEquals("result length", 2, res.length);
+ }
+}